diff --git a/.github/filters.yaml b/.github/filters.yaml index 7ba093ed889..8e728173048 100644 --- a/.github/filters.yaml +++ b/.github/filters.yaml @@ -17,6 +17,7 @@ all: docs: - 'docs/**' + - 'fern/**' - '**/*.md' - '**/*.rst' - '**/*.txt' diff --git a/docs/kubernetes/dynamo_operator.md b/docs/kubernetes/dynamo_operator.md index 4e926841333..43e1de8d8fe 100644 --- a/docs/kubernetes/dynamo_operator.md +++ b/docs/kubernetes/dynamo_operator.md @@ -86,6 +86,7 @@ helm install dynamo-test dynamo-platform-${RELEASE_VERSION}.tgz \ --create-namespace \ --set dynamo-operator.namespaceRestriction.enabled=true \ --set dynamo-operator.controllerManager.manager.image.tag=v2.0.0-beta +``` **Observability:** diff --git a/fern/.gitignore b/fern/.gitignore new file mode 100644 index 00000000000..3dc347093d9 --- /dev/null +++ b/fern/.gitignore @@ -0,0 +1,5 @@ +**/.preview +**/.definition + +# Include logos +!*.svg diff --git a/fern/assets/img/architecture.png b/fern/assets/img/architecture.png new file mode 100644 index 00000000000..c285b70aa6f Binary files /dev/null and b/fern/assets/img/architecture.png differ diff --git a/fern/assets/img/disagg-perf-benefit.png b/fern/assets/img/disagg-perf-benefit.png new file mode 100644 index 00000000000..099a0e8bc54 Binary files /dev/null and b/fern/assets/img/disagg-perf-benefit.png differ diff --git a/fern/assets/img/dynamo-deploy.png b/fern/assets/img/dynamo-deploy.png new file mode 100644 index 00000000000..aaea1906806 Binary files /dev/null and b/fern/assets/img/dynamo-deploy.png differ diff --git a/fern/assets/img/dynamo-flow.png b/fern/assets/img/dynamo-flow.png new file mode 100644 index 00000000000..672bc6309d9 Binary files /dev/null and b/fern/assets/img/dynamo-flow.png differ diff --git a/fern/assets/img/favicon.png b/fern/assets/img/favicon.png new file mode 100644 index 00000000000..308a5e6e209 Binary files /dev/null and b/fern/assets/img/favicon.png differ diff --git a/fern/assets/img/frontpage-architecture.png b/fern/assets/img/frontpage-architecture.png new file mode 100644 index 00000000000..1c8c8d42da5 Binary files /dev/null and b/fern/assets/img/frontpage-architecture.png differ diff --git a/fern/assets/img/frontpage-banner.png b/fern/assets/img/frontpage-banner.png new file mode 100644 index 00000000000..e06c52ba609 Binary files /dev/null and b/fern/assets/img/frontpage-banner.png differ diff --git a/fern/assets/img/frontpage-gpu-evolution.png b/fern/assets/img/frontpage-gpu-evolution.png new file mode 100644 index 00000000000..8c483620c1a Binary files /dev/null and b/fern/assets/img/frontpage-gpu-evolution.png differ diff --git a/fern/assets/img/frontpage-gpu-vertical.png b/fern/assets/img/frontpage-gpu-vertical.png new file mode 100644 index 00000000000..169beee9af6 Binary files /dev/null and b/fern/assets/img/frontpage-gpu-vertical.png differ diff --git a/fern/assets/img/grafana-disagg-trace.png b/fern/assets/img/grafana-disagg-trace.png new file mode 100644 index 00000000000..1e41bc4d4ec Binary files /dev/null and b/fern/assets/img/grafana-disagg-trace.png differ diff --git a/fern/assets/img/grafana-dynamo-composite.png b/fern/assets/img/grafana-dynamo-composite.png new file mode 100644 index 00000000000..eba18e0b06d Binary files /dev/null and b/fern/assets/img/grafana-dynamo-composite.png differ diff --git a/fern/assets/img/grafana-k8s.png b/fern/assets/img/grafana-k8s.png new file mode 100644 index 00000000000..2c9ea3018b4 Binary files /dev/null and b/fern/assets/img/grafana-k8s.png differ diff --git a/fern/assets/img/h100-decode-performance.png b/fern/assets/img/h100-decode-performance.png new file mode 100644 index 00000000000..dfc2c7a2e7a Binary files /dev/null and b/fern/assets/img/h100-decode-performance.png differ diff --git a/fern/assets/img/h100-prefill-performance.png b/fern/assets/img/h100-prefill-performance.png new file mode 100644 index 00000000000..0d5b499403f Binary files /dev/null and b/fern/assets/img/h100-prefill-performance.png differ diff --git a/fern/assets/img/itl-interpolation.png b/fern/assets/img/itl-interpolation.png new file mode 100644 index 00000000000..356c986555b Binary files /dev/null and b/fern/assets/img/itl-interpolation.png differ diff --git a/fern/assets/img/kv-cache-mgr-design.png b/fern/assets/img/kv-cache-mgr-design.png new file mode 100644 index 00000000000..18cae8e0c4e Binary files /dev/null and b/fern/assets/img/kv-cache-mgr-design.png differ diff --git a/fern/assets/img/kv-cache-mgr.png b/fern/assets/img/kv-cache-mgr.png new file mode 100644 index 00000000000..a8bec363586 Binary files /dev/null and b/fern/assets/img/kv-cache-mgr.png differ diff --git a/fern/assets/img/kv-routing.png b/fern/assets/img/kv-routing.png new file mode 100644 index 00000000000..a24d38de810 Binary files /dev/null and b/fern/assets/img/kv-routing.png differ diff --git a/fern/assets/img/kvbm-agg-performance.png b/fern/assets/img/kvbm-agg-performance.png new file mode 100644 index 00000000000..3d2863cc2d1 Binary files /dev/null and b/fern/assets/img/kvbm-agg-performance.png differ diff --git a/fern/assets/img/kvbm-architecture.png b/fern/assets/img/kvbm-architecture.png new file mode 100644 index 00000000000..90ae3ac1ef1 Binary files /dev/null and b/fern/assets/img/kvbm-architecture.png differ diff --git a/fern/assets/img/kvbm-components.png b/fern/assets/img/kvbm-components.png new file mode 100644 index 00000000000..afa9f2dd25a Binary files /dev/null and b/fern/assets/img/kvbm-components.png differ diff --git a/fern/assets/img/kvbm-data-flows.png b/fern/assets/img/kvbm-data-flows.png new file mode 100644 index 00000000000..2358da2a2d7 Binary files /dev/null and b/fern/assets/img/kvbm-data-flows.png differ diff --git a/fern/assets/img/kvbm-integrations.png b/fern/assets/img/kvbm-integrations.png new file mode 100644 index 00000000000..ffb64f3874e Binary files /dev/null and b/fern/assets/img/kvbm-integrations.png differ diff --git a/fern/assets/img/kvbm-internal-arch.png b/fern/assets/img/kvbm-internal-arch.png new file mode 100644 index 00000000000..3e5c9c76e34 Binary files /dev/null and b/fern/assets/img/kvbm-internal-arch.png differ diff --git a/fern/assets/img/kvbm-metrics-grafana.png b/fern/assets/img/kvbm-metrics-grafana.png new file mode 100644 index 00000000000..b68b707ab06 Binary files /dev/null and b/fern/assets/img/kvbm-metrics-grafana.png differ diff --git a/fern/assets/img/kvbm-offload.png b/fern/assets/img/kvbm-offload.png new file mode 100644 index 00000000000..0da6af2a1d9 Binary files /dev/null and b/fern/assets/img/kvbm-offload.png differ diff --git a/fern/assets/img/kvbm-onboard-disk2device.png b/fern/assets/img/kvbm-onboard-disk2device.png new file mode 100644 index 00000000000..2354e7e2bac Binary files /dev/null and b/fern/assets/img/kvbm-onboard-disk2device.png differ diff --git a/fern/assets/img/kvbm-onboard-host2device.png b/fern/assets/img/kvbm-onboard-host2device.png new file mode 100644 index 00000000000..fe8ad9f575d Binary files /dev/null and b/fern/assets/img/kvbm-onboard-host2device.png differ diff --git a/fern/assets/img/nvidia-logo-dark.svg b/fern/assets/img/nvidia-logo-dark.svg new file mode 100644 index 00000000000..6798f1596ae --- /dev/null +++ b/fern/assets/img/nvidia-logo-dark.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/fern/assets/img/nvidia-logo.svg b/fern/assets/img/nvidia-logo.svg new file mode 100644 index 00000000000..08844b77b9e --- /dev/null +++ b/fern/assets/img/nvidia-logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/fern/assets/img/pd-interpolation.png b/fern/assets/img/pd-interpolation.png new file mode 100644 index 00000000000..68c3e49fee0 Binary files /dev/null and b/fern/assets/img/pd-interpolation.png differ diff --git a/fern/assets/img/planner-perf.png b/fern/assets/img/planner-perf.png new file mode 100644 index 00000000000..9a8e8cf6271 Binary files /dev/null and b/fern/assets/img/planner-perf.png differ diff --git a/fern/assets/img/planner-tensorboard.png b/fern/assets/img/planner-tensorboard.png new file mode 100644 index 00000000000..a42127a9e4f Binary files /dev/null and b/fern/assets/img/planner-tensorboard.png differ diff --git a/fern/assets/img/prefill-time.png b/fern/assets/img/prefill-time.png new file mode 100644 index 00000000000..25630974de4 Binary files /dev/null and b/fern/assets/img/prefill-time.png differ diff --git a/fern/assets/img/prometheus-k8s.png b/fern/assets/img/prometheus-k8s.png new file mode 100644 index 00000000000..e754f3d5d1d Binary files /dev/null and b/fern/assets/img/prometheus-k8s.png differ diff --git a/fern/assets/img/trace.png b/fern/assets/img/trace.png new file mode 100644 index 00000000000..7cc6eb09b19 Binary files /dev/null and b/fern/assets/img/trace.png differ diff --git a/fern/docs.yml b/fern/docs.yml new file mode 100644 index 00000000000..ba8c319e4e6 --- /dev/null +++ b/fern/docs.yml @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +instances: + - url: ai-dynamo.docs.buildwithfern.com + +title: NVIDIA Dynamo Documentation + +# Version configuration +versions: + - display-name: Next + path: ./versions/next.yml + +# GitHub repository link in navbar +navbar-links: + - type: github + value: https://github.com/ai-dynamo/dynamo + +# NVIDIA branding colors +colors: + accent-primary: + dark: "#76B900" + light: "#4A7300" + background: + dark: "#1A1A1A" + light: "#FFFFFF" + +# Logo and favicon +logo: + href: / + light: ./assets/img/nvidia-logo.svg + dark: ./assets/img/nvidia-logo-dark.svg + height: 50 + +favicon: ./assets/img/favicon.png diff --git a/fern/fern.config.json b/fern/fern.config.json new file mode 100644 index 00000000000..be7914486ab --- /dev/null +++ b/fern/fern.config.json @@ -0,0 +1,4 @@ +{ + "organization": "ai-dynamo", + "version": "3.29.1" +} diff --git a/fern/pages/agents/tool-calling.md b/fern/pages/agents/tool-calling.md new file mode 100644 index 00000000000..142c88dec18 --- /dev/null +++ b/fern/pages/agents/tool-calling.md @@ -0,0 +1,189 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Tool Calling with Dynamo" +--- + +You can connect Dynamo to external tools and services using function calling (also known as tool calling). By providing a list of available functions, Dynamo can choose +to output function arguments for the relevant function(s) which you can execute to augment the prompt with relevant external information. + +Tool calling (AKA function calling) is controlled using the `tool_choice` and `tools` request parameters. + + +## Prerequisites + +To enable this feature, you should set the following flag while launching the backend worker + +- `--dyn-tool-call-parser` : select the parser from the available parsers list using the below command + +```bash +# can be vllm, sglang, trtllm, etc. based on your installation +python -m dynamo. --help" +``` + + +If no tool call parser is provided by the user, Dynamo will try to use default tool call parsing based on `` and `<|python_tag|>` tool tags. + + + +If your model's default chat template doesn't support tool calling, but the model itself does, you can specify a custom chat template per worker +with `python -m dynamo. --custom-jinja-template `. + + + +Parser to Model Mapping + +| Parser Name | Supported Models | +|-------------|-----------------------------------------------------------------------| +| hermes | Qwen/Qwen2.5-*, Qwen/QwQ-32B, NousResearch/Hermes-2-Pro-*, NousResearch/Hermes-2-Theta-*, NousResearch/Hermes-3-* | +| mistral | mistralai/Mistral-7B-Instruct-v0.3, Additional mistral function-calling models are compatible as well.| +| llama3_json | meta-llama/Llama-3.1-*, meta-llama/Llama-3.2-* | +| harmony | openai/gpt-oss-* | +| nemotron_deci | nvidia/nemotron-* | +| phi4 | Phi-4-* | +| deepseek_v3 | deepseek-ai/DeepSeek-V3, deepseek-ai/DeepSeek-R1, deepseek-ai/DeepSeek-R1-0528 | +| deepseek_v3_1 | deepseek-ai/DeepSeek-V3.1 | +| pythonic | meta-llama/Llama-4-* | +| jamba | ai21labs/AI21-Jamba-*-1.5, ai21labs/AI21-Jamba-*-1.6, ai21labs/AI21-Jamba-*-1.7, | + + +## Examples + +### Launch Dynamo Frontend and Backend + +```bash +# launch backend worker +python -m dynamo.vllm --model openai/gpt-oss-20b --dyn-tool-call-parser harmony + +# launch frontend worker +python -m dynamo.frontend +``` + +### Tool Calling Request Examples + +- Example 1 +```python +from openai import OpenAI +import json + +client = OpenAI(base_url="http://localhost:8081/v1", api_key="dummy") + +def get_weather(location: str, unit: str): + return f"Getting the weather for {location} in {unit}..." +tool_functions = {"get_weather": get_weather} + +tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} + }, + "required": ["location", "unit"] + } + } +}] + +response = client.chat.completions.create( + model="openai/gpt-oss-20b", + messages=[{"role": "user", "content": "What's the weather like in San Francisco in Celsius?"}], + tools=tools, + tool_choice="auto", + max_tokens=10000 +) +print(f"{response}") +tool_call = response.choices[0].message.tool_calls[0].function +print(f"Function called: {tool_call.name}") +print(f"Arguments: {tool_call.arguments}") +print(f"Result: {tool_functions[tool_call.name](**json.loads(tool_call.arguments))}") +``` + +- Example 2 +```python + +# Use tools defined in example 1 + +time_tool = { + "type": "function", + "function": { + "name": "get_current_time_nyc", + "description": "Get the current time in NYC.", + "parameters": {} + } +} + + +tools.append(time_tool) + +messages = [ + {"role": "user", "content": "What's the current time in New York?"} +] + + +response = client.chat.completions.create( + model="openai/gpt-oss-20b", #client.models.list().data[1].id, + messages=messages, + tools=tools, + tool_choice="auto", + max_tokens=100, +) +print(f"{response}") +tool_call = response.choices[0].message.tool_calls[0].function +print(f"Function called: {tool_call.name}") +print(f"Arguments: {tool_call.arguments}") +``` + +- Example 3 + + +```python + +tools = [ + { + "type": "function", + "function": { + "name": "get_tourist_attractions", + "description": "Get a list of top tourist attractions for a given city.", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The name of the city to find attractions for.", + } + }, + "required": ["city"], + }, + }, + }, +] + +def get_messages(): + return [ + { + "role": "user", + "content": ( + "I'm planning a trip to Tokyo next week. what are some top tourist attractions in Tokyo? " + ), + }, + ] + + +messages = get_messages() + +response = client.chat.completions.create( + model="openai/gpt-oss-20b", + messages=messages, + tools=tools, + tool_choice="auto", + max_tokens=100, +) +print(f"{response}") +tool_call = response.choices[0].message.tool_calls[0].function +print(f"Function called: {tool_call.name}") +print(f"Arguments: {tool_call.arguments}") +``` diff --git a/fern/pages/api/nixl-connect/README.md b/fern/pages/api/nixl-connect/README.md new file mode 100644 index 00000000000..15e77d8d4b3 --- /dev/null +++ b/fern/pages/api/nixl-connect/README.md @@ -0,0 +1,171 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Dynamo NIXL Connect" +--- + +Dynamo NIXL Connect specializes in moving data between models/workers in a Dynamo Graph, and for the use cases where registration and memory regions need to be dynamic. +Dynamo connect provides utilities for such use cases, using the NIXL-based I/O subsystem via a set of Python classes. +The relaxed registration comes with some performance overheads, but simplifies the integration process. +Especially for larger data transfer operations, such as between models in a multi-model graph, the overhead would be marginal. +The `dynamo.nixl_connect` library can be imported by any Dynamo container hosted application. + + +Dynamo NIXL Connect will pick the best available method of data transfer available to it. +The available methods depend on the hardware and software configuration of the machines and network running the graph. +GPU Direct RDMA operations require that both ends of the operation have: +- NIC and GPU capable of performing RDMA operations +- Device drivers that support GPU-NIC direct interactions (aka "zero copy") and RDMA operations +- Network that supports InfiniBand or RoCE +With any of the above not satisfied, GPU Direct RDMA will not be available to the graph's workers, and less-optimal methods will be utilized to ensure basic functionality. +For additional information, please read this [GPUDirect RDMA](https://docs.nvidia.com/cuda/pdf/GPUDirect_RDMA.pdf) document. + + +```python +import dynamo.nixl_connect +``` + +All operations using the NIXL Connect library begin with the [`Connector`](connector.md) class and the type of operation required. +There are four types of supported operations: + + 1. **Register local readable memory**: + + Register local memory buffer(s) with the NIXL subsystem to enable a remote worker to read from. + + 2. **Register local writable memory**: + + Register local memory buffer(s) with the NIXL subsystem to enable a remote worker to write to. + + 3. **Read from registered, remote memory**: + + Read remote memory buffer(s), registered by a remote worker to be readable, into local memory buffer(s). + + 4. **Write to registered, remote memory**: + + Write local memory buffer(s) to remote memory buffer(s) registered by a remote worker to writable. + +When available, by connecting correctly paired operations, high-throughput GPU Direct RDMA data transfers can be completed. +Given the list above, the correct pairing of operations would be 1 & 3 or 2 & 4. +Where one side is a "(read|write)-able operation" and the other is its correctly paired "(read|write) operation". +Specifically, a read operation must be paired with a readable operation, and a write operation must be paired with a writable operation. + +```mermaid +sequenceDiagram + participant LocalWorker + participant RemoteWorker + participant NIXL + + LocalWorker ->> NIXL: Register memory (Descriptor) + RemoteWorker ->> NIXL: Register memory (Descriptor) + LocalWorker ->> LocalWorker: Create Readable/WritableOperation + LocalWorker ->> RemoteWorker: Send NIXL metadata (via HTTP/TCP+NATS) + RemoteWorker ->> NIXL: Begin Read/WriteOperation with metadata + NIXL -->> RemoteWorker: Data transfer + RemoteWorker -->> LocalWorker: Notify completion (unblock awaiter) +``` + +## Examples + +### Generic Example + +In the diagram below, Local creates a [`WritableOperation`](writable-operation.md) intended to receive data from Remote. +Local then sends metadata about the requested operation to Remote. +Remote then uses the metadata to create a [`WriteOperation`](write-operation.md) which will perform the GPU Direct RDMA memory transfer, when available, from Remote's GPU memory to Local's GPU memory. + +```mermaid +--- +title: Write Operation Between Two Workers (RDMA available) +--- +flowchart LR + c1[Remote] --"3: .begin_write()"--- WriteOperation + WriteOperation e1@=="4: GPU Direct RDMA"==> WritableOperation + WritableOperation --"1: .create_writable()"--- c2[Local] + c2 e2@--"2: RDMA Metadata via HTTP"--> c1 + e1@{ animate: true; } + e2@{ animate: true; } +``` + + +When RDMA isn't available, the NIXL data transfer will still complete using non-accelerated methods. + + +### Multimodal Example + +In the case of the [Dynamo Multimodal Disaggregated Example](../../multimodal/vllm.md): + + 1. The HTTP frontend accepts a text prompt and a URL to an image. + + 2. The prompt and URL are then enqueued with the Processor before being dispatched to the first available Decode Worker. + + 3. Decode Worker then requests a Prefill Worker to provide key-value data for the LLM powering the Decode Worker. + + 4. Prefill Worker then requests that the image be processed and provided as embeddings by the Encode Worker. + + 5. Encode Worker acquires the image, processes it, performs inference on the image using a specialized vision model, and finally provides the embeddings to Prefill Worker. + + 6. Prefill Worker receives the embeddings from Encode Worker and generates a key-value cache (KV$) update for Decode Worker's LLM and writes the update directly to the GPU memory reserved for the data. + + 7. Finally, Decode Worker performs the requested inference. + +```mermaid +--- +title: Multimodal Disaggregated Workflow +--- +flowchart LR + p0[HTTP Frontend] i0@--"text prompt"-->p1[Processor] + p0 i1@--"url"-->p1 + p1 i2@--"prompt"-->dw[Decode Worker] + p1 i3@--"url"-->dw + dw i4@--"prompt"-->pw[Prefill Worker] + dw i5@--"url"-->pw + pw i6@--"url"-->ew[Encode Worker] + ew o0@=="image embeddings"==>pw + pw o1@=="kv_cache updates"==>dw + dw o2@--"inference results"-->p0 + + i0@{ animate: true; } + i1@{ animate: true; } + i2@{ animate: true; } + i3@{ animate: true; } + i4@{ animate: true; } + i5@{ animate: true; } + i6@{ animate: true; } + o0@{ animate: true; } + o1@{ animate: true; } + o2@{ animate: true; } +``` + + +In this example, it is the data transfer between the Prefill Worker and the Encode Worker that utilizes the Dynamo NIXL Connect library. +The KV Cache transfer between Decode Worker and Prefill Worker utilizes a different connector that also uses the NIXL-based I/O subsystem underneath. + + +#### Code Examples + +See [MultimodalPDWorkerHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py) or [MultimodalDecodeWorkerHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py) from our Multimodal example, +for how they coordinate directly with the Encode Worker by creating a [`WritableOperation`](writable-operation.md), +sending the operation's metadata via Dynamo's round-robin dispatcher, and awaiting the operation for completion before making use of the transferred data. + +See [MultimodalEncodeWorkerHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/vllm/multimodal_handlers/encode_worker_handler.py) from our Multimodal example, +for how the resulting embeddings are registered with the NIXL subsystem by creating a [`Descriptor`](descriptor.md), +a [`WriteOperation`](write-operation.md) is created using the metadata provided by the requesting worker, +and the worker awaits for the data transfer to complete for yielding a response. + + +## Python Classes + + - [Connector](connector.md) + - [Descriptor](descriptor.md) + - [Device](device.md) + - [ReadOperation](read-operation.md) + - [ReadableOperation](readable-operation.md) + - [WritableOperation](writable-operation.md) + - [WriteOperation](write-operation.md) + + +## References + + - [NVIDIA Dynamo](https://developer.nvidia.com/dynamo) @ [GitHub](https://github.com/ai-dynamo/dynamo) + - [NVIDIA Inference Transfer Library (NIXL)](https://developer.nvidia.com/blog/introducing-nvidia-dynamo-a-low-latency-distributed-inference-framework-for-scaling-reasoning-ai-models/#nvidia_inference_transfer_library_nixl_low-latency_hardware-agnostic_communication%C2%A0) @ [GitHub](https://github.com/ai-dynamo/nixl) + - [Dynamo Multimodal Example](https://github.com/ai-dynamo/dynamo/tree/main/examples/multimodal) + - [NVIDIA GPU Direct](https://developer.nvidia.com/gpudirect) diff --git a/fern/pages/api/nixl-connect/connector.md b/fern/pages/api/nixl-connect/connector.md new file mode 100644 index 00000000000..28714eb2a20 --- /dev/null +++ b/fern/pages/api/nixl-connect/connector.md @@ -0,0 +1,179 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "dynamo.nixl_connect.Connector" +--- + +Core class for managing the connection between workers in a distributed environment. +Use this class to create readable and writable operations, or read and write data to remote workers. + +This class provides a "pythonic" interface using NIXL library to utilize GPU Direct RDMA accelerated, when available, data transfers between models hosted by different workers in a Dynamo graph. +The connector provides two methods of moving data between workers: + + - Preparing local memory to be written to by a remote worker. + + - Preparing local memory to be read by a remote worker. + +In both cases, local memory is registered with the NIXL-based I/O subsystem via the [`Descriptor`](descriptor.md) class and provided to the connector. +When RDMA is available, the connector then configures the RDMA subsystem to expose the memory for the requested operation and returns an operation control object; +otherwise the connector will select the best available RDMA alternative. +The operation control object, either a [`ReadableOperation`](readable-operation.md) or a [`WritableOperation`](writable-operation.md), +provides NIXL metadata ([RdmaMetadata](rdma-metadata.md)) via its `.metadata()` method, functionality to query the operation's current state, as well as the ability to cancel the operation prior to its completion. + +The NIXL metadata must be provided to the remote worker expected to complete the operation. +The metadata contains required information (identifiers, keys, etc.) which enables the remote worker to interact with the provided memory. + + +NIXL metadata contains a worker's address as well as security keys to access specific registered memory descriptors. +This data provides direct memory access between workers, and should be considered sensitive and therefore handled accordingly. + + + +## Example Usage + +```python + @async_on_start + async def async_init(self): + self.connector = dynamo.nixl_connect.Connector() +``` + + +See [`ReadOperation`](read-operation.md#example-usage), [`ReadableOperation`](readable-operation.md#example-usage), +[`WritableOperation`](writable-operation.md#example-usage), and [`WriteOperation`](write-operation.md#example-usage) +for additional examples. + + + +## Methods + +### `begin_read` + +```python +async def begin_read( + self, + remote_metadata: RdmaMetadata, + local_descriptors: Descriptor | list[Descriptor], +) -> ReadOperation: +``` + +Creates a [`ReadOperation`](read-operation.md) for transferring data from a remote worker. + +To create the operation, the serialized request from a remote worker's [`ReadableOperation`](readable-operation.md) +along with a matching set of local memory descriptors which reference memory intended to receive data from the remote worker +must be provided. +The serialized request must be transferred from the remote to the local worker via a secondary channel, most likely HTTP or TCP+NATS. + +Once created, data transfer will begin immediately. + +Disposal of the object will instruct the NIXL subsystem to cancel the operation, +therefore the operation should be awaited until completed unless cancellation is intended. + +Use [`.wait_for_completion()`](read-operation.md#wait_for_completion) to block the caller until the operation has completed or encountered an error. + +### `begin_write` + +```python +async def begin_write( + self, + local_descriptors: Descriptor | list[Descriptor], + remote_metadata: RdmaMetadata, +) -> WriteOperation: +``` + +Creates a [`WriteOperation`](write-operation.md) for transferring data to a remote worker. + +To create the operation, the serialized request from a remote worker's [`WritableOperation`](writable-operation.md) +along with a matching set of local memory descriptors which reference memory to be transferred to the remote worker +must be provided. +The serialized request must be transferred from the remote to the local worker via a secondary channel, most likely HTTP or TCP+NATS. + +Once created, data transfer will begin immediately. + +Disposal of the object will instruct the NIXL subsystem to cancel the operation, +therefore the operation should be awaited until completed unless cancellation is intended. + +Use [`.wait_for_completion()`](write-operation.md#wait_for_completion) to block the caller until the operation has completed or encountered an error. + +### `create_readable` + +```python +async def create_readable( + self, + local_descriptors: Descriptor | list[Descriptor], +) -> ReadableOperation: +``` + +Creates a [`ReadableOperation`](readable-operation.md) for transferring data to a remote worker. + +To create the operation, a set of local memory descriptors must be provided that reference memory intended to be transferred to a remote worker. +Once created, the memory referenced by the provided descriptors becomes immediately readable by a remote worker with the necessary metadata. +The metadata required to access the memory referenced by the provided descriptors is accessible via the operation's `.metadata()` method. +Once acquired, the metadata needs to be provided to a remote worker via a secondary channel, most likely HTTP or TCP+NATS. + +Disposal of the object will instruct the NIXL subsystem to cancel the operation, +therefore the operation should be awaited until completed unless cancellation is intended. + +Use [`.wait_for_completion()`](readable-operation.md#wait_for_completion) to block the caller until the operation has completed or encountered an error. + +### `create_writable` + +```python +async def create_writable( + self, + local_descriptors: Descriptor | list[Descriptor], +) -> WritableOperation: +``` + +Creates a [`WritableOperation`](writable-operation.md) for transferring data from a remote worker. + +To create the operation, a set of local memory descriptors must be provided which reference memory intended to receive data from a remote worker. +Once created, the memory referenced by the provided descriptors becomes immediately writable by a remote worker with the necessary metadata. +The metadata required to access the memory referenced by the provided descriptors is accessible via the operation's `.metadata()` method. +Once acquired, the metadata needs to be provided to a remote worker via a secondary channel, most likely HTTP or TCP+NATS. + +Disposal of the object will instruct the NIXL subsystem to cancel the operation, +therefore the operation should be awaited until completed unless cancellation is intended. + +Use [`.wait_for_completion()`](writable-operation.md#wait_for_completion) to block the caller until the operation has completed or encountered an error. + + +## Properties + +### `hostname` + +```python +@property +def hostname(self) -> str: +``` + +Gets the name of the current worker's host. + +### `is_cuda_available` + +```python +@cached_property +def is_cuda_available(self) -> bool: +``` + +Gets `True` when CUDA is available for the selected array module (most likely CuPy); otherwise `False`. + +### `name` + +```python +@property +def name(self) -> str | None: +``` + +Gets the Dynamo component name used by the connector. + + +## Related Classes + + - [Descriptor](descriptor.md) + - [Device](device.md) + - [OperationStatus](operation-status.md) + - [RdmaMetadata](rdma-metadata.md) + - [ReadOperation](read-operation.md) + - [ReadableOperation](readable-operation.md) + - [WritableOperation](writable-operation.md) + - [WriteOperation](write-operation.md) diff --git a/fern/pages/api/nixl-connect/descriptor.md b/fern/pages/api/nixl-connect/descriptor.md new file mode 100644 index 00000000000..adcd7a08a38 --- /dev/null +++ b/fern/pages/api/nixl-connect/descriptor.md @@ -0,0 +1,68 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "dynamo.nixl_connect.Descriptor" +--- + +Memory descriptor that ensures memory is registered with the NIXL-base I/O subsystem. +Memory must be registered with the NIXL subsystem to enable interaction with the memory. + +Descriptor objects are administrative and do not copy, move, or otherwise modify the registered memory. + +There are four ways to create a descriptor: + + 1. From a `torch.Tensor` object. Device information will be derived from the provided object. + + 2. From a `tuple` containing either a NumPy or CuPy `ndarray` and information describing where the memory resides (Host/CPU vs GPU). + + 3. From a Python `bytes` object. Memory is assumed to reside in CPU addressable host memory. + + 4. From a `tuple` comprised of the address of the memory, its size in bytes, and device information. + An optional reference to a Python object can be provided to avoid garbage collection issues. + + +## Methods + +### `register_memory` + +```python +def register_memory(self, connector: Connector) -> None: +``` + +Instructs the descriptor to register its memory buffer with the NIXL-based I/O subsystem. + +Calling this method more than once on the same descriptor has no effect. + +When the descriptor is assigned to a NIXL operation, it will be automatically registered if was not explicitly registered. + + +## Properties + +### `device` + +```python +@property +def device(self) -> Device: +``` + +Gets a reference to the [`Device`](device.md) that contains the buffer the descriptor represents. + +### `size` + +```python +@property +def size(self) -> int: +``` + +Gets the size of the memory allocation the descriptor represents. + +## Related Classes + + - [Connector](connector.md) + - [Device](device.md) + - [OperationStatus](operation-status.md) + - [RdmaMetadata](rdma-metadata.md) + - [ReadOperation](read-operation.md) + - [ReadableOperation](readable-operation.md) + - [WritableOperation](writable-operation.md) + - [WriteOperation](write-operation.md) diff --git a/fern/pages/api/nixl-connect/device-kind.md b/fern/pages/api/nixl-connect/device-kind.md new file mode 100644 index 00000000000..c76eb4e77c7 --- /dev/null +++ b/fern/pages/api/nixl-connect/device-kind.md @@ -0,0 +1,30 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "dynamo.nixl_connect.DeviceKind(IntEnum)" +--- + +Represents the kind of device a [`Device`](device.md) object represents. + + +## Values + +### `CUDA` + +CUDA addressable device (GPU) memory. + +### `HOST` + +System (CPU) memory. + + +## Related Classes + + - [Connector](connector.md) + - [Descriptor](descriptor.md) + - [Device](device.md) + - [OperationStatus](operation-status.md) + - [RdmaMetadata](rdma-metadata.md) + - [ReadOperation](read-operation.md) + - [WritableOperation](writable-operation.md) + - [WriteOperation](write-operation.md) diff --git a/fern/pages/api/nixl-connect/device.md b/fern/pages/api/nixl-connect/device.md new file mode 100644 index 00000000000..a1754ac6323 --- /dev/null +++ b/fern/pages/api/nixl-connect/device.md @@ -0,0 +1,50 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "dynamo.nixl_connect.Device" +--- + +`Device` class describes the device a given allocation resides in. +Usually host (`"cpu"`) or GPU (`"cuda"`) memory. + +When a system contains multiple GPU devices, specific GPU devices can be identified by including their ordinal index number. +For example, to reference the second GPU in a system `"cuda:1"` can be used. + +By default, when `"cuda"` is provided, it is assumed to be `"cuda:0"` or the first GPU enumerated by the system. + + +## Properties + +### `id` + +```python +@property +def id(self) -> int: +``` + +Gets the identity, or ordinal, of the device. + +When the device is the [`HOST`](device-kind.md#host), this value is always `0`. + +When the device is a [`GPU`](device-kind.md#cuda), this value identifies a specific GPU. + +### `kind` + +```python +@property +def kind(self) -> DeviceKind: +``` + +Gets the [`DeviceKind`](device-kind.md) of device the instance references. + + +## Related Classes + + - [Connector](connector.md) + - [Descriptor](descriptor.md) + - [OperationStatus](operation-status.md) + - [ReadOperation](read-operation.md) + - [ReadableOperation](readable-operation.md) + - [RdmaMetadata](rdma-metadata.md) + - [WritableOperation](writable-operation.md) + - [WriteOperation](write-operation.md) diff --git a/fern/pages/api/nixl-connect/operation-status.md b/fern/pages/api/nixl-connect/operation-status.md new file mode 100644 index 00000000000..a966fb2323d --- /dev/null +++ b/fern/pages/api/nixl-connect/operation-status.md @@ -0,0 +1,46 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "dynamo.nixl_connect.OperationStatus(IntEnum)" +--- + +Represents the current state or status of an operation. + + +## Values + +### `CANCELLED` + +The operation has been cancelled by the user or system. + +### `COMPLETE` + +The operation has been completed successfully. + +### `ERRORED` + +The operation has encountered an error and cannot be completed. + +### `IN_PROGRESS` + +The operation has been initialized and is in-progress (not completed, errored, or cancelled). + +### `INITIALIZED` + +The operation has been initialized and is ready to be processed. + +### `UNINITIALIZED` + +The operation has not been initialized yet and is not in a valid state. + + +## Related Classes + + - [Connector](connector.md) + - [Descriptor](descriptor.md) + - [Device](device.md) + - [RdmaMetadata](rdma-metadata.md) + - [ReadOperation](read-operation.md) + - [ReadableOperation](readable-operation.md) + - [WritableOperation](writable-operation.md) + - [WriteOperation](write-operation.md) diff --git a/fern/pages/api/nixl-connect/rdma-metadata.md b/fern/pages/api/nixl-connect/rdma-metadata.md new file mode 100644 index 00000000000..e909dc2b070 --- /dev/null +++ b/fern/pages/api/nixl-connect/rdma-metadata.md @@ -0,0 +1,35 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "dynamo.nixl_connect.RdmaMetadata" +--- + +A Pydantic type intended to provide JSON serialized NIXL metadata about a [`ReadableOperation`](readable-operation.md) or [`WritableOperation`](writable-operation.md) object. +NIXL metadata contains detailed information about a worker process and how to access memory regions registered with the corresponding agent. +This data is required to perform data transfers using the NIXL-based I/O subsystem. + + +NIXL metadata contains information to connect corresponding backends across agents, as well as identification keys to access specific registered memory regions. +This data provides direct memory access between workers, and should be considered sensitive and therefore handled accordingly. + + +Use the respective class's `.metadata()` method to generate an `RdmaMetadata` object for an operation. + + +Classes using `RdmaMetadata` objects must be paired correctly. +[`ReadableOperation`](readable-operation.md) with [`ReadOperation`](read-operation.md), and +[`WritableOperation`](write-operation.md) with [`WriteOperation`](write-operation.md). +Incorrect pairing will result in an error being raised. + + + +## Related Classes + + - [Connector](connector.md) + - [Descriptor](descriptor.md) + - [Device](device.md) + - [OperationStatus](operation-status.md) + - [ReadOperation](read-operation.md) + - [ReadableOperation](readable-operation.md) + - [WritableOperation](writable-operation.md) + - [WriteOperation](write-operation.md) diff --git a/fern/pages/api/nixl-connect/read-operation.md b/fern/pages/api/nixl-connect/read-operation.md new file mode 100644 index 00000000000..8dcce65cede --- /dev/null +++ b/fern/pages/api/nixl-connect/read-operation.md @@ -0,0 +1,75 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "dynamo.nixl_connect.ReadOperation" +--- + +An operation which transfers data from a remote worker to the local worker. + +To create the operation, NIXL metadata ([RdmaMetadata](rdma-metadata.md)) from a remote worker's [`ReadableOperation`](readable-operation.md) +along with a matching set of local [`Descriptor`](descriptor.md) objects which reference memory intended to receive data from the remote worker must be provided. +The NIXL metadata must be transferred from the remote to the local worker via a secondary channel, most likely HTTP or TCP+NATS. + +Once created, data transfer will begin immediately. +Disposal of the object will instruct the NIXL subsystem to cancel the operation, +therefore the operation should be awaited until completed unless cancellation is intended. + + +## Example Usage + +```python + async def read_from_remote( + self, + remote_metadata: dynamo.nixl_connect.RdmaMetadata, + local_tensor: torch.Tensor + ) -> None: + descriptor = dynamo.nixl_connect.Descriptor(local_tensor) + + with await self.connector.begin_read(remote_metadata, descriptor) as read_op: + # Wait for the operation to complete writing data from the remote worker to local_tensor. + await read_op.wait_for_completion() +``` + + +## Methods + +### `cancel` + +```python +def cancel(self) -> None: +``` + +Instructs the NIXL subsystem to cancel the operation. +Completed operations cannot be cancelled. + +### `wait_for_completion` + +```python +async def wait_for_completion(self) -> None: +``` + +Blocks the caller until the memory from the remote worker has been transferred to the provided buffers. + + +## Properties + +### `status` + +```python +@property +def status(self) -> OperationStatus: +``` + +Returns [`OperationStatus`](operation-status.md) which provides the current state (aka. status) of the operation. + + +## Related Classes + + - [Connector](connector.md) + - [Descriptor](descriptor.md) + - [Device](device.md) + - [OperationStatus](operation-status.md) + - [RdmaMetadata](rdma-metadata.md) + - [ReadableOperation](readable-operation.md) + - [WritableOperation](writable-operation.md) + - [WriteOperation](write-operation.md) diff --git a/fern/pages/api/nixl-connect/readable-operation.md b/fern/pages/api/nixl-connect/readable-operation.md new file mode 100644 index 00000000000..30c2d691dd2 --- /dev/null +++ b/fern/pages/api/nixl-connect/readable-operation.md @@ -0,0 +1,79 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "dynamo.nixl_connect.ReadableOperation" +--- + +An operation which enables a remote worker to read data from the local worker. + +To create the operation, a set of local [`Descriptor`](descriptor.md) objects must be provided that reference memory intended to be transferred to a remote worker. +Once created, the memory referenced by the provided descriptors becomes immediately readable by a remote worker with the necessary metadata. +The NIXL metadata ([RdmaMetadata](rdma-metadata.md)) required to access the memory referenced by the provided descriptors is accessible via the operations `.metadata()` method. +Once acquired, the metadata needs to be provided to a remote worker via a secondary channel, most likely HTTP or TCP+NATS. + +Disposal of the object will instruct the NIXL subsystem to cancel the operation, +therefore the operation should be awaited until completed unless cancellation is intended. + + +## Example Usage + +```python + async def send_data( + self, + local_tensor: torch.Tensor + ) -> None: + descriptor = dynamo.nixl_connect.Descriptor(local_tensor) + + with await self.connector.create_readable(descriptor) as read_op: + op_metadata = read_op.metadata() + + # Send the metadata to the remote worker via sideband communication. + await self.notify_remote_data(op_metadata) + # Wait for the remote worker to complete its read operation of local_tensor. + # AKA send data to remote worker. + await read_op.wait_for_completion() +``` + + +## Methods + +### `metadata` + +```python +def metadata(self) -> RdmaMetadata: +``` + +Generates and returns the NIXL metadata ([RdmaMetadata](rdma-metadata.md)) required for a remote worker to read from the operation. +Once acquired, the metadata needs to be provided to a remote worker via a secondary channel, most likely HTTP or TCP+NATS. + +### `wait_for_completion` + +```python +async def wait_for_completion(self) -> None: +``` + +Blocks the caller until the operation has received a completion signal from a remote worker. + + +## Properties + +### `status` + +```python +@property +def status(self) -> OperationStatus: +``` + +Returns [`OperationStatus`](operation-status.md) which provides the current state (aka. status) of the operation. + + +## Related Classes + + - [Connector](connector.md) + - [Descriptor](descriptor.md) + - [Device](device.md) + - [OperationStatus](operation-status.md) + - [RdmaMetadata](rdma-metadata.md) + - [ReadOperation](read-operation.md) + - [WritableOperation](writable-operation.md) + - [WriteOperation](write-operation.md) diff --git a/fern/pages/api/nixl-connect/writable-operation.md b/fern/pages/api/nixl-connect/writable-operation.md new file mode 100644 index 00000000000..765869a775b --- /dev/null +++ b/fern/pages/api/nixl-connect/writable-operation.md @@ -0,0 +1,80 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "dynamo.nixl_connect.WritableOperation" +--- + +An operation which enables a remote worker to write data to the local worker. + +To create the operation, a set of local [`Descriptor`](descriptor.md) objects must be provided which reference memory intended to receive data from a remote worker. +Once created, the memory referenced by the provided descriptors becomes immediately writable by a remote worker with the necessary metadata. +The NIXL metadata ([RdmaMetadata](rdma-metadata.md)) required to access the memory referenced by the provided descriptors is accessible via the operations `.metadata()` method. +Once acquired, the metadata needs to be provided to a remote worker via a secondary channel, most likely HTTP or TCP+NATS. + +Disposal of the object will instruct the NIXL subsystem to cancel the operation, +therefore the operation should be awaited until completed unless cancellation is intended. +Cancellation is handled asynchronously. + + +## Example Usage + +```python + async def recv_data( + self, + local_tensor: torch.Tensor + ) -> None: + descriptor = dynamo.nixl_connect.Descriptor(local_tensor) + + with await self.connector.create_writable(descriptor) as write_op: + op_metadata = write_op.metadata() + + # Send the metadata to the remote worker via sideband communication. + await self.request_remote_data(op_metadata) + # Wait the remote worker to complete its write operation to local_tensor. + # AKA receive data from remote worker. + await write_op.wait_for_completion() +``` + + +## Methods + +### `metadata` + +```python +def metadata(self) -> RdmaMetadata: +``` + +Generates and returns the NIXL metadata ([RdmaMetadata](rdma-metadata.md)) required for a remote worker to write to the operation. +Once acquired, the metadata needs to be provided to a remote worker via a secondary channel, most likely HTTP or TCP+NATS. + +### `wait_for_completion` + +```python +async def wait_for_completion(self) -> None: +``` + +Blocks the caller until the operation has received a completion signal from a remote worker. + + +## Properties + +### `status` + +```python +@property +def status(self) -> OperationStatus: +``` + +Returns [`OperationStatus`](operation-status.md) which provides the current state (aka. status) of the operation. + + +## Related Classes + + - [Connector](connector.md) + - [Descriptor](descriptor.md) + - [Device](device.md) + - [OperationStatus](operation-status.md) + - [RdmaMetadata](rdma-metadata.md) + - [ReadOperation](read-operation.md) + - [ReadableOperation](readable-operation.md) + - [WriteOperation](write-operation.md) diff --git a/fern/pages/api/nixl-connect/write-operation.md b/fern/pages/api/nixl-connect/write-operation.md new file mode 100644 index 00000000000..ba6a4b4ae2c --- /dev/null +++ b/fern/pages/api/nixl-connect/write-operation.md @@ -0,0 +1,76 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "dynamo.nixl_connect.WriteOperation" +--- + +An operation which transfers data from the local worker to a remote worker. + +To create the operation, NIXL metadata ([RdmaMetadata](rdma-metadata.md)) from a remote worker's [`WritableOperation`](writable-operation.md) +along with a matching set of local [`Descriptor`](descriptor.md) objects which reference memory to be transferred to the remote worker must be provided. +The NIXL metadata must be transferred from the remote to the local worker via a secondary channel, most likely HTTP or TCP+NATS. + +Once created, data transfer will begin immediately. +Disposal of the object will instruct the NIXL subsystem to cancel the operation, +therefore the operation should be awaited until completed unless cancellation is intended. +Cancellation is handled asynchronously. + + +## Example Usage + +```python + async def write_to_remote( + self, + remote_metadata: dynamo.nixl_connect.RdmaMetadata, + local_tensor: torch.Tensor + ) -> None: + descriptor = dynamo.nixl_connect.Descriptor(local_tensor) + + with await self.connector.begin_write(descriptor, remote_metadata) as write_op: + # Wait for the operation to complete writing local_tensor to the remote worker. + await write_op.wait_for_completion() +``` + + +## Methods + +### `cancel` + +```python +def cancel(self) -> None: +``` + +Instructs the NIXL subsystem to cancel the operation. +Completed operations cannot be cancelled. + +### `wait_for_completion` + +```python +async def wait_for_completion(self) -> None: +``` + +Blocks the caller until all provided buffers have been transferred to the remote worker. + + +## Properties + +### `status` + +```python +@property +def status(self) -> OperationStatus: +``` + +Returns [`OperationStatus`](operation-status.md) which provides the current state (aka. status) of the operation. + + +## Related Classes + + - [Connector](connector.md) + - [Descriptor](descriptor.md) + - [Device](device.md) + - [OperationStatus](operation-status.md) + - [RdmaMetadata](rdma-metadata.md) + - [ReadOperation](read-operation.md) + - [ReadableOperation](readable-operation.md) + - [WritableOperation](writable-operation.md) diff --git a/fern/pages/backends/sglang/README.md b/fern/pages/backends/sglang/README.md new file mode 100644 index 00000000000..273bccf3f46 --- /dev/null +++ b/fern/pages/backends/sglang/README.md @@ -0,0 +1,273 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Running SGLang with Dynamo" +--- + +## Use the Latest Release + +We recommend using the latest stable release of dynamo to avoid breaking changes: + +[![GitHub Release](https://img.shields.io/github/v/release/ai-dynamo/dynamo)](https://github.com/ai-dynamo/dynamo/releases/latest) + +You can find the latest release [here](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with: + +```bash +git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) +``` + +--- + +## Table of Contents +- [Feature Support Matrix](#feature-support-matrix) +- [Dynamo SGLang Integration](#dynamo-sglang-integration) +- [Installation](#installation) +- [Quick Start](#quick-start) +- [Aggregated Serving](#aggregated-serving) +- [Disaggregated Serving](#disaggregated-serving) +- [Deploy on SLURM or Kubernetes](#deployment) + +## Feature Support Matrix + +### Core Dynamo Features + +| Feature | SGLang | Notes | +|---------|--------|-------| +| [**Disaggregated Serving**](../../design-docs/disagg-serving.md) | ✅ | | +| [**Conditional Disaggregation**](../../design-docs/disagg-serving.md#conditional-disaggregation) | 🚧 | WIP [PR](https://github.com/sgl-project/sglang/pull/7730) | +| [**KV-Aware Routing**](../../router/kv-cache-routing.md) | ✅ | | +| [**SLA-Based Planner**](../../planner/sla-planner.md) | ✅ | | +| [**Multimodal Support**](../../multimodal/sglang.md) | ✅ | | +| [**KVBM**](../../kvbm/kvbm-architecture.md) | ❌ | Planned | + + +## Dynamo SGLang Integration + +Dynamo SGLang integrates SGLang engines into Dynamo's distributed runtime, enabling advanced features like disaggregated serving, KV-aware routing, and request migration while maintaining full compatibility with SGLang's engine arguments. + +### Argument Handling + +Dynamo SGLang uses SGLang's native argument parser, so **most SGLang engine arguments work identically**. You can pass any SGLang argument (like `--model-path`, `--tp`, `--trust-remote-code`) directly to `dynamo.sglang`. + +#### Dynamo-Specific Arguments + +| Argument | Description | Default | SGLang Equivalent | +|----------|-------------|---------|-------------------| +| `--endpoint` | Dynamo endpoint in `dyn://namespace.component.endpoint` format | Auto-generated based on mode | N/A | +| `--migration-limit` | Max times a request can migrate between workers for fault tolerance. See [Request Migration Architecture](../../fault-tolerance/request-migration.md). | `0` (disabled) | N/A | +| `--dyn-tool-call-parser` | Tool call parser for structured outputs (takes precedence over `--tool-call-parser`) | `None` | `--tool-call-parser` | +| `--dyn-reasoning-parser` | Reasoning parser for CoT models (takes precedence over `--reasoning-parser`) | `None` | `--reasoning-parser` | +| `--use-sglang-tokenizer` | Use SGLang's tokenizer instead of Dynamo's | `False` | N/A | +| `--custom-jinja-template` | Use custom chat template for that model (takes precedence over default chat template in model repo) | `None` | `--chat-template` | + +#### Tokenizer Behavior + +- **Default (`--use-sglang-tokenizer` not set)**: Dynamo handles tokenization/detokenization via our blazing fast frontend and passes `input_ids` to SGLang +- **With `--use-sglang-tokenizer`**: SGLang handles tokenization/detokenization, Dynamo passes raw prompts + + +When using `--use-sglang-tokenizer`, only `v1/chat/completions` is available through Dynamo's frontend. + + +### Request Cancellation + +When a user cancels a request (e.g., by disconnecting from the frontend), the request is automatically cancelled across all workers, freeing compute resources for other requests. + +#### Cancellation Support Matrix + +| | Prefill | Decode | +|-|---------|--------| +| **Aggregated** | ✅ | ✅ | +| **Disaggregated** | ⚠️ | ✅ | + + +⚠️ SGLang backend currently does not support cancellation during remote prefill phase in disaggregated mode. + + +For more details, see the [Request Cancellation Architecture](../../fault-tolerance/request-cancellation.md) documentation. + +## Installation + +### Install latest release +We suggest using uv to install the latest release of ai-dynamo[sglang]. You can install it with `curl -LsSf https://astral.sh/uv/install.sh | sh` + +
+Expand for instructions + +```bash +# create a virtual env +uv venv --python 3.12 --seed +# install the latest release (which comes bundled with a stable sglang version) +uv pip install "ai-dynamo[sglang]" +``` + +
+ +### Install editable version for development + +
+Expand for instructions + +This requires having rust installed. We also recommend having a proper installation of the cuda toolkit as sglang requires `nvcc` to be available. + +```bash +# create a virtual env +uv venv --python 3.12 --seed +# build dynamo runtime bindings +uv pip install maturin +cd $DYNAMO_HOME/lib/bindings/python +maturin develop --uv +cd $DYNAMO_HOME +# installs sglang supported version along with dynamo +# include the prerelease flag to install flashinfer rc versions +uv pip install -e . +# install any sglang version >= 0.5.3.post2 +uv pip install "sglang[all]==0.5.3.post2" +``` + +
+ +### Using docker containers + +
+Expand for instructions + +We are in the process of shipping pre-built docker containers that contain installations of DeepEP, DeepGEMM, and NVSHMEM in order to support WideEP and P/D. For now, you can quickly build the container from source with the following command. + +```bash +cd $DYNAMO_ROOT +./container/build.sh \ + --framework SGLANG \ + --tag dynamo-sglang:latest \ +``` + +And then run it using + +```bash +docker run \ + --gpus all \ + -it \ + --rm \ + --network host \ + --shm-size=10G \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --ulimit nofile=65536:65536 \ + --cap-add CAP_SYS_PTRACE \ + --ipc host \ + dynamo-sglang:latest +``` + +
+ +## Quick Start + +Below we provide a guide that lets you run all of our common deployment patterns on a single node. + +### Start NATS and ETCD in the background + +Start using [Docker Compose](https://github.com/ai-dynamo/dynamo/tree/main/deploy/docker-compose.yml) + +```bash +docker compose -f deploy/docker-compose.yml up -d +``` + + +Each example corresponds to a simple bash script that runs the OpenAI compatible server, processor, and optional router (written in Rust) and LLM engine (written in Python) in a single terminal. You can easily take each command and run them in separate terminals. +Additionally - because we use sglang's argument parser, you can pass in any argument that sglang supports to the worker! + + + +### Aggregated Serving + +```bash +cd $DYNAMO_HOME/examples/backends/sglang +./launch/agg.sh +``` + +### Aggregated Serving with KV Routing + +```bash +cd $DYNAMO_HOME/examples/backends/sglang +./launch/agg_router.sh +``` + +### Aggregated Serving for Embedding Models + +Here's an example that uses the [Qwen/Qwen3-Embedding-4B](https://huggingface.co/Qwen/Qwen3-Embedding-4B) model. + +```bash +cd $DYNAMO_HOME/examples/backends/sglang +./launch/agg_embed.sh +``` + +
+Send the following request to verify your deployment: + +```bash +curl localhost:8000/v1/embeddings \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-Embedding-4B", + "input": "Hello, world!" + }' +``` + +
+ +### Disaggregated serving + +See [SGLang Disaggregation](sglang-disaggregation.md) to learn more about how sglang and dynamo handle disaggregated serving. + + +```bash +cd $DYNAMO_HOME/examples/backends/sglang +./launch/disagg.sh +``` + +### Disaggregated Serving with KV Aware Prefill Routing + +```bash +cd $DYNAMO_HOME/examples/backends/sglang +./launch/disagg_router.sh +``` + +### Disaggregated Serving with Mixture-of-Experts (MoE) models and DP attention + +You can use this configuration to test out disaggregated serving with dp attention and expert parallelism on a single node before scaling to the full DeepSeek-R1 model across multiple nodes. + +```bash +# note this will require 4 GPUs +cd $DYNAMO_HOME/examples/backends/sglang +./launch/disagg_dp_attn.sh +``` + +### Testing the Deployment + +Send a test request to verify your deployment: + +```bash +curl localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-0.6B", + "messages": [ + { + "role": "user", + "content": "Explain why Roger Federer is considered one of the greatest tennis players of all time" + } + ], + "stream": true, + "max_tokens": 30 + }' +``` + +## Deployment + +We currently provide deployment examples for Kubernetes and SLURM. + +## Kubernetes +- **[Deploying Dynamo with SGLang on Kubernetes](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/sglang/deploy/README.md)** + +## SLURM +- **[Deploying Dynamo with SGLang on SLURM](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/sglang/slurm_jobs/README.md)** diff --git a/fern/pages/backends/sglang/expert-distribution-eplb.md b/fern/pages/backends/sglang/expert-distribution-eplb.md new file mode 100644 index 00000000000..507fd3edd2a --- /dev/null +++ b/fern/pages/backends/sglang/expert-distribution-eplb.md @@ -0,0 +1,60 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Expert Parallelism Load Balancer (EPLB) in SGLang" +--- + +Mixture-of-Experts (MoE) models utilize a technique called Expert Parallelism (EP), where experts are distributed across multiple GPUs. While this allows for much larger and more powerful models, it can lead to an uneven workload distribution. Because the load on different experts may vary depending on the workload, some GPUs can become bottlenecks, forcing the entire system to wait. This imbalance leads to wasted compute cycles and increased memory usage. + +To address this, SGLang implements an Expert Parallelism Load Balancer (EPLB) inspired by the work in the DeepSeek-V3 paper. EPLB analyzes expert usage patterns and dynamically re-arranges the experts across the available GPUs to ensure a more balanced workload. + +## The EPLB Algorithm: Core Concepts + +The load balancing algorithm revolves around a few key ideas to achieve an optimal distribution of work. + +### Redundant Experts for Flexibility + +The core strategy is to create **redundant experts**. Instead of being limited to the model's original number of experts, EPLB can create duplicates of heavily-loaded experts. For example, if a model has 256 experts, you can configure EPLB to create an additional 32 "redundant" experts, bringing the total to 288. This pool of replicated experts is then strategically packed onto the available GPUs. A popular expert might be duplicated multiple times, while a moderately used expert might be grouped with several rarely used ones on a single GPU. + +### Group-Limited Routing for Efficiency + +Modern MoE models like DeepSeek-V3 use **group-limited expert routing**. In this design, experts are organized into groups, and routing decisions are constrained within these groups. EPLB can take advantage of this structure to reduce inter-node data traffic by attempting to place all experts from the same group onto the same node whenever possible. + +### Load Balancing Policies + +The algorithm comes with two policies for different scenarios: + +1. **Hierarchical Load Balancing**: This policy is used when the number of server nodes evenly divides the number of expert groups. It first harnesses the group-limited routing by packing expert groups onto nodes to balance the load between nodes. Then, within each node, it replicates and packs the experts onto individual GPUs to balance the load locally. This is often used during prefill where the expert-parallel size might be smaller. + +2. **Global Load Balancing**: In all other cases, a global policy is used. It replicates experts globally without regard to their group affiliation and packs them onto individual GPUs. This policy is more general and can be adopted during the decoding stage with a larger expert-parallel size. + +## How SGLang Implements EPLB + +SGLang provides a robust implementation of EPLB, allowing for dynamic, online rebalancing of expert locations based on real-world traffic. + +### Dynamic Rebalancing + +You can enable dynamic rebalancing by setting the `--enable-eplb` flag. When enabled, the `EPLBManager` runs in the background. It periodically triggers a rebalance after a certain number of requests, configured with `--eplb-rebalance-num-iterations`. At each rebalance, it computes a new expert placement plan based on the latest usage statistics and updates the model's expert locations on the fly. + +### Expert Usage Recording + +To make intelligent balancing decisions, SGLang needs to collect data on expert usage. The `ExpertDistributionRecorder` is responsible for this, and its behavior is controlled by the `--expert-distribution-recorder-mode` flag. This flag determines the granularity of the collected data. When `enable_eplb` is on, this mode defaults to `stat` to gather statistics for rebalancing. The available modes are: + +- **`per_token`**: This is the most detailed mode. It records the specific expert choices for every single token processed by the model. While it provides the richest data, it also has the highest performance overhead. The raw, unaggregated data for each forward pass is stored. + +- **`per_pass`**: In this mode, SGLang records the aggregated expert usage counts for each individual forward pass. The data is not aggregated across different passes, giving you a snapshot of expert popularity for each batch of requests. + +- **`stat`**: This mode also records the exact expert usage counts for each forward pass, but it then aggregates these counts across multiple passes (the number of passes is determined by `--expert-distribution-recorder-buffer-size`). This provides a moving average of expert usage statistics and is the default when EPLB is enabled. + +- **`stat_approx`**: This mode is similar to `stat` but gathers _approximate_ statistics, usually from the DeepEP dispatcher. This method has lower overhead than `stat` but is less precise, especially for small batch sizes. It is a good choice when performance is critical. + +The collected statistics are then fed into the rebalancing algorithm to generate a new expert placement plan. + +### Initializing with a Pre-computed Distribution + +While SGLang can start with a simple default layout and learn a better one over time, you can also provide it with a pre-computed expert distribution to start with. The `--init-expert-location` flag allows you to specify a file path (`.pt` or `.json`) or a JSON string containing an expert layout. This is useful if you have already analyzed a representative workload offline and want the server to start immediately with a balanced configuration. If this flag is not set, it defaults to a `trivial` sequential layout. + +### References and further reading + +- [SGLang Large Scale P/D + WideEP Deployment](https://lmsys.org/blog/2025-05-05-large-scale-ep/#expert-parallelism-load-balancer) +- [Deepseek's EPLB repository](https://github.com/deepseek-ai/EPLB) diff --git a/fern/pages/backends/sglang/gpt-oss.md b/fern/pages/backends/sglang/gpt-oss.md new file mode 100644 index 00000000000..441ac804ae8 --- /dev/null +++ b/fern/pages/backends/sglang/gpt-oss.md @@ -0,0 +1,47 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Running gpt-oss-120b Disaggregated with SGLang" +--- + +The gpt-oss-120b guide for SGLang is largely identical to the [guide for vLLM](../vllm/gpt-oss.md), +please ues the vLLM guide as a reference with the different deployment steps as highlighted below: + +# Launch the Deployment + +Note that GPT-OSS is a reasoning model with tool calling support. To +ensure the response is being processed correctly, the worker should be +launched with proper `--dyn-reasoning-parser` and `--dyn-tool-call-parser`. + +**Start frontend** +```bash +python3 -m dynamo.frontend --http-port 8000 & +``` + +**Run decode worker** +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.sglang \ + --model-path openai/gpt-oss-120b \ + --served-model-name openai/gpt-oss-120b \ + --tp 4 \ + --trust-remote-code \ + --skip-tokenizer-init \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend nixl \ + --dyn-reasoning-parser gpt_oss \ + --dyn-tool-call-parser harmony +``` + +**Run prefill workers** +```bash +CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.sglang \ + --model-path openai/gpt-oss-120b \ + --served-model-name openai/gpt-oss-120b \ + --tp 4 \ + --trust-remote-code \ + --skip-tokenizer-init \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend nixl \ + --dyn-reasoning-parser gpt_oss \ + --dyn-tool-call-parser harmony +``` diff --git a/fern/pages/backends/sglang/profiling.md b/fern/pages/backends/sglang/profiling.md new file mode 100644 index 00000000000..515551136dd --- /dev/null +++ b/fern/pages/backends/sglang/profiling.md @@ -0,0 +1,43 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Profiling SGLang Workers in Dynamo" +--- + +Dynamo exposes profiling endpoints for SGLang workers via the system server's `/engine/*` routes. This allows you to start and stop PyTorch profiling on running inference workers without restarting them. + +These endpoints wrap SGLang's internal `TokenizerManager.start_profile()` and `stop_profile()` methods. See SGLang's documentation for the full list of supported parameters. + +## Quick Start + +1. **Start profiling:** + +```bash +curl -X POST http://localhost:9090/engine/start_profile \ + -H "Content-Type: application/json" \ + -d '{"output_dir": "/tmp/profiler_output"}' +``` + +2. **Run some inference requests to generate profiling data** + +3. **Stop profiling:** + +```bash +curl -X POST http://localhost:9090/engine/stop_profile +``` + +4. **View the traces:** + +The profiler outputs Chrome trace files in the specified `output_dir`. You can view them using: +- Chrome's `chrome://tracing` +- [Perfetto UI](https://ui.perfetto.dev/) +- TensorBoard with the PyTorch Profiler plugin + +## Test Script + +A test script is provided at [`examples/backends/sglang/test_sglang_profile.py`](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/sglang/test_sglang_profile.py) that demonstrates the full profiling workflow: + +```bash +python examples/backends/sglang/test_sglang_profile.py +``` + diff --git a/fern/pages/backends/sglang/prometheus.md b/fern/pages/backends/sglang/prometheus.md new file mode 100644 index 00000000000..c29858ea2e7 --- /dev/null +++ b/fern/pages/backends/sglang/prometheus.md @@ -0,0 +1,122 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "SGLang Prometheus Metrics" +--- + +## Overview + +When running SGLang through Dynamo, SGLang engine metrics are automatically passed through and exposed on Dynamo's `/metrics` endpoint (default port 8081). This allows you to access both SGLang engine metrics (prefixed with `sglang:`) and Dynamo runtime metrics (prefixed with `dynamo_*`) from a single worker backend endpoint. + +**For the complete and authoritative list of all SGLang metrics**, always refer to the [official SGLang Production Metrics documentation](https://docs.sglang.ai/references/production_metrics.html). + +**For Dynamo runtime metrics**, see the [Dynamo Metrics Guide](../../observability/metrics.md). + +**For visualization setup instructions**, see the [Prometheus and Grafana Setup Guide](../../observability/prometheus-grafana.md). + +## Environment Variables + +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `DYN_SYSTEM_PORT` | System metrics/health port | `-1` (disabled) | `8081` | + +## Getting Started Quickly + +This is a single machine example. + +### Start Observability Stack + +For visualizing metrics with Prometheus and Grafana, start the observability stack. See [Observability Getting Started](../../observability/README.md#getting-started-quickly) for instructions. + +### Launch Dynamo Components + +Launch a frontend and SGLang backend to test metrics: + +```bash +# Start frontend (default port 8000, override with --http-port or DYN_HTTP_PORT env var) +$ python -m dynamo.frontend + +# Enable system metrics server on port 8081 +$ DYN_SYSTEM_PORT=8081 python -m dynamo.sglang --model --enable-metrics +``` + +Wait for the SGLang worker to start, then send requests and check metrics: + +```bash +# Send a request +curl -H 'Content-Type: application/json' \ +-d '{ + "model": "", + "max_completion_tokens": 100, + "messages": [{"role": "user", "content": "Hello"}] +}' \ +http://localhost:8000/v1/chat/completions + +# Check metrics from the worker +curl -s localhost:8081/metrics | grep "^sglang:" +``` + +## Exposed Metrics + +SGLang exposes metrics in Prometheus Exposition Format text at the `/metrics` HTTP endpoint. All SGLang engine metrics use the `sglang:` prefix and include labels (e.g., `model_name`, `engine_type`, `tp_rank`, `pp_rank`) to identify the source. + +**Example Prometheus Exposition Format text:** + +``` +# HELP sglang:prompt_tokens_total Number of prefill tokens processed. +# TYPE sglang:prompt_tokens_total counter +sglang:prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8128902.0 + +# HELP sglang:generation_tokens_total Number of generation tokens processed. +# TYPE sglang:generation_tokens_total counter +sglang:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7557572.0 + +# HELP sglang:cache_hit_rate The cache hit rate +# TYPE sglang:cache_hit_rate gauge +sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0075 +``` + +**Note:** The specific metrics shown above are examples and may vary depending on your SGLang version. Always inspect your actual `/metrics` endpoint or refer to the [official documentation](https://docs.sglang.ai/references/production_metrics.html) for the current list. + +### Metric Categories + +SGLang provides metrics in the following categories (all prefixed with `sglang:`): + +- **Throughput metrics** - Token processing rates +- **Resource usage** - System resource consumption +- **Latency metrics** - Request and token latency measurements +- **Disaggregation metrics** - Metrics specific to disaggregated deployments (when enabled) + +**Note:** Specific metrics are subject to change between SGLang versions. Always refer to the [official documentation](https://docs.sglang.ai/references/production_metrics.html) or inspect the `/metrics` endpoint for your SGLang version. + +## Available Metrics + +The official SGLang documentation includes complete metric definitions with: +- HELP and TYPE descriptions +- Counter, Gauge, and Histogram metric types +- Metric labels (e.g., `model_name`, `engine_type`, `tp_rank`, `pp_rank`) +- Setup guide for Prometheus + Grafana monitoring +- Troubleshooting tips and configuration examples + +For the complete and authoritative list of all SGLang metrics, see the [official SGLang Production Metrics documentation](https://docs.sglang.ai/references/production_metrics.html). + +## Implementation Details + +- SGLang uses multiprocess metrics collection via `prometheus_client.multiprocess.MultiProcessCollector` +- Metrics are filtered by the `sglang:` prefix before being exposed +- The integration uses Dynamo's `register_engine_metrics_callback()` function +- Metrics appear after SGLang engine initialization completes + +## Related Documentation + +### SGLang Metrics +- [Official SGLang Production Metrics](https://docs.sglang.ai/references/production_metrics.html) +- [SGLang GitHub - Metrics Collector](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/metrics/collector.py) + +### Dynamo Metrics +- [Dynamo Metrics Guide](../../observability/metrics.md) - Complete documentation on Dynamo runtime metrics +- [Prometheus and Grafana Setup](../../observability/prometheus-grafana.md) - Visualization setup instructions +- Dynamo runtime metrics (prefixed with `dynamo_*`) are available at the same `/metrics` endpoint alongside SGLang metrics + - Implementation: `lib/runtime/src/metrics.rs` (Rust runtime metrics) + - Metric names: `lib/runtime/src/metrics/prometheus_names.rs` (metric name constants) + - Integration code: `components/src/dynamo/common/utils/prometheus.py` - Prometheus utilities and callback registration diff --git a/fern/pages/backends/sglang/sgl-hicache-example.md b/fern/pages/backends/sglang/sgl-hicache-example.md new file mode 100644 index 00000000000..d4ef20a2fe3 --- /dev/null +++ b/fern/pages/backends/sglang/sgl-hicache-example.md @@ -0,0 +1,64 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Enable SGLang Hierarchical Cache (HiCache)" +--- + +This guide shows how to enable SGLang's Hierarchical Cache (HiCache) inside Dynamo. + +## 1) Start the SGLang worker with HiCache enabled + +```bash +python -m dynamo.sglang \ + --model-path Qwen/Qwen3-0.6B \ + --host 0.0.0.0 --port 8000 \ + --page-size 64 \ + --enable-hierarchical-cache \ + --hicache-ratio 2 \ + --hicache-write-policy write_through \ + --hicache-storage-backend nixl \ + --log-level debug \ + --skip-tokenizer-init +``` + +- **--enable-hierarchical-cache**: Enables hierarchical KV cache/offload +- **--hicache-ratio**: The ratio of the size of host KV cache memory pool to the size of device pool. Lower this number if your machine has less CPU memory. +- **--hicache-write-policy**: Write policy (e.g., `write_through` for synchronous host writes) +- **--hicache-storage-backend**: Host storage backend for HiCache (e.g., `nixl`). NIXL selects the concrete store automatically; see [PR #8488](https://github.com/sgl-project/sglang/pull/8488) + + +Then, start the frontend: +```bash +python -m dynamo.frontend --http-port 8000 +``` + +## 2) Send a single request + +```bash +curl localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-0.6B", + "messages": [ + { + "role": "user", + "content": "Explain why Roger Federer is considered one of the greatest tennis players of all time" + } + ], + "stream": false, + "max_tokens": 30 + }' +``` + +## 3) (Optional) Benchmarking + +Run the perf script: +```bash +bash -x $DYNAMO_ROOT/benchmarks/llm/perf.sh \ + --model Qwen/Qwen3-0.6B \ + --tensor-parallelism 1 \ + --data-parallelism 1 \ + --concurrency "2,4,8" \ + --input-sequence-length 2048 \ + --output-sequence-length 256 +``` diff --git a/fern/pages/backends/sglang/sglang-disaggregation.md b/fern/pages/backends/sglang/sglang-disaggregation.md new file mode 100644 index 00000000000..e0be2afd126 --- /dev/null +++ b/fern/pages/backends/sglang/sglang-disaggregation.md @@ -0,0 +1,88 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "SGLang Disaggregated Serving" +--- + +This document explains how SGLang's disaggregated prefill-decode architecture works, both standalone and within Dynamo. + +## Overview + +Disaggregated serving separates the prefill and decode phases of LLM inference into different workers. This architecture allows for: +- Independent scaling of prefill and decode resources +- Better resource utilization (prefill is compute-bound, decode is memory-bound) +- Efficient KV cache transfer between workers using RDMA + +## How Dynamo Integrates with SGLang Disaggregation + +**SGLang's standalone approach:** +1. The load balancer receives a request from the client +2. A random `(prefill, decode)` pair is selected from the pool of available workers +3. Request is sent to both `prefill` and `decode` workers via asyncio tasks +4. Internally disaggregation is done from prefill → decode + +**Dynamo's approach:** + +Because Dynamo has a discovery mechanism, we do not use a load balancer. Instead: +1. Route to a decode worker first +2. Choose a prefill worker via round-robin or KV-aware selection +3. Send the request to both workers +4. SGLang's bootstrap server (part of the `tokenizer_manager`) is used in conjunction with NIXL/Mooncake to handle the KV transfer + +## Disaggregation Flow + +The following diagram shows the complete request flow for disaggregated serving: + +```mermaid +sequenceDiagram + participant Client + participant Decode + participant Prefill + + Note over Decode,Prefill: 0. Setup Phase (One-Time) + Decode->>Prefill: Register RDMA connection info (base GPU memory pointers) + Note over Client,Prefill: Per-Request Phase + Client->>Decode: 1. Send request + Decode->>Prefill: 2. Forward request + get bootstrap_room + Prefill-->>Decode: Return bootstrap_room ID + Note over Decode: 3. Allocate GPU memory for KV cache + Decode->>Prefill: Send allocation info (page indices, metadata buffer) + Note over Prefill: 4. Prefill forward pass + par Decode polls + loop Poll transfer + Note over Decode: 5. Poll for KV arrival + end + and Prefill transfers + Note over Prefill: 6. RDMA write KV to decode + Prefill->>Decode: Transfer KV cache + metadata + end + Note over Prefill: 7. Poll RDMA handles + Note over Prefill: Transfer complete, deallocate metadata + Note over Decode: 8. KV received, start decode + loop Generate tokens + Note over Decode: Decode forward pass + Decode-->>Client: Stream output token + end +``` + +### Key Steps Explained + +**Setup Phase (One-Time)** +- Decode workers register their RDMA connection information with prefill workers +- This includes base GPU memory pointers for direct memory access + +**Per-Request Flow** +1. **Request initiation**: Client sends request to decode worker +2. **Bootstrap room allocation**: Decode forwards to prefill and receives a bootstrap_room ID for coordination +3. **Memory allocation**: Decode allocates GPU memory pages for incoming KV cache +4. **Prefill execution**: Prefill worker processes the prompt and generates KV cache +5. **KV transfer**: Prefill uses RDMA to write KV cache directly to decode's GPU memory (while decode polls for completion) +6. **Cleanup**: Prefill deallocates transfer metadata after confirming completion +7. **Decode phase**: Decode worker generates tokens using the transferred KV cache +8. **Streaming**: Tokens are streamed back to the client as they're generated + +### Performance Characteristics + +- **RDMA transfer**: Zero-copy GPU-to-GPU transfer with minimal CPU involvement +- **Parallel operations**: Decode can poll while prefill transfers data +- **One-time setup**: RDMA connections established once, reused for all requests \ No newline at end of file diff --git a/fern/pages/backends/trtllm/README.md b/fern/pages/backends/trtllm/README.md new file mode 100644 index 00000000000..06b3fdb910e --- /dev/null +++ b/fern/pages/backends/trtllm/README.md @@ -0,0 +1,284 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "LLM Deployment using TensorRT-LLM" +--- + +This directory contains examples and reference implementations for deploying Large Language Models (LLMs) in various configurations using TensorRT-LLM. + +## Use the Latest Release + +We recommend using the latest stable release of dynamo to avoid breaking changes: + +[![GitHub Release](https://img.shields.io/github/v/release/ai-dynamo/dynamo)](https://github.com/ai-dynamo/dynamo/releases/latest) + +You can find the latest release [here](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with: + +```bash +git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) +``` + +--- + +## Table of Contents +- [Feature Support Matrix](#feature-support-matrix) +- [Quick Start](#tensorrt-llm-quick-start) +- [Single Node Examples](#single-node-examples) +- [Advanced Examples](#advanced-examples) +- [KV Cache Transfer](#kv-cache-transfer-in-disaggregated-serving) +- [Client](#client) +- [Benchmarking](#benchmarking) +- [Multimodal Support](#multimodal-support) +- [Logits Processing](#logits-processing) +- [Performance Sweep](#performance-sweep) + +## Feature Support Matrix + +### Core Dynamo Features + +| Feature | TensorRT-LLM | Notes | +|---------|--------------|-------| +| [**Disaggregated Serving**](../../design-docs/disagg-serving.md) | ✅ | | +| [**Conditional Disaggregation**](../../design-docs/disagg-serving.md#conditional-disaggregation) | 🚧 | Not supported yet | +| [**KV-Aware Routing**](../../router/kv-cache-routing.md) | ✅ | | +| [**SLA-Based Planner**](../../planner/sla-planner.md) | ✅ | | +| [**Load Based Planner**](../../planner/load-planner.md) | 🚧 | Planned | +| [**KVBM**](../../kvbm/kvbm-architecture.md) | ✅ | | + +### Large Scale P/D and WideEP Features + +| Feature | TensorRT-LLM | Notes | +|--------------------|--------------|-----------------------------------------------------------------| +| **WideEP** | ✅ | | +| **DP Rank Routing**| ✅ | | +| **GB200 Support** | ✅ | | + +## TensorRT-LLM Quick Start + +Below we provide a guide that lets you run all of our the common deployment patterns on a single node. + +### Start NATS and ETCD in the background + +Start using [Docker Compose](https://github.com/ai-dynamo/dynamo/tree/main/deploy/docker-compose.yml) + +```bash +docker compose -f deploy/docker-compose.yml up -d +``` + +### Build container + +```bash +# TensorRT-LLM uses git-lfs, which needs to be installed in advance. +apt-get update && apt-get -y install git git-lfs + +# On an x86 machine: +./container/build.sh --framework trtllm + +# On an ARM machine: +./container/build.sh --framework trtllm --platform linux/arm64 + +# Build the container with the default experimental TensorRT-LLM commit +# WARNING: This is for experimental feature testing only. +# The container should not be used in a production environment. +./container/build.sh --framework trtllm --tensorrtllm-git-url https://github.com/NVIDIA/TensorRT-LLM.git --tensorrtllm-commit main +``` + +### Run container + +```bash +./container/run.sh --framework trtllm -it +``` + +## Single Node Examples + + +Below we provide some simple shell scripts that run the components for each configuration. Each shell script is simply running the `python3 -m dynamo.frontend ` to start up the ingress and using `python3 -m dynamo.trtllm ` to start up the workers. You can easily take each command and run them in separate terminals. + + +For detailed information about the architecture and how KV-aware routing works, see the [KV Cache Routing documentation](../../router/kv-cache-routing.md). + +### Aggregated +```bash +cd $DYNAMO_HOME/examples/backends/trtllm +./launch/agg.sh +``` + +### Aggregated with KV Routing +```bash +cd $DYNAMO_HOME/examples/backends/trtllm +./launch/agg_router.sh +``` + +### Disaggregated + +```bash +cd $DYNAMO_HOME/examples/backends/trtllm +./launch/disagg.sh +``` + +### Disaggregated with KV Routing + + +In disaggregated workflow, requests are routed to the prefill worker to maximize KV cache reuse. + + +```bash +cd $DYNAMO_HOME/examples/backends/trtllm +./launch/disagg_router.sh +``` + +### Aggregated with Multi-Token Prediction (MTP) and DeepSeek R1 +```bash +cd $DYNAMO_HOME/examples/backends/trtllm + +export AGG_ENGINE_ARGS=./engine_configs/deepseek-r1/agg/mtp/mtp_agg.yaml +export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4" +# nvidia/DeepSeek-R1-FP4 is a large model +export MODEL_PATH="nvidia/DeepSeek-R1-FP4" +./launch/agg.sh +``` + +Notes: +- There is a noticeable latency for the first two inference requests. Please send warm-up requests before starting the benchmark. +- MTP performance may vary depending on the acceptance rate of predicted tokens, which is dependent on the dataset or queries used while benchmarking. Additionally, `ignore_eos` should generally be omitted or set to `false` when using MTP to avoid speculating garbage outputs and getting unrealistic acceptance rates. + +## Advanced Examples + +Below we provide a selected list of advanced examples. Please open up an issue if you'd like to see a specific example! + +### Multinode Deployment + +For comprehensive instructions on multinode serving, see the [multinode-examples.md](multinode/multinode-examples.md) guide. It provides step-by-step deployment examples and configuration tips for running Dynamo with TensorRT-LLM across multiple nodes. While the walkthrough uses DeepSeek-R1 as the model, you can easily adapt the process for any supported model by updating the relevant configuration files. You can see [Llama4+eagle](llama4-plus-eagle.md) guide to learn how to use these scripts when a single worker fits on the single node. + +### Speculative Decoding +- **[Llama 4 Maverick Instruct + Eagle Speculative Decoding](llama4-plus-eagle.md)** + +### Kubernetes Deployment + +For complete Kubernetes deployment instructions, configurations, and troubleshooting, see [TensorRT-LLM Kubernetes Deployment Guide](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/trtllm/deploy/README.md). + +### Client + +See [client](../sglang/README.md#testing-the-deployment) section to learn how to send request to the deployment. + +NOTE: To send a request to a multi-node deployment, target the node which is running `python3 -m dynamo.frontend `. + +### Benchmarking + +To benchmark your deployment with AIPerf, see this utility script, configuring the +`model` name and `host` based on your deployment: [perf.sh](https://github.com/ai-dynamo/dynamo/tree/main/benchmarks/llm/perf.sh) + +## KV Cache Transfer in Disaggregated Serving + +Dynamo with TensorRT-LLM supports two methods for transferring KV cache in disaggregated serving: UCX (default) and NIXL (experimental). For detailed information and configuration instructions for each method, see the [KV cache transfer guide](kv-cache-transfer.md). + + +## Request Migration + +You can enable [request migration](../../fault-tolerance/request-migration.md) to handle worker failures gracefully. Use the `--migration-limit` flag to specify how many times a request can be migrated to another worker: + +```bash +# For decode and aggregated workers +python3 -m dynamo.trtllm ... --migration-limit=3 +``` + + +**Prefill workers do not support request migration** and must use `--migration-limit=0` (the default). Prefill workers only process prompts and return KV cache state - they don't maintain long-running generation requests that would benefit from migration. + + +See the [Request Migration Architecture](../../fault-tolerance/request-migration.md) documentation for details on how this works. + +## Request Cancellation + +When a user cancels a request (e.g., by disconnecting from the frontend), the request is automatically cancelled across all workers, freeing compute resources for other requests. + +### Cancellation Support Matrix + +| | Prefill | Decode | +|-|---------|--------| +| **Aggregated** | ✅ | ✅ | +| **Disaggregated** | ✅ | ✅ | + +For more details, see the [Request Cancellation Architecture](../../fault-tolerance/request-cancellation.md) documentation. + +## Client + +See [client](../sglang/README.md#testing-the-deployment) section to learn how to send request to the deployment. + +NOTE: To send a request to a multi-node deployment, target the node which is running `python3 -m dynamo.frontend `. + +## Benchmarking + +To benchmark your deployment with AIPerf, see this utility script, configuring the +`model` name and `host` based on your deployment: [perf.sh](https://github.com/ai-dynamo/dynamo/tree/main/benchmarks/llm/perf.sh) + +## Multimodal support + +Dynamo with the TensorRT-LLM backend supports multimodal models, enabling you to process both text and images (or pre-computed embeddings) in a single request. For detailed setup instructions, example requests, and best practices, see the [TensorRT-LLM Multimodal Guide](../../multimodal/trtllm.md). + +## Logits Processing + +Logits processors let you modify the next-token logits at every decoding step (e.g., to apply custom constraints or sampling transforms). Dynamo provides a backend-agnostic interface and an adapter for TensorRT-LLM so you can plug in custom processors. + +### How it works +- **Interface**: Implement `dynamo.logits_processing.BaseLogitsProcessor` which defines `__call__(input_ids, logits)` and modifies `logits` in-place. +- **TRT-LLM adapter**: Use `dynamo.trtllm.logits_processing.adapter.create_trtllm_adapters(...)` to convert Dynamo processors into TRT-LLM-compatible processors and assign them to `SamplingParams.logits_processor`. +- **Examples**: See example processors in `lib/bindings/python/src/dynamo/logits_processing/examples/` ([temperature](https://github.com/ai-dynamo/dynamo/tree/main/lib/bindings/python/src/dynamo/logits_processing/examples/temperature.py), [hello_world](https://github.com/ai-dynamo/dynamo/tree/main/lib/bindings/python/src/dynamo/logits_processing/examples/hello_world.py)). + +### Quick test: HelloWorld processor +You can enable a test-only processor that forces the model to respond with "Hello world!". This is useful to verify the wiring without modifying your model or engine code. + +```bash +cd $DYNAMO_HOME/examples/backends/trtllm +export DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR=1 +./launch/agg.sh +``` + +Notes: +- When enabled, Dynamo initializes the tokenizer so the HelloWorld processor can map text to token IDs. +- Expected chat response contains "Hello world". + +### Bring your own processor +Implement a processor by conforming to `BaseLogitsProcessor` and modify logits in-place. For example, temperature scaling: + +```python +from typing import Sequence +import torch +from dynamo.logits_processing import BaseLogitsProcessor + +class TemperatureProcessor(BaseLogitsProcessor): + def __init__(self, temperature: float = 1.0): + if temperature <= 0: + raise ValueError("Temperature must be positive") + self.temperature = temperature + + def __call__(self, input_ids: Sequence[int], logits: torch.Tensor): + if self.temperature == 1.0: + return + logits.div_(self.temperature) +``` + +Wire it into TRT-LLM by adapting and attaching to `SamplingParams`: + +```python +from dynamo.trtllm.logits_processing.adapter import create_trtllm_adapters +from dynamo.logits_processing.examples import TemperatureProcessor + +processors = [TemperatureProcessor(temperature=0.7)] +sampling_params.logits_processor = create_trtllm_adapters(processors) +``` + +### Current limitations +- Per-request processing only (batch size must be 1); beam width > 1 is not supported. +- Processors must modify logits in-place and not return a new tensor. +- If your processor needs tokenization, ensure the tokenizer is initialized (do not skip tokenizer init). + +## Performance Sweep + +For detailed instructions on running comprehensive performance sweeps across both aggregated and disaggregated serving configurations, see the [TensorRT-LLM Benchmark Scripts for DeepSeek R1 model](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/trtllm/performance_sweeps/README.md). This guide covers recommended benchmarking setups, usage of provided scripts, and best practices for evaluating system performance. + +## Dynamo KV Block Manager Integration + +Dynamo with TensorRT-LLM currently supports integration with the Dynamo KV Block Manager. This integration can significantly reduce time-to-first-token (TTFT) latency, particularly in usage patterns such as multi-turn conversations and repeated long-context requests. + +Here is the instruction: [Running KVBM in TensorRT-LLM](../../kvbm/trtllm-setup.md) . diff --git a/fern/pages/backends/trtllm/gemma3-sliding-window-attention.md b/fern/pages/backends/trtllm/gemma3-sliding-window-attention.md new file mode 100644 index 00000000000..5adde1b9b46 --- /dev/null +++ b/fern/pages/backends/trtllm/gemma3-sliding-window-attention.md @@ -0,0 +1,52 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Gemma 3 with Variable Sliding Window Attention" +--- + +This guide demonstrates how to deploy google/gemma-3-1b-it with Variable Sliding Window Attention (VSWA) using Dynamo. Since google/gemma-3-1b-it is a small model, each aggregated, decode, or prefill worker only requires one H100 GPU or one GB200 GPU. +VSWA is a mechanism in which a model’s layers alternate between multiple sliding window sizes. An example of this is Gemma 3, which incorporates both global attention layers and sliding window layers. + + +- Ensure that required services such as `nats` and `etcd` are running before starting. +- Request access to `google/gemma-3-1b-it` on Hugging Face and set your `HF_TOKEN` environment variable for authentication. +- It's recommended to continue using the VSWA feature with the Dynamo 0.5.0 release and the TensorRT-LLM dynamo runtime image nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.0. The 0.5.1 release bundles TensorRT-LLM v1.1.0rc5, which has a regression that breaks VSWA. + + +## Aggregated Serving +```bash +cd $DYNAMO_HOME/examples/backends/trtllm +export MODEL_PATH=google/gemma-3-1b-it +export SERVED_MODEL_NAME=$MODEL_PATH +export AGG_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml +./launch/agg.sh +``` + +## Aggregated Serving with KV Routing +```bash +cd $DYNAMO_HOME/examples/backends/trtllm +export MODEL_PATH=google/gemma-3-1b-it +export SERVED_MODEL_NAME=$MODEL_PATH +export AGG_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml +./launch/agg_router.sh +``` + +## Disaggregated Serving +```bash +cd $DYNAMO_HOME/examples/backends/trtllm +export MODEL_PATH=google/gemma-3-1b-it +export SERVED_MODEL_NAME=$MODEL_PATH +export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml +export DECODE_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml +./launch/disagg.sh +``` + +## Disaggregated Serving with KV Routing +```bash +cd $DYNAMO_HOME/examples/backends/trtllm +export MODEL_PATH=google/gemma-3-1b-it +export SERVED_MODEL_NAME=$MODEL_PATH +export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml +export DECODE_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml +./launch/disagg_router.sh +``` diff --git a/fern/pages/backends/trtllm/gpt-oss.md b/fern/pages/backends/trtllm/gpt-oss.md new file mode 100644 index 00000000000..4b5d4c7ebb2 --- /dev/null +++ b/fern/pages/backends/trtllm/gpt-oss.md @@ -0,0 +1,515 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Running gpt-oss-120b Disaggregated with TensorRT-LLM" +--- + +Dynamo supports disaggregated serving of gpt-oss-120b with TensorRT-LLM. This guide demonstrates how to deploy gpt-oss-120b using disaggregated prefill/decode serving on a single B200 node with 8 GPUs, running 1 prefill worker on 4 GPUs and 1 decode worker on 4 GPUs. + +## Overview + +This deployment uses disaggregated serving in TensorRT-LLM where: +- **Prefill Worker**: Processes input prompts efficiently using 4 GPUs with tensor parallelism +- **Decode Worker**: Generates output tokens using 4 GPUs, optimized for token generation throughput +- **Frontend**: Provides OpenAI-compatible API endpoint with round-robin routing + +The disaggregated approach optimizes for both low-latency (maximizing tokens per second per user) and high-throughput (maximizing total tokens per GPU per second) use cases by separating the compute-intensive prefill phase from the memory-bound decode phase. + +## Prerequisites + +- 1x NVIDIA B200 node with 8 GPUs (this guide focuses on single-node B200 deployment) +- CUDA Toolkit 12.8 or later +- Docker with [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed +- Fast SSD storage for model weights (~240GB required) +- HuggingFace account and [access token](https://huggingface.co/settings/tokens) +- [HuggingFace CLI](https://huggingface.co/docs/huggingface_hub/en/guides/cli) + + +Ensure that the `etcd` and `nats` services are running with the following command: + +```bash +docker compose -f deploy/docker-compose.yml up +``` + +## Instructions + +### 1. Download the Model + +```bash +export MODEL_PATH= +export HF_TOKEN= + +pip install -U "huggingface_hub[cli]" + +huggingface-cli download openai/gpt-oss-120b --exclude "original/*" --exclude "metal/*" --local-dir $MODEL_PATH +``` + +### 2. Run the Container + +Set the container image: +```bash +export DYNAMO_CONTAINER_IMAGE=nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag +``` + +Launch the Dynamo TensorRT-LLM container with the necessary configurations: + +```bash +docker run \ + --gpus all \ + -it \ + --rm \ + --network host \ + --volume $MODEL_PATH:/model \ + --volume $PWD:/workspace \ + --shm-size=10G \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --ulimit nofile=65536:65536 \ + --cap-add CAP_SYS_PTRACE \ + --ipc host \ + -e HF_TOKEN=$HF_TOKEN \ + -e TRTLLM_ENABLE_PDL=1 \ + -e TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True \ + $DYNAMO_CONTAINER_IMAGE +``` + +This command: +- Automatically removes the container when stopped (`--rm`) +- Allows container to interact with host's IPC resources for optimal performance (`--ipc=host`) +- Runs the container in interactive mode (`-it`) +- Sets up shared memory and stack limits for optimal performance +- Mounts your model directory into the container at `/model` +- Mounts the current Dynamo workspace into the container at `/workspace/dynamo` +- Enables [PDL](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization) and disables parallel weight loading +- Sets HuggingFace token as environment variable in the container + +### 3. Understanding the Configuration + +The deployment uses configuration files and command-line arguments to control behavior: + +#### Configuration Files + +**Prefill Configuration (`examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml`)**: +- `enable_attention_dp: false` - Attention data parallelism disabled for prefill +- `enable_chunked_prefill: true` - Enables efficient chunked prefill processing +- `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers +- `cache_transceiver_config.backend: ucx` - Uses UCX for efficient KV cache transfer +- `cuda_graph_config.max_batch_size: 32` - Maximum batch size for CUDA graphs + +**Decode Configuration (`examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml`)**: +- `enable_attention_dp: true` - Attention data parallelism enabled for decode +- `disable_overlap_scheduler: false` - Enables overlapping for decode efficiency +- `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers +- `cache_transceiver_config.backend: ucx` - Uses UCX for efficient KV cache transfer +- `cuda_graph_config.max_batch_size: 128` - Maximum batch size for CUDA graphs + +#### Command-Line Arguments + +Both workers receive these key arguments: +- `--tensor-parallel-size 4` - Uses 4 GPUs for tensor parallelism +- `--expert-parallel-size 4` - Expert parallelism across 4 GPUs +- `--free-gpu-memory-fraction 0.9` - Allocates 90% of GPU memory + +Prefill-specific arguments: +- `--max-num-tokens 20000` - Maximum tokens for prefill processing +- `--max-batch-size 32` - Maximum batch size for prefill + +Decode-specific arguments: +- `--max-num-tokens 16384` - Maximum tokens for decode processing +- `--max-batch-size 128` - Maximum batch size for decode + +### 4. Launch the Deployment + +Note that GPT-OSS is a reasoning model with tool calling support. To ensure the response is being processed correctly, the worker should be launched with proper ```--dyn-reasoning-parser``` and ```--dyn-tool-call-parser```. + +You can use the provided launch script or run the components manually: + +#### Option A: Using the Launch Script + +```bash +cd /workspace/examples/backends/trtllm +./launch/gpt_oss_disagg.sh +``` + +#### Option B: Manual Launch + +1. **Start frontend**: +```bash +# Start frontend with round-robin routing +python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 & +``` + +2. **Launch prefill worker**: +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \ + --model-path /model \ + --served-model-name openai/gpt-oss-120b \ + --extra-engine-args examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml \ + --dyn-reasoning-parser gpt_oss \ + --dyn-tool-call-parser harmony \ + --disaggregation-mode prefill \ + --max-num-tokens 20000 \ + --max-batch-size 32 \ + --free-gpu-memory-fraction 0.9 \ + --tensor-parallel-size 4 \ + --expert-parallel-size 4 & +``` + +3. **Launch decode worker**: +```bash +CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \ + --model-path /model \ + --served-model-name openai/gpt-oss-120b \ + --extra-engine-args examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml \ + --dyn-reasoning-parser gpt_oss \ + --dyn-tool-call-parser harmony \ + --disaggregation-mode decode \ + --max-num-tokens 16384 \ + --free-gpu-memory-fraction 0.9 \ + --tensor-parallel-size 4 \ + --expert-parallel-size 4 +``` + +### 6. Verify the Deployment is Ready + +Poll the `/health` endpoint to verify that both the prefill and decode worker endpoints have started: +``` +curl http://localhost:8000/health +``` + +Make sure that both of the endpoints are available before sending an inference request: +``` +{ + "endpoints": [ + "dyn://dynamo.tensorrt_llm.generate", + "dyn://dynamo.prefill.generate" + ], + "status": "healthy" +} +``` + +If only one worker endpoint is listed, the other may still be starting up. Monitor the worker logs to track startup progress. + +### 7. Test the Deployment + +Send a test request to verify the deployment: + +```bash +curl -X POST http://localhost:8000/v1/responses \ + -H "Content-Type: application/json" \ + -d '{ + "model": "openai/gpt-oss-120b", + "input": "Explain the concept of disaggregated serving in LLM inference in 3 sentences.", + "max_output_tokens": 200, + "stream": false + }' +``` + +The server exposes a standard OpenAI-compatible API endpoint that accepts JSON requests. You can adjust parameters like `max_tokens`, `temperature`, and others according to your needs. + +### 8. Reasoning and Tool Calling + +Dynamo has supported reasoning and tool calling in OpenAI Chat Completion endpoint. A typical workflow for application built on top of Dynamo +is that the application has a set of tools to aid the assistant provide accurate answer, and it is ususally +multi-turn as it involves tool selection and generation based on the tool result. + +In addition, the reasoning effort can be configured through ```chat_template_args```. Increasing the reasoning effort makes the model more accurate but also slower. It supports three levels: ```low```, ```medium```, and ```high```. + +Below is an example of sending multi-round requests to complete a user query with reasoning and tool calling: +**Application setup (pseudocode)** +```Python +# The tool defined by the application +def get_system_health(): + for component in system.components: + if not component.health(): + return False + return True + +# The JSON representation of the declaration in ChatCompletion tool style +tool_choice = '{ + "type": "function", + "function": { + "name": "get_system_health", + "description": "Returns the current health status of the LLM runtime—use before critical operations to verify the service is live.", + "parameters": { + "type": "object", + "properties": {} + } + } +}' + +# On user query, perform below workflow. +def user_query(app_request): + # first round + # create chat completion with prompt and tool choice + request = ... + response = send(request) + + if response["finish_reason"] == "tool_calls": + # second round + function, params = parse_tool_call(response) + function_result = function(params) + # create request with prompt, assistant response, and function result + request = ... + response = send(request) + return app_response(response) +``` + + +**First request with tools** + + +```bash +curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d ' +{ + "model": "openai/gpt-oss-120b", + "messages": [ + { + "role": "user", + "content": "Hey, quick check: is everything up and running?" + } + ], + "chat_template_args": { + "reasoning_effort": "low" + }, + "tools": [ + { + "type": "function", + "function": { + "name": "get_system_health", + "description": "Returns the current health status of the LLM runtime—use before critical operations to verify the service is live.", + "parameters": { + "type": "object", + "properties": {} + } + } + } + ], + "response_format": { + "type": "text" + }, + "stream": false, + "max_tokens": 300 +}' +``` +**First response with tool choice** +```JSON +{ + "id": "chatcmpl-d1c12219-6298-4c83-a6e3-4e7cef16e1a9", + "choices": [ + { + "index": 0, + "message": { + "tool_calls": [ + { + "id": "call-1", + "type": "function", + "function": { + "name": "get_system_health", + "arguments": "{}" + } + } + ], + "role": "assistant", + "reasoning_content": "We need to check system health. Use function." + }, + "finish_reason": "tool_calls" + } + ], + "created": 1758758741, + "model": "openai/gpt-oss-120b", + "object": "chat.completion", + "usage": null +} +``` +**Second request with tool calling result** +```bash +curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d ' +{ + "model": "openai/gpt-oss-120b", + "messages": [ + { + "role": "user", + "content": "Hey, quick check: is everything up and running?" + }, + { + "role": "assistant", + "tool_calls": [ + { + "id": "call-1", + "type": "function", + "function": { + "name": "get_system_health", + "arguments": "{}" + } + } + ] + }, + { + "role": "tool", + "tool_call_id": "call-1", + "content": "{\"status\":\"ok\",\"uptime_seconds\":372045}" + } + ], + "chat_template_args": { + "reasoning_effort": "low" + }, + "tools": [ + { + "type": "function", + "function": { + "name": "get_system_health", + "description": "Returns the current health status of the LLM runtime—use before critical operations to verify the service is live.", + "parameters": { + "type": "object", + "properties": {} + } + } + } + ], + "response_format": { + "type": "text" + }, + "stream": false, + "max_tokens": 300 +}' +``` +**Second response with final message** +```JSON +{ + "id": "chatcmpl-9ebfe64a-68b9-4c1d-9742-644cf770ad0e", + "choices": [ + { + "index": 0, + "message": { + "content": "All systems are green—everything’s up and running smoothly! 🚀 Let me know if you need anything else.", + "role": "assistant", + "reasoning_content": "The user asks: \"Hey, quick check: is everything up and running?\" We have just checked system health, it's ok. Provide friendly response confirming everything's up." + }, + "finish_reason": "stop" + } + ], + "created": 1758758853, + "model": "openai/gpt-oss-120b", + "object": "chat.completion", + "usage": null +} +``` +## Benchmarking + +### Performance Testing with AIPerf + +The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main?tab=readme-ov-file#aiperf), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment. + +**Run the following benchmark from inside the container** (after completing the deployment steps above): + +```bash +# Create a directory for benchmark results +mkdir -p /tmp/benchmark-results + +# Run the benchmark - this command tests the deployment with high-concurrency synthetic workload +aiperf profile \ + --model openai/gpt-oss-120b \ + --tokenizer /model \ + --endpoint-type chat \ + --endpoint /v1/chat/completions \ + --streaming \ + --url localhost:8000 \ + --synthetic-input-tokens-mean 32000 \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean 256 \ + --output-tokens-stddev 0 \ + --extra-inputs max_tokens:256 \ + --extra-inputs min_tokens:256 \ + --extra-inputs ignore_eos:true \ + --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ + --concurrency 256 \ + --request-count 6144 \ + --warmup-request-count 1000 \ + --num-dataset-entries 8000 \ + --random-seed 100 \ + --artifact-dir /tmp/benchmark-results \ + -H 'Authorization: Bearer NOT USED' \ + -H 'Accept: text/event-stream' +``` + +### What This Benchmark Does + +This command: +- **Tests chat completions** with streaming responses against the disaggregated deployment +- **Simulates high load** with 256 concurrent requests and 6144 total requests +- **Uses long context inputs** (32K tokens) to test prefill performance +- **Generates consistent outputs** (256 tokens) to measure decode throughput +- **Includes warmup period** (1000 requests) to stabilize performance metrics +- **Saves detailed results** to `/tmp/benchmark-results` for analysis + +Key parameters you can adjust: +- `--concurrency`: Number of simultaneous requests (impacts GPU utilization) +- `--synthetic-input-tokens-mean`: Average input length (tests prefill capacity) +- `--output-tokens-mean`: Average output length (tests decode throughput) +- `--request-count`: Total number of requests for the benchmark + +### Installing AIPerf Outside the Container + +If you prefer to run benchmarks from outside the container: + +```bash +# Install AIPerf +pip install aiperf + +# Then run the same benchmark command, adjusting the tokenizer path if needed +``` + +## Architecture Overview + +The disaggregated architecture separates prefill and decode phases: + +```mermaid +flowchart TD + Client["Users/Clients
(HTTP)"] --> Frontend["Frontend
Round-Robin Router"] + Frontend --> Prefill["Prefill Worker
(GPUs 0-3)"] + Frontend --> Decode["Decode Worker
(GPUs 4-7)"] + + Prefill -.->|KV Cache Transfer
via UCX| Decode +``` + +## Key Features + +1. **Disaggregated Serving**: Separates compute-intensive prefill from memory-bound decode operations +2. **Optimized Resource Usage**: Different parallelism strategies for prefill vs decode +3. **Scalable Architecture**: Easy to adjust worker counts based on workload +4. **TensorRT-LLM Optimizations**: Leverages TensorRT-LLM's efficient kernels and memory management + +## Troubleshooting + +### Common Issues + +1. **CUDA Out-of-Memory Errors** + - Reduce `--max-num-tokens` in the launch commands (currently 20000 for prefill, 16384 for decode) + - Lower `--free-gpu-memory-fraction` from 0.9 to 0.8 or 0.7 + - Ensure model checkpoints are compatible with the expected format + +2. **Workers Not Connecting** + - Ensure etcd and NATS services are running: `docker ps | grep -E "(etcd|nats)"` + - Check network connectivity between containers + - Verify CUDA_VISIBLE_DEVICES settings match your GPU configuration + - Check that no other processes are using the assigned GPUs + +3. **Performance Issues** + - Monitor GPU utilization with `nvidia-smi` while the deployment is running + - Check worker logs for bottlenecks or errors + - Ensure that batch sizes in manual commands match those in configuration files + - Adjust chunked prefill settings based on your workload + - For connection issues, ensure port 8000 is not being used by another application + +4. **Container Startup Issues** + - Verify that the NVIDIA Container Toolkit is properly installed + - Check Docker daemon is running with GPU support + - Ensure sufficient disk space for model weights and container images + +## Next Steps + +- **Production Deployment**: For multi-node deployments, see the [Multi-node Guide](https://github.com/ai-dynamo/dynamo/tree/main/examples/basics/multinode/README.md) +- **Advanced Configuration**: Explore TensorRT-LLM engine building options for further optimization +- **Monitoring**: Set up Prometheus and Grafana for production monitoring +- **Performance Benchmarking**: Use AIPerf to measure and optimize your deployment performance diff --git a/fern/pages/backends/trtllm/kv-cache-transfer.md b/fern/pages/backends/trtllm/kv-cache-transfer.md new file mode 100644 index 00000000000..40fddc6f478 --- /dev/null +++ b/fern/pages/backends/trtllm/kv-cache-transfer.md @@ -0,0 +1,23 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "KV Cache Transfer in Disaggregated Serving" +--- + +In disaggregated serving architectures, KV cache must be transferred between prefill and decode workers. TensorRT-LLM supports two methods for this transfer: + +## Default Method: NIXL +By default, TensorRT-LLM uses **NIXL** (NVIDIA Inference Xfer Library) with UCX (Unified Communication X) as backend for KV cache transfer between prefill and decode workers. [NIXL](https://github.com/ai-dynamo/nixl) is NVIDIA's high-performance communication library designed for efficient data transfer in distributed GPU environments. + +### Specify Backends for NIXL + +TODO: Add instructions for how to specify different backends for NIXL. + +## Alternative Method: UCX + +TensorRT-LLM can also leverage **UCX** (Unified Communication X) directly for KV cache transfer between prefill and decode workers. There are two ways to enable UCX as the KV cache transfer backend: + +1. **Recommended:** Set `cache_transceiver_config.backend: UCX` in your engine configuration YAML file. +2. Alternatively, set the environment variable `TRTLLM_USE_UCX_KV_CACHE=1` and configure `cache_transceiver_config.backend: DEFAULT` in the engine configuration YAML. + +This flexibility allows users to choose the most suitable method for their deployment and compatibility requirements. diff --git a/fern/pages/backends/trtllm/llama4-plus-eagle.md b/fern/pages/backends/trtllm/llama4-plus-eagle.md new file mode 100644 index 00000000000..e2ae9ec7efa --- /dev/null +++ b/fern/pages/backends/trtllm/llama4-plus-eagle.md @@ -0,0 +1,72 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Llama 4 Maverick Instruct with Eagle Speculative Decoding on SLURM" +--- + +This guide demonstrates how to deploy Llama 4 Maverick Instruct with Eagle Speculative Decoding on GB200x4 nodes. We will be following the [multi-node deployment instructions](multinode/multinode-examples.md) to set up the environment for the following scenarios: + +- **Aggregated Serving:** + Deploy the entire Llama 4 model on a single GB200x4 node for end-to-end serving. + +- **Disaggregated Serving:** + Distribute the workload across two GB200x4 nodes: + - One node runs the decode worker. + - The other node runs the prefill worker. + +## Notes +* Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `examples/backends/trtllm/engine_configs/llama4/eagle` folder. + +## Setup + +Assuming you have already allocated your nodes via `salloc`, and are +inside an interactive shell on one of the allocated nodes, set the +following environment variables based: + +```bash +cd $DYNAMO_HOME/examples/backends/trtllm + +export IMAGE="" +# export MOUNTS="${PWD}/:/mnt,/lustre:/lustre" +export MOUNTS="${PWD}/:/mnt" +export MODEL_PATH="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" +export SERVED_MODEL_NAME="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8" +``` + +See [this](multinode/multinode-examples.md#setup) section from multinode guide to learn more about the above options. + + +## Aggregated Serving +```bash +export NUM_NODES=1 +export ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yml" +./multinode/srun_aggregated.sh +``` + +## Disaggregated Serving + +```bash +export NUM_PREFILL_NODES=1 +export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yml" +export NUM_DECODE_NODES=1 +export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yml" +./multinode/srun_disaggregated.sh +``` + +## Example Request + +See [here](multinode/multinode-examples.md#example-request) to learn how to send a request to the deployment. + +``` +curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8", + "messages": [{"role": "user", "content": "Why is NVIDIA a great company?"}], + "max_tokens": 1024 + }' -w "\n" + + +# output: +{"id":"cmpl-3e87ea5c-010e-4dd2-bcc4-3298ebd845a8","choices":[{"text":"NVIDIA is considered a great company for several reasons:\n\n1. **Technological Innovation**: NVIDIA is a leader in the field of graphics processing units (GPUs) and has been at the forefront of technological innovation. +... +and the broader tech industry.\n\nThese factors combined have contributed to NVIDIA's status as a great company in the technology sector.","index":0,"logprobs":null,"finish_reason":"stop"}],"created":1753329671,"model":"nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8","system_fingerprint":null,"object":"text_completion","usage":{"prompt_tokens":16,"completion_tokens":562,"total_tokens":578,"prompt_tokens_details":null,"completion_tokens_details":null}} +``` diff --git a/fern/pages/backends/trtllm/multinode/multinode-examples.md b/fern/pages/backends/trtllm/multinode/multinode-examples.md new file mode 100644 index 00000000000..99e0919b379 --- /dev/null +++ b/fern/pages/backends/trtllm/multinode/multinode-examples.md @@ -0,0 +1,280 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Example: Multi-node TRTLLM Workers with Dynamo on Slurm" +--- + +> **Note:** The scripts referenced in this example (such as `srun_aggregated.sh` and `srun_disaggregated.sh`) can be found in [`examples/basics/multinode/trtllm/`](https://github.com/ai-dynamo/dynamo/tree/main/examples/basics/multinode/trtllm/). + +To run a single Dynamo+TRTLLM Worker that spans multiple nodes (ex: TP16), +the set of nodes need to be launched together in the same MPI world, such as +via `mpirun` or `srun`. This is true regardless of whether the worker is +aggregated, prefill-only, or decode-only. + +In this document we will demonstrate two examples launching multinode workers +on a slurm cluster with `srun`: +1. Deploying an aggregated nvidia/DeepSeek-R1 model as a multi-node TP16/EP16 + worker across 4 GB200 nodes +2. Deploying a disaggregated nvidia/DeepSeek-R1 model with a multi-node + TP16/EP16 prefill worker (4 nodes) and a multi-node TP16/EP16 decode + worker (4 nodes) across a total of 8 GB200 nodes. + +NOTE: Some of the scripts used in this example like `start_frontend_services.sh` and +`start_trtllm_worker.sh` should be translatable to other environments like Kubernetes, or +using `mpirun` directly, with relative ease. + +## Setup + +For simplicity of the example, we will make some assumptions about your slurm cluster: +1. First, we assume you have access to a slurm cluster with multiple GPU nodes + available. For functional testing, most setups should be fine. For performance + testing, you should aim to allocate groups of nodes that are performantly + inter-connected, such as those in an NVL72 setup. +2. Second, we assume this slurm cluster has the [Pyxis](https://github.com/NVIDIA/pyxis) + SPANK plugin setup. In particular, the `srun_aggregated.sh` script in this + example will use `srun` arguments like `--container-image`, + `--container-mounts`, and `--container-env` that are added to `srun` by Pyxis. + If your cluster supports similar container based plugins, you may be able to + modify the script to use that instead. +3. Third, we assume you have already built a recent Dynamo+TRTLLM container image as + described [here](https://github.com/ai-dynamo/dynamo/tree/main/docs/backends/trtllm/README.md#build-container). + This is the image that can be set to the `IMAGE` environment variable in later steps. +4. Fourth, we assume you pre-allocate a group of nodes using `salloc`. We + will allocate 8 nodes below as a reference command to have enough capacity + to run both examples. If you plan to only run the aggregated example, you + will only need 4 nodes. If you customize the configurations to require a + different number of nodes, you can adjust the number of allocated nodes + accordingly. Pre-allocating nodes is technically not a requirement, + but it makes iterations of testing/experimenting easier. + + Make sure to set your `PARTITION` and `ACCOUNT` according to your slurm cluster setup: + ```bash + # Set partition manually based on your slurm cluster's partition names + PARTITION="" + # Set account manually if this command doesn't work on your cluster + ACCOUNT="$(sacctmgr -nP show assoc where user=$(whoami) format=account)" + salloc \ + --partition="${PARTITION}" \ + --account="${ACCOUNT}" \ + --job-name="${ACCOUNT}-dynamo.trtllm" \ + -t 05:00:00 \ + --nodes 8 + ``` +5. Lastly, we will assume you are inside an interactive shell on one of your allocated + nodes, which may be the default behavior after executing the `salloc` command above + depending on the cluster setup. If not, then you should SSH into one of the allocated nodes. + +### Environment Variable Setup + +This example aims to automate as much of the environment setup as possible, +but all slurm clusters and environments are different, and you may need to +dive into the scripts to make modifications based on your specific environment. + +Assuming you have already allocated your nodes via `salloc`, and are +inside an interactive shell on one of the allocated nodes, set the +following environment variables based: +```bash +# NOTE: IMAGE must be set manually for now +# To build an iamge, see the steps here: +# https://github.com/ai-dynamo/dynamo/tree/main/docs/backends/trtllm/README.md#build-container +export IMAGE="" + +# MOUNTS are the host:container path pairs that are mounted into the containers +# launched by each `srun` command. +# +# If you want to reference files, such as $MODEL_PATH below, in a +# different location, you can customize MOUNTS or specify additional +# comma-separated mount pairs here. +# +# NOTE: Currently, this example assumes that the local bash scripts and configs +# referenced are mounted into into /mnt inside the container. If you want to +# customize the location of the scripts, make sure to modify `srun_aggregated.sh` +# accordingly for the new locations of `start_frontend_services.sh` and +# `start_trtllm_worker.sh`. +# +# For example, assuming your cluster had a `/lustre` directory on the host, you +# could add that as a mount like so: +# +# export MOUNTS="${PWD}/../../../../:/mnt,/lustre:/lustre" +export MOUNTS="${PWD}/../../../../:/mnt" + +# NOTE: In general, Deepseek R1 is very large, so it is recommended to +# pre-download the model weights and save them in some shared location, +# NFS storage, HF_HOME, etc. and modify the `--model-path` below +# to reuse the pre-downloaded weights instead. +# +# On Blackwell systems (ex: GB200), it is recommended to use the FP4 weights: +# https://huggingface.co/nvidia/DeepSeek-R1-FP4 +# +# On Hopper systems, FP4 isn't supported so you'll need to use the default weights: +# https://huggingface.co/deepseek-ai/DeepSeek-R1 +export MODEL_PATH="nvidia/DeepSeek-R1-FP4" + +# The name the model will be served/queried under, matching what's +# returned by the /v1/models endpoint. +# +# By default this is inferred from MODEL_PATH, but when using locally downloaded +# model weights, it can be nice to have explicit control over the name. +export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4" +``` + +## Aggregated WideEP + +Assuming you have at least 4 nodes allocated following the setup steps above, +follow these steps below to launch an **aggregated** deployment across 4 nodes: + +```bash +# Default set in srun_aggregated.sh, but can customize here. +# export ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/wide_ep_agg.yaml" + +# Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG +# The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of +# total GPUs necessary to satisfy the requested parallelism. For example, +# 4 nodes x 4 gpus/node = 16 gpus total for TP16/EP16. +# export NUM_NODES=4 + +# GB200 nodes have 4 gpus per node, but for other types of nodes you can configure this. +# export NUM_GPUS_PER_NODE=4 + +# Launches: +# - frontend + etcd/nats on current (head) node +# - one large aggregated trtllm worker across multiple nodes via MPI tasks +./srun_aggregated.sh +``` + +## Disaggregated WideEP + +Assuming you have at least 8 nodes allocated (4 for prefill, 4 for decode) +following the setup above, follow these steps below to launch a **disaggregated** +deployment across 8 nodes: + + +Make sure you have a fresh environment and don't still have the aggregated +example above still deployed on the same set of nodes. + + +```bash +# Defaults set in srun_disaggregated.sh, but can customize here. +# export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_prefill.yaml" +# export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_decode.yaml" + +# Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG +# Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG +# The products of NUM_PREFILL_NODES*NUM_GPUS_PER_NODE and +# NUM_DECODE_NODES*NUM_GPUS_PER_NODE should match the respective number of +# GPUs necessary to satisfy the requested parallelism in each config. +# export NUM_PREFILL_NODES=4 +# export NUM_DECODE_NODES=4 + +# GB200 nodes have 4 gpus per node, but for other types of nodes you can configure this. +# export NUM_GPUS_PER_NODE=4 + +# Launches: +# - frontend + etcd/nats on current (head) node. +# - one large prefill trtllm worker across multiple nodes via MPI tasks +# - one large decode trtllm worker across multiple nodes via MPI tasks +./srun_disaggregated.sh +``` + + +To launch multiple replicas of the configured prefill/decode workers, you can set +NUM_PREFILL_WORKERS and NUM_DECODE_WORKERS respectively (default: 1). + + +## Understanding the Output + +1. The `srun_aggregated.sh` launches two `srun` jobs. The first launches + etcd, NATS, and the OpenAI frontend on the head node only + called "node1" in the example output below. The second launches + a single TP16 Dynamo+TRTLLM worker spread across 4 nodes, each node + using 4 GPUs each. + ``` + # Frontend/etcd/nats services + srun: launching StepId=453374.17 on host node1, 1 tasks: 0 + ... + # TP16 TRTLLM worker split across 4 nodes with 4 gpus each + srun: launching StepId=453374.18 on host node1, 4 tasks: [0-3] + srun: launching StepId=453374.18 on host node2, 4 tasks: [4-7] + srun: launching StepId=453374.18 on host node3, 4 tasks: [8-11] + srun: launching StepId=453374.18 on host node4, 4 tasks: [12-15] + ``` +2. The OpenAI frontend will listen for and dynamically discover workers as + they register themselves with Dynamo's distributed runtime: + ``` + 0: 2025-06-13T02:36:48.160Z INFO dynamo_run::input::http: Watching for remote model at models + 0: 2025-06-13T02:36:48.161Z INFO dynamo_llm::http::service::service_v2: Starting HTTP service on: 0.0.0.0:8000 address="0.0.0.0:8000" + ``` +3. The TRTLLM worker will consist of N (N=16 for TP16) MPI ranks, 1 rank on each + GPU on each node, which will each output their progress while loading the model. + You can see each rank's output prefixed with the rank at the start of each log line + until the model succesfully finishes loading: + ``` + 8: rank8 run mgmn worker node with mpi_world_size: 16 ... + 10: rank10 run mgmn worker node with mpi_world_size: 16 ... + 9: rank9 run mgmn worker node with mpi_world_size: 16 ... + 11: rank11 run mgmn worker node with mpi_world_size: 16 ... + ... + 15: Model init total -- 55.42s + 11: Model init total -- 55.91s + 12: Model init total -- 55.24s + ``` +4. After the model fully finishes loading on all ranks, the worker will register itself, + and the OpenAI frontend will detect it, signaled by this output: + ``` + 0: 2025-06-13T02:46:35.040Z INFO dynamo_llm::discovery::watcher: added model model_name="nvidia/DeepSeek-R1-FP4" + ``` +5. At this point, with the worker fully initialized and detected by the frontend, + it is now ready for inference. +6. For `srun_disaggregated.sh`, it follows a very similar flow, but instead launches + three srun jobs instead of two. One for frontend, one for prefill worker, + and one for decode worker. + +## Example Request + +To verify the deployed model is working, send a `curl` request: +```bash +# NOTE: $HOST assumes running on head node, but can be changed to $HEAD_NODE_IP instead. +HOST=localhost +PORT=8000 +# "model" here should match the model name returned by the /v1/models endpoint +curl -w "%{http_code}" ${HOST}:${PORT}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "'${SERVED_MODEL_NAME}'", + "messages": [ + { + "role": "user", + "content": "Tell me a story as if we were playing dungeons and dragons." + } + ], + "stream": true, + "max_tokens": 30 +}' +``` + +## Cleanup + +To cleanup background `srun` processes launched by `srun_aggregated.sh` or +`srun_disaggregated.sh`, you can run: +```bash +pkill srun +``` + +## Known Issues + +- This example has only been tested on a 4xGB200 node setup with 16 GPUs using + FP4 weights. In theory, the example should work on alternative setups such as + H100 nodes with FP8 weights, but this hasn't been tested yet. +- WideEP configs in this directory are still being tested. A WideEP specific + example with documentation will be added once ready. +- There are known issues where WideEP workers may not cleanly shut down: + - This may lead to leftover shared memory files in `/dev/shm/moe_*`. For + now, you must manually clean these up before deploying again on the + same set of nodes. + - Similarly, there may be GPU memory left in-use after killing the `srun` + jobs. After cleaning up any leftover shared memory files as described + above, the GPU memory may slowly come back. You can run `watch nvidia-smi` + to check on this behavior. If you don't free the GPU memory before the + next deployment, you may get a CUDA OOM error while loading the model. + - There is mention of this issue in the relevant TRT-LLM blog + [here](https://github.com/NVIDIA/TensorRT-LLM/blob/6021a439ab9c29f4c46f721eeb59f6b992c425ea/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md#miscellaneous). diff --git a/fern/pages/backends/trtllm/prometheus.md b/fern/pages/backends/trtllm/prometheus.md new file mode 100644 index 00000000000..a55f920cc6a --- /dev/null +++ b/fern/pages/backends/trtllm/prometheus.md @@ -0,0 +1,192 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "TensorRT-LLM Prometheus Metrics" +--- + +## Overview + +When running TensorRT-LLM through Dynamo, TensorRT-LLM's Prometheus metrics are automatically passed through and exposed on Dynamo's `/metrics` endpoint (default port 8081). This allows you to access both TensorRT-LLM engine metrics (prefixed with `trtllm_`) and Dynamo runtime metrics (prefixed with `dynamo_*`) from a single worker backend endpoint. + +Additional performance metrics are available via non-Prometheus APIs (see [Non-Prometheus Performance Metrics](#non-prometheus-performance-metrics) below). + +As of the date of this documentation, the included TensorRT-LLM version 1.1.0rc5 exposes **5 basic Prometheus metrics**. Note that the `trtllm_` prefix is added by Dynamo. + +**For Dynamo runtime metrics**, see the [Dynamo Metrics Guide](../../observability/metrics.md). + +**For visualization setup instructions**, see the [Prometheus and Grafana Setup Guide](../../observability/prometheus-grafana.md). + +## Environment Variables + +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `DYN_SYSTEM_PORT` | System metrics/health port | `-1` (disabled) | `8081` | + +## Getting Started Quickly + +This is a single machine example. + +### Start Observability Stack + +For visualizing metrics with Prometheus and Grafana, start the observability stack. See [Observability Getting Started](../../observability/README.md#getting-started-quickly) for instructions. + +### Launch Dynamo Components + +Launch a frontend and TensorRT-LLM backend to test metrics: + +```bash +# Start frontend (default port 8000, override with --http-port or DYN_HTTP_PORT env var) +$ python -m dynamo.frontend + +# Enable system metrics server on port 8081 and enable metrics collection +$ DYN_SYSTEM_PORT=8081 python -m dynamo.trtllm --model --publish-events-and-metrics +``` + +**Note:** The `backend` must be set to `"pytorch"` for metrics collection (enforced in `components/src/dynamo/trtllm/main.py`). TensorRT-LLM's `MetricsCollector` integration has only been tested/validated with the PyTorch backend. + +Wait for the TensorRT-LLM worker to start, then send requests and check metrics: + +```bash +# Send a request +curl -H 'Content-Type: application/json' \ +-d '{ + "model": "", + "max_completion_tokens": 100, + "messages": [{"role": "user", "content": "Hello"}] +}' \ +http://localhost:8000/v1/chat/completions + +# Check metrics from the worker +curl -s localhost:8081/metrics | grep "^trtllm_" +``` + +## Exposed Metrics + +TensorRT-LLM exposes metrics in Prometheus Exposition Format text at the `/metrics` HTTP endpoint. All TensorRT-LLM engine metrics use the `trtllm_` prefix and include labels (e.g., `model_name`, `engine_type`, `finished_reason`) to identify the source. + +**Note:** TensorRT-LLM uses `model_name` instead of Dynamo's standard `model` label convention. + +**Example Prometheus Exposition Format text:** + +``` +# HELP trtllm_request_success_total Count of successfully processed requests. +# TYPE trtllm_request_success_total counter +trtllm_request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="stop"} 150.0 +trtllm_request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="length"} 5.0 + +# HELP trtllm_time_to_first_token_seconds Histogram of time to first token in seconds. +# TYPE trtllm_time_to_first_token_seconds histogram +trtllm_time_to_first_token_seconds_bucket{le="0.01",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 0.0 +trtllm_time_to_first_token_seconds_bucket{le="0.05",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 12.0 +trtllm_time_to_first_token_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0 +trtllm_time_to_first_token_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 8.75 + +# HELP trtllm_e2e_request_latency_seconds Histogram of end to end request latency in seconds. +# TYPE trtllm_e2e_request_latency_seconds histogram +trtllm_e2e_request_latency_seconds_bucket{le="0.5",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 25.0 +trtllm_e2e_request_latency_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0 +trtllm_e2e_request_latency_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 45.2 + +# HELP trtllm_time_per_output_token_seconds Histogram of time per output token in seconds. +# TYPE trtllm_time_per_output_token_seconds histogram +trtllm_time_per_output_token_seconds_bucket{le="0.1",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 120.0 +trtllm_time_per_output_token_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0 +trtllm_time_per_output_token_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 12.5 + +# HELP trtllm_request_queue_time_seconds Histogram of time spent in WAITING phase for request. +# TYPE trtllm_request_queue_time_seconds histogram +trtllm_request_queue_time_seconds_bucket{le="1.0",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 140.0 +trtllm_request_queue_time_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0 +trtllm_request_queue_time_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 32.1 +``` + +**Note:** The specific metrics shown above are examples and may vary depending on your TensorRT-LLM version. Always inspect your actual `/metrics` endpoint for the current list. + +### Metric Categories + +TensorRT-LLM provides metrics in the following categories (all prefixed with `trtllm_`): + +- **Request metrics** - Request success tracking and latency measurements +- **Performance metrics** - Time to first token (TTFT), time per output token (TPOT), and queue time + +**Note:** Metrics may change between TensorRT-LLM versions. Always inspect the `/metrics` endpoint for your version. + +## Available Metrics + +The following metrics are exposed via Dynamo's `/metrics` endpoint (with the `trtllm_` prefix added by Dynamo) for TensorRT-LLM version 1.1.0rc5: + +- `trtllm_request_success_total` (Counter) — Count of successfully processed requests by finish reason + - Labels: `model_name`, `engine_type`, `finished_reason` +- `trtllm_e2e_request_latency_seconds` (Histogram) — End-to-end request latency (seconds) + - Labels: `model_name`, `engine_type` +- `trtllm_time_to_first_token_seconds` (Histogram) — Time to first token, TTFT (seconds) + - Labels: `model_name`, `engine_type` +- `trtllm_time_per_output_token_seconds` (Histogram) — Time per output token, TPOT (seconds) + - Labels: `model_name`, `engine_type` +- `trtllm_request_queue_time_seconds` (Histogram) — Time a request spends waiting in the queue (seconds) + - Labels: `model_name`, `engine_type` + +These metric names and availability are subject to change with TensorRT-LLM version updates. + +TensorRT-LLM provides Prometheus metrics through the `MetricsCollector` class (see [tensorrt_llm/metrics/collector.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/metrics/collector.py)). + +## Non-Prometheus Performance Metrics + +TensorRT-LLM provides extensive performance data beyond the basic Prometheus metrics. These are not currently exposed to Prometheus. + +### Available via Code References + +- **RequestPerfMetrics Structure**: [tensorrt_llm/executor/result.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/executor/result.py) - KV cache, timing, speculative decoding metrics +- **Engine Statistics**: `engine.llm.get_stats_async()` - System-wide aggregate statistics +- **KV Cache Events**: `engine.llm.get_kv_cache_events_async()` - Real-time cache operations + +### Example RequestPerfMetrics JSON Structure + +```json +{ + "timing_metrics": { + "arrival_time": 1234567890.123, + "first_scheduled_time": 1234567890.135, + "first_token_time": 1234567890.150, + "last_token_time": 1234567890.300, + "kv_cache_size": 2048576, + "kv_cache_transfer_start": 1234567890.140, + "kv_cache_transfer_end": 1234567890.145 + }, + "kv_cache_metrics": { + "num_total_allocated_blocks": 100, + "num_new_allocated_blocks": 10, + "num_reused_blocks": 90, + "num_missed_blocks": 5 + }, + "speculative_decoding": { + "acceptance_rate": 0.85, + "total_accepted_draft_tokens": 42, + "total_draft_tokens": 50 + } +} +``` + +**Note:** These structures are valid as of the date of this documentation but are subject to change with TensorRT-LLM version updates. + +## Implementation Details + +- **Prometheus Integration**: Uses the `MetricsCollector` class from `tensorrt_llm.metrics` (see [collector.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/metrics/collector.py)) +- **Dynamo Integration**: Uses `register_engine_metrics_callback()` function with `add_prefix="trtllm_"` +- **Engine Configuration**: `return_perf_metrics` set to `True` when `--publish-events-and-metrics` is enabled +- **Initialization**: Metrics appear after TensorRT-LLM engine initialization completes +- **Metadata**: `MetricsCollector` initialized with model metadata (model name, engine type) + +## Related Documentation + +### TensorRT-LLM Metrics +- See the [Non-Prometheus Performance Metrics](#non-prometheus-performance-metrics) section above for detailed performance data and source code references +- [TensorRT-LLM Metrics Collector](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/metrics/collector.py) - Source code reference + +### Dynamo Metrics +- [Dynamo Metrics Guide](../../observability/metrics.md) - Complete documentation on Dynamo runtime metrics +- [Prometheus and Grafana Setup](../../observability/prometheus-grafana.md) - Visualization setup instructions +- Dynamo runtime metrics (prefixed with `dynamo_*`) are available at the same `/metrics` endpoint alongside TensorRT-LLM metrics + - Implementation: `lib/runtime/src/metrics.rs` (Rust runtime metrics) + - Metric names: `lib/runtime/src/metrics/prometheus_names.rs` (metric name constants) + - Integration code: `components/src/dynamo/common/utils/prometheus.py` - Prometheus utilities and callback registration diff --git a/fern/pages/backends/vllm/LMCache-Integration.md b/fern/pages/backends/vllm/LMCache-Integration.md new file mode 100644 index 00000000000..fa202e201a1 --- /dev/null +++ b/fern/pages/backends/vllm/LMCache-Integration.md @@ -0,0 +1,211 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "LMCache Integration in Dynamo" +--- + +## Introduction + +LMCache is a high-performance KV cache layer that supercharges LLM serving by enabling **prefill-once, reuse-everywhere** semantics. As described in the [official documentation](https://docs.lmcache.ai/index.html), LMCache lets LLMs prefill each text only once by storing the KV caches of all reusable texts, allowing reuse of KV caches for any reused text (not necessarily prefix) across any serving engine instance. + +This document describes how LMCache is integrated into Dynamo's vLLM backend to provide enhanced performance and memory efficiency. + +### Key Benefits +- **Reduced Time to First Token (TTFT)**: Eliminates redundant prefill computations +- **Memory Offloading**: Intelligent KV cache placement across CPU/GPU/storage tiers +- **Improved Throughput**: Reduced GPU memory pressure enables higher batch sizes + +## Platform Support + +**Important Note**: LMCache integration currently only supports x86 architecture. ARM64 is not supported at this time. + +## Aggregated Serving + + +### Configuration + +LMCache is enabled using the `--connector lmcache` flag: + +```bash +python -m dynamo.vllm --model --connector lmcache +``` + +### Customization + +LMCache configuration can be customized via environment variables listed [here](https://docs.lmcache.ai/api_reference/configurations.html). + +For advanced configurations, LMCache supports multiple [storage backends](https://docs.lmcache.ai/index.html): +- **CPU RAM**: Fast local memory offloading +- **Local Storage**: Disk-based persistence +- **Redis**: Distributed cache sharing +- **GDS Backend**: GPU Direct Storage for high throughput +- **InfiniStore/Mooncake**: Cloud-native storage solutions + +### Deployment + +Use the provided launch script for quick setup: + +```bash +./examples/backends/vllm/launch/agg_lmcache.sh +``` + +This will: +1. Start the dynamo frontend +2. Launch a single vLLM worker with LMCache enabled + +### Architecture for Aggregated Mode + +In aggregated mode, the system uses: +- **KV Connector**: `LMCacheConnectorV1` +- **KV Role**: `kv_both` (handles both reading and writing) + +## Disaggregated Serving + +Disaggregated serving separates prefill and decode operations into dedicated workers. This provides better resource utilization and scalability for production deployments. + +### Deployment + +Use the provided disaggregated launch script(the script requires at least 2 GPUs): + +```bash +./examples/backends/vllm/launch/disagg_lmcache.sh +``` + +This will: +1. Start the dynamo frontend +2. Launch a decode worker on GPU 0 +3. Wait for initialization +4. Launch a prefill worker on GPU 1 with LMCache enabled + +### Worker Roles + +#### Decode Worker +- **Purpose**: Handles token generation (decode phase) +- **GPU Assignment**: CUDA_VISIBLE_DEVICES=0 +- **LMCache Config**: Uses `NixlConnector` only for kv transfer between prefill and decode workers + +#### Prefill Worker +- **Purpose**: Handles prompt processing (prefill phase) +- **GPU Assignment**: CUDA_VISIBLE_DEVICES=1 +- **LMCache Config**: Uses `MultiConnector` with both LMCache and NIXL connectors. This enables prefill worker to use LMCache for kv offloading and use NIXL for kv transfer between prefill and decode workers. +- **Flag**: `--is-prefill-worker` + +## Architecture + +### KV Transfer Configuration + +The system automatically configures KV transfer based on the deployment mode and worker type: + +#### Prefill Worker (Disaggregated Mode) +```python +kv_transfer_config = KVTransferConfig( + kv_connector="PdConnector", + kv_role="kv_both", + kv_connector_extra_config={ + "connectors": [ + {"kv_connector": "LMCacheConnectorV1", "kv_role": "kv_both"}, + {"kv_connector": "NixlConnector", "kv_role": "kv_both"} + ] + } +) +``` + +#### Decode Worker or Aggregated Mode +```python +kv_transfer_config = KVTransferConfig( + kv_connector="LMCacheConnectorV1", + kv_role="kv_both" +) +``` + +#### Fallback (No LMCache) +```python +kv_transfer_config = KVTransferConfig( + kv_connector="NixlConnector", + kv_role="kv_both" +) +``` + +### Integration Points + +1. **Argument Parsing** (`args.py`): + - Configures appropriate KV transfer settings + - Sets up connector configurations based on worker type + +2. **Engine Setup** (`main.py`): + - Initializes LMCache environment variables + - Creates vLLM engine with proper KV transfer config + - Handles both aggregated and disaggregated modes + + +### Best Practices + +1. **Chunk Size Tuning**: Adjust `LMCACHE_CHUNK_SIZE` based on your use case: + - Smaller chunks (128-256): Better reuse granularity for varied content + - Larger chunks (512-1024): More efficient for repetitive content patterns + +2. **Memory Allocation**: Set `LMCACHE_MAX_LOCAL_CPU_SIZE` conservatively: + - Leave sufficient RAM for other system processes + - Monitor memory usage during peak loads + +3. **Workload Optimization**: LMCache performs best with: + - Repeated prompt patterns (RAG, multi-turn conversations) + - Shared context across sessions + - Long-running services with warm caches + +## Metrics and Monitoring + +When LMCache is enabled with `--connector lmcache` and `DYN_SYSTEM_PORT` is set, LMCache metrics are automatically exposed via Dynamo's `/metrics` endpoint alongside vLLM and Dynamo metrics. + +**Requirements to access LMCache metrics:** +- `--connector lmcache` - Enables LMCache +- `DYN_SYSTEM_PORT=8081` - Enables metrics HTTP endpoint +- `PROMETHEUS_MULTIPROC_DIR` (optional) - If not set, Dynamo manages it internally. Only set explicitly if you need control over the metrics directory. + +For detailed information on LMCache metrics, including the complete list of available metrics and how to access them, see the **[LMCache Metrics section](prometheus.md#lmcache-metrics)** in the vLLM Prometheus Metrics Guide. + +### Troubleshooting + +#### LMCache log: `PrometheusLogger instance already created with different metadata` + +You may see an error like: + +```text +LMCache ERROR: PrometheusLogger instance already created with different metadata. This should not happen except in test +``` + +**Version note**: We reproduced this behavior with **vLLM v0.12.0**. We have not reproduced it with **vLLM v0.11.0**, so it may be specific to (or introduced in) v0.12.0. + +This is emitted by LMCache when the LMCache connector is initialized more than once in the same process (for example, once for a `WORKER` role and later for a `SCHEDULER` role). LMCache uses a process-global singleton for its Prometheus logger, so the second initialization can log this warning if its metadata differs. + +- **Impact**: This is a log-only error; in our testing it does not prevent vLLM/Dynamo from serving requests. If you care about LMCache metric labels, be aware the logger singleton uses the first-seen metadata. +- **Repro without Dynamo** (vLLM v0.12.0): + +```bash +vllm serve Qwen/Qwen3-0.6B \ + --host 127.0.0.1 --port 18000 \ + --gpu-memory-utilization 0.24 \ + --enforce-eager \ + --no-enable-prefix-caching \ + --max-num-seqs 2 \ + --kv-offloading-backend lmcache \ + --kv-offloading-size 1 \ + --disable-hybrid-kv-cache-manager +``` + +- **Mitigation (silence)**: set `LMCACHE_LOG_LEVEL=CRITICAL`. +- **Upstream issue**: [vLLM issue #30996](https://github.com/vllm-project/vllm/issues/30996). + +#### vLLM log: `Found PROMETHEUS_MULTIPROC_DIR was set by user` + +vLLM v1 uses `prometheus_client.multiprocess` and stores intermediate metric values in `PROMETHEUS_MULTIPROC_DIR`. + +- If you **set `PROMETHEUS_MULTIPROC_DIR` yourself**, vLLM warns that the directory must be wiped between runs to avoid stale/incorrect metrics. +- When running via Dynamo, the vLLM wrapper may set `PROMETHEUS_MULTIPROC_DIR` internally to a temporary directory to avoid vLLM cleanup issues. If you still see the warning, confirm you are not exporting `PROMETHEUS_MULTIPROC_DIR` in your shell or container environment. + +## References and Additional Resources + +- [LMCache Documentation](https://docs.lmcache.ai/index.html) - Comprehensive guide and API reference +- [Configuration Reference](https://docs.lmcache.ai/api_reference/configurations.html) - Detailed configuration options +- [LMCache Observability Guide](https://docs.lmcache.ai/production/observability/vllm_endpoint.html) - Metrics and monitoring details + diff --git a/fern/pages/backends/vllm/README.md b/fern/pages/backends/vllm/README.md new file mode 100644 index 00000000000..d713b424583 --- /dev/null +++ b/fern/pages/backends/vllm/README.md @@ -0,0 +1,199 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "LLM Deployment using vLLM" +--- + +This directory contains reference implementations for deploying Large Language Models (LLMs) in various configurations using vLLM. For Dynamo integration, we leverage vLLM's native KV cache events, NIXL based transfer mechanisms, and metric reporting to enable KV-aware routing and P/D disaggregation. + +## Use the Latest Release + +We recommend using the latest stable release of Dynamo to avoid breaking changes: + +[![GitHub Release](https://img.shields.io/github/v/release/ai-dynamo/dynamo)](https://github.com/ai-dynamo/dynamo/releases/latest) + +You can find the latest release [here](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with: + +```bash +git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) +``` + +--- + +## Table of Contents +- [Feature Support Matrix](#feature-support-matrix) +- [Quick Start](#vllm-quick-start) +- [Single Node Examples](#run-single-node-examples) +- [Advanced Examples](#advanced-examples) +- [Deploy on Kubernetes](#kubernetes-deployment) +- [Configuration](#configuration) + +## Feature Support Matrix + +### Core Dynamo Features + +| Feature | vLLM | Notes | +|---------|------|-------| +| [**Disaggregated Serving**](../../design-docs/disagg-serving.md) | ✅ | | +| [**Conditional Disaggregation**](../../design-docs/disagg-serving.md#conditional-disaggregation) | 🚧 | WIP | +| [**KV-Aware Routing**](../../router/kv-cache-routing.md) | ✅ | | +| [**SLA-Based Planner**](../../planner/sla-planner.md) | ✅ | | +| [**Load Based Planner**](../../planner/load-planner.md) | 🚧 | WIP | +| [**KVBM**](../../kvbm/kvbm-architecture.md) | ✅ | | +| [**LMCache**](LMCache-Integration.md) | ✅ | | +| [**Prompt Embeddings**](prompt-embeddings.md) | ✅ | Requires `--enable-prompt-embeds` flag | + +### Large Scale P/D and WideEP Features + +| Feature | vLLM | Notes | +|--------------------|------|-----------------------------------------------------------------------| +| **WideEP** | ✅ | Support for PPLX / DeepEP not verified | +| **DP Rank Routing**| ✅ | Supported via external control of DP ranks | +| **GB200 Support** | 🚧 | Container functional on main | + +## vLLM Quick Start + +Below we provide a guide that lets you run all of our the common deployment patterns on a single node. + +### Start NATS and ETCD in the background + +Start using [Docker Compose](https://github.com/ai-dynamo/dynamo/tree/main/deploy/docker-compose.yml) + +```bash +docker compose -f deploy/docker-compose.yml up -d +``` + +### Pull or build container + +We have public images available on [NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo/artifacts). If you'd like to build your own container from source: + +```bash +./container/build.sh --framework VLLM +``` + +### Run container + +```bash +./container/run.sh -it --framework VLLM [--mount-workspace] +``` + +This includes the specific commit [vllm-project/vllm#19790](https://github.com/vllm-project/vllm/pull/19790) which enables support for external control of the DP ranks. + +## Run Single Node Examples + + +Below we provide simple shell scripts that run the components for each configuration. Each shell script runs `python3 -m dynamo.frontend` to start the ingress and uses `python3 -m dynamo.vllm` to start the vLLM workers. You can also run each command in separate terminals for better log visibility. + + +### Aggregated Serving + +```bash +# requires one gpu +cd examples/backends/vllm +bash launch/agg.sh +``` + +### Aggregated Serving with KV Routing + +```bash +# requires two gpus +cd examples/backends/vllm +bash launch/agg_router.sh +``` + +### Disaggregated Serving + +```bash +# requires two gpus +cd examples/backends/vllm +bash launch/disagg.sh +``` + +### Disaggregated Serving with KV Routing + +```bash +# requires three gpus +cd examples/backends/vllm +bash launch/disagg_router.sh +``` + +### Single Node Data Parallel Attention / Expert Parallelism + +This example is not meant to be performant but showcases Dynamo routing to data parallel workers + +```bash +# requires four gpus +cd examples/backends/vllm +bash launch/dep.sh +``` + + +Run a disaggregated example and try adding another prefill worker once the setup is running! The system will automatically discover and utilize the new worker. + + +## Advanced Examples + +Below we provide a selected list of advanced deployments. Please open up an issue if you'd like to see a specific example! + +### Speculative Decoding with Aggregated Serving (Meta-Llama-3.1-8B-Instruct + Eagle3) + +Run **Meta-Llama-3.1-8B-Instruct** with **Eagle3** as a draft model using **aggregated speculative decoding** on a single node. +This setup demonstrates how to use Dynamo to create an instance using Eagle-based speculative decoding under the **VLLM aggregated serving framework** for faster inference while maintaining accuracy. + +**Guide:** [Speculative Decoding Quickstart](speculative-decoding.md) + +### Kubernetes Deployment + +For complete Kubernetes deployment instructions, configurations, and troubleshooting, see [vLLM Kubernetes Deployment Guide](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm/deploy/README.md) + +## Configuration + +vLLM workers are configured through command-line arguments. Key parameters include: + +- `--model`: Model to serve (e.g., `Qwen/Qwen3-0.6B`) +- `--is-prefill-worker`: Enable prefill-only mode for disaggregated serving +- `--metrics-endpoint-port`: Port for publishing KV metrics to Dynamo +- `--connector`: Specify which kv_transfer_config you want vllm to use `[nixl, lmcache, kvbm, none]`. This is a helper flag which overwrites the engines KVTransferConfig. +- `--enable-prompt-embeds`: **Enable prompt embeddings feature** (opt-in, default: disabled) + - **Required for:** Accepting pre-computed prompt embeddings via API + - **Default behavior:** Prompt embeddings DISABLED - requests with `prompt_embeds` will fail + - **Error without flag:** `ValueError: You must set --enable-prompt-embeds to input prompt_embeds` + +See `args.py` for the full list of configuration options and their defaults. + +The [documentation](https://docs.vllm.ai/en/v0.9.2/configuration/serve_args.html?h=serve+arg) for the vLLM CLI args points to running 'vllm serve --help' to see what CLI args can be added. We use the same argument parser as vLLM. + +### Hashing Consistency for KV Events + +When using KV-aware routing, ensure deterministic hashing across processes to avoid radix tree mismatches. Choose one of the following: + +- Set `PYTHONHASHSEED=0` for all vLLM processes when relying on Python's builtin hashing for prefix caching. +- If your vLLM version supports it, configure a deterministic prefix caching algorithm, for example: + +```bash +vllm serve ... --enable-prefix-caching --prefix-caching-algo sha256 +``` +See the high-level notes in [KV Cache Routing](../../router/kv-cache-routing.md) on deterministic event IDs. + +## Request Migration + +You can enable [request migration](../../fault-tolerance/request-migration.md) to handle worker failures gracefully. Use the `--migration-limit` flag to specify how many times a request can be migrated to another worker: + +```bash +python3 -m dynamo.vllm ... --migration-limit=3 +``` + +This allows a request to be migrated up to 3 times before failing. See the [Request Migration Architecture](../../fault-tolerance/request-migration.md) documentation for details on how this works. + +## Request Cancellation + +When a user cancels a request (e.g., by disconnecting from the frontend), the request is automatically cancelled across all workers, freeing compute resources for other requests. + +### Cancellation Support Matrix + +| | Prefill | Decode | +|-|---------|--------| +| **Aggregated** | ✅ | ✅ | +| **Disaggregated** | ✅ | ✅ | + +For more details, see the [Request Cancellation Architecture](../../fault-tolerance/request-cancellation.md) documentation. diff --git a/fern/pages/backends/vllm/deepseek-r1.md b/fern/pages/backends/vllm/deepseek-r1.md new file mode 100644 index 00000000000..d502b2b161b --- /dev/null +++ b/fern/pages/backends/vllm/deepseek-r1.md @@ -0,0 +1,41 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Running Deepseek R1 with Wide EP" +--- + +Dynamo supports running Deepseek R1 with data parallel attention and wide expert parallelism. Each data parallel attention rank is a seperate dynamo component that will emit its own KV Events and Metrics. vLLM controls the expert parallelism using the flag `--enable-expert-parallel` + +## Instructions + +The following script can be adapted to run Deepseek R1 with a variety of different configuration. The current configuration uses 2 nodes, 16 GPUs, and a dp of 16. Follow the [vLLM Backend](README.md) Getting Started section on each node, and then run these two commands. + +node 0 +```bash +./launch/dsr1_dep.sh --num-nodes 2 --node-rank 0 --gpus-per-node 8 --master-addr +``` + +node 1 +```bash +./launch/dsr1_dep.sh --num-nodes 2 --node-rank 1 --gpus-per-node 8 --master-addr +``` + +### Testing the Deployment + +On node 0 (where the frontend was started) send a test request to verify your deployment: + +```bash +curl localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "deepseek-ai/DeepSeek-R1", + "messages": [ + { + "role": "user", + "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden." + } + ], + "stream": false, + "max_tokens": 30 + }' +``` diff --git a/fern/pages/backends/vllm/gpt-oss.md b/fern/pages/backends/vllm/gpt-oss.md new file mode 100644 index 00000000000..8cc89f98993 --- /dev/null +++ b/fern/pages/backends/vllm/gpt-oss.md @@ -0,0 +1,286 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Running gpt-oss-120b Disaggregated with vLLM" +--- + +Dynamo supports disaggregated serving of gpt-oss-120b with vLLM. This guide demonstrates how to deploy gpt-oss-120b using disaggregated prefill/decode serving on a single H100 node with 8 GPUs, running 1 prefill worker on 4 GPUs and 1 decode worker on 4 GPUs. + +## Overview + +This deployment uses disaggregated serving in vLLM where: +- **Prefill Worker**: Processes input prompts efficiently using 4 GPUs with tensor parallelism +- **Decode Worker**: Generates output tokens using 4 GPUs, optimized for token generation throughput +- **Frontend**: Provides OpenAI-compatible API endpoint with round-robin routing + +## Prerequisites + +This guide assumes readers already knows how to deploy Dynamo disaggregated serving with vLLM as illustrated in [README.md](README.md) + +## Instructions + +### 1. Launch the Deployment + +Note that GPT-OSS is a reasoning model with tool calling support. To +ensure the response is being processed correctly, the worker should be +launched with proper `--dyn-reasoning-parser` and `--dyn-tool-call-parser`. + +**Start frontend** +```bash +python3 -m dynamo.frontend --http-port 8000 & +``` + +**Run decode worker** +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 python -m dynamo.vllm \ + --model openai/gpt-oss-120b \ + --tensor-parallel-size 4 \ + --dyn-reasoning-parser gpt_oss \ + --dyn-tool-call-parser harmony +``` + +**Run prefill workers** +```bash +CUDA_VISIBLE_DEVICES=4,5,6,7 python -m dynamo.vllm \ + --model openai/gpt-oss-120b \ + --tensor-parallel-size 4 \ + --is-prefill-worker \ + --dyn-reasoning-parser gpt_oss \ + --dyn-tool-call-parser harmony +``` + +### 2. Verify the Deployment is Ready + +Poll the `/health` endpoint to verify that both the prefill and decode worker endpoints have started: +``` +curl http://localhost:8000/health +``` + +Make sure that both of the `generate` endpoints are available before sending an inference request: +``` +{ + "status": "healthy", + "endpoints": [ + "dyn://dynamo.backend.generate" + ], + "instances": [ + { + "component": "backend", + "endpoint": "generate", + "namespace": "dynamo", + "instance_id": 7587889712474989333, + "transport": { + "nats_tcp": "dynamo_backend.generate-694d997dbae9a315" + } + }, + { + "component": "prefill", + "endpoint": "generate", + "namespace": "dynamo", + "instance_id": 7587889712474989350, + "transport": { + "nats_tcp": "dynamo_prefill.generate-694d997dbae9a326" + } + }, + ... + ] +} +``` + +If only one worker endpoint is listed, the other may still be starting up. Monitor the worker logs to track startup progress. + +### 3. Test the Deployment + +Send a test request to verify the deployment: + +```bash +curl -X POST http://localhost:8000/v1/responses \ + -H "Content-Type: application/json" \ + -d '{ + "model": "openai/gpt-oss-120b", + "input": "Explain the concept of disaggregated serving in LLM inference in 3 sentences.", + "max_output_tokens": 200, + "stream": false + }' +``` + +The server exposes a standard OpenAI-compatible API endpoint that accepts JSON requests. You can adjust parameters like `max_tokens`, `temperature`, and others according to your needs. + +### 4. Reasoning and Tool Calling + +Dynamo has supported reasoning and tool calling in OpenAI Chat Completion endpoint. A typical workflow for application built on top of Dynamo +is that the application has a set of tools to aid the assistant provide accurate answer, and it is ususally +multi-turn as it involves tool selection and generation based on the tool result. Below is an example +of sending multi-round requests to complete a user query with reasoning and tool calling: + +**Application setup (pseudocode)** +```Python +# The tool defined by the application +def get_system_health(): + for component in system.components: + if not component.health(): + return False + return True + +# The JSON representation of the declaration in ChatCompletion tool style +tool_choice = '{ + "type": "function", + "function": { + "name": "get_system_health", + "description": "Returns the current health status of the LLM runtime—use before critical operations to verify the service is live.", + "parameters": { + "type": "object", + "properties": {} + } + } +}' + +# On user query, perform below workflow. +def user_query(app_request): + # first round + # create chat completion with prompt and tool choice + request = ... + response = send(request) + + if response["finish_reason"] == "tool_calls": + # second round + function, params = parse_tool_call(response) + function_result = function(params) + # create request with prompt, assistant response, and function result + request = ... + response = send(request) + return app_response(response) +``` + + +**First request with tools** +```bash +curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d ' +{ + "model": "openai/gpt-oss-120b", + "messages": [ + { + "role": "user", + "content": "Hey, quick check: is everything up and running?" + } + ], + "tools": [ + { + "type": "function", + "function": { + "name": "get_system_health", + "description": "Returns the current health status of the LLM runtime—use before critical operations to verify the service is live.", + "parameters": { + "type": "object", + "properties": {} + } + } + } + ], + "response_format": { + "type": "text" + }, + "stream": false, + "max_tokens": 300 +}' +``` +**First response with tool choice** +```JSON +{ + "id": "chatcmpl-d1c12219-6298-4c83-a6e3-4e7cef16e1a9", + "choices": [ + { + "index": 0, + "message": { + "tool_calls": [ + { + "id": "call-1", + "type": "function", + "function": { + "name": "get_system_health", + "arguments": "{}" + } + } + ], + "role": "assistant", + "reasoning_content": "We need to check system health. Use function." + }, + "finish_reason": "tool_calls" + } + ], + "created": 1758758741, + "model": "openai/gpt-oss-120b", + "object": "chat.completion", + "usage": null +} +``` +**Second request with tool calling result** +```bash +curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d ' +{ + "model": "openai/gpt-oss-120b", + "messages": [ + { + "role": "user", + "content": "Hey, quick check: is everything up and running?" + }, + { + "role": "assistant", + "tool_calls": [ + { + "id": "call-1", + "type": "function", + "function": { + "name": "get_system_health", + "arguments": "{}" + } + } + ] + }, + { + "role": "tool", + "tool_call_id": "call-1", + "content": "{\"status\":\"ok\",\"uptime_seconds\":372045}" + } + ], + "tools": [ + { + "type": "function", + "function": { + "name": "get_system_health", + "description": "Returns the current health status of the LLM runtime—use before critical operations to verify the service is live.", + "parameters": { + "type": "object", + "properties": {} + } + } + } + ], + "response_format": { + "type": "text" + }, + "stream": false, + "max_tokens": 300 +}' +``` +**Second response with final message** +```JSON +{ + "id": "chatcmpl-9ebfe64a-68b9-4c1d-9742-644cf770ad0e", + "choices": [ + { + "index": 0, + "message": { + "content": "All systems are green—everything’s up and running smoothly! 🚀 Let me know if you need anything else.", + "role": "assistant", + "reasoning_content": "The user asks: \"Hey, quick check: is everything up and running?\" We have just checked system health, it's ok. Provide friendly response confirming everything's up." + }, + "finish_reason": "stop" + } + ], + "created": 1758758853, + "model": "openai/gpt-oss-120b", + "object": "chat.completion", + "usage": null +} +``` \ No newline at end of file diff --git a/fern/pages/backends/vllm/multi-node.md b/fern/pages/backends/vllm/multi-node.md new file mode 100644 index 00000000000..22d1981ed75 --- /dev/null +++ b/fern/pages/backends/vllm/multi-node.md @@ -0,0 +1,95 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Multi-node Examples" +--- + +This guide covers deploying vLLM across multiple nodes using Dynamo's distributed capabilities. + +## Prerequisites + +Multi-node deployments require: +- Multiple nodes with GPU resources +- Network connectivity between nodes (faster the better) +- Firewall rules allowing NATS/ETCD communication + +## Infrastructure Setup + +### Step 1: Start NATS/ETCD on Head Node + +Start the required services on your head node. These endpoints must be accessible from all worker nodes: + +```bash +# On head node (node-1) +docker compose -f deploy/docker-compose.yml up -d +``` + +Default ports: +- NATS: 4222 +- ETCD: 2379 + +### Step 2: Configure Environment Variables + +Set the head node IP address and service endpoints. **Set this on all nodes** for easy copy-paste: + +```bash +# Set this on ALL nodes - replace with your actual head node IP +export HEAD_NODE_IP="" + +# Service endpoints (set on all nodes) +export NATS_SERVER="nats://${HEAD_NODE_IP}:4222" +export ETCD_ENDPOINTS="${HEAD_NODE_IP}:2379" +``` + +## Deployment Patterns + +### Multi-node Aggregated Serving + +Deploy vLLM workers across multiple nodes for horizontal scaling: + +**Node 1 (Head Node)**: Run ingress and first worker +```bash +# Start ingress +python -m dynamo.frontend --router-mode kv + +# Start vLLM worker +python -m dynamo.vllm \ + --model meta-llama/Llama-3.3-70B-Instruct \ + --tensor-parallel-size 8 \ + --enforce-eager +``` + +**Node 2**: Run additional worker +```bash +# Start vLLM worker +python -m dynamo.vllm \ + --model meta-llama/Llama-3.3-70B-Instruct \ + --tensor-parallel-size 8 \ + --enforce-eager +``` + +### Multi-node Disaggregated Serving + +Deploy prefill and decode workers on separate nodes for optimized resource utilization: + +**Node 1**: Run ingress and decode worker +```bash +# Start ingress +python -m dynamo.frontend --router-mode kv & + +# Start prefill worker +python -m dynamo.vllm \ + --model meta-llama/Llama-3.3-70B-Instruct \ + --tensor-parallel-size 8 \ + --enforce-eager +``` + +**Node 2**: Run prefill worker +```bash +# Start decode worker +python -m dynamo.vllm \ + --model meta-llama/Llama-3.3-70B-Instruct \ + --tensor-parallel-size 8 \ + --enforce-eager \ + --is-prefill-worker +``` diff --git a/fern/pages/backends/vllm/prometheus.md b/fern/pages/backends/vllm/prometheus.md new file mode 100644 index 00000000000..f15c920ad69 --- /dev/null +++ b/fern/pages/backends/vllm/prometheus.md @@ -0,0 +1,164 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "vLLM Prometheus Metrics" +--- + +## Overview + +When running vLLM through Dynamo, vLLM engine metrics are automatically passed through and exposed on Dynamo's `/metrics` endpoint (default port 8081). This allows you to access both vLLM engine metrics (prefixed with `vllm:`) and Dynamo runtime metrics (prefixed with `dynamo_*`) from a single worker backend endpoint. + +**For the complete and authoritative list of all vLLM metrics**, always refer to the [official vLLM Metrics Design documentation](https://docs.vllm.ai/en/latest/design/metrics.html). + +**For LMCache metrics and integration**, see the [LMCache Integration Guide](LMCache-Integration.md). + +**For Dynamo runtime metrics**, see the [Dynamo Metrics Guide](../../observability/metrics.md). + +**For visualization setup instructions**, see the [Prometheus and Grafana Setup Guide](../../observability/prometheus-grafana.md). + +## Environment Variables and Flags + +| Variable/Flag | Description | Default | Example | +|---------------|-------------|---------|---------| +| `DYN_SYSTEM_PORT` | System metrics/health port. Required to expose `/metrics` endpoint. | `-1` (disabled) | `8081` | +| `--connector` | KV connector to use. Use `lmcache` to enable LMCache metrics. | `nixl` | `--connector lmcache` | + +## Getting Started Quickly + +This is a single machine example. + +### Start Observability Stack + +For visualizing metrics with Prometheus and Grafana, start the observability stack. See [Observability Getting Started](../../observability/README.md#getting-started-quickly) for instructions. + +### Launch Dynamo Components + +Launch a frontend and vLLM backend to test metrics: + +```bash +# Start frontend (default port 8000, override with --http-port or DYN_HTTP_PORT env var) +$ python -m dynamo.frontend + +# Enable system metrics server on port 8081 +$ DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model \ + --enforce-eager --no-enable-prefix-caching --max-num-seqs 3 +``` + +Wait for the vLLM worker to start, then send requests and check metrics: + +```bash +# Send a request +curl -H 'Content-Type: application/json' \ +-d '{ + "model": "", + "max_completion_tokens": 100, + "messages": [{"role": "user", "content": "Hello"}] +}' \ +http://localhost:8000/v1/chat/completions + +# Check metrics from the worker +curl -s localhost:8081/metrics | grep "^vllm:" +``` + +## Exposed Metrics + +vLLM exposes metrics in Prometheus Exposition Format text at the `/metrics` HTTP endpoint. All vLLM engine metrics use the `vllm:` prefix and include labels (e.g., `model_name`, `finished_reason`, `scheduling_event`) to identify the source. + +**Example Prometheus Exposition Format text:** + +``` +# HELP vllm:request_success_total Number of successfully finished requests. +# TYPE vllm:request_success_total counter +vllm:request_success_total{finished_reason="length",model_name="meta-llama/Llama-3.1-8B"} 15.0 +vllm:request_success_total{finished_reason="stop",model_name="meta-llama/Llama-3.1-8B"} 150.0 + +# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds. +# TYPE vllm:time_to_first_token_seconds histogram +vllm:time_to_first_token_seconds_bucket{le="0.001",model_name="meta-llama/Llama-3.1-8B"} 0.0 +vllm:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B"} 5.0 +vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B"} 165.0 +vllm:time_to_first_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B"} 89.38 +``` + +**Note:** The specific metrics shown above are examples and may vary depending on your vLLM version. Always inspect your actual `/metrics` endpoint or refer to the [official documentation](https://docs.vllm.ai/en/latest/design/metrics.html) for the current list. + +### Metric Categories + +vLLM provides metrics in the following categories (all prefixed with `vllm:`): + +- **Request metrics** - Request success, failure, and completion tracking +- **Performance metrics** - Latency, throughput, and timing measurements +- **Resource usage** - System resource consumption +- **Scheduler metrics** - Scheduling and queue management +- **Disaggregation metrics** - Metrics specific to disaggregated deployments (when enabled) + +**Note:** Specific metrics are subject to change between vLLM versions. Always refer to the [official documentation](https://docs.vllm.ai/en/latest/design/metrics.html) or inspect the `/metrics` endpoint for your vLLM version. + +## Available Metrics + +The official vLLM documentation includes complete metric definitions with: +- Detailed explanations and design rationale +- Counter, Gauge, and Histogram metric types +- Metric labels (e.g., `model_name`, `finished_reason`, `scheduling_event`) +- Information about v1 metrics migration +- Future work and deprecated metrics + +For the complete and authoritative list of all vLLM metrics, see the [official vLLM Metrics Design documentation](https://docs.vllm.ai/en/latest/design/metrics.html). + +## LMCache Metrics + +When LMCache is enabled with `--connector lmcache` and `DYN_SYSTEM_PORT` is set, LMCache metrics (prefixed with `lmcache:`) are automatically exposed via Dynamo's `/metrics` endpoint alongside vLLM and Dynamo metrics. + +### Minimum Requirements + +To access LMCache metrics, both of these are required: +1. `--connector lmcache` - Enables LMCache in vLLM +2. `DYN_SYSTEM_PORT=8081` - Enables Dynamo's metrics HTTP endpoint + +**Example:** +```bash +DYN_SYSTEM_PORT=8081 \ +python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache +``` + +### Viewing LMCache Metrics + +```bash +# View all LMCache metrics +curl -s localhost:8081/metrics | grep "^lmcache:" +``` + +### Troubleshooting + +Troubleshooting LMCache-related metrics and logs (including `PrometheusLogger instance already created with different metadata` and `PROMETHEUS_MULTIPROC_DIR` warnings) is documented in: + +- [LMCache Integration Guide](LMCache-Integration.md#troubleshooting) + +**For complete LMCache configuration and metric details**, see: +- [LMCache Integration Guide](LMCache-Integration.md) - Setup and configuration +- [LMCache Observability Documentation](https://docs.lmcache.ai/production/observability/vllm_endpoint.html) - Complete metrics reference + +## Implementation Details + +- vLLM v1 uses multiprocess metrics collection via `prometheus_client.multiprocess` +- `PROMETHEUS_MULTIPROC_DIR`: (optional). By default, Dynamo automatically manages this environment variable, setting it to a temporary directory where multiprocess metrics are stored as memory-mapped files. Each worker process writes its metrics to separate files in this directory, which are aggregated when `/metrics` is scraped. Users only need to set this explicitly where complete control over the metrics directory is required. +- Dynamo uses `MultiProcessCollector` to aggregate metrics from all worker processes +- Metrics are filtered by the `vllm:` and `lmcache:` prefixes before being exposed (when LMCache is enabled) +- The integration uses Dynamo's `register_engine_metrics_callback()` function with the global `REGISTRY` +- Metrics appear after vLLM engine initialization completes +- vLLM v1 metrics are different from v0 - see the [official documentation](https://docs.vllm.ai/en/latest/design/metrics.html) for migration details + +## Related Documentation + +### vLLM Metrics +- [Official vLLM Metrics Design Documentation](https://docs.vllm.ai/en/latest/design/metrics.html) +- [vLLM Production Metrics User Guide](https://docs.vllm.ai/en/latest/usage/metrics.html) +- [vLLM GitHub - Metrics Implementation](https://github.com/vllm-project/vllm/tree/main/vllm/v1/metrics) + +### Dynamo Metrics +- [Dynamo Metrics Guide](../../observability/metrics.md) - Complete documentation on Dynamo runtime metrics +- [Prometheus and Grafana Setup](../../observability/prometheus-grafana.md) - Visualization setup instructions +- Dynamo runtime metrics (prefixed with `dynamo_*`) are available at the same `/metrics` endpoint alongside vLLM metrics + - Implementation: `lib/runtime/src/metrics.rs` (Rust runtime metrics) + - Metric names: `lib/runtime/src/metrics/prometheus_names.rs` (metric name constants) + - Integration code: `components/src/dynamo/common/utils/prometheus.py` - Prometheus utilities and callback registration diff --git a/fern/pages/backends/vllm/prompt-embeddings.md b/fern/pages/backends/vllm/prompt-embeddings.md new file mode 100644 index 00000000000..09221f5498e --- /dev/null +++ b/fern/pages/backends/vllm/prompt-embeddings.md @@ -0,0 +1,253 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Prompt Embeddings" +--- + +Dynamo supports prompt embeddings (also known as prompt embeds) as a secure alternative input method to traditional text prompts. By allowing applications to use pre-computed embeddings for inference, this feature not only offers greater flexibility in prompt engineering but also significantly enhances privacy and data security. With prompt embeddings, sensitive user data can be transformed into embeddings before ever reaching the inference server, reducing the risk of exposing confidential information during the AI workflow. + + +## How It Works + +| Path | What Happens | +|------|--------------| +| **Text prompt** | Tokenize → Embedding Layer → Transformer | +| **Prompt embeds** | Validate → Bypass Embedding → Transformer | + + +## Architecture + +```mermaid +flowchart LR + subgraph FE["Frontend (Rust)"] + A[Request] --> B{prompt_embeds?} + B -->|No| C[🔴 Tokenize text] + B -->|Yes| D[🟢 Validate base64+size] + C --> E[token_ids, ISL=N] + D --> F[token_ids=empty, skip ISL] + end + + subgraph RT["Router (NATS)"] + G[Route PreprocessedRequest] + end + + subgraph WK["Worker (Python)"] + H[TokensPrompt#40;token_ids#41;] + I[Decode → EmbedsPrompt#40;tensor#41;] + end + + subgraph VLLM["vLLM Engine"] + J[🔴 Embedding Layer] + K[🟢 Bypass Embedding] + L[Transformer Layers] + M[LM Head → Response] + end + + E --> G + F --> G + G -->|Normal| H + G -->|Embeds| I + H --> J --> L + I --> K --> L + L --> M +``` + +| Layer | **Normal Flow** | **Prompt Embeds** | +|---|---|---| +| **Frontend (Rust)** | 🔴 Tokenize text → token_ids, compute ISL | 🟢 Validate base64+size, skip tokenization | +| **Router (NATS)** | Forward token_ids in PreprocessedRequest | Forward prompt_embeds string | +| **Worker (Python)** | `TokensPrompt(token_ids)` | Decode base64 → `EmbedsPrompt(tensor)` | +| **vLLM Engine** | 🔴 Embedding Layer → Transformer | 🟢 Bypass Embedding → Transformer | + + +## Quick Start + +Send pre-computed prompt embeddings directly to vLLM, bypassing tokenization. + +### 1. Enable Feature + +```bash +python -m dynamo.vllm --model --enable-prompt-embeds +``` + +> **Required:** The `--enable-prompt-embeds` flag must be set or requests will fail. + +### 2. Send Request + +```python +import torch +import base64 +import io +from openai import OpenAI + +# Prepare embeddings (sequence_length, hidden_dim) +embeddings = torch.randn(10, 4096, dtype=torch.float32) + +# Encode +buffer = io.BytesIO() +torch.save(embeddings, buffer) +buffer.seek(0) +embeddings_base64 = base64.b64encode(buffer.read()).decode() + +# Send +client = OpenAI(base_url="http://localhost:8000/v1", api_key="EMPTY") +response = client.completions.create( + model="meta-llama/Meta-Llama-3.1-8B-Instruct", + prompt="", # Can be empty or present; prompt_embeds takes precedence + max_tokens=100, + extra_body={"prompt_embeds": embeddings_base64} +) +``` + +## Configuration + +### Docker Compose + +```yaml +vllm-worker: + command: + - python + - -m + - dynamo.vllm + - --model + - meta-llama/Meta-Llama-3.1-8B-Instruct + - --enable-prompt-embeds # Add this +``` + +### Kubernetes + +```yaml +extraPodSpec: + mainContainer: + args: + - "--model" + - "meta-llama/Meta-Llama-3.1-8B-Instruct" + - "--enable-prompt-embeds" # Add this +``` + +### NATS Configuration + +NATS needs 15MB payload limit (already configured in default deployments): + +```yaml +# Docker Compose - deploy/docker-compose.yml +nats-server: + command: ["-js", "--trace", "-m", "8222", "--max_payload", "15728640"] + +# Kubernetes - deploy/cloud/helm/platform/values.yaml +nats: + config: + merge: + max_payload: 15728640 +``` + +## API Reference + +### Request + +```json +{ + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "prompt": "", + "prompt_embeds": "", + "max_tokens": 100 +} +``` + +**Requirements:** +- **Format:** PyTorch tensor serialized with `torch.save()` and base64-encoded +- **Size:** 100 bytes - 10MB (decoded) +- **Shape:** `(seq_len, hidden_dim)` or `(batch, seq_len, hidden_dim)` +- **Dtype:** `torch.float32` (recommended) + +**Field Precedence:** +- Both `prompt` and `prompt_embeds` can be provided in the same request +- When both are present, **`prompt_embeds` takes precedence** and `prompt` is ignored +- The `prompt` field can be empty (`""`) when using `prompt_embeds` + +### Response + +Standard OpenAI format with accurate usage: + +```json +{ + "usage": { + "prompt_tokens": 10, // Extracted from embedding shape + "completion_tokens": 15, + "total_tokens": 25 + } +} +``` + +## Errors + +| Error | Fix | +|-------|-----| +| `ValueError: You must set --enable-prompt-embeds` | Add `--enable-prompt-embeds` to worker | +| `prompt_embeds must be valid base64` | Use `.decode('utf-8')` after `base64.b64encode()` | +| `decoded data must be at least 100 bytes` | Increase sequence length | +| `exceeds maximum size of 10MB` | Reduce sequence length | +| `must be a torch.Tensor` | Use `torch.save()` not NumPy | +| `size of tensor must match` | Use correct hidden dimension for model | + +## Examples + +### Streaming + +```python +stream = client.completions.create( + model="meta-llama/Meta-Llama-3.1-8B-Instruct", + prompt="", + max_tokens=100, + stream=True, + extra_body={"prompt_embeds": embeddings_base64} +) + +for chunk in stream: + if chunk.choices: + print(chunk.choices[0].text, end="", flush=True) +``` + +### Load from File + +```python +embeddings = torch.load("embeddings.pt") + +buffer = io.BytesIO() +torch.save(embeddings, buffer) +buffer.seek(0) +embeddings_base64 = base64.b64encode(buffer.read()).decode() + +# Use in request... +``` + + +## Limitations + +- ❌ Requires `--enable-prompt-embeds` flag (disabled by default) +- ❌ PyTorch format only (NumPy not supported) +- ❌ 10MB decoded size limit +- ❌ Cannot mix with multimodal data (images/video) + +## Testing + +Comprehensive test coverage ensures reliability: + +- **Unit Tests:** 31 tests (11 Rust + 20 Python) + - Validation, decoding, format handling, error cases, usage statistics +- **Integration Tests:** 21 end-to-end tests + - Core functionality, performance, formats, concurrency, usage statistics + +Run integration tests: +```bash +# Start worker with flag +python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enable-prompt-embeds + +# Run tests +pytest tests/integration/test_prompt_embeds_integration.py -v +``` + +## See Also + +- [vLLM Backend](README.md) +- [vLLM Configuration](README.md#configuration) diff --git a/fern/pages/backends/vllm/speculative-decoding.md b/fern/pages/backends/vllm/speculative-decoding.md new file mode 100644 index 00000000000..06ae5f5a90a --- /dev/null +++ b/fern/pages/backends/vllm/speculative-decoding.md @@ -0,0 +1,109 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Running **Meta-Llama-3.1-8B-Instruct** with Speculative Decoding (Eagle3)" +--- + +This guide walks through how to deploy **Meta-Llama-3.1-8B-Instruct** using **aggregated speculative decoding** with **Eagle3** on a single node. +Since the model is only **8B parameters**, you can run it on **any GPU with at least 16GB VRAM**. + + + +## Step 1: Set Up Your Docker Environment + +First, we’ll initialize a Docker container using the VLLM backend. +You can refer to the [VLLM Quickstart Guide](README.md#vllm-quick-start) — or follow the full steps below. + +### 1. Launch Docker Compose + +```bash +docker compose -f deploy/docker-compose.yml up -d +``` + +### 2. Build the Container + +```bash +./container/build.sh --framework VLLM +``` + +### 3. Run the Container + +```bash +./container/run.sh -it --framework VLLM --mount-workspace +``` + + + +## Step 2: Get Access to the Llama-3 Model + +The **Meta-Llama-3.1-8B-Instruct** model is gated, so you’ll need to request access on Hugging Face. +Go to the official [Meta-Llama-3.1-8B-Instruct repository](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) and fill out the access form. +Approval usually takes around **5 minutes**. + +Once you have access, generate a **Hugging Face access token** with permission for gated repositories, then set it inside your container: + +```bash +export HUGGING_FACE_HUB_TOKEN="insert_your_token_here" +export HF_TOKEN=$HUGGING_FACE_HUB_TOKEN +``` + + + +## Step 3: Run Aggregated Speculative Decoding + +Now that your environment is ready, start the aggregated server with **speculative decoding**. + +```bash +# Requires only one GPU +cd examples/backends/vllm +bash launch/agg_spec_decoding.sh +``` + +Once the weights finish downloading and serving begins, you’ll be ready to send inference requests to your model. + + + + +## Step 4: Example Request + +To verify your setup, try sending a simple prompt to your model: + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "messages": [ + {"role": "user", "content": "Write a poem about why Sakura trees are beautiful."} + ], + "max_tokens": 250 + }' +``` + +### Example Output + +```json +{ + "id": "cmpl-3e87ea5c-010e-4dd2-bcc4-3298ebd845a8", + "choices": [ + { + "text": "In cherry blossom’s gentle breeze ... A delicate balance of life and death, as petals fade, and new life breathes.", + "index": 0, + "finish_reason": "stop" + } + ], + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "usage": { + "prompt_tokens": 16, + "completion_tokens": 250, + "total_tokens": 266 + } +} +``` + + + +## Additional Resources + +* [VLLM Quickstart](README.md#vllm-quick-start) +* [Meta-Llama-3.1-8B-Instruct on Hugging Face](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) \ No newline at end of file diff --git a/fern/pages/benchmarks/benchmarking.md b/fern/pages/benchmarks/benchmarking.md new file mode 100644 index 00000000000..ef9d2478696 --- /dev/null +++ b/fern/pages/benchmarks/benchmarking.md @@ -0,0 +1,530 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Dynamo Benchmarking Guide" +--- + +This benchmarking framework lets you compare performance across any combination of: +- **DynamoGraphDeployments** +- **External HTTP endpoints** (existing services deployed following standard documentation from vLLM, llm-d, AIBrix, etc.) + +## Choosing Your Benchmarking Approach + +Dynamo provides two benchmarking approaches to suit different use cases: **client-side** and **server-side**. Client-side refers to running benchmarks on your local machine and connecting to Kubernetes deployments via port-forwarding, while server-side refers to running benchmarks directly within the Kubernetes cluster using internal service URLs. Which method to use depends on your use case. + +**TLDR:** +Need high performance/load testing? Server-side. +Just quick testing/comparison? Client-side. + +### Use Client-Side Benchmarking When: +- You want to quickly test deployments +- You want immediate access to results on your local machine +- You're comparing external services or deployments (not necessarily just Dynamo deployments) +- You need to run benchmarks from your laptop/workstation + +→ **[Go to Client-Side Benchmarking (Local)](#client-side-benchmarking-local)** + +### Use Server-Side Benchmarking When: +- You have a development environment with kubectl access +- You're doing performance validation with high load/speed requirements +- You're experiencing timeouts or performance issues with client-side benchmarking +- You want optimal network performance (no port-forwarding overhead) +- You're running automated CI/CD pipelines +- You need isolated execution environments +- You're doing resource-intensive benchmarking +- You want persistent result storage in the cluster + +→ **[Go to Server-Side Benchmarking (In-Cluster)](#server-side-benchmarking-in-cluster)** + +### Quick Comparison + +| Feature | Client-Side | Server-Side | +|---------|-------------|-------------| +| **Location** | Your local machine | Kubernetes cluster | +| **Network** | Port-forwarding required | Direct service DNS | +| **Setup** | Quick and simple | Requires cluster resources | +| **Performance** | Limited by local resources, may timeout under high load | Optimal cluster performance, handles high load | +| **Isolation** | Shared environment | Isolated job execution | +| **Results** | Local filesystem | Persistent volumes | +| **Best for** | Light load | High load | + +## What This Tool Does + +The framework is a Python-based wrapper around `aiperf` that: +- Benchmarks any HTTP endpoints +- Runs concurrency sweeps across configurable load levels +- Generates comparison plots with your custom labels +- Works with any HuggingFace-compatible model on NVIDIA GPUs (H200, H100, A100, etc.) +- Provides direct Python script execution for maximum flexibility + +**Default sequence lengths**: Input: 2000 tokens, Output: 256 tokens (configurable with `--isl` and `--osl`) + +**Important**: The `--model` parameter configures AIPerf for benchmarking and provides logging context. The default `--model` value in the benchmarking script is `Qwen/Qwen3-0.6B`, but it must match the model deployed at the endpoint(s). + +--- + +## Client-Side Benchmarking (Local) + +Client-side benchmarking runs on your local machine and connects to Kubernetes deployments via port-forwarding. + +## Prerequisites + +1. **Dynamo container environment** - You must be running inside a Dynamo container with the benchmarking tools pre-installed. + +2. **HTTP endpoints** - Ensure you have HTTP endpoints available for benchmarking. These can be: + - DynamoGraphDeployments exposed via HTTP endpoints + - External services (vLLM, llm-d, AIBrix, etc.) + - Any HTTP endpoint serving HuggingFace-compatible models + +3. **Benchmark dependencies** - Since benchmarks run locally, you need to install the required Python dependencies. Install them using: + ```bash + pip install -r deploy/utils/requirements.txt + ``` + +## User Workflow + +Follow these steps to benchmark Dynamo deployments using client-side benchmarking: + +### Step 1: Establish Kubernetes Cluster and Install Dynamo +Set up your Kubernetes cluster with NVIDIA GPUs and install the Dynamo Kubernetes Platform. First follow the [installation guide](../kubernetes/installation-guide.md) to install Dynamo Kubernetes Platform, then use [deploy/utils/README](https://github.com/ai-dynamo/dynamo/tree/main/deploy/utils/README.md) to set up benchmarking resources. + +### Step 2: Deploy DynamoGraphDeployments +Deploy your DynamoGraphDeployments separately using the [deployment documentation](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/). Each deployment should have a frontend service exposed. + +### Step 3: Port-Forward and Benchmark Deployment A +```bash +# Port-forward the frontend service for deployment A +kubectl port-forward -n svc/ 8000:8000 > /dev/null 2>&1 & +# Note: remember to stop the port-forward process after benchmarking. + +# Benchmark deployment A using Python scripts +python3 -m benchmarks.utils.benchmark \ + --benchmark-name deployment-a \ + --endpoint-url http://localhost:8000 \ + --model "your-model-name" \ + --output-dir ./benchmarks/results +``` + +### Step 4: [If Comparative] Teardown Deployment A and Establish Deployment B +If comparing multiple deployments, teardown deployment A and deploy deployment B with a different configuration. + +### Step 5: [If Comparative] Port-Forward and Benchmark Deployment B +```bash +# Port-forward the frontend service for deployment B +kubectl port-forward -n svc/ 8001:8000 > /dev/null 2>&1 & + +# Benchmark deployment B using Python scripts +python3 -m benchmarks.utils.benchmark \ + --benchmark-name deployment-b \ + --endpoint-url http://localhost:8001 \ + --model "your-model-name" \ + --output-dir ./benchmarks/results +``` + +### Step 6: Generate Summary and Visualization +```bash +# Generate plots and summary using Python plotting script +python3 -m benchmarks.utils.plot --data-dir ./benchmarks/results + +# Or plot only specific benchmark experiments +python3 -m benchmarks.utils.plot --data-dir ./benchmarks/results --benchmark-name experiment-a --benchmark-name experiment-b +``` + +## Use Cases + +The benchmarking framework supports various comparative analysis scenarios: + +- **Compare multiple DynamoGraphDeployments of a single backend** (e.g., aggregated vs disaggregated configurations) +- **Compare different backends** (e.g., vLLM vs TensorRT-LLM vs SGLang) +- **Compare Dynamo vs other platforms** (e.g., Dynamo vs llm-d vs AIBrix) +- **Compare different models** (e.g., Llama-3-8B vs Llama-3-70B vs Qwen-3-0.6B) +- **Compare different hardware configurations** (e.g., H100 vs A100 vs H200) +- **Compare different parallelization strategies** (e.g., different GPU counts or memory configurations) + +## Configuration and Usage + +### Command Line Options + +```bash +python3 -m benchmarks.utils.benchmark --benchmark-name --endpoint-url [OPTIONS] + +REQUIRED: + --benchmark-name NAME Name/label for this benchmark (used in plots and results) + --endpoint-url URL HTTP endpoint URL to benchmark (e.g., http://localhost:8000) + +OPTIONS: + -h, --help Show help message and examples + -m, --model MODEL Model name for AIPerf configuration and logging (default: Qwen/Qwen3-0.6B) + NOTE: This must match the model deployed at the endpoint + -i, --isl LENGTH Input sequence length (default: 2000) + -s, --std STDDEV Input sequence standard deviation (default: 10) + -o, --osl LENGTH Output sequence length (default: 256) + -d, --output-dir DIR Output directory (default: ./benchmarks/results) + --verbose Enable verbose output +``` + +### Important Notes + +- **Benchmark Name**: The benchmark name becomes the label in plots and results +- **Name Restrictions**: Names can only contain letters, numbers, hyphens, and underscores. The name `plots` is reserved. +- **Port-Forwarding**: You must have an exposed endpoint before benchmarking +- **Model Parameter**: The `--model` parameter configures AIPerf for testing and logging, and must match the model deployed at the endpoint +- **Sequential Benchmarking**: For comparative benchmarks, deploy and benchmark each configuration separately + +### What Happens During Benchmarking + +The Python benchmarking module: +1. **Connects** to your port-forwarded endpoint +2. **Benchmarks** using AIPerf at various concurrency levels (default: 1, 2, 5, 10, 50, 100, 250) +3. **Measures** key metrics: latency, throughput, time-to-first-token +4. **Saves** results to an output directory organized by benchmark name + +The Python plotting module: +1. **Generates** comparison plots using your benchmark name in `/plots/` +2. **Creates** summary statistics and visualizations + +### Plotting Options + +The plotting script supports several options for customizing which experiments to visualize: + +```bash +# Plot all benchmark experiments in the data directory +python3 -m benchmarks.utils.plot --data-dir ./benchmarks/results + +# Plot only specific benchmark experiments +python3 -m benchmarks.utils.plot --data-dir ./benchmarks/results --benchmark-name experiment-a --benchmark-name experiment-b + +# Specify custom output directory for plots +python3 -m benchmarks.utils.plot --data-dir ./benchmarks/results --output-dir ./custom-plots +``` + +**Available Options:** +- `--data-dir`: Directory containing benchmark results (required) +- `--benchmark-name`: Specific benchmark experiment name to plot (can be specified multiple times). Names must match subdirectory names under the data dir. +- `--output-dir`: Custom output directory for plots (defaults to data-dir/plots) + +**Note**: If `--benchmark-name` is not specified, the script will plot all subdirectories found in the data directory. + +### Using Your Own Models and Configuration + +The benchmarking framework supports any HuggingFace-compatible LLM model. Specify your model in the benchmark script's `--model` parameter. It must match the model name of the deployment. You can override the default sequence lengths (2000/256 tokens) with `--isl` and `--osl` flags if needed for your specific workload. + +The benchmarking framework is built around Python modules that provide direct control over the benchmark workflow. The Python benchmarking module connects to your existing endpoints, runs the benchmarks, and can generate plots. Deployment is user-managed and out of scope for this tool. + +### Comparison Limitations + +The plotting system supports up to 12 different benchmarks in a single comparison. + +### Concurrency Configuration + +You can customize the concurrency levels using the CONCURRENCIES environment variable: + +```bash +# Custom concurrency levels +CONCURRENCIES="1,5,20,50" python3 -m benchmarks.utils.benchmark \ + --benchmark-name my-test \ + --endpoint-url http://localhost:8000 + +# Or set permanently +export CONCURRENCIES="1,2,5,10,25,50,100" +python3 -m benchmarks.utils.benchmark \ + --benchmark-name test \ + --endpoint-url http://localhost:8000 +``` + +## Understanding Your Results + +After benchmarking completes, check `./benchmarks/results/` (or your custom output directory): + +### Plot Labels and Organization + +The plotting script uses the `--benchmark-name` as the experiment name in all generated plots. For example: +- `--benchmark-name aggregated` → plots will show "aggregated" as the label +- `--benchmark-name vllm-disagg` → plots will show "vllm-disagg" as the label + +This allows you to easily identify and compare different configurations in the visualization plots. + +### Summary and Plots + +```text +benchmarks/results/plots +├── SUMMARY.txt # Quick overview of all results +├── p50_inter_token_latency_vs_concurrency.png # Token generation speed +├── avg_time_to_first_token_vs_concurrency.png # Response time +├── request_throughput_vs_concurrency.png # Requests per second +├── efficiency_tok_s_gpu_vs_user.png # GPU efficiency +└── avg_inter_token_latency_vs_concurrency.png # Average latency +``` + +### Data Files + +Raw data is organized by deployment/benchmark type and concurrency level: + +**For Any Benchmarking (uses your custom benchmark name):** +```text +results/ # Client-side: ./benchmarks/results/ or custom dir +├── plots/ # Server-side: /data/results/ +│ ├── SUMMARY.txt # Performance visualization plots +│ ├── p50_inter_token_latency_vs_concurrency.png +│ ├── avg_inter_token_latency_vs_concurrency.png +│ ├── request_throughput_vs_concurrency.png +│ ├── efficiency_tok_s_gpu_vs_user.png +│ └── avg_time_to_first_token_vs_concurrency.png +├── / # Results for your benchmark (uses your custom name) +│ ├── c1/ # Concurrency level 1 +│ │ └── profile_export_aiperf.json +│ ├── c2/ # Concurrency level 2 +│ ├── c5/ # Concurrency level 5 +│ └── ... # Other concurrency levels (10, 50, 100, 250) +└── / # Results for additional benchmarking runs + └── c*/ # Same structure as above +``` + +**Example with actual benchmark names:** +```text +results/ +├── plots/ +├── experiment-a/ # --benchmark-name experiment-a +├── experiment-b/ # --benchmark-name experiment-b +└── experiment-c/ # --benchmark-name experiment-c +``` + +Each concurrency directory contains: +- **`profile_export_aiperf.json`** - Structured metrics from AIPerf +- **`profile_export_aiperf.csv`** - CSV format metrics from AIPerf +- **`profile_export.json`** - Raw AIPerf results +- **`inputs.json`** - Generated test inputs + +--- + +## Server-Side Benchmarking (In-Cluster) + +Server-side benchmarking runs directly within the Kubernetes cluster, eliminating the need for port forwarding and providing better resource utilization. + +## What Server-Side Benchmarking Does + +The server-side benchmarking solution: +- Runs benchmarks directly within the Kubernetes cluster using internal service URLs +- Uses Kubernetes service DNS for direct communication (no port forwarding required) +- Leverages the existing benchmarking infrastructure (`benchmarks.utils.benchmark`) +- Stores results persistently using `dynamo-pvc` +- Provides isolated execution environment with configurable resources +- Handles high load/speed requirements without timeout issues +- **Note**: Each benchmark job runs within a single Kubernetes namespace, but can benchmark services across multiple namespaces using the full DNS format `svc_name.namespace.svc.cluster.local` + +## Prerequisites + +1. **Kubernetes cluster** with NVIDIA GPUs and Dynamo namespace setup (see [Dynamo Kubernetes Platform docs](../kubernetes/README.md)) +2. **Storage** PersistentVolumeClaim configured with appropriate permissions (see [deploy/utils README](https://github.com/ai-dynamo/dynamo/tree/main/deploy/utils/README.md)) +3. **Docker image** containing the Dynamo benchmarking tools + +## Quick Start + +### Step 1: Deploy Your DynamoGraphDeployment +Deploy your DynamoGraphDeployment using the [deployment documentation](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/). Ensure it has a frontend service exposed. + +### Step 2: Deploy and Run Benchmark Job + +**Note**: The server-side benchmarking job requires a Docker image containing the Dynamo benchmarking tools. Before the 0.5.1 release, you must build your own Docker image using the [container build instructions](https://github.com/ai-dynamo/dynamo/tree/main/container/README.md), push it to your container registry, then update the `image` field in `benchmarks/incluster/benchmark_job.yaml` to use your built image tag. + +```bash +export NAMESPACE=benchmarking + +# Deploy the benchmark job with default settings +kubectl apply -f benchmarks/incluster/benchmark_job.yaml -n $NAMESPACE + +# Monitor the job, wait for it to complete +kubectl logs -f job/dynamo-benchmark -n $NAMESPACE +``` + +#### Customize the job configuration + +To customize the benchmark parameters, edit the `benchmarks/incluster/benchmark_job.yaml` file and modify: + +- **Model name**: Change `"Qwen/Qwen3-0.6B"` in the args section +- **Benchmark name**: Change `"qwen3-0p6b-vllm-agg"` to your desired benchmark name +- **Service URL**: Change `"vllm-agg-frontend:8000"` so the service URL matches your deployed service +- **Docker image**: Change the image field if needed + +Then deploy: +```bash +kubectl apply -f benchmarks/incluster/benchmark_job.yaml -n $NAMESPACE +``` + +### Step 3: Retrieve Results +```bash +# Create access pod (skip this step if access pod is already running) +kubectl apply -f deploy/utils/manifests/pvc-access-pod.yaml -n $NAMESPACE +kubectl wait --for=condition=Ready pod/pvc-access-pod -n $NAMESPACE --timeout=60s + +# Download the results +kubectl cp $NAMESPACE/pvc-access-pod:/data/results/ ./benchmarks/results/ + +# Cleanup +kubectl delete pod pvc-access-pod -n $NAMESPACE +``` + +### Step 4: Generate Plots +```bash +# Generate performance plots from the downloaded results +python3 -m benchmarks.utils.plot \ + --data-dir ./benchmarks/results +``` + +This will create visualization plots. For more details on interpreting these plots, see the [Summary and Plots](#summary-and-plots) section above. + +## Cross-Namespace Service Access + +Server-side benchmarking can benchmark services across multiple namespaces from a single job using Kubernetes DNS. When referencing services in other namespaces, use the full DNS format: + +```bash +# Access service in same namespace +SERVICE_URL=vllm-agg-frontend:8000 + +# Access service in different namespace +SERVICE_URL=vllm-agg-frontend.production.svc.cluster.local:8000 +``` + +**DNS Format**: `..svc.cluster.local:port` + +This allows you to: +- Benchmark multiple services across different namespaces in a single job +- Compare services running in different environments (dev, staging, production) +- Test cross-namespace integrations without port-forwarding +- Run comprehensive cross-namespace performance comparisons + +## Configuration + +The benchmark job is configured directly in the YAML file. + +### Default Configuration + +- **Model**: `Qwen/Qwen3-0.6B` +- **Benchmark Name**: `qwen3-0p6b-vllm-agg` +- **Service**: `vllm-agg-frontend:8000` +- **Docker Image**: `nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag` + +### Customizing the Job + +To customize the benchmark, edit `benchmarks/incluster/benchmark_job.yaml`: + +1. **Change the model**: Update the `--model` argument +2. **Change the benchmark name**: Update the `--benchmark-name` argument +3. **Change the service URL**: Update the `--endpoint-url` argument (use `..svc.cluster.local:port` for cross-namespace access) +4. **Change Docker image**: Update the image field if needed + +### Example: Multi-Namespace Benchmarking + +To benchmark services across multiple namespaces, you would need to run separate benchmark jobs for each service since the format supports one benchmark per job. However, the results are stored in the same PVC and may be accessed together. + +```yaml +# Job 1: Production service +args: + - --model + - "Qwen/Qwen3-0.6B" + - --benchmark-name + - "prod-vllm" + - --endpoint-url + - "vllm-agg-frontend.production.svc.cluster.local:8000" + - --output-dir + - /data/results + +# Job 2: Staging service +args: + - --model + - "Qwen/Qwen3-0.6B" + - --benchmark-name + - "staging-vllm" + - --endpoint-url + - "vllm-agg-frontend.staging.svc.cluster.local:8000" + - --output-dir + - /data/results +``` + +## Understanding Your Results + +Results are stored in `/data/results` and follow the same structure as client-side benchmarking: + +```text +/data/results/ +└── / # Results for your benchmark name + ├── c1/ # Concurrency level 1 + │ └── profile_export_aiperf.json + ├── c2/ # Concurrency level 2 + └── ... # Other concurrency levels +``` + +## Monitoring and Debugging + +### Check Job Status +```bash +kubectl describe job dynamo-benchmark -n $NAMESPACE +``` + +### View Logs +```bash +# Follow logs in real-time +kubectl logs -f job/dynamo-benchmark -n $NAMESPACE +``` + +### Debug Failed Jobs +```bash +# Check pod status +kubectl get pods -n $NAMESPACE -l job-name=dynamo-benchmark + +# Describe failed pod +kubectl describe pod -n $NAMESPACE +``` + +## Troubleshooting + +### Common Issues + +1. **Service not found**: Ensure your DynamoGraphDeployment frontend service is running +3. **PVC access**: Check that `dynamo-pvc` is properly configured and accessible +4. **Image pull issues**: Ensure the Docker image is accessible from the cluster +5. **Resource constraints**: Adjust resource limits if the job is being evicted + +### Debug Commands + +```bash +# Check PVC status +kubectl get pvc dynamo-pvc -n $NAMESPACE + +# Check service endpoints +kubectl get svc -n $NAMESPACE + +# Verify your service exists and has endpoints +SVC_NAME="${SERVICE_URL%%:*}" +kubectl get svc "$SVC_NAME" -n "$NAMESPACE" +kubectl get endpoints "$SVC_NAME" -n "$NAMESPACE" +``` + +--- + +## Customize Benchmarking Behavior + +The built-in Python workflow connects to endpoints, benchmarks with aiperf, and generates plots. If you want to modify the behavior: + +1. **Extend the workflow**: Modify `benchmarks/utils/workflow.py` to add custom deployment types or metrics collection + +2. **Generate different plots**: Modify `benchmarks/utils/plot.py` to generate a different set of plots for whatever you wish to visualize. + +3. **Direct module usage**: Use individual Python modules (`benchmarks.utils.benchmark`, `benchmarks.utils.plot`) for granular control over each step of the benchmarking process. + +The Python benchmarking module provides a complete end-to-end benchmarking experience with full control over the workflow. + +--- + +## Testing with Mocker Backend + +For development and testing purposes, Dynamo provides a [mocker backend](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/mocker/) that simulates LLM inference without requiring actual GPU resources. This is useful for: + +- **Testing deployments** without expensive GPU infrastructure +- **Developing and debugging** router, planner, or frontend logic +- **CI/CD pipelines** that need to validate infrastructure without model execution +- **Benchmarking framework validation** to ensure your setup works before using real backends + +The mocker backend mimics the API and behavior of real backends (vLLM, SGLang, TensorRT-LLM) but generates mock responses instead of running actual inference. + +See the [mocker directory](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/mocker/) for usage examples and configuration options. diff --git a/fern/pages/benchmarks/kv-router-ab-testing.md b/fern/pages/benchmarks/kv-router-ab-testing.md new file mode 100644 index 00000000000..28cd4db92b7 --- /dev/null +++ b/fern/pages/benchmarks/kv-router-ab-testing.md @@ -0,0 +1,801 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Dynamo KV Smart Router A/B Benchmarking Guide" +--- + +This guide walks you through setting up and running A/B benchmarks to compare Dynamo's KV Smart Router against standard round-robin routing on a Kubernetes cluster. + +## Overview +Dynamo's KV Smart Router intelligently routes requests based on KV cache affinity, improving performance for workloads with shared prompt prefixes. This guide helps you: + +1. Deploy two identical Dynamo configurations: + a. A vllm server for Qwen3-32B with 8 workers (aggregated) **WITHOUT** KV Smart Router enabled + b. A vllm server for Qwen3-32B with 8 workers (aggregated) **WITH** KV Smart Router enabled +2. Run controlled benchmarks using AIPerf +3. Compare performance metrics to evaluate KV router effectiveness + +**Prerequisites:** Kubernetes cluster with GPUs, kubectl, helm + +--- + +## Prerequisites + +### Required Tools + +- `kubectl` (configured with cluster access) +- `helm` (v3+) +- HuggingFace account and token (if model downloads are gated) +- Kubernetes cluster with: + - GPU nodes (H100, H200, or similar) + - Sufficient GPU capacity (16+ GPUs recommended for this example) + - Dynamo platform installed globally OR ability to install per-namespace + +### Knowledge Requirements + +- Basic Kubernetes concepts (namespaces, pods, services) +- Familiarity with LLM inference concepts +- Command-line proficiency + +--- + +## Architecture + +This guide sets up two parallel deployments, as well as a benchmarking pod that can test each deployment: + +```text +┌─────────────────────────────────────┐ +│ Deployment A: Router OFF │ +│ Namespace: router-off-test │ +│ ├─ Frontend (Standard Routing) │ +│ └─ 8x Decode Workers (1 GPU each) │ +└─────────────────────────────────────┘ + +┌─────────────────────────────────────┐ +│ Deployment B: Router ON │ +│ Namespace: router-on-test │ +│ ├─ Frontend (KV Smart Router) │ +│ └─ 8x Decode Workers (1 GPU each) │ +└─────────────────────────────────────┘ + +┌─────────────────────────────────────┐ +│ Benchmark Pod │ +│ Namespace: benchmark │ +│ └─ AIPerf + Dataset │ +└─────────────────────────────────────┘ +``` + +**Key Difference:** Deployment B sets `DYN_ROUTER_MODE=kv` on the frontend to enable KV cache-aware routing. + +--- + +## Phase 1: Namespace and Infrastructure Setup + +### Step 1.1: Create Namespaces + +```bash +# Create namespaces for both deployments +kubectl create namespace router-off-test +kubectl create namespace router-on-test +kubectl create namespace benchmark +``` + +### Step 1.2: Create HuggingFace Token Secret (optional) + +If the model you're seeking to deploy requires HF token to download (Llama family models require this), replace `YOUR_HF_TOKEN` with your actual HuggingFace token: + +```bash +# Router-OFF namespace +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN="YOUR_HF_TOKEN" \ + -n router-off-test + +# Router-ON namespace +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN="YOUR_HF_TOKEN" \ + -n router-on-test +``` + +### Step 1.3: Install Dynamo Platform (Per-Namespace) + +If your cluster uses namespace-restricted Dynamo operators, you'll need to install the Dynamo platform in each namespace. Follow the [Dynamo Kubernetes Installation Guide](https://github.com/ai-dynamo/dynamo/blob/main/docs/kubernetes/installation_guide.md) to install the platform in both namespaces: + +- `router-off-test` +- `router-on-test` + +**Key Configuration Notes:** +- If your cluster uses namespace restrictions, ensure `dynamo-operator.namespaceRestriction.enabled=true` is set during installation +- Adjust version tags to match your cluster's available Dynamo versions +- If you encounter operator compatibility issues (e.g., unsupported MPI arguments), consult your cluster administrator or the Dynamo troubleshooting documentation + +### Step 1.4: Verify Infrastructure + +Wait for operators and infrastructure to be ready: + +```bash +# Check router-off-test +kubectl get pods -n router-off-test + +# Check router-on-test +kubectl get pods -n router-on-test +``` + +You should see: +- `dynamo-platform-dynamo-operator-controller-manager` (2/2 Running) +- `dynamo-platform-etcd-0` (1/1 Running) +- `dynamo-platform-nats-0` (2/2 Running) + +--- + +## Phase 2: Deploy Model Serving + +### Step 2.1: Create Deployment YAMLs + +Create `router-off-deployment.yaml`: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: vllm-agg-no-router +spec: + services: + Frontend: + dynamoNamespace: vllm-agg-no-router + componentType: frontend + replicas: 1 + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0 + VllmDecodeWorker: + envFromSecret: hf-token-secret + dynamoNamespace: vllm-agg-no-router + componentType: worker + replicas: 8 + resources: + limits: + gpu: "1" + extraPodSpec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - gpu-h200-sxm # Adjust to your GPU node type + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0 + workingDir: /workspace/examples/backends/vllm + command: + - /bin/sh + - -c + args: + - python3 -m dynamo.vllm --model Qwen/Qwen3-32B --quantization fp8 + startupProbe: + httpGet: + path: /health + port: 9090 + initialDelaySeconds: 120 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 60 # 32 minutes total (120s + 60*30s) + livenessProbe: + httpGet: + path: /live + port: 9090 + initialDelaySeconds: 300 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 10 + readinessProbe: + httpGet: + path: /live + port: 9090 + initialDelaySeconds: 300 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 10 +``` + +Create `router-on-deployment.yaml`: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: vllm-agg-router +spec: + services: + Frontend: + dynamoNamespace: vllm-agg-router + componentType: frontend + replicas: 1 + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0 + envs: + - name: DYN_ROUTER_MODE + value: kv # KEY DIFFERENCE: Enable KV Smart Router + VllmDecodeWorker: + envFromSecret: hf-token-secret + dynamoNamespace: vllm-agg-router + componentType: worker + replicas: 8 + resources: + limits: + gpu: "1" + extraPodSpec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - gpu-h200-sxm # Adjust to your GPU node type + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0 + workingDir: /workspace/examples/backends/vllm + command: + - /bin/sh + - -c + args: + - python3 -m dynamo.vllm --model Qwen/Qwen3-32B --quantization fp8 + startupProbe: + httpGet: + path: /health + port: 9090 + initialDelaySeconds: 120 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 60 # 32 minutes total (120s + 60*30s) + livenessProbe: + httpGet: + path: /live + port: 9090 + initialDelaySeconds: 300 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 10 + readinessProbe: + httpGet: + path: /live + port: 9090 + initialDelaySeconds: 300 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 10 +``` + +### Step 2.2: Deploy Both Configurations + +```bash +# Deploy router-OFF +kubectl apply -f router-off-deployment.yaml -n router-off-test + +# Deploy router-ON +kubectl apply -f router-on-deployment.yaml -n router-on-test +``` + +**💡 Optimization Tip:** Each worker will download the model independently (~20 minutes per pod). For faster initialization, add a shared PVC with `ReadWriteMany` access mode to cache the model. + +First, create the PVC separately: + +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: model-cache +spec: + accessModes: + - ReadWriteMany + storageClassName: "your-shared-storage-class" # e.g., nfs, efs, nebius-shared-fs + resources: + requests: + storage: 100Gi +``` + +Then reference it in your DynamoGraphDeployment: + +```yaml +spec: + pvcs: + - create: false + name: model-cache + size: "0" + services: + VllmDecodeWorker: + volumeMounts: + - mountPoint: /root/.cache/huggingface + name: model-cache + useAsCompilationCache: false +``` + +With this configuration, only the first worker downloads the model; others use the cached version, reducing startup time from 20+ minutes to ~2 minutes per pod. + +### Step 2.3: Monitor Deployment Progress + +```bash +# Watch router-OFF pods +kubectl get pods -n router-off-test -w + +# Watch router-ON pods +kubectl get pods -n router-on-test -w +``` + +Wait for all pods to reach `Running` status and pass readiness probes. + +**Expected Timeline:** +- **With shared PVC** (ReadWriteMany): ~5-10 minutes total (first worker downloads, others reuse cache) +- **Without shared PVC**: 20-30 minutes per worker (workers download independently) + - For 8 workers: Budget **1-2 hours** for full deployment (workers start in parallel but are limited by node scheduling) + +The startup probe allows 32 minutes per pod (failureThreshold: 60), which accommodates model download and initialization. + +### Step 2.4: Verify All Workers Are Healthy + +> ⚠️ **CRITICAL CHECKPOINT**: Before running benchmarks, you **MUST** verify equal worker health in both deployments. Unequal worker counts will invalidate your comparison results. + +```bash +# Quick health check - both should show "8/8" +echo "Router OFF: $(kubectl get pods -n router-off-test -l nvidia.com/dynamo-component-type=worker --field-selector=status.phase=Running -o json | jq '[.items[] | select(.status.conditions[] | select(.type=="Ready" and .status=="True"))] | length')/8 ready" +echo "Router ON: $(kubectl get pods -n router-on-test -l nvidia.com/dynamo-component-type=worker --field-selector=status.phase=Running -o json | jq '[.items[] | select(.status.conditions[] | select(.type=="Ready" and .status=="True"))] | length')/8 ready" + +# Detailed view +kubectl get pods -n router-off-test -l nvidia.com/dynamo-component-type=worker +kubectl get pods -n router-on-test -l nvidia.com/dynamo-component-type=worker +``` + +**Both must show 8/8 workers in Ready state (1/1 Running).** If workers are not ready: +- Check logs: `kubectl logs -n ` +- Common issues: model download in progress, startup probe timeout, insufficient GPU resources + +**Do not proceed with benchmarks until all 16 workers (8 per deployment) are healthy.** + +--- + +## Phase 3: Prepare Benchmark Dataset + +### Understanding the Mooncake Trace Dataset + +For this A/B comparison, we use the **Mooncake Trace Dataset**, published by [Mooncake AI](https://github.com/kvcache-ai/Mooncake). This is a privacy-preserving dataset of real-world LLM inference traffic from production arxiv workloads. + +**What's in the dataset?** Each trace entry contains: +- **Timestamp:** When the request arrived (for realistic request timing) +- **Input/output lengths:** Number of tokens in prompts and responses +- **Block hash IDs:** Cryptographic hashes representing KV cache blocks (explained below) + +**Sample trace entry:** +```json +{ + "timestamp": 27482, + "input_length": 6955, + "output_length": 52, + "hash_ids": [46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 2353, 2354] +} +``` + +### Why Mooncake Traces Matter for KV Cache Benchmarking + +**The Challenge:** Traditional LLM benchmarks use synthetic or random data, which are often insufficient to capture real-world optimizations like KV Smart Router. To properly evaluate this feature, we need realistic traffic patterns with **prefix repetition** - but this creates a privacy problem: how do we measure realistic KV cache hit patterns without exposing actual user conversations? + +**Mooncake's Solution: Privacy-Preserving Block Hashes** + +Instead of storing actual prompt text, the Mooncake dataset uses cryptographic hashes to represent KV cache blocks. Each hash ID represents a **512-token block**, and the hash includes both the current block and all preceding blocks. This preserves the **pattern of prefix reuse** while completely protecting user privacy. + +### How it works - Multi-turn conversation example + +```text +Turn 1 (initial request - long document analysis): + Input: ~8,000 tokens (e.g., research paper + question) + Hash IDs: [46][47][48][49][50][51][52][53][54][55][56][57][58][59][60][61] + └─ 16 blocks × 512 tokens/block = ~8,192 tokens + +Turn 2 (follow-up question on same document): + Input: Same document + new question (~8,500 tokens) + Hash IDs: [46][47][48][49][50][51][52][53][54][55][56][57][58][59][60][61][62] + └──────────── Reuses first 16 blocks (~8,192 tokens) ───────────────┘ + + ✅ Cache hit: First 8,192 tokens don't need recomputation! + +Turn 3 (another follow-up): + Input: Same document + different question (~9,000 tokens) + Hash IDs: [46][47][48][49][50][51][52][53][54][55][56][57][58][59][60][61][62][63] + └──────────── Reuses first 16 blocks (~8,192 tokens) ───────────────┘ +``` + +When requests share the same hash IDs (e.g., blocks 46-61), it means they share those 512-token blocks - indicating **significant prefix overlap** (in this case, 8,192 tokens). The **KV Smart Router** routes requests with matching hash IDs to the same worker, maximizing cache hits and avoiding redundant computation for those shared prefix tokens. + +**Key Dataset Properties:** +- ✅ **Realistic timing:** Request arrival patterns from production workloads +- ✅ **Real prefix patterns:** Up to 50% cache hit ratio ([Mooncake technical report](https://github.com/kvcache-ai/Mooncake)) +- ✅ **Privacy-preserving:** No actual text - only hash-based cache block identifiers +- ✅ **Reproducible:** Public dataset enables fair comparisons across different systems + +**Why this matters:** With random synthetic data, the KV Smart Router would show no benefit because there's no prefix reuse to exploit. Mooncake traces provide realistic workload patterns that demonstrate the router's real-world performance gains while respecting user privacy. + +--- + +### Download and Prepare the Dataset + +```bash +# Download the Mooncake arxiv trace dataset +curl -sL https://raw.githubusercontent.com/kvcache-ai/Mooncake/refs/heads/main/FAST25-release/arxiv-trace/mooncake_trace.jsonl -o mooncake_trace.jsonl + +# Trim to 1000 requests for faster benchmarking +head -n 1000 mooncake_trace.jsonl > mooncake_trace_small.jsonl + +# Speed up timestamps 4x (reduces benchmark time from ~12 min to ~3 min) +python3 - <<'PY' +import json + +with open("mooncake_trace_small.jsonl") as src, open("mooncake_trace_4x.jsonl", "w") as dst: + for line in src: + rec = json.loads(line) + rec["timestamp"] = int(rec["timestamp"] / 4) + dst.write(json.dumps(rec) + "\n") +PY + +echo "Dataset ready: mooncake_trace_4x.jsonl (1000 requests, 4x speed)" +``` + +--- + +## Phase 4: Set Up Benchmark Environment + +### Step 4.1: Deploy Benchmark Pod + +Create `benchmark-job.yaml`: + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: aiperf-benchmark + namespace: benchmark +spec: + backoffLimit: 1 + template: + spec: + restartPolicy: Never + containers: + - name: benchmark + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0 + command: ["/bin/sh", "-c", "sleep infinity"] + imagePullPolicy: IfNotPresent + resources: + limits: + nvidia.com/gpu: 0 +``` + +Deploy: + +```bash +kubectl apply -f benchmark-job.yaml +``` + +Wait for pod to be ready: + +```bash +kubectl get pods -n benchmark +``` + +### Step 4.2: Copy Dataset to Benchmark Pod + +```bash +POD_NAME=$(kubectl get pods -n benchmark -l job-name=aiperf-benchmark -o jsonpath='{.items[0].metadata.name}') + +kubectl -n benchmark cp mooncake_trace_4x.jsonl ${POD_NAME}:/tmp/mooncake_trace_4x.jsonl +``` + +### Step 4.3: Install AIPerf + +```bash +kubectl -n benchmark exec ${POD_NAME} -- bash -lc '. /opt/dynamo/venv/bin/activate && pip install -q aiperf' +``` + +--- + +## Phase 5: Run Benchmarks + +### Step 5.1: Benchmark Router-OFF (Baseline) + +```bash +kubectl -n benchmark exec ${POD_NAME} -- bash -lc ' + . /opt/dynamo/venv/bin/activate + aiperf profile \ + --model "Qwen/Qwen3-32B" \ + --url "http://vllm-agg-no-router-frontend.router-off-test.svc.cluster.local:8000" \ + --endpoint-type chat \ + --input-file /tmp/mooncake_trace_4x.jsonl \ + --custom-dataset-type mooncake_trace \ + --tokenizer "Qwen/Qwen3-32B" \ + --streaming \ + --request-count 1000 \ + --fixed-schedule \ + --output-artifact-dir /tmp/router_off_results +' +``` + +**Note:** This will take 3-5 minutes. The terminal output includes a summary table. + +### Step 5.2: Benchmark Router-ON (KV Smart Router) + +```bash +kubectl -n benchmark exec ${POD_NAME} -- bash -lc ' + . /opt/dynamo/venv/bin/activate + aiperf profile \ + --model "Qwen/Qwen3-32B" \ + --url "http://vllm-agg-router-frontend.router-on-test.svc.cluster.local:8000" \ + --endpoint-type chat \ + --input-file /tmp/mooncake_trace_4x.jsonl \ + --custom-dataset-type mooncake_trace \ + --tokenizer "Qwen/Qwen3-32B" \ + --streaming \ + --request-count 1000 \ + --fixed-schedule \ + --output-artifact-dir /tmp/router_on_results +' +``` + +### Step 5.3: Collect Results + +```bash +# Copy results to local machine +kubectl -n benchmark cp ${POD_NAME}:/tmp/router_off_results/profile_export_aiperf.csv ./router_off_results.csv +kubectl -n benchmark cp ${POD_NAME}:/tmp/router_on_results/profile_export_aiperf.csv ./router_on_results.csv +``` + +--- + +## Phase 6: Analyze Results + +### Key Metrics to Compare + +| Metric | Description | What to Look For | +|--------|-------------|------------------| +| **Time to First Token (TTFT)** | Latency until first token arrives | Lower is better; KV router may reduce with prefix reuse | +| **Inter Token Latency (ITL)** | Average time between tokens | Lower is better; indicates generation speed | +| **Request Latency** | Total end-to-end latency | Lower is better; overall user experience | +| **Output Token Throughput** | Tokens generated per second (system-wide) | Higher is better; system efficiency | +| **Request Throughput** | Requests completed per second | Higher is better; capacity | + +### Interpreting Results + +**Your Results May Vary**: The improvement from KV Smart Router depends heavily on your workload characteristics: + +**Factors that increase KV router benefit:** +- **High prefix overlap** (shared system prompts, templates, document contexts) +- **Long prompts** (>2000 tokens) where caching saves significant compute +- **Multi-turn conversations** with context carryover +- **Batch workloads** with similar queries + +**Factors that reduce KV router benefit:** +- **Unique prompts** with no prefix reuse +- **Short prompts** (\<1000 tokens) where routing overhead exceeds benefit +- **Evenly distributed load** where round-robin is already optimal +- **Low request rate** where cache eviction negates benefits + +**Expected Performance:** +- **High prefix overlap workloads**: 20-50% TTFT improvement +- **Moderate prefix overlap**: 10-20% improvement +- **Low prefix overlap**: \<5% improvement (may not be worth enabling) + +**KV Smart Router is beneficial when:** +- TTFT improvements > 20% +- No significant degradation in other metrics +- Workload demonstrates measurable prefix reuse patterns + +**Standard routing is better when:** +- KV router shows \<10% improvement +- Increased latency variance is observed +- Load distribution across workers is more important than cache affinity + +### Example Comparison + +From the terminal output, compare the summary tables: + +``` +Router-OFF (Baseline): + TTFT avg: 12,764 ms p99: 45,898 ms + Request Latency avg: 32,978 ms + Output Token Throughput: 1,614 tokens/sec + Request Throughput: 8.61 req/sec + +Router-ON (KV Router): + TTFT avg: 8,012 ms p99: 28,644 ms (37% faster ✅) + Request Latency avg: 28,972 ms (12% faster ✅) + Output Token Throughput: 1,746 tokens/sec (8% higher ✅) + Request Throughput: 9.33 req/sec (8% higher ✅) +``` + +In this example with all 8 workers healthy, the **KV router significantly outperformed** the baseline: +- **37% faster TTFT** - Users see first token much sooner +- **8% higher throughput** - System processes more requests per second +- **12% lower latency** - Faster end-to-end completion + +The Mooncake arxiv dataset has sufficient prefix overlap (long input sequences with similar patterns) to benefit from KV cache-aware routing. Workloads with explicit shared prefixes (system prompts, templates) may see even greater improvements. + +--- + +## Phase 7: Cleanup + +```bash +# Delete deployments +kubectl delete dynamographdeployment vllm-agg-no-router -n router-off-test +kubectl delete dynamographdeployment vllm-agg-router -n router-on-test + +# Delete namespaces (removes all resources) +kubectl delete namespace router-off-test +kubectl delete namespace router-on-test +kubectl delete namespace benchmark +``` + +--- + +## Troubleshooting + +### Issue: Pods Stuck in Pending + +**Cause:** Insufficient GPU resources + +**Solution:** +```bash +# Check GPU availability +kubectl describe nodes | grep -A 10 "Allocated resources" + +# Reduce worker replicas if needed +kubectl edit dynamographdeployment -n +``` + +### Issue: ImagePullBackOff Errors + +**Cause:** Version mismatch or missing credentials + +**Solution:** +```bash +# Check available versions +kubectl get pods -n dynamo-system -o yaml | grep image: + +# Update deployment YAML to match cluster version +``` + +### Issue: Operator Not Processing Deployment + +**Cause:** Namespace restrictions + +**Solution:** +- Ensure Dynamo platform is Helm-installed in the namespace +- Verify operator has `--restrictedNamespace=` argument +- Check operator logs: `kubectl logs -n deployment/dynamo-platform-dynamo-operator-controller-manager` + +### Issue: Workers Not Becoming Ready + +**Cause:** Model download failures or probe configuration + +**Solution:** +```bash +# Check worker logs +kubectl logs -n + +# Common issues: +# - Invalid HuggingFace token +# - Network connectivity +# - Insufficient disk space for model +``` + +### Issue: Workers Restarting in CrashLoopBackOff + +**Cause:** Startup probe timeout - workers killed before finishing initialization + +**Symptoms:** +- Pods show "Container main failed startup probe, will be restarted" +- Logs show model still downloading or loading when pod is killed +- Large models (>30GB) take longer than default 22-minute timeout + +**Solution:** +Increase the startup probe `failureThreshold`: + +```bash +# Patch the deployment to allow 32 minutes instead of 22 +kubectl patch dynamographdeployment -n --type='json' \ + -p='[{"op": "replace", "path": "/spec/services/VllmDecodeWorker/extraPodSpec/mainContainer/startupProbe/failureThreshold", "value": 60}]' +``` + +Or update your YAML before deploying: +```yaml +startupProbe: + httpGet: + path: /health + port: 9090 + initialDelaySeconds: 120 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 60 # 32 minutes total (120s + 60*30s) +``` + +**Model Loading Times (approximate):** +- Qwen3-32B: ~20-25 minutes (first download) +- Llama-70B: ~25-30 minutes (first download) +- With cached model on node: ~2-5 minutes + +### Issue: Unequal Worker Health + +**Cause:** Resource constraints, image pull issues, or configuration errors + +**Solution:** +```bash +# Check all worker status +kubectl get pods -n -l nvidia.com/dynamo-component-type=worker + +# Describe problematic pods +kubectl describe pod -n + +# Fix issues before benchmarking or results will be skewed +``` + +--- + +## Advanced Configuration + +### Testing Different Models + +Replace `Qwen/Qwen3-32B` with your model in: +- Deployment YAML `args` section +- AIPerf `--model` and `--tokenizer` parameters + +### Adjusting Worker Count + +Change `replicas: 8` in the deployment YAMLs. Ensure both deployments use the same count for fair comparison. + +### Using Custom Datasets + +Replace mooncake dataset with your own JSONL file: +- Format: One request per line with `timestamp` field +- AIPerf supports various formats via `--custom-dataset-type` + +### Disaggregated Prefill/Decode + +For advanced testing, add separate prefill workers: + +```yaml +VllmPrefillWorker: + componentType: worker + replicas: 2 + # ... configuration +``` + +--- + +## Best Practices + +1. **Equal Conditions:** Ensure both deployments have identical worker counts and health before benchmarking +2. **Warm-Up:** Run a small test (100 requests) before the full benchmark to warm up caches +3. **Multiple Runs:** Run benchmarks 3+ times and average results for statistical significance +4. **Monitor Workers:** Watch for any pod restarts or issues during benchmark runs +5. **Document Conditions:** Record cluster state, worker health, and any anomalies +6. **Test Relevant Workloads:** Use datasets that match your actual use case for meaningful results + +--- + +## Conclusion + +This guide provides a complete methodology for A/B testing Dynamo's KV Smart Router. The KV router's effectiveness depends heavily on workload characteristics—datasets with high prefix overlap will show the most benefit. + +For questions or issues, consult the [Dynamo documentation](https://github.com/ai-dynamo/dynamo) or open an issue on GitHub. + +--- + +## Appendix: Files Reference + +- `router-off-deployment.yaml`: Standard routing deployment +- `router-on-deployment.yaml`: KV router enabled deployment +- `benchmark-job.yaml`: AIPerf benchmark pod +- `prepare-dataset.sh`: Dataset preparation script +- Results CSVs: Detailed metrics from AIPerf + +**Repository:** [https://github.com/ai-dynamo/dynamo](https://github.com/ai-dynamo/dynamo) + diff --git a/fern/pages/benchmarks/sla-driven-profiling.md b/fern/pages/benchmarks/sla-driven-profiling.md new file mode 100644 index 00000000000..9e23fa84940 --- /dev/null +++ b/fern/pages/benchmarks/sla-driven-profiling.md @@ -0,0 +1,628 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "SLA-Driven Profiling with DynamoGraphDeploymentRequest" +--- + + +**New to DGDR and SLA-Driven Profiling?** Start with the [SLA-Driven Profiling and Planner Deployment Quick Start Guide](../planner/sla-planner-quickstart.md) for step-by-step instructions. This document provides deeper technical details about the profiling process. + + +## Overview + +Dynamo provides automated SLA-driven profiling through **DynamoGraphDeploymentRequests (DGDR)**. Instead of manually running profiling scripts, you declare your performance requirements and let the Dynamo Operator handle profiling and deployment automatically. + +**Key Benefits:** +- **Declarative**: Specify SLAs, not implementation details +- **Automated**: No manual job setup or result processing +- **Integrated**: Seamlessly works with Dynamo Operator +- **Production-Ready**: Generates optimized configurations with SLA planner + +This document covers: +- Technical details of online vs offline profiling +- Profiling process internals (GPU usage, measurements, interpolation) +- Direct script usage for advanced scenarios +- Comprehensive troubleshooting + +## Support Matrix + +| Backend | Dense Models | MoE Models | +|---------|-------------|------------| +| vLLM | ✅ | 🚧 | +| SGLang | ✅ | ✅ | +| TensorRT-LLM | ✅ | 🚧 | + +Specifically, the profiler sweeps over the following parallelization mapping for prefill and decode: +| Model Architecture | Prefill Parallelization Mapping | Decode Parallelization Mapping | +|---------|-------------|------------| +| MLA+MoE (DeepseekV3ForCausalLM, DeepseekV32ForCausalLM) | TEP, DEP | TEP, DEP | +| GQA+MoE (Qwen3MoeForCausalLM) | TP, TEP, DEP | TP, TEP, DEP | +| Other Models | TP | TP | + + +- Exact model x parallelization mapping support is dependent on the backend. The profiler does not guarantee that the recommended P/D engine configuration is supported and bug-free by the backend. + + +## Using DGDR for Profiling (Recommended) + +The recommended way to profile models is through DGDRs. Sample configurations are provided in `deploy/`: + +**Available Samples:** +- **`profile_sla_dgdr.yaml`**: Standard profiling with AIPerf on real engines +- **`profile_sla_aic_dgdr.yaml`**: Fast profiling with AI Configurator simulation +- **`profile_sla_moe_dgdr.yaml`**: MoE model profiling + +The Dynamo Operator automatically: +1. Discovers GPU resources (cluster-scoped operators only) +2. Runs profiling (AIPerf on real engines or AI Configurator simulation) +3. Generates optimal DGD configuration with SLA planner +4. Deploys the DGD to your cluster + +See the [Quick Start Guide](../planner/sla-planner-quickstart.md) for prerequisites and detailed instructions. + +## Hardware Configuration + +Hardware parameters have sensible defaults and are **optional** - you can override them if needed: + +```yaml +profilingConfig: + config: + # Override hardware defaults if needed + hardware: + min_num_gpus_per_engine: 1 + max_num_gpus_per_engine: 8 + num_gpus_per_node: 8 + + # Only needed when using AI Configurator (sweep.use_ai_configurator: true) + sweep: + aic_system: h200_sxm # GPU type for AI Configurator (h100_sxm, h200_sxm, etc.) +``` + +### Automatic GPU Discovery (Optional Feature) + +Cluster-scoped operators can optionally enable automatic GPU discovery to detect hardware from cluster nodes. When enabled, hardware config is auto-detected and overrides any manually specified values. + +```yaml +spec: + enableGpuDiscovery: true +``` + +This feature is only available with cluster-scoped operators (`namespaceRestriction.enabled=false`) as it requires cluster-wide node access permissions. It is not available for namespace-restricted operators. + +## Profiling Method + +1. **Hardware Setup**: Uses defaults or user-specified hardware configuration. Optionally, cluster-scoped operators can enable automatic GPU discovery to detect specifications from cluster nodes. +2. **Identify Sweep Ranges**: Automatically determine minimum and maximum number of GPUs per engine. Minimum is determined by the model size and GPU VRAM. Maximum is set to one node for dense model and 4 nodes for MoE models. +3. **Parallelization Mapping Sweep**: Use the input ISL and OSL, test the performance of the engines with different parallelization mappings. + - For dense models, we test different TP sizes for both prefill and decode. + - For MoE models (SGLang), we evaluate both TEP and DEP as candidates for prefill and decode. + - **Prefill**: + - TP/TEP: We measure TTFT with batch size = 1 (assuming ISL is long enough to saturate compute) without KV reuse. + - DEP: Attention uses data parallelism. We send a single burst with total concurrency `attention_dp_size × attn_dp_num_req_ratio` (defaults to 4) and compute the reported TTFT as `time_to_first_token.max / attn_dp_num_req_ratio` from the AIPerf summary of that burst. This stabilizes measurements when the first batch may launch before all requests arrive. + ![Prefill Performance](../../assets/img/h100-prefill-performance.png) + - **Decode**: Since the ITL (or iteration time) is relevant with how many requests are in-flight, we measure the ITL under different number of in-flight requests. The range of the number of in-flight requests is from 1 to the maximum number of requests that the kv cache of the engine can hold. To measure the ITL without being affected by piggy-backed prefill requests, the script will enable kv-reuse and warm up the engine by issuing the same prompts before measuring the ITL. Since the kv cache is sufficient for all the requests, it can hold the kv cache of the pre-computed prompts and skip the prefill phase when measuring the ITL. However, for MoE models, this is not guaranteed because the kv cache in different attention DP ranks is different. We are working on framework-side change to fix this issue. For example, the below plot shows the decode parallelization mapping sweep results for H100 for deepseek-ai/DeepSeek-R1-Distill-Llama-8B. + ![Decode Performance](../../assets/img/h100-decode-performance.png) +4. **Recommendation**: Selects optimal parallelization mapping for prefill and decode that achieves the highest per GPU throughput while adhering the SLA on TTFT and ITL. Specifically, the profiler will choose the point (or a point on the curve for decode) that is left to the vertical red dashed line that represents the SLAs while has the highest y coordinate (throughput per GPU). +5. **In-Depth Profiling on the Recommended P/D Engine**: After finding the best TP size for prefill and decode, the script will then interpolate the TTFT with ISL and ITL with active KV cache and decode context length. This is to provide a more accurate estimation of the performance when ISL and OSL changes and will be used in the sla-planner. +![ITL Interpolation](../../assets/img/pd-interpolation.png) + - **Prefill**: Measures TTFT and throughput per GPU across different input lengths with batch size=1. + - **Decode**: Measures ITL and throughput per GPU under various KV cache loads and decode context lengths. The active kv usage determines the complexity of the memory-bounded attention kernel while the active kv usage divided the average context length determines the complexity of the computation bound MLP kernel. For example, the below figure shows the ITL of DS-Distilled Llama 8b model on H100 TP4. The ITL grows near-linearly with active kv usage under a fixed context length. And the slope increases as the context length decreases. + + +To run the parallelization mapping sweep and the in-depth profiling on the recommended P/D engine, the profiler need to know the engine's forward pass time with different loads. There are two ways to achieve this: run AIPerf on real engines or use AI Configurator to run simulations. + +### AIPerf on Real Engines + +Profiles your model by creating real test deployments in Kubernetes and measuring their performance. + +**Characteristics:** +- **Duration**: 2-4 hours +- **Accuracy**: Highest (real measurements) +- **GPU Requirements**: Full access to test different parallelization mappings +- **Backends**: vLLM, SGLang, TensorRT-LLM + +**DGDR Configuration:** +```yaml +profilingConfig: + config: + sweep: + use_ai_configurator: false # Default +``` + +### AI Configurator Simulation + +Uses performance simulation to rapidly estimate optimal configurations without running real deployments. + +**Characteristics:** +- **Duration**: 20-30 seconds +- **Accuracy**: Estimated (may have errors for unusual configurations) +- **GPU Requirements**: None +- **Backends**: TensorRT-LLM only (vLLM/SGLang coming soon) + +**DGDR Configuration:** +```yaml +profilingConfig: + config: + sweep: + use_ai_configurator: true + aic: + system: h200_sxm # GPU system type + model_name: QWEN3_32B # AIC model identifier + backend_version: "0.20.0" +``` + +**Supported Configurations:** + +For the current list of supported models, systems, and backend versions, see the [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#supported-features). + +To check from the command line: `aiconfigurator cli --help` + +**Currently supports:** +- **Backends**: TensorRT-LLM (versions 0.20.0, 1.0.0rc3, 1.0.0rc6) +- **Systems**: H100 SXM, H200 SXM, B200 SXM, GB200 SXM, A100 SXM +- **Models**: Wide range including GPT, Llama, Mixtral, DeepSeek, Qwen, and more + +### Output Format + +After profiling, the DGDR status contains: + +1. **Recommended Configuration**: Optimal TP for prefill and decode +2. **Performance Data**: Interpolation models for SLA planner +3. **Generated DGD**: Complete deployment manifest + +**Example Recommendations:** +``` +Suggested prefill TP:4 (TTFT 48.37 ms, throughput 15505.23 tokens/s/GPU) +Suggested decode TP:4 (ITL 4.83 ms, throughput 51.22 tokens/s/GPU) +``` + +#### Interactive Configuration Selection WebUI + +When running the profiler with `--pick-with-webui`, an interactive web interface is launched that allows you to visually explore profiling results and manually select configurations. + +**Features:** +- **Interactive Charts**: Visualize prefill TTFT, decode ITL, and GPU hours analysis with hover-to-highlight synchronization between charts and tables +- **Pareto-Optimal Analysis**: The GPU Hours table shows pareto-optimal configurations balancing latency and throughput +- **DGD Config Preview**: Click "Show Config" on any row to view the corresponding DynamoGraphDeployment YAML +- **GPU Cost Estimation**: Toggle GPU cost display to convert GPU hours to cost ($/1000 requests) +- **SLA Visualization**: Red dashed lines indicate your TTFT and ITL targets + +**Selection Methods:** +1. **GPU Hours Table** (recommended): Click any row to select both prefill and decode configurations at once based on the pareto-optimal combination +2. **Individual Selection**: Click one row in the Prefill table AND one row in the Decode table to manually choose each + +**Example DGD Config Output:** + +When you click "Show Config", you'll see a DynamoGraphDeployment configuration like: + +```yaml +# DynamoGraphDeployment Configuration +# Prefill: 1 GPU(s), TP=1 +# Decode: 4 GPU(s), TP=4 +# Model: Qwen/Qwen3-32B-FP8 +# Backend: trtllm +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +spec: + services: + PrefillWorker: + subComponentType: prefill + replicas: 1 + extraPodSpec: + mainContainer: + args: + - --tensor-parallel-size=1 + DecodeWorker: + subComponentType: decode + replicas: 1 + extraPodSpec: + mainContainer: + args: + - --tensor-parallel-size=4 +``` + +**Usage:** +```bash +python -m benchmarks.profiler.profile_sla \ + --backend trtllm \ + --config path/to/disagg.yaml \ + --pick-with-webui \ + --use-ai-configurator \ + --model Qwen/Qwen3-32B-FP8 \ + --aic-system h200_sxm \ + --ttft 200 --itl 15 +``` + +Once you have selected a configuration, the full DynamoGraphDeployment CRD will be saved in your output folder as `config_with_planner.yaml`. + +The WebUI launches on port 8000 by default (configurable with `--webui-port`). + +#### Output Performance Plots + +The profiler will generate the following plots to better visualize the performance data: + +**Parallelization Mapping Sweep Plots:** +- `prefill_performance.png`: TTFT vs Parallelization Mapping size +- `decode_performance.png`: ITL vs Parallelization Mapping size and in-flight requests + +Note these two plots are based on the input ISL and OSL. + +**In-Depth Profiling for the Recommended P/D Engine Plots:** +- `selected_prefill_interpolation/prefill_ttft_interpolation.png`: TTFT vs ISL for the recommended prefill engine +- `selected_prefill_interpolation/prefill_throughput_interpolation.png`: Throughput vs ISL for the recommended prefill engine +- `selected_decode_interpolation/decode_itl_interplation.png`: ITL vs KV usage and context length for the recommended decode engine +- `selected_decode_interpolation/decode_throughput_interpolation.png`: Throughput vs KV usage and context length for the recommended decode engine + + +### Output Interpolation Data + +The profiler generates `.npz` files to store the performance data for the recommended P/D engine: + +**Prefill Interpolation** (`selected_prefill_interpolation/raw_data.npz`): +- `prefill_isl`: 1D array of input sequence lengths tested +- `prefill_ttft`: 1D array of TTFTs (ms) at each ISL +- `prefill_thpt_per_gpu`: 1D array of throughput (tokens/s/GPU) at each ISL + +**Decode Interpolation** (`selected_decode_interpolation/raw_data.npz`): +- `max_kv_tokens`: Total KV tokens capacity in decode engine +- `x_kv_usage`: 1D array of active KV usage percentages [0, 1] +- `y_context_length`: 1D array of average context lengths tested +- `z_itl`: 1D array of ITLs (ms) at each (KV usage, context length) point +- `z_thpt_per_gpu`: 1D array of throughput (tokens/s/GPU) at each point + +## DGDR Configuration Reference + +This section provides detailed explanations of all DGDR `profilingConfig` options. The DGDR controller passes this configuration to the profiler script, which is defined in `benchmarks/profiler/utils/profiler_argparse.py`. + +### Configuration Structure + +All profiler configuration goes under `spec.profilingConfig.config`: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeploymentRequest +metadata: + name: my-deployment +spec: + model: "Qwen/Qwen3-0.6B" # High-level: model to deploy + backend: vllm # High-level: inference backend + + profilingConfig: + profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" # Required + configMapRef: # Optional: base DGD config + name: my-config + key: disagg.yaml + + config: # Profiler configuration + sla: { ... } + hardware: { ... } + sweep: { ... } + aic: { ... } + planner: { ... } + + deploymentOverrides: # Optional + workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" +``` + +### SLA Configuration (Required) + +Define your performance requirements and workload characteristics: + +```yaml +profilingConfig: + config: + sla: + isl: 3000 # Average input sequence length (tokens) + osl: 150 # Average output sequence length (tokens) + ttft: 200.0 # Target Time To First Token (milliseconds) + itl: 20.0 # Target Inter-Token Latency (milliseconds) +``` + +**What these control:** +- **ISL/OSL**: Based on your expected traffic patterns +- **TTFT**: First token latency target (lower = more GPUs needed, affects prefill engine) +- **ITL**: Token generation latency target (lower = more GPUs needed, affects decode engine) +- **Trade-offs**: Tighter SLAs require more GPU resources + +### Hardware Configuration (Optional) + +Control GPU search space and constraints: + +```yaml +profilingConfig: + config: + hardware: + min_num_gpus_per_engine: 2 # if not provided, will automatically determine based on model and VRAM size + max_num_gpus_per_engine: 8 # Maximum GPUs to test + num_gpus_per_node: 8 # GPUs per node (for multi-node MoE) + gpu_type: h200_sxm # GPU type hint +``` + +**When to use:** +- **min_num_gpus_per_engine**: Skip small TP sizes if your model is large +- **max_num_gpus_per_engine**: Limit search space or work around constraints (e.g., [AIC attention heads](#ai-configurator-attention-head-constraint-error)) +- **num_gpus_per_node**: Determine the upper bound of number of GPUs per node for dense models and configure Grove for multi-node MoE engines. +- **gpu_type**: Informational, auto-detected by controller + + +If you don't specify hardware constraints, the controller auto-detects based on your model size and available cluster resources. + + +### Sweep Configuration (Optional) + +Control profiling behavior: + +```yaml +profilingConfig: + config: + sweep: + use_ai_configurator: false # Use offline profiling (default: false) + prefill_interpolation_granularity: 16 # Samples for prefill TTFT curve + decode_interpolation_granularity: 6 # Samples for decode ITL curve +``` + +**Use cases:** +- **use_ai_configurator**: Set to `true` for 20-30 second profiling (TensorRT-LLM only) +- **prefill_interpolation_granularity**: How many samples to benchmark for prefill TTFT curve (lower = faster but may be less accurate) +- **decode_interpolation_granularity**: How many samples to benchmark for decode ITL curve (lower = faster but may be less accurate). Since ITL interpolation is a 3d plot and takes longer to run, we default to a smaller number of samples. Increasing this value might quadratically increase the profiling time. + +### AI Configurator Configuration (Required if `use_ai_configurator: true`) + +Configure AI Configurator profiling mode: + +```yaml +profilingConfig: + config: + sweep: + use_ai_configurator: true + aic_system: h200_sxm # GPU system: h100_sxm, h200_sxm, b200_sxm, gb200_sxm, a100_sxm + aic_hf_id: Qwen/Qwen3-32B # Huggingface model id + aic_backend_version: "0.20.0" # TensorRT-LLM version: 0.20.0, 1.0.0rc3 +``` + +**Supported configurations:** See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#supported-features) + +### Planner Configuration (Optional) + +Pass arguments to the SLA planner: + +```yaml +profilingConfig: + config: + planner: + planner_min_endpoint: 2 # Minimum endpoints to maintain + planner_adjustment_interval: 60 # Adjustment interval (seconds) + planner_load_predictor: linear # Load prediction method +``` + + +Planner arguments use `planner_` prefix. See planner documentation for full list. + + +### Engine Configuration (Auto-configured) + +The controller automatically sets these from high-level fields: + +```yaml +# You specify: +spec: + model: "Qwen/Qwen3-0.6B" + backend: vllm + +# Controller auto-injects into config: +profilingConfig: + config: + deployment: + model: "Qwen/Qwen3-0.6B" # From spec.model + engine: + backend: vllm # From spec.backend + config: /path/to/configmap # From spec.profilingConfig.configMapRef (if provided) +``` + +**You should not manually set** `deployment.model` or `engine.backend` in `profilingConfig.config` - they are automatically injected from the high-level fields. + +### Complete Example: AIPerf on Real Engines + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeploymentRequest +metadata: + name: vllm-dense-online +spec: + model: "Qwen/Qwen3-0.6B" + backend: vllm + + profilingConfig: + profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" + config: + sla: + isl: 3000 + osl: 150 + ttft: 200.0 + itl: 20.0 + + hardware: + min_num_gpus_per_engine: 1 + max_num_gpus_per_engine: 8 + + sweep: + use_ai_configurator: false + + deploymentOverrides: + workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" + + autoApply: true +``` + +### Complete Example: AI Configurator Simulation + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeploymentRequest +metadata: + name: trtllm-aic-offline +spec: + model: "Qwen/Qwen3-32B" + backend: trtllm + + profilingConfig: + profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.6.1" + config: + sla: + isl: 4000 + osl: 500 + ttft: 300.0 + itl: 10.0 + + sweep: + use_ai_configurator: true + + aic: + system: h200_sxm + model_name: QWEN3_32B + backend_version: "0.20.0" + + deploymentOverrides: + workersImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.6.1" + + autoApply: true +``` + +### Complete Example: MoE Model + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeploymentRequest +metadata: + name: sglang-moe +spec: + model: "deepseek-ai/DeepSeek-R1" + backend: sglang + + profilingConfig: + profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1" + config: + sla: + isl: 2048 + osl: 512 + ttft: 300.0 + itl: 25.0 + + hardware: + num_gpus_per_node: 8 + max_num_gpus_per_engine: 32 + + engine: + is_moe_model: true # Enable MoE profiling mode + + deploymentOverrides: + workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1" + + autoApply: true +``` + +## Troubleshooting + +### Profiling Takes Too Long + +**Solution 1**: Use AI Configurator for rapid profiling (TensorRT-LLM only): +```yaml +sweep: + use_ai_configurator: true +``` + +**Solution 2**: Reduce search space: +```yaml +config: + sweep: + min_num_gpus: 4 # Skip TP1, TP2 + max_num_gpus: 8 # Don't test beyond TP8 +``` + +### SLA Cannot Be Met + +**Symptoms**: Profiler reports no configuration meets targets + +**Solutions:** +1. Relax SLA targets (increase TTFT/ITL) +2. Add more GPU resources +3. Try a different backend +4. Use a smaller model + +### AI Configurator: Attention Head Constraint Error + +**Symptoms**: Profiling fails with error: +``` +AssertionError: num_heads should be divisible by tp_size and the division result should be >= 4 +``` + +**Cause**: AI Configurator requires **≥4 attention heads per GPU**. Small models with few heads cannot use high TP sizes. + +**Affected Models:** +- **Qwen3-0.6B** (16 heads): Max TP = 4 ❌ Fails at TP=8 +- **GPT-2** (12 heads): Max TP = 3 +- Most models **\<1B parameters**: May hit this constraint + +**Solution**: Limit `max_num_gpus_per_engine` in your DGDR: + +```yaml +profilingConfig: + profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.6.1" + config: + hardware: + max_num_gpus_per_engine: 4 # For Qwen3-0.6B (16 heads / 4 = max TP of 4) + sweep: + use_ai_configurator: true + aic: + system: h200_sxm + model_name: QWEN3_0_6B +``` + +**Calculate Max TP**: `max_tp = num_attention_heads / 4` + +> **Note**: This is an AI Configurator limitation. Online profiling doesn't have this constraint. + +### Image Pull Errors + +**Symptoms**: `ErrImagePull` or `ImagePullBackOff` + +**Solution**: Ensure image pull secrets are configured: +```bash +kubectl create secret docker-registry nvcr-imagepullsecret \ + --docker-server=nvcr.io \ + --docker-username='$oauthtoken' \ + --docker-password= \ + --namespace +``` + +### Out of Memory During Profiling + +**Symptoms**: OOM errors in profiling jobs + +**Solutions:** +1. Reduce `gpu_memory_utilization` in engine config +2. Reduce `--max-context-length` +3. Skip larger TP configurations +4. Use fewer GPUs per test + +### Unsupported Parallelization Mapping in Backend + +**Symptoms**: Starttime/runtime error in the backend. For example, prime number of attention heads restrain TP size to be 1 (i.e., falcon-7b with 71 attention heads). Or some backend does not support different TP sizes for prefill and decode. + +**Solutions:** +1. Contact the backend to add support for the use cases and bump backend version in dynamo. +2. Restrain the max and min number of GPUs per engine to the supported range. + +## Next Steps + +- **Deploy with DGDR**: See [Quick Start Guide](../planner/sla-planner-quickstart.md) +- **Understand SLA Planner**: Read [SLA Planner Deep Dive](../planner/sla-planner.md) +- **Monitor Deployments**: Set up [Observability](../kubernetes/observability/metrics.md) +- **Optimize Performance**: See [Performance Tuning](../performance/tuning.md) + +## Related Documentation + +- [DGDR API Reference](../kubernetes/api-reference.md) +- [SLA Planner Quick Start](../planner/sla-planner-quickstart.md) +- [SLA Planner Architecture](../planner/sla-planner.md) +- [Profiler Arguments Reference](https://github.com/ai-dynamo/dynamo/tree/main/benchmarks/profiler/utils/profiler_argparse.py) diff --git a/fern/pages/design-docs/architecture.md b/fern/pages/design-docs/architecture.md new file mode 100644 index 00000000000..e43e41ead89 --- /dev/null +++ b/fern/pages/design-docs/architecture.md @@ -0,0 +1,99 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "High Level Architecture" +--- + +Dynamo is NVIDIA's high-throughput, low-latency inference framework that's designed to serve generative AI and reasoning models in multi-node distributed environments. It's inference engine agnostic, supporting TRT-LLM, vLLM, SGLang and others, while capturing essential LLM capabilities: + +- **Disaggregated prefill & decode inference**: Maximizes GPU throughput and helps you balance throughput and latency +- **Dynamic GPU scheduling**: Optimizes performance based on real-time demand +- **LLM-aware request routing**: Eliminates unnecessary KV cache recomputation +- **Accelerated data transfer**: Reduces inference response time using NIXL +- **KV cache offloading**: Uses multiple memory hierarchies for higher system throughput and lower latency + +Built in Rust for performance and in Python for extensibility, Dynamo is fully open-source and driven by a transparent, Open Source Software (OSS)-first development approach + +## Motivation behind Dynamo + +Scaling inference for generative AI and reasoning models presents complex challenges in three key areas: performance, correctness, and efficiency. Here's what we're solving: + +There are multi-faceted challenges: + +- *Difficult UX*: User experience is critical for distributed inference runtimes because managing large-scale inference systems is already complex, and poor usability further complicates matters. Developers need a clear, intuitive way to define, optimize, and update inference execution without wrestling with low-level infrastructure details. Without simple UX, inference runtimes remain inaccessible, prone to errors, and inefficient, hindering model deployment and innovation. A modern distributed inference stack must consider usability at its core—empowering developers to scale AI effortlessly for agentic workflows while ensuring correctness and performance. + +- *GPU underutilization*: Traditional monolithic inference pipelines often leave GPUs idle due to the imbalance between prefill and decode stages. Prefill (which generates large prompt embeddings) is highly compute-intensive, while decode (which generates tokens) is latency-sensitive. A disaggregated approach that separate prefill and decode ensures optimal GPU utilization and increases overall throughput ([DistServe](https://arxiv.org/abs/2401.09670)). + +- *Expensive KV cache re-computation*: When requests aren't efficiently routed, KV caches (intermediate states of transformer model) often get flushed and recomputed, leading to wasted computation cycles and increased latency. KV-aware request routing eliminates redundant KV cache regeneration, significantly boosting efficiency.([DeepSeek](https://arxiv.org/abs/2501.12948)) + +- *Memory bottlenecks*: Large-scale inference workloads demand extensive KV cache storage, which can quickly overwhelm GPU memory capacity. KV cache offloading across memory hierarchies (HBM, DDR, NVMe or remote storage) enables models to scale beyond GPU memory limits and speeds up latency. ([Mooncake](https://kvcache-ai.github.io/Mooncake/design/mooncake-store.html), [AIBrix](https://blog.vllm.ai/2025/02/21/aibrix-release.html), [LMCache](https://lmcache.ai/)) + +- *Fluctuating demand and inefficient GPU allocation*: Inference workloads are use-case specific and dynamic—demand surges inherently cause unpredictably, yet traditional serving stacks allocate GPUs statically. Dynamic GPU scheduling ensures that resources are allocated based on real-time demand, preventing over-provisioning and improving utilization ([AzureTrace](https://github.com/Azure/AzurePublicDataset)) + +- *Inefficient data transfer*: Distributed inference workloads introduce unique and highly dynamic communication patterns that differ fundamentally from training. Unlike training, where worker roles remain largely static, inference requires real-time worker scaling, dynamic load balancing, and adaptive memory management—necessitating a communication layer that can efficiently handle these evolving requirements. Contemporary libraries are built for static, synchronous operations and lack the dynamicity needed for inference serving. While UCX provides high-performance networking, it requires deep networking expertise to configure correctly, making it impractical for broad inference use cases. Developers need a library optimized for inference workloads that can abstract heterogeneous memory (remote memory or storage) and dynamically select the best transport mechanism via a unified API. + +To address the growing demands of distributed inference serving, NVIDIA introduces Dynamo. This innovative product tackles key challenges in scheduling, memory management, and data transfer. Dynamo employs KV-aware routing for optimized decoding, leveraging existing KV caches. For efficient global memory management at scale, it strategically stores and evicts KV caches across multiple memory tiers—GPU, CPU, SSD, and object storage—enhancing both time-to-first-token and overall throughput. Dynamo features NIXL (NVIDIA Inference tranXfer Library), a new data transfer engine designed for dynamic scaling and low-latency storage access. + +## Key benefits + +The following diagram outlines Dynamo's high-level architecture. To enable large-scale distributed and disaggregated inference serving, Dynamo includes five key features: + +- [Dynamo Disaggregated Serving](disagg-serving.md) +- [Dynamo Smart Router](../router/kv-cache-routing.md) +- [Dynamo KV Cache Block Manager](../kvbm/kvbm-intro.md) +- [Planner](../planner/planner-intro.md) +- [NVIDIA Inference Transfer Library (NIXL)](https://github.com/ai-dynamo/nixl/blob/main/docs/nixl.md) + +Every component in the Dynamo architecture is independently scalable and portable. The API server can adapt to task-specific deployment. A smart router processes user requests to route them to the optimal worker for performance. Specifically, for Large Language Models (LLMs), Dynamo employs KV cache-aware routing, which directs requests to the worker with the highest cache hit rate while maintaining load balance, expediting decoding. This routing strategy leverages a KV cache manager that maintains a global radix tree registry for hit rate calculation. The KV cache manager also oversees a multi-tiered memory system, enabling rapid KV cache storage and eviction. This design results in substantial TTFT reductions, increased throughput, and the ability to process extensive context lengths. + +![Diagram of the NVIDIA Dynamo architecture for distributed AI inference, including User Requests, Planner, API Server, Smart Router, and Disaggregated Serving](../../assets/img/architecture.png "Dynamo Architecture") + +Dynamo enables dynamic worker scaling, responding to real-time deployment signals. These signals, captured and communicated through an event plane, empower the Planner to make intelligent, zero-downtime adjustments. For instance, if Dynamo detects an increase in requests with long input sequences, the Planner automatically scales up prefill workers to meet the heightened demand. + +Beyond efficient event communication, data transfer across multi-node deployments is crucial at scale. To address this, Dynamo utilizes NIXL, a technology designed to expedite transfers through reduced synchronization and intelligent batching. This acceleration is particularly vital for disaggregated serving, ensuring minimal latency when prefill workers pass KV cache data to decode workers. + +Dynamo prioritizes seamless integration. Its modular design enables it to work harmoniously with your existing infrastructure and preferred open-source components. To achieve optimal performance and extensibility, Dynamo leverages the strengths of both Rust and Python. We built critical performance-sensitive modules with Rust for speed, memory safety, and robust concurrency. Meanwhile, we used Python for its flexibility, enabling rapid prototyping and effortless customization. + +## Performance benefits of key features + +### Disaggregated serving + +Disaggregating prefill and decode boosts performance, gaining efficiency when more GPUs are involved in inference. For example, for Llama 70B, single-node tests show a 30% throughput/GPU improvement, while two-node setups achieve over 2X gains due to better parallelization. + +![Two scatter plots comparing the performance of disagg and baseline configurations on one node versus two nodes](../../assets/img/disagg-perf-benefit.png) + +* Tested on H100s with R1 Distilled Llama 70B model FP8 using vLLM. 3K ISL/ 150 OSL + + +The disaggregation of prefill and decode phases offers valuable flexibility. Since these phases directly correlate with time-to-first-token (TTFT) and inter-token latency (ITL) respectively, adjusting worker allocation can provide tailored performance. This enables optimization for specific service level agreements (SLAs), whether prioritizing faster TTFT, lower ITL, or higher throughput. + +### KV aware routing + +![Two bar charts comparing Random routing and Dynamo with KV aware routing for Time To First Token (3x faster with Dynamo) and Avg request latency (2x faster with Dynamo).](../../assets/img/kv-routing.png) + +* Tested with 100K requests to R1 using R1 Distilled Llama 70B FP8 on 2 nodes of H100s. Avg 4K ISL / 800 OSL + + +Existing routing methods, including load-based routing, overlook the specific properties of LLMs that could improve performance. Addressing this, routing user queries to workers with the highest KV cache hit rate (rather than simply the least busy node) allows for immediate processing, even under heavy load. The preceeding figures illustrate the effectiveness of KV aware routing on 100,000 real R1 user queries, achieving a 3x improvement in TTFT and a 2x reduction in average request latency. Depending on traffic, this approach can also enhance throughput. + +### KV cache manager + +The Dynamo KV Block Manager (KVBM) enables KV cache offloading to system CPU memory, local SSDs, and network-attached storage, allowing more KV blocks to be reused instead of recomputed. In many cases, KV transfer is faster than recomputation, so KVBM helps reduce time-to-first-token (TTFT). The following plot highlights the performance gains achieved through CPU memory offloading. In a scenario involving 20 multi-turn conversations with 15 users, KVBM with CPU memory offloading achieved a 2.2×–12× improvement in TTFT (depending on QPS), demonstrating benefits that extend beyond basic prefix caching. +![Line graph comparing Pure GPU prefix caching with vLLM and KVBM host offloading for TTFT (Time To First Token)](../../assets/img/kvbm-agg-performance.png) + +* Tested with different QPS using Qwen3-8B on H100. Avg 20K ISL / 100 OSL. + +### NVIDIA Inference Transfer Library (NIXL) + +NIXL streamlines data transfer through simplified synchronization and batching and simplified source and destination abstractions. NIXL can abstract data movement across different types of memory and fast storage, whereas other data transfer libraries typically support a single tier of memory. These enhancements yield significant performance gains, accelerating both time-to-first-token (TTFT) and throughput. + +## Acknowledgements + +We'd like to acknowledge several open source software stacks that motivated our creation Dynamo. + +- vLLM and vLLM-project +- SGLang +- DistServe +- Mooncake +- AIBrix +- BentoML diff --git a/fern/pages/design-docs/disagg-serving.md b/fern/pages/design-docs/disagg-serving.md new file mode 100644 index 00000000000..5e16c1cec78 --- /dev/null +++ b/fern/pages/design-docs/disagg-serving.md @@ -0,0 +1,105 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Dynamo Disaggregation: Separating Prefill and Decode for Enhanced Performance" +--- + +The prefill and decode phases of LLM requests have different computation characteristics and memory footprints. Disaggregating these phases into specialized llm engines allows for better hardware allocation, improved scalability, and overall enhanced performance. For example, using a larger TP for the memory-bound decoding phase while a smaller TP for the computation-bound prefill phase allows both phases to be computed efficiently. In addition, for requests with long context, separating their prefill phase into dedicated prefill engines allows the ongoing decoding requests to be efficiently processed without being blocked by these long prefills. + +Disaggregated execution of a request has three main steps: +1. Prefill engine computes prefill phase and generates KV cache +2. Prefill engine transfers the KV cache to decode engine, and +3. Decode engine computes decode phase. + +However, not all requests’ prefill phases need to be computed in the remote prefill engine. If the prefill is short or the decode engine has a high prefix cache hit, often it is more efficient to prefill locally in the decode engine. The disaggregation design in Dynamo accounts for all these scenarios and features a flexible framework that delivers strong performance across various conditions. + + +## Design + +```mermaid +sequenceDiagram + participant D as Worker + participant Q as PrefillQueue + participant P as PrefillWorker + + Note over D: Request is routed to decode + D->>D: Decide if prefill should be done locally or remotely + + D->>D: Allocate KV blocks + D->>Q: Put RemotePrefillRequest on the queue + + P->>Q: Pull request from the queue + P-->>D: Read cached KVs from Decode + + D->>D: Decode other requests + P->>P: Run prefill + P-->>D: Write prefilled KVs into allocated blocks + P->>D: Send completion notification + Note over D: Notification received when prefill is done + D->>D: Schedule decoding +``` + +There are four main components in Dynamo disaggregation: +- Worker: execute prefill and decode requests +- Prefill worker: execute prefill requests only +- Disaggregated router: decide whether to prefill locally or remotely +- Prefill queue: cache and load balance the remote prefill requests + +When worker receives a request, it first decides if the prefill should be done locally or remotely using the disaggregated router and allocates the KV blocks. If prefilling remotely, it then pushes a remote prefill request to the prefill queue. After that, the prefill worker pulls from prefill queue, reads KV blocks with prefix cache hit from the worker, computes the prefill, and writes the computed KV blocks back to the worker. Finally, the worker completes the remaining decoding. + +## Conditional Disaggregation + +Not all requests’ prefill phases need to be computed in the remote prefill engine. Disaggregated router decides whether the prefill phase of a request should be computed locally and globally at runtime based on the prefill length and prefill queue status. Specifically, a request is sent to remote prefill engine if the following two conditions are met: +1. The absolute prefill length without prefix cache hit is greater than a preset threshold. On the one hand, if the prefill length of a request is short, it can be efficiently computed in the decode engine by piggybacking chunked prefill requests with ongoing decode requests. On the other hand, if the prefix cache hit is long, the prefill becomes memory bound and hence can be more efficiently computed in the decode engine. +2. The number of remote prefill requests in the prefill queue is less than a preset threshold. When the prefill queue has a large number of prefill requests, it indicates that the prefill workers are lagging behind, and it is better to prefill locally until more prefill workers join. + +Conditional disaggregation allows Dynamo to achieve high performance for dynamic workloads + +## Prefill Queue + +Prefill requests are computation bound (except for very short prefills) and should be executed in their dedicated iterations without any other requests to ensure fast TTFT. To balance the load across multiple prefill engines, Dynamo adopts a global prefill queue where workers push remote prefill requests and prefill workers pull and complete the requests one by one. The global prefill queue is implemented based on NATS stream to ensure high performance and availability. + +## Efficient KV Transfer + +```mermaid +sequenceDiagram + participant D as Worker + participant SD as WorkerScheduler + participant SP as PrefillWorkerScheduler + participant P as PrefillWorker + + Note over SD: KV blocks allocated + SD->>SP: Issue remote prefill request
with KV block descriptors via prefill queue + SP->>P: Add to in-flight batch + + P-->>D: Remote NIXL read for prefix hit KV blocks (non-block) + P->>P: Execute prefill + P-->>D: Remote NIXL write for comptued KV blocks (non-block) + + P->>SP: Notify finish + SP->>SD: Notify finish + SD->>D: Add to in-flight batch + + D->>D: Execute decode +``` + +The key to high-performance disaggregation is efficient KV transfer. Dynamo leverage NIXL to transfer KV cache directly from the VRAM of prefill engine to the VRAM of decode engine. In addition, the KV transfer is non-blocking, allowing GPU forward pass to serve other requests in addition to the KV transfer. + +After the KV blocks are allocated, the worker scheduler sends the remote prefill requests, which contain the memory descriptors for the allocated KV blocks, to the prefill worker scheduler via prefill queue. This allows the prefill worker to read and write from the remote KV blocks without explicit handling in the remote worker engine, thanks to the RDMA read and write NIXL operations. Once the remote prefill is done, worker scheduler simply adds the decode request to the worker in-flight. This allows workers to execute forward passes of ongoing decode/prefill requests while waiting for the remote prefill to finish. + +To reduce the size of memory descriptors, Dynamo applies two optimizations: +1. After each worker finishes its initialization and allocates all the KV cache pool, it stores the memory descriptor of all blocks (which is also referred to as the NIXL metadata) in ETCD, a distributed key-value store. Prefill workers load and cache the memory descriptors in one worker at the first time that it serves a remote prefill request issued by this worker. Thus, only the KV block ID instead of the full memory descriptor is needed when issuing the remote prefill request. + +2. Dynamo promotes the memory allocator in the prefill engine to allocate continuous blocks and merge continuous blocks into larger blocks to reduce the total number of KV blocks. + +For decode and prefill with different KV layouts (i.e., due to different TP), Dynamo applies a high-performance kernel that transposes the KV blocks into their matching layout in the KV receiver after the NIXL reads and before the NIXL writes. + +## Runtime-Reconfigurable xPyD + +The prefill queue and NIXL-based KV transfer design in Dynamo naturally allows runtime-reconfigurable xPyD. Workers and prefill workers can be added and removed at runtime without any system-level synchronization or overheads. New and existing prefill workers both just simply pull remote prefill requests from NATS prefill queue. The NIXL metadata of the new or existing workers (for new prefill workers) are lazily loaded and cached when necessary. Specifically, adding and removing workers and prefill workers is as easy as: + +- Add worker: add NIXL metadata in ETCD. +- Remove worker: flush engine and delete NIXL metadata in ETCD. +- Add prefill worker: no explicit action needed. +- Delete prefill worker: flush engine. + diff --git a/fern/pages/design-docs/distributed-runtime.md b/fern/pages/design-docs/distributed-runtime.md new file mode 100644 index 00000000000..e3c45bb70c6 --- /dev/null +++ b/fern/pages/design-docs/distributed-runtime.md @@ -0,0 +1,67 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Dynamo Distributed Runtime" +--- + +## Overview + +Dynamo's `DistributedRuntime` is the core infrastructure in the framework that enables distributed communication and coordination between different Dynamo components. It is implemented in rust (`/lib/runtime`) and exposed to other programming languages via bindings (i.e., python bindings can be found in `/lib/bindings/python`). `DistributedRuntime` follows a hierarchical structure: + +- `DistributedRuntime`: This is the highest level object that exposes the distributed runtime interface. It maintains connection to external services (e.g., etcd for service discovery and NATS for messaging) and manages lifecycle with cancellation tokens. +- `Namespace`: A `Namespace` is a logical grouping of components that isolate between different model deployments. +- `Component`: A `Component` is a discoverable object within a `Namespace` that represents a logical unit of workers. +- `Endpoint`: An `Endpoint` is a network-accessible service that provides a specific service or function. + +While theoretically each `DistributedRuntime` can have multiple `Namespace`s as long as their names are unique (similar logic also applies to `Component/Namespace` and `Endpoint/Component`), in practice, each dynamo components typically are deployed with its own process and thus has its own `DistributedRuntime` object. However, they share the same namespace to discover each other. + +For example, a typical deployment configuration (like `examples/backends/vllm/deploy/agg.yaml` or `examples/backends/sglang/deploy/agg.yaml`) has multiple workers: + +- `Frontend`: Starts an HTTP server and handles incoming requests. The HTTP server routes all requests to the `Processor`. +- `Processor`: When a new request arrives, `Processor` applies the chat template and performs the tokenization. +Then, it routes the request to the `Worker`. +- `Worker` components (e.g., `VllmDecodeWorker`, `SGLangDecodeWorker`, `TrtllmWorker`): Perform the actual computation using their respective engines (vLLM, SGLang, TensorRT-LLM). + +Since the workers are deployed in different processes, each of them has its own `DistributedRuntime`. Within their own `DistributedRuntime`, they all share the same `Namespace` (e.g., `vllm-agg`, `sglang-agg`). Then, under their namespace, they have their own `Component`s: `Frontend` uses the `make_engine` function which handles HTTP serving and routing automatically, while worker components create components with names like `worker`, `decode`, or `prefill` and register endpoints like `generate`, `flush_cache`, or `clear_kv_blocks`. The `Frontend` component doesn't explicitly create endpoints - instead, the `make_engine` function handles the HTTP server and worker discovery. Worker components create their endpoints programmatically using the `component.endpoint()` method. Their `DistributedRuntime`s are initialized in their respective main functions, their `Namespace`s are configured in the deployment YAML, their `Component`s are created programmatically (e.g., `runtime.namespace("dynamo").component("worker")`), and their `Endpoint`s are created using the `component.endpoint()` method. + +## Initialization + +In this section, we explain what happens under the hood when `DistributedRuntime/Namespace/Component/Endpoint` objects are created. There are two modes for `DistributedRuntime` initialization: dynamic and static. In static mode, components and endpoints are defined using known addresses and do not change during runtime. In dynamic modes, components and endpoints are discovered through the network and can change during runtime. We focus on the dynamic mode in the rest of this document. Static mode is basically dynamic mode without registration and discovery and hence does not rely on etcd. + +:::caution +The hierarchy and naming in etcd and NATS may change over time, and this document might not reflect the latest changes. Regardless of such changes, the main concepts would remain the same. +::: + +- `DistributedRuntime`: When a `DistributedRuntime` object is created, it establishes connections to the following two services: + - etcd (dynamic mode only): for service discovery. In static mode, `DistributedRuntime` can operate without etcd. + - NATS (both static and dynamic mode): for messaging. + + where etcd and NATS are two global services (there could be multiple etcd and NATS services for high availability). + + For etcd, it also creates a primary lease and spin up a background task to keep the lease alive. All objects registered under this `DistributedRuntime` use this lease_id to maintain their life cycle. There is also a cancellation token that is tied to the primary lease. When the cancellation token is triggered or the background task failed, the primary lease is revoked or expired and the kv pairs stored with this lease_id is removed. +- `Namespace`: `Namespace`s are primarily a logical grouping mechanism and is not registered in etcd. It provides the root path for all components under this `Namespace`. +- `Component`: When a `Component` object is created, similar to `Namespace`, it isn't be registered in etcd. When `create_service` is called, it creates a NATS service group using `{namespace_name}.{service_name}` as the service identifier and registers a service in the registry of the `Component`, where the registry is an internal data structure that tracks all services and endpoints within the `DistributedRuntime`. +- `Endpoint`: When an Endpoint object is created and started, it performs two key registrations: + - NATS Registration: The endpoint is registered with the NATS service group created during service creation. The endpoint is assigned a unique subject following the naming: `{namespace_name}.{service_name}.{endpoint_name}-{lease_id_hex}`. + - etcd Registration: The endpoint information is stored in etcd at a path following the naming: `/services/{namespace}/{component}/{endpoint}-{lease_id}`. Note that the endpoints of different workers of the same type (i.e., two `VllmPrefillWorker`s in one deployment) share the same `Namespace`, `Component`, and `Endpoint` name. They are distinguished by their different primary `lease_id` of their `DistributedRuntime`. + +## Calling Endpoints + +Dynamo uses `Client` object to call an endpoint. When a `Client` objected is created, it is given the name of the `Namespace`, `Component`, and `Endpoint`. It then sets up an etcd watcher to monitor the prefix `/services/{namespace}/{component}/{endpoint}`. The etcd watcher continuously updates the `Client` with the information, including `lease_id` and NATS subject of the available `Endpoint`s. + +The user can decide which load balancing strategy to use when calling the `Endpoint` from the `Client`, which is done in [push_router.rs](https://github.com/ai-dynamo/dynamo/tree/main/lib/runtime/src/pipeline/network/egress/push_router.rs). Dynamo supports three load balancing strategies: + +- `random`: randomly select an endpoint to hit +- `round_robin`: select endpoints in round-robin order +- `direct`: direct the request to a specific endpoint by specifying the `lease_id` of the endpoint + +After selecting which endpoint to hit, the `Client` sends the serialized request to the NATS subject of the selected `Endpoint`. The `Endpoint` receives the request and create a TCP response stream using the connection information from the request, which establishes a direct TCP connection to the `Client`. Then, as the worker generates the response, it serializes each response chunk and sends the serialized data over the TCP connection. + +## Examples + +We provide native rust and python (through binding) examples for basic usage of `DistributedRuntime`: + +- Rust: `/lib/runtime/examples/` +- Python: We also provide complete examples of using `DistributedRuntime`. Please refer to the engines in `components/src/dynamo` for full implementation details. + + diff --git a/fern/pages/design-docs/dynamo-flow.md b/fern/pages/design-docs/dynamo-flow.md new file mode 100644 index 00000000000..df79a2cfa5f --- /dev/null +++ b/fern/pages/design-docs/dynamo-flow.md @@ -0,0 +1,252 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Dynamo Architecture Flow" +--- + +This diagram shows the NVIDIA Dynamo disaggregated inference system as implemented in [examples/backends/vllm](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm). Color-coded flows indicate different types of operations: + +## 🔵 Main Request Flow (Blue) +The primary user journey through the system: + +1. **Discovery (S1)**: Client discovers the service endpoint +2. **Request (S2)**: HTTP client sends API request to Frontend (OpenAI-compatible server on port 8000) +3. **Validate (S3)**: Frontend forwards request to Processor for validation and routing +4. **Route (S3)**: Processor routes the validated request to appropriate Decode Worker + +## 🟠 Decision and Allocation Flow (Orange) +The system's intelligent routing and resource allocation: + +4. **Query (S4)**: Decode Worker queries for prefix cache hits to optimize processing +5. **Disagg Decision (S5)**: Based on prefill length and queue size, the system decides whether it needs remote prefill +5a. **Allocate (S5a)**: Decode Worker pre-allocates KV cache blocks in its local GPU memory +6. **Queue (S6)**: If remote prefill is required, the system puts the RemotePrefillRequest with block IDs into the PrefillQueue + +## 🟢 Prefill Worker Flow (Green) +The dedicated prefill processing pipeline: + +7. **NATS Pull (S7)**: PrefillQueue uses a NATS consumer group to distribute work to available PrefillWorkers +8. **Load Metadata (S8)**: PrefillWorker loads NIXL metadata from ETCD to establish GPU communication +9. **Prefill (S9)**: Worker executes the prefill computation on the input tokens +10. **NIXL Transfer (S10)**: Direct GPU-to-GPU transfer writes the prefilled KV cache to the Decode Worker's pre-allocated blocks + +## 🟣 Completion Flow (Purple) +The response generation and delivery: + +11. **Notify (S11)**: PrefillWorker sends completion notification to Decode Worker +12. **Decode (S12)**: Decode Worker decodes from its local KV cache containing prefilled data +13. **Response (S13)**: The system sends the generated response to the Processor for post-processing, then through the Frontend to the Client + +## 🔗 Infrastructure Connections (Dotted lines) +Coordination and messaging support: + +### ETCD Connections (Gray, dotted) +- **Frontend, Processor, Planner**: Service discovery and registration +- **Decode Worker, PrefillWorker**: NIXL metadata storage for GPU communication setup + +### NATS Connections (Teal, dotted) +- **PrefillQueue**: JetStream consumer group for reliable work distribution +- **Processor**: Load balancing across workers + +### Planning Connections (Gold, dotted) +- **Frontend → Planner**: Metrics collection for auto-scaling decisions +- **Planner → Workers**: Resource scaling commands for both Decode Worker and PrefillWorker + +## Technical Implementation Details + +### NIXL (NVIDIA Interchange Library): +- Enables high-speed GPU-to-GPU data transfers using NVLink/PCIe +- Decode Worker publishes GPU metadata to ETCD for coordination +- PrefillWorker loads metadata to establish direct communication channels +- Block-based transfers (64–128 tokens per block) for efficient batching + +### Disaggregated KV Cache: +- Each Decode Worker maintains local KV cache in its GPU memory +- No shared storage bottlenecks—all transfers are direct worker-to-worker +- Pre-allocated blocks ensure deterministic memory layout and performance + +```mermaid +%%{init: {'theme':'dark', 'themeVariables': {'primaryColor': '#f4f4f4', 'primaryTextColor': '#333333', 'primaryBorderColor': '#888888', 'lineColor': '#4A90E2', 'sectionBkgColor': '#f9f9f9', 'altSectionBkgColor': '#eeeeee', 'tertiaryColor': '#f0f0f0', 'background': '#ffffff', 'mainBkg': '#f8f8f8', 'secondaryColor': '#f4f4f4', 'nodeTextColor': '#333333'}, 'flowchart': {'htmlLabels': true, 'curve': 'basis'}, 'fontFamily': 'Inter, system-ui, -apple-system, "Segoe UI", Roboto, sans-serif', 'fontSize': '18px'}%% +graph TD + %% Top Layer - Client & Frontend + Client["HTTP Client"] + S1[["1 DISCOVERY"]] + Frontend["Frontend
OpenAI Compatible Server
Port 8000
"] + S2[["2 REQUEST"]] + + %% Processing Layer + Processor["Processor
Request Handler & Router"] + S3[["3 VALIDATE"]] + + %% Infrastructure - Positioned strategically to minimize crossings + subgraph INF["Infrastructure Layer"] + ETCD[("ETCD
Service Discovery &
NIXL Metadata
")] + NATS[("NATS
Message Broker")] + Planner["Planner
Resource Management
Auto-scaling
"] + end + + %% Worker Layer - Main processing + subgraph WL["Worker Layer"] + %% VllmWorker section + VllmWorker["Decode Worker
Handles Decoding & Disagg Decisions"] + S4[["4 QUERY"]] + S5[["5 DISAGG DECISION"]] + S5a[["5a ALLOCATE"]] + S12[["12 DECODE"]] + S6[["6 QUEUE"]] + S13[["13 RESPONSE"]] + + %% Storage positioned near workers + LocalKVCache[("Local KV Cache
Pre-allocated Blocks")] + + %% Prefill System - Right side to minimize crossings + subgraph PS["Prefill System"] + PrefillQueue["Prefill Queue
NATS JetStream
Consumer Group
"] + PrefillWorker["Prefill Worker
Dedicated Prefill Processing
(Multiple Instances)
"] + S7[["7 NATS PULL"]] + S8[["8 LOAD METADATA"]] + S9[["9 PREFILL"]] + S10[["10 NIXL TRANSFER"]] + S11[["11 NOTIFY"]] + end + end + + %% Main Request Flow (Blue) - Clean vertical flow + Client -.-> S1 + S1 -->|HTTP API Call| Frontend + Frontend -.-> S2 + S2 -->|Process & Validate| Processor + Processor -.-> S3 + S3 -->|Route to Worker| VllmWorker + + %% VllmWorker Internal Flow (Orange) + VllmWorker -.-> S4 + S4 -->|Query Prefix Cache Hit| S5 + S5 -->|Prefill Length & Queue Check| S5a + S5a -->|Continue to Decode| S12 + + %% Allocation & Queuing (Orange) - Minimize crossings + S5a -->|Allocate KV Cache Blocks| LocalKVCache + VllmWorker --> S6 + S6 -->|Put RemotePrefillRequest| PrefillQueue + + %% Prefill Worker Flow (Green) - Self-contained within PS + PrefillQueue -.-> S7 + S7 -->|Consumer Group Pull| PrefillWorker + PrefillWorker -.-> S8 + PrefillWorker -.-> S9 + S9 -->|Execute Prefill| S10 + S10 -->|Direct GPU Transfer| LocalKVCache + PrefillWorker --> S11 + + %% Return Flow (Purple) - Clean return path + S11 -->|Completion Notification| S12 + S12 -->|Decode from KV Cache| S13 + S13 -->|Post-process Response| Processor + Processor -->|HTTP Response| Frontend + Frontend -->|Final Response| Client + + %% Infrastructure Connections - Organized to avoid crossings + %% ETCD Connections - Grouped by proximity + Frontend -.->|Service Discovery| ETCD + Processor -.->|Service Discovery| ETCD + VllmWorker -.->|NIXL Metadata| ETCD + PrefillWorker -.->|NIXL Metadata| ETCD + S8 -.->|Load NIXL Metadata| ETCD + Planner -.->|Service Discovery| ETCD + + %% NATS Connections - Direct to queue system + PrefillQueue -.->|JetStream| NATS + Processor -.->|Load Balancing| NATS + + %% Planning Connections - Strategic positioning + Frontend -.->|Metrics| Planner + Planner -.->|Auto-scaling| VllmWorker + Planner -.->|Auto-scaling| PrefillWorker + + %% Styling - Each component with unique colors + classDef client fill:#e8f5e8,stroke:#2E7D32,stroke-width:3px + classDef frontend fill:#fff3e0,stroke:#F57C00,stroke-width:3px + classDef processor fill:#f3e5f5,stroke:#7B1FA2,stroke-width:3px + classDef worker fill:#e3f2fd,stroke:#1565C0,stroke-width:3px + classDef prefillQueue fill:#fff8e1,stroke:#E65100,stroke-width:3px + classDef prefillWorker fill:#fce4ec,stroke:#C2185B,stroke-width:3px + classDef prefillBox fill:#eceff1,stroke:#455A64,stroke-width:3px + classDef planner fill:#f1f8e9,stroke:#558B2F,stroke-width:3px + classDef storage fill:#e0f2f1,stroke:#00695C,stroke-width:3px + classDef etcd fill:#fff9c4,stroke:#F9A825,stroke-width:3px + classDef nats fill:#ede7f6,stroke:#5E35B1,stroke-width:3px + classDef infraLayer fill:#fff9c4,stroke:#FFC107,stroke-width:3px + classDef workerLayer fill:#e3f2fd,stroke:#2196F3,stroke-width:3px + + + class Client client + class Frontend frontend + class Processor processor + class VllmWorker worker + class PrefillQueue prefillQueue + class PrefillWorker prefillWorker + class Planner planner + class LocalKVCache storage + class ETCD etcd + class NATS nats + class PS prefillBox + class INF infraLayer + class WL workerLayer + + + + %% Flow Colors - Different line styles to reduce visual clutter + %% Main Request Flow - Blue (solid) + linkStyle 0 stroke:#1565C0,stroke-width:3px,stroke-dasharray: 3 3 + linkStyle 1 stroke:#1565C0,stroke-width:4px + linkStyle 2 stroke:#1565C0,stroke-width:3px,stroke-dasharray: 3 3 + linkStyle 3 stroke:#1565C0,stroke-width:4px + linkStyle 4 stroke:#1565C0,stroke-width:3px,stroke-dasharray: 3 3 + linkStyle 5 stroke:#1565C0,stroke-width:4px + + %% Decision & Allocation Flow - Orange (mixed) + linkStyle 6 stroke:#E65100,stroke-width:3px,stroke-dasharray: 3 3 + linkStyle 7 stroke:#E65100,stroke-width:4px + linkStyle 8 stroke:#E65100,stroke-width:4px + linkStyle 9 stroke:#E65100,stroke-width:3px,stroke-dasharray: 3 3 + + %% KV Cache & Queue - Orange (solid) + linkStyle 10 stroke:#E65100,stroke-width:4px + linkStyle 11 stroke:#E65100,stroke-width:4px + linkStyle 12 stroke:#E65100,stroke-width:4px + + %% Prefill Worker Flow - Green (mixed) + linkStyle 13 stroke:#2E7D32,stroke-width:3px,stroke-dasharray: 3 3 + linkStyle 14 stroke:#2E7D32,stroke-width:4px + linkStyle 15 stroke:#2E7D32,stroke-width:3px,stroke-dasharray: 3 3 + linkStyle 16 stroke:#2E7D32,stroke-width:3px,stroke-dasharray: 3 3 + linkStyle 17 stroke:#2E7D32,stroke-width:4px + linkStyle 18 stroke:#2E7D32,stroke-width:4px + linkStyle 19 stroke:#2E7D32,stroke-width:4px + + %% Completion Flow - Purple (mixed) + linkStyle 20 stroke:#6A1B9A,stroke-width:4px + linkStyle 21 stroke:#6A1B9A,stroke-width:3px,stroke-dasharray: 3 3 + linkStyle 22 stroke:#6A1B9A,stroke-width:4px + linkStyle 23 stroke:#6A1B9A,stroke-width:4px + linkStyle 24 stroke:#6A1B9A,stroke-width:4px + + %% Infrastructure Flows - Lighter and dotted to reduce visual noise + %% ETCD Connections - Gray (dotted, thinner) + linkStyle 25 stroke:#757575,stroke-width:2px,stroke-dasharray: 8 8 + linkStyle 26 stroke:#757575,stroke-width:2px,stroke-dasharray: 8 8 + linkStyle 27 stroke:#757575,stroke-width:2px,stroke-dasharray: 8 8 + linkStyle 28 stroke:#757575,stroke-width:2px,stroke-dasharray: 8 8 + linkStyle 29 stroke:#757575,stroke-width:2px,stroke-dasharray: 8 8 + linkStyle 30 stroke:#757575,stroke-width:2px,stroke-dasharray: 8 8 + + %% NATS Connections - Teal (dotted, thinner) + linkStyle 31 stroke:#26A69A,stroke-width:2px,stroke-dasharray: 8 8 + linkStyle 32 stroke:#26A69A,stroke-width:2px,stroke-dasharray: 8 8 + + %% Planning Connections - Gold (dotted, thinner) + linkStyle 33 stroke:#FFA726,stroke-width:2px,stroke-dasharray: 8 8 + linkStyle 34 stroke:#FFA726,stroke-width:2px,stroke-dasharray: 8 8 + linkStyle 35 stroke:#FFA726,stroke-width:2px,stroke-dasharray: 8 8 +``` diff --git a/fern/pages/design-docs/event-plane.md b/fern/pages/design-docs/event-plane.md new file mode 100644 index 00000000000..2e1011e1ec5 --- /dev/null +++ b/fern/pages/design-docs/event-plane.md @@ -0,0 +1,466 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Event Plane Architecture" +--- + +This document describes Dynamo's event plane architecture, which handles service discovery, coordination, and event distribution using etcd and NATS. + +## Overview + +Dynamo's coordination layer adapts to the deployment environment: + +| Deployment | Service Discovery | KV Events | Request Plane | +|------------|-------------------|-----------|---------------| +| **Kubernetes** (with operator) | Native K8s (CRDs, EndpointSlices) | NATS (optional) | TCP | +| **Bare metal / Local** (default) | etcd | NATS (optional) | TCP | + + +The runtime always defaults to `kv_store` (etcd) for service discovery. Kubernetes deployments must explicitly set `DYN_DISCOVERY_BACKEND=kubernetes` - the Dynamo operator handles this automatically. + + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ Coordination Layer │ +│ │ +│ ┌─────────────────────────┐ ┌─────────────────────────────────┐ │ +│ │ Service Discovery │ │ NATS │ │ +│ │ │ │ (Optional) │ │ +│ │ • K8s: CRDs + API │ │ • KV Cache Events │ │ +│ │ • Bare metal: etcd │ │ • Router Replica Sync │ │ +│ │ │ │ • JetStream Persistence │ │ +│ └─────────────────────────┘ └─────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────┘ + │ │ + ┌──────────┴──────────┐ ┌─────────┴──────────┐ + ▼ ▼ ▼ ▼ + ┌─────────┐ ┌─────────┐ ┌─────────┐ + │Frontend │ │ Planner │ │ Worker │ + └─────────┘ └─────────┘ └─────────┘ +``` + +## Kubernetes-Native Service Discovery + +When running on Kubernetes with the Dynamo operator, service discovery uses native Kubernetes resources instead of etcd. + +### Configuration + +The operator explicitly sets: +```bash +DYN_DISCOVERY_BACKEND=kubernetes +``` + + +This must be explicitly configured. The runtime defaults to `kv_store` in all environments. + + +### How It Works + +1. **DynamoWorkerMetadata CRD**: Workers register their endpoints by creating/updating DynamoWorkerMetadata custom resources +2. **EndpointSlices**: Used to signal readiness status to the system +3. **K8s API Watches**: Components watch for CRD changes to discover available endpoints + +### Benefits + +- No external etcd cluster required +- Native integration with Kubernetes lifecycle +- Automatic cleanup when pods terminate +- Works with standard K8s RBAC + +### Environment Variables (Injected by Operator) + +| Variable | Description | +|----------|-------------| +| `DYN_DISCOVERY_BACKEND` | Set to `kubernetes` | +| `POD_NAME` | Current pod name | +| `POD_NAMESPACE` | Current namespace | +| `POD_UID` | Pod unique identifier | + +--- + +## etcd Architecture (Default for All Deployments) + +When `DYN_DISCOVERY_BACKEND=kv_store` (the global default), etcd is used for service discovery. + +### Connection Configuration + +etcd connection is configured via environment variables: + +| Variable | Description | Default | +|----------|-------------|---------| +| `ETCD_ENDPOINTS` | Comma-separated etcd URLs | `http://localhost:2379` | +| `ETCD_AUTH_USERNAME` | Basic auth username | None | +| `ETCD_AUTH_PASSWORD` | Basic auth password | None | +| `ETCD_AUTH_CA` | CA certificate path (TLS) | None | +| `ETCD_AUTH_CLIENT_CERT` | Client certificate path | None | +| `ETCD_AUTH_CLIENT_KEY` | Client key path | None | + +Example: +```bash +export ETCD_ENDPOINTS=http://etcd-0:2379,http://etcd-1:2379,http://etcd-2:2379 +``` + +### Lease Management + +Each `DistributedRuntime` maintains a primary lease with etcd: + +``` +┌────────────────────┐ ┌──────────────┐ +│ DistributedRuntime │◄────────│ Primary Lease │ +│ │ │ TTL: 10s │ +│ • Namespace │ └───────┬───────┘ +│ • Components │ │ +│ • Endpoints │ │ Keep-Alive +│ │ │ Heartbeat +└────────────────────┘ ▼ + ┌──────────────┐ + │ etcd │ + └──────────────┘ +``` + +**Lease Lifecycle:** + +1. **Creation**: Lease created during `DistributedRuntime` initialization +2. **Keep-Alive**: Background task sends heartbeats at 50% of remaining TTL +3. **Expiration**: If heartbeats stop, lease expires after TTL (10 seconds default) +4. **Cleanup**: All keys associated with the lease are automatically deleted + +**Automatic Recovery:** + +- Reconnection with exponential backoff (50ms to 5s) +- Deadline-based retry logic +- Cancellation token propagation + +### Service Discovery + +Endpoints are registered in etcd for dynamic discovery: + +**Key Format:** +``` +/services/{namespace}/{component}/{endpoint}/{instance_id} +``` + +**Example:** +``` +/services/vllm-agg/backend/generate/694d98147d54be25 +``` + +**Registration Data:** +```json +{ + "namespace": "vllm-agg", + "component": "backend", + "endpoint": "generate", + "instance_id": 7587888160958628000, + "transport": { + "tcp": "192.168.1.10:9999" + } +} +``` + +### Discovery Queries + +The discovery system supports multiple query patterns: + +| Query Type | Pattern | Use Case | +|------------|---------|----------| +| `AllEndpoints` | `/services/` | List all services | +| `NamespacedEndpoints` | `/services/{namespace}/` | Filter by namespace | +| `ComponentEndpoints` | `/services/{namespace}/{component}/` | Filter by component | +| `Endpoint` | `/services/{namespace}/{component}/{endpoint}/` | Specific endpoint | + +### Watch Functionality + +Clients watch etcd prefixes for real-time updates: + +```python +# Client watches for endpoint changes +watcher = etcd.watch_prefix("/services/vllm-agg/backend/generate/") + +for event in watcher: + if event.type == "PUT": + # New endpoint registered + add_endpoint(event.value) + elif event.type == "DELETE": + # Endpoint removed (worker died) + remove_endpoint(event.key) +``` + +**Watch Features:** + +- Initial state retrieval with `get_and_watch_prefix()` +- Automatic reconnection on stream failure +- Revision tracking for no-event-loss guarantees +- Event types: `PUT` (create/update) and `DELETE` + +### Distributed Locks + +etcd provides distributed locking for coordination: + +**Lock Types:** + +| Type | Key Pattern | Behavior | +|------|-------------|----------| +| Write Lock | `v1/{prefix}/writer` | Exclusive (no readers/writers) | +| Read Lock | `v1/{prefix}/readers/{id}` | Shared (multiple readers) | + +**Operations:** + +```rust +// Non-blocking write lock +let lock = client.try_write_lock("my_resource").await?; + +// Blocking read lock with polling (100ms intervals) +let lock = client.read_lock_with_wait("my_resource").await?; +``` + +## NATS Architecture + +### When NATS is Used + +NATS is used for: + +1. **KV Cache Events**: Real-time KV cache state updates for routing +2. **Router Replica Sync**: Synchronizing router state across replicas +3. **Legacy Request Plane**: NATS-based request transport (optional) + +### Configuration + +| Variable | Description | Default | +|----------|-------------|---------| +| `NATS_SERVER` | NATS server URL | `nats://localhost:4222` | + +### Disabling NATS + +For deployments without KV-aware routing: + +```bash +# Disable NATS and KV events +python -m dynamo.frontend --no-kv-events +``` + +This enables "approximate mode" for KV routing without event persistence. + +### Event Publishing + +Components publish events to NATS subjects: + +```rust +pub trait EventPublisher { + async fn publish(&self, event: &str, data: &[u8]) -> Result<()>; + async fn publish_serialized(&self, event: &str, data: &T) -> Result<()>; +} +``` + +**Subject Naming:** +``` +{base_subject}.{event_name} +``` + +Example: +``` +vllm-agg.backend.kv_cache_update +``` + +### Event Subscription + +Components subscribe to events: + +```rust +pub trait EventSubscriber { + async fn subscribe(&self, topic: &str) -> Result; + async fn subscribe_typed(&self, topic: &str) -> Result>; +} +``` + +### JetStream Persistence + +For durable event delivery, NATS JetStream provides: + +- Message persistence +- Replay from offset +- Consumer groups for load balancing +- Acknowledgment tracking + +## Key-Value Store Abstraction + +Dynamo provides a unified KV store interface supporting multiple backends: + +### Supported Backends + +| Backend | Use Case | Configuration | +|---------|----------|---------------| +| `EtcdStore` | Production deployments | `ETCD_ENDPOINTS` | +| `MemoryStore` | Testing, development | Default | +| `NatsStore` | NATS-only deployments | `NATS_SERVER` | +| `FileStore` | Local persistence | File path | + +### Store Interface + +```rust +pub trait KvStore { + async fn get(&self, bucket: &str, key: &str) -> Result>>; + async fn put(&self, bucket: &str, key: &str, value: &[u8]) -> Result<()>; + async fn delete(&self, bucket: &str, key: &str) -> Result<()>; + async fn watch(&self, bucket: &str) -> Result; +} +``` + +### Buckets + +Data is organized into logical buckets: + +| Bucket | Purpose | +|--------|---------| +| `v1/instances` | Endpoint instance registry | +| `v1/mdc` | Model deployment cards | + +## Typed Prefix Watcher + +For type-safe watching of etcd prefixes: + +```rust +// Watch and maintain HashMap of deserialized values +let watcher = watch_prefix_with_extraction::( + &etcd_client, + "/services/vllm-agg/", + lease_id_extractor, + value_extractor, +).await?; + +// Receive updates via watch channel +let instances = watcher.borrow(); +``` + +**Key Extractors:** + +| Extractor | Description | +|-----------|-------------| +| `lease_id()` | Use lease ID as key | +| `key_string()` | Extract key with prefix stripping | +| `full_key_string()` | Use full etcd key | + +## Reliability Features + +### Connection Resilience + +**etcd Reconnection:** +- Exponential backoff: 50ms to 5s +- Deadline-based retry logic +- Mutex ensures single concurrent reconnect + +**NATS Reconnection:** +- Built-in reconnection in NATS client +- Configurable max reconnect attempts +- Buffering during disconnection + +### Lease-Based Cleanup + +When a worker crashes or loses connectivity: + +1. Keep-alive heartbeats stop +2. Lease expires after TTL (10 seconds) +3. All registered endpoints automatically deleted +4. Clients receive DELETE watch events +5. Traffic reroutes to healthy workers + +### Transaction Safety + +etcd transactions ensure atomic operations: + +```rust +// Atomic create-if-not-exists +let txn = Txn::new() + .when([Compare::create_revision(key, CompareOp::Equal, 0)]) + .and_then([Op::put(key, value, options)]); + +etcd_client.txn(txn).await?; +``` + +This prevents race conditions in concurrent service registration. + +## Operational Modes + +### Kubernetes Mode (Requires Explicit Configuration) + +Native Kubernetes service discovery: + +```bash +# Operator explicitly sets this (not auto-detected): +export DYN_DISCOVERY_BACKEND=kubernetes + +# Workers register via K8s CRDs +python -m dynamo.vllm --model Qwen/Qwen3-0.6B + +# Frontend discovers workers via K8s API +python -m dynamo.frontend +``` + +No etcd or NATS required for basic operation when using K8s discovery. + +### KV Store Mode (Global Default) + +Full service discovery with etcd: + +```bash +# This is the default - no configuration needed +# export DYN_DISCOVERY_BACKEND=kv_store # (implicit) + +# Workers register with etcd +python -m dynamo.vllm --model Qwen/Qwen3-0.6B + +# Frontend discovers workers via etcd +python -m dynamo.frontend +``` + +### KV-Aware Routing (Optional) + +Enable NATS for KV cache event tracking: + +```bash +# Default: KV events enabled (requires NATS) +python -m dynamo.frontend --router-mode kv + +# Disable KV events for prediction-based routing (no NATS) +python -m dynamo.frontend --router-mode kv --no-kv-events +``` + +With `--no-kv-events`: +- Router predicts cache state based on routing decisions +- TTL-based expiration and LRU pruning +- No NATS infrastructure required + +## Best Practices + +### 1. Use Kubernetes Discovery on K8s + +The Dynamo operator automatically sets `DYN_DISCOVERY_BACKEND=kubernetes` for pods. No additional setup required when using the operator. + +### 2. For Bare Metal: Deploy etcd Cluster + +For bare-metal production deployments, deploy a 3-node etcd cluster for high availability. + +### 3. Configure Appropriate TTLs (etcd mode) + +Balance between detection speed and overhead: + +- **Short TTL (5s)**: Faster failure detection, more keep-alive traffic +- **Long TTL (30s)**: Less overhead, slower detection + +### 4. KV Routing Without NATS + +For simpler deployments without NATS: + +```bash +# Use prediction-based KV routing +python -m dynamo.frontend --router-mode kv --no-kv-events +``` + +This provides KV-aware routing with reduced accuracy but no NATS dependency. + +## Related Documentation + +- [Distributed Runtime](distributed-runtime.md) - Runtime architecture +- [Request Plane](../guides/request-plane.md) - Request transport configuration +- [Fault Tolerance](../fault-tolerance/request-cancellation.md) - Failure handling diff --git a/fern/pages/development/backend-guide.md b/fern/pages/development/backend-guide.md new file mode 100644 index 00000000000..c2c883def4a --- /dev/null +++ b/fern/pages/development/backend-guide.md @@ -0,0 +1,162 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Writing Python Workers in Dynamo" +--- + +This guide explains how to create your own Python worker in Dynamo. + +The [dynamo](https://pypi.org/project/ai-dynamo/) Python library allows you to build your own engine and attach it to Dynamo. + +The Python file must do three things: +1. Decorate a function to get the runtime +2. Register on the network +3. Attach a request handler + +``` +from dynamo.llm import ModelInput, ModelType, register_llm +from dynamo.runtime import DistributedRuntime, dynamo_worker + + # 1. Decorate a function to get the runtime + # + @dynamo_worker() + async def worker(runtime: DistributedRuntime): + + # 2. Register ourselves on the network + # + component = runtime.namespace("namespace").component("component") + model_path = "Qwen/Qwen3-0.6B" # or "/data/models/Qwen3-0.6B" + model_input = ModelInput.Tokens # or ModelInput.Text if engine handles pre-processing + model_type = ModelType.Chat # or ModelType.Chat | ModelType.Completions if model can be deployed on chat and completions endpoints + endpoint = component.endpoint("endpoint") + # Optional last param to register_llm is model_name. If not present derives it from model_path + await register_llm(model_input, model_type, endpoint, model_path) + + # Initialize your engine here + # engine = ... + + # 3. Attach request handler + # + await endpoint.serve_endpoint(RequestHandler(engine).generate) + +class RequestHandler: + + def __init__(self, engine): + ... + + async def generate(self, request): + # Call the engine + # yield result dict + ... + +if __name__ == "__main__": + uvloop.install() + asyncio.run(worker()) +``` + + +The `model_path` can be: +- A HuggingFace repo ID, optionally prefixed with `hf://`. It is downloaded and cached locally. +- The path to a checkout of a HuggingFace repo - any folder containing safetensor files as well as `config.json`, `tokenizer.json` and `tokenizer_config.json`. + +The `model_input` can be: +- ModelInput.Tokens. Your engine expects pre-processed input (token IDs). Dynamo handles tokenization and pre-processing. +- ModelInput.Text. Your engine expects raw text input and handles its own tokenization and pre-processing. + +The `model_type` can be: +- ModelType.Chat. Your `generate` method receives a `request` and must return a response dict of type [OpenAI Chat Completion](https://platform.openai.com/docs/api-reference/chat). +- ModelType.Completions. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions). + +`register_llm` can also take the following kwargs: +- `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name or the folder name. +- `context_length`: Max model length in tokens. Defaults to the model's set max. Only set this if you need to reduce KV cache allocation to fit into VRAM. +- `kv_cache_block_size`: Size of a KV block for the engine, in tokens. Defaults to 16. +- `migration_limit`: Maximum number of times a request may be [migrated to another Instance](../fault-tolerance/request-migration.md). Defaults to 0. +- `user_data`: Optional dictionary containing custom metadata for worker behavior (e.g., LoRA configuration). Defaults to None. + +See `examples/backends` for full code examples. + +## Component names + +A worker needs three names to register itself: namespace.component.endpoint + +* *Namespace*: A pipeline. Usually a model. e.g "llama_8b". Just a name. +* *Component*: A load balanced service needed to run that pipeline. "backend", "prefill", "decode", "preprocessor", "draft", etc. This typically has some configuration (which model to use, for example). +* *Endpoint*: Like a URL. "generate", "load_metrics". +* *Instance*: A process. Unique. Dynamo assigns each one a unique instance_id. The thing that is running is always an instance. Namespace/component/endpoint can refer to multiple instances. + +If you run two models, that is two pipelines. An exception would be if doing speculative decoding. The draft model is part of the pipeline of a bigger model. + +If you run two instances of the same model ("data parallel") they are the same namespace+component+endpoint but different instances. The router will spread traffic over all the instances of a namespace+component+endpoint. If you have four prefill workers in a pipeline, they all have the same namespace+component+endpoint and are automatically assigned unique instance_ids. + +Example 1: Data parallel load balanced, one model one pipeline two instances. +``` +Node 1: namespace: qwen3-32b, component: backend, endpoint: generate, model: /data/Qwen3-32B --tensor-parallel-size 2 --base-gpu-id 0 +Node 2: namespace: qwen3-32b, component: backend, endpoint: generate model: /data/Qwen3-32B --tensor-parallel-size 2 --base-gpu-id 2 +``` + +Example 2: Two models, two pipelines. +``` +Node 1: namespace: qwen3-32b, component: backend, endpoint: generate, model: /data/Qwen3-32B +Node 2: namespace: llama3-1-8b, component: backend, endpoint: generat, model: /data/Llama-3.1-8B-Instruct/ +``` + +Example 3: Different endpoints. + +The KV metrics publisher in VLLM adds a `load_metrics` endpoint to the current component. If the `llama3-1-8b.backend` component above is using patched vllm it will also expose `llama3-1-8b.backend.load_metrics`. + +Example 4: Multiple component in a pipeline. + +In the P/D disaggregated setup you would have `deepseek-distill-llama8b.prefill.generate` (possibly multiple instances of this) and `deepseek-distill-llama8b.decode.generate`. + +## Migrate Ongoing Requests + +A Python worker may need to be shut down promptly, for example when the node running the worker is to be reclaimed and there isn't enough time to complete all ongoing requests before the shutdown deadline. + +In such cases, you can signal incomplete responses by raising a `GeneratorExit` exception in your generate loop. This will immediately close the response stream, signaling to the frontend that the stream is incomplete. With request migration enabled (see the [`migration_limit`](../fault-tolerance/request-migration.md) parameter), the frontend will automatically migrate the partially completed request to another worker instance, if available, to be completed. + + +We will update the `GeneratorExit` exception to a new Dynamo exception. Please expect minor code breaking change in the near future. + + +Here's an example of how to implement this in your `RequestHandler`: + +```python +class RequestHandler: + + async def generate(self, request): + """Generate response, with support for request migration""" + for result in self.engine.generate_streaming(request): + # Check if we need to migrate before yielding each token + if is_shutting_down(): + # Raising GeneratorExit closes the stream and triggers migration + raise GeneratorExit("Worker shutting down, migrating request") + + yield result +``` + +When `GeneratorExit` is raised, the frontend receives the incomplete response and can seamlessly continue generation on another available worker instance, preserving the user experience even during worker shutdowns. + +For more information about how request migration works, see the [Request Migration Architecture](../fault-tolerance/request-migration.md) documentation. + +## Request Cancellation + +Your Python worker's request handler can optionally support request cancellation by accepting a `context` argument after the `request` argument. This context object allows you to check for cancellation signals and respond appropriately: + +```python +class RequestHandler: + + async def generate(self, request, context): + """Generate response with cancellation support""" + for result in self.engine.generate_streaming(request): + # Check if the request has been cancelled + if context.is_stopped(): + # Stop processing and clean up + break + + yield result +``` + +The context parameter is optional - if your generate method doesn't include it in its signature, Dynamo will call your method without the context argument. + +For detailed information about request cancellation, including async cancellation monitoring and context propagation patterns, see the [Request Cancellation Architecture](../fault-tolerance/request-cancellation.md) documentation. diff --git a/fern/pages/development/runtime-guide.md b/fern/pages/development/runtime-guide.md new file mode 100644 index 00000000000..c7d60a830a8 --- /dev/null +++ b/fern/pages/development/runtime-guide.md @@ -0,0 +1,103 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Dynamo Runtime" +--- + +

A Datacenter Scale Distributed Inference Serving Framework

+ +[![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) + +Rust implementation of the Dynamo runtime system, enabling distributed computing capabilities for machine learning workloads. + +## Prerequisites + +### Install Rust and Cargo using [rustup](https://rustup.rs/): + +```bash +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +``` + +### Build + +``` +cargo build +cargo test +``` + +### Start Dependencies + +#### Docker Compose + +The simplest way to deploy the pre-requisite services is using +[docker-compose](https://docs.docker.com/compose/install/linux/), +defined in [deploy/docker-compose.yml](https://github.com/ai-dynamo/dynamo/tree/main/deploy/docker-compose.yml). + +``` +# At the root of the repository: +docker compose -f deploy/docker-compose.yml up -d +``` + +This will deploy a [NATS.io](https://nats.io/) server and an [etcd](https://etcd.io/) +server used to communicate between and discover components at runtime. + + +#### Local (alternate) + +To deploy the pre-requisite services locally instead of using `docker-compose` +above, you can manually launch each: + +- [NATS.io](https://docs.nats.io/running-a-nats-service/introduction/installation) server with [Jetstream](https://docs.nats.io/nats-concepts/jetstream) + - example: `nats-server -js --trace` +- [etcd](https://etcd.io) server + - follow instructions in [etcd installation](https://etcd.io/docs/v3.5/install/) to start an `etcd-server` locally + + +### Run Examples + +When developing or running examples, any process or user that shared your core-services (`etcd` and `nats.io`) will +be operating within your distributed runtime. + +The current examples use a hard-coded `namespace`. We will address the `namespace` collisions later. + +All examples require the `etcd` and `nats.io` pre-requisites to be running and available. + +#### Rust `hello_world` + +With two terminals open, in one window: + +``` +cd examples/hello_world +cargo run --bin server +``` + +In the second terminal, execute: + +``` +cd examples/hello_world +cargo run --bin client +``` + +which should yield some output similar to: +``` + Finished `dev` profile [unoptimized + debuginfo] target(s) in 6.25s + Running `target/debug/client` +Annotated { data: Some("h"), id: None, event: None, comment: None } +Annotated { data: Some("e"), id: None, event: None, comment: None } +Annotated { data: Some("l"), id: None, event: None, comment: None } +Annotated { data: Some("l"), id: None, event: None, comment: None } +Annotated { data: Some("o"), id: None, event: None, comment: None } +Annotated { data: Some(" "), id: None, event: None, comment: None } +Annotated { data: Some("w"), id: None, event: None, comment: None } +Annotated { data: Some("o"), id: None, event: None, comment: None } +Annotated { data: Some("r"), id: None, event: None, comment: None } +Annotated { data: Some("l"), id: None, event: None, comment: None } +Annotated { data: Some("d"), id: None, event: None, comment: None } +``` + +#### Python + +See the [README.md](https://github.com/ai-dynamo/dynamo/tree/main/lib/runtime/lib/bindings/python/README.md) for details + +The Python and Rust `hello_world` client and server examples are interchangeable, +so you can start the Python `server.py` and talk to it from the Rust `client`. diff --git a/fern/pages/fault-tolerance/README.md b/fern/pages/fault-tolerance/README.md new file mode 100644 index 00000000000..813e2c8010e --- /dev/null +++ b/fern/pages/fault-tolerance/README.md @@ -0,0 +1,129 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Fault Tolerance" +--- + +Dynamo provides comprehensive fault tolerance mechanisms to ensure reliable LLM inference in production deployments. This section covers the various strategies and features that enable Dynamo to handle failures gracefully and maintain service availability. + +## Overview + +Fault tolerance in Dynamo operates at multiple levels: + +| Layer | Mechanism | Purpose | +|-------|-----------|---------| +| **Request** | Migration, Cancellation | Handle in-flight request failures | +| **Worker** | Health Checks, Graceful Shutdown | Detect and recover from worker failures | +| **System** | Load Shedding, Request Rejection | Prevent system overload | +| **Infrastructure** | etcd HA, NATS resilience | Handle infrastructure component failures | + +## Key Features + +### Request Migration + +When a worker fails during request processing, Dynamo can migrate in-progress requests to healthy workers. The migration system: + +- Preserves partial generation state (accumulated tokens) +- Transparently continues generation on a new worker +- Maintains seamless token flow to clients + +See [Request Migration](request-migration.md) for details. + +### Request Cancellation + +Dynamo supports canceling in-flight requests to free computational resources: + +- Graceful stop signals for clean termination +- Kill signals for immediate termination +- Hierarchical cancellation propagation through request chains + +See [Request Cancellation](request-cancellation.md) for details. + +### Graceful Shutdown + +Workers handle shutdown signals (SIGTERM/SIGINT) gracefully: + +- Immediately stop accepting new requests +- Optionally drain in-flight requests before terminating +- Clean up resources (engines, connections, temp files) + +See [Graceful Shutdown](graceful-shutdown.md) for details. + +### Request Rejection (Load Shedding) + +When workers are overloaded, Dynamo rejects new requests to prevent cascading failures: + +- Configurable busy thresholds based on KV cache utilization +- Real-time worker load monitoring +- HTTP 503 responses with retry guidance + +See [Request Rejection](request-rejection.md) for details. + +### Health Checks + +Dynamo provides multiple health check mechanisms: + +- **HTTP Endpoints**: `/health` and `/live` endpoints for orchestration +- **Canary Health Checks**: Active monitoring via periodic test requests +- **Engine Monitoring**: Automatic shutdown on engine failure detection + +See [Health Checks](../observability/health-checks.md) for details. + +## Configuration Quick Reference + +| Feature | Environment Variable | Default | +|---------|---------------------|---------| +| Worker health port | `DYN_SYSTEM_PORT` | `9090` | +| Canary health checks | `DYN_HEALTH_CHECK_ENABLED` | `false` (K8s: `true`) | +| Canary wait time | `DYN_CANARY_WAIT_TIME` | `10` seconds | +| Health check timeout | `DYN_HEALTH_CHECK_REQUEST_TIMEOUT` | `3` seconds | +| Decode blocks threshold | `--active-decode-blocks-threshold` | None (disabled) | +| Prefill tokens threshold | `--active-prefill-tokens-threshold` | None (disabled) | + +## Failure Scenarios and Recovery + +### Worker Pod Restart + +1. Worker receives SIGTERM from Kubernetes +2. Endpoints are immediately invalidated (no new requests) +3. In-flight requests complete or migrate (based on configuration) +4. Resources are cleaned up +5. Pod restarts with fresh state + +### Worker Crash (Unexpected) + +1. etcd lease expires (TTL-based detection) +2. Client discovers endpoint removal via etcd watch +3. New requests route to remaining healthy workers +4. In-flight requests on crashed worker are migrated (if enabled) + +### Network Partition + +1. Worker loses connectivity to etcd/NATS +2. Lease keep-alive fails, lease eventually expires +3. Worker is removed from service discovery +4. Traffic reroutes to reachable workers + +### GPU Failure + +1. Engine health check detects GPU error (XID, OOM, etc.) +2. Worker initiates graceful shutdown +3. Runtime is shut down, engine cleaned up +4. Process exits with code 1 for pod restart + +## Testing Fault Tolerance + +Dynamo includes a comprehensive testing framework for validating fault tolerance: + +- Request cancellation tests +- Migration tests with worker failures +- etcd HA failover tests +- Hardware fault injection (GPU XID, network partitions) + +See [Fault Tolerance Testing](testing.md) for details. + +## Related Documentation + +- [Observability](../observability/README.md) - Metrics and monitoring +- [Distributed Runtime](../design-docs/distributed-runtime.md) - Service discovery architecture +- [Event Plane](../design-docs/event-plane.md) - etcd and NATS coordination diff --git a/fern/pages/fault-tolerance/graceful-shutdown.md b/fern/pages/fault-tolerance/graceful-shutdown.md new file mode 100644 index 00000000000..8c52dbce9f1 --- /dev/null +++ b/fern/pages/fault-tolerance/graceful-shutdown.md @@ -0,0 +1,261 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Graceful Shutdown" +--- + +This document describes how Dynamo components handle shutdown signals to ensure in-flight requests complete successfully and resources are properly cleaned up. + +## Overview + +Graceful shutdown in Dynamo ensures that: + +1. **No new requests are accepted** - Endpoints are immediately invalidated +2. **In-flight requests complete** - Existing requests finish processing (configurable) +3. **Resources are cleaned up** - Engines, connections, and temporary files are released +4. **Pods restart cleanly** - Exit codes signal Kubernetes for proper restart behavior + +## Signal Handling + +All Dynamo components handle Unix signals for graceful shutdown: + +| Signal | Trigger | Behavior | +|--------|---------|----------| +| `SIGTERM` | Kubernetes pod termination | Graceful shutdown initiated | +| `SIGINT` | Ctrl+C / manual interrupt | Graceful shutdown initiated | + +### Implementation + +Each component registers signal handlers at startup: + +```python +def signal_handler(): + asyncio.create_task(graceful_shutdown(runtime)) + +for sig in (signal.SIGTERM, signal.SIGINT): + loop.add_signal_handler(sig, signal_handler) +``` + +The `graceful_shutdown()` function: +1. Logs the shutdown signal +2. Calls `runtime.shutdown()` to invalidate endpoints +3. Waits for in-flight requests (based on configuration) +4. Returns to allow cleanup to proceed + +## Endpoint Draining + +When `runtime.shutdown()` is called, endpoints are immediately invalidated so no new requests are accepted. The behavior for in-flight requests depends on the `graceful_shutdown` parameter when serving the endpoint. + +### Configuration + +When registering an endpoint, the `graceful_shutdown` parameter controls draining behavior: + +```python +generate_endpoint.serve_endpoint( + handler.generate, + graceful_shutdown=True, # Wait for all requests to finish + metrics_labels=[("model", model_name)], + health_check_payload=health_check_payload, +) +``` + +| `graceful_shutdown` | Behavior | +|---------------------|----------| +| `True` | Wait for all in-flight requests to complete before returning | +| `False` | Return immediately without waiting for requests | + +### Component-Specific Behavior + +| Component | Default Behavior | Rationale | +|-----------|------------------|-----------| +| **Frontend** | N/A (HTTP server) | HTTP server handles its own shutdown | +| **Prefill Workers** | `graceful_shutdown=True` | Prefill operations must complete to avoid wasted computation | +| **Decode Workers** | Conditional | If migration is enabled (`migration_limit > 0`), shutdown immediately to allow migration; otherwise wait | +| **Router** | `graceful_shutdown=True` | Ensure routing decisions complete | + +### Decode Worker Migration Integration + +Decode workers use conditional draining based on whether request migration is supported: + +```python +generate_endpoint.serve_endpoint( + handler.generate, + graceful_shutdown=config.migration_limit <= 0, # If no migration, wait for requests + ... +) +``` + +When `migration_limit > 0`: +- Worker shuts down immediately (`graceful_shutdown=False`) +- In-flight requests are migrated to healthy workers +- No request loss occurs + +When `migration_limit <= 0`: +- Worker waits for in-flight requests (`graceful_shutdown=True`) +- Migration is not available +- Requests complete on the shutting-down worker + +## Resource Cleanup + +After endpoint draining, components clean up their resources in `finally` blocks: + +### vLLM Worker Cleanup + +```python +finally: + logger.debug("Cleaning up worker") + handler.cleanup() +``` + +The handler's `cleanup()` method: +- Removes temporary directories (LoRA adapters, etc.) +- Releases engine resources + +### SGLang Worker Cleanup + +```python +def cleanup(self) -> None: + # Cancel pending consume tasks + for task in self._consume_tasks: + if not task.done(): + task.cancel() + self._consume_tasks.clear() + + # Shutdown engine + self.engine.shutdown() +``` + +### TensorRT-LLM Worker Cleanup + +```python +async def cleanup(self): + if self._llm: + try: + self._llm.shutdown() + except Exception as e: + logging.error(f"Error during cleanup: {e}") + finally: + self._llm = None +``` + +## Error-Initiated Shutdown + +Workers can initiate graceful shutdown when fatal errors occur: + +### Engine Health Monitoring (vLLM) + +The `VllmEngineMonitor` continuously checks engine health: + +```python +async def _check_engine_health(self): + while True: + try: + await self.engine_client.check_health() + await asyncio.sleep(HEALTH_CHECK_INTERVAL) # 2 seconds + except EngineDeadError as e: + logger.error(f"Health check failed: {e}") + self._shutdown_engine() + self.runtime.shutdown() + os._exit(1) +``` + +Configuration: +- `HEALTH_CHECK_INTERVAL`: 2 seconds between checks +- `ENGINE_SHUTDOWN_TIMEOUT`: 30 seconds max for engine shutdown + +### Fatal Error Handling (TensorRT-LLM) + +```python +async def _initiate_shutdown(self, error: Exception): + logging.warning(f"Initiating graceful shutdown due to: {error}") + + try: + if self.runtime: + self.runtime.shutdown() + if self.engine: + await self.engine.cleanup() + except Exception as cleanup_error: + logging.error(f"Error during graceful shutdown: {cleanup_error}") + finally: + logging.critical("Forcing process exit for restart") + os._exit(1) +``` + +## Kubernetes Integration + +### Pod Termination Flow + +1. Kubernetes sends `SIGTERM` to the pod +2. Dynamo initiates graceful shutdown +3. Pod has `terminationGracePeriodSeconds` to complete (default: 30s) +4. If not terminated, Kubernetes sends `SIGKILL` + +### Recommended Configuration + +For production deployments, configure adequate termination grace period: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +spec: + services: + VllmWorker: + extraPodSpec: + terminationGracePeriodSeconds: 60 # Allow time for request draining +``` + +### Health Check Integration + +Kubernetes uses health endpoints to determine pod readiness: + +- **During shutdown**: Endpoints become unavailable +- **Readiness probe fails**: Traffic stops routing to the pod +- **Graceful draining**: Existing requests complete + +## Best Practices + +### 1. Set Appropriate Grace Periods + +Match `terminationGracePeriodSeconds` to your expected request completion time: +- Short requests (< 10s): 30s grace period +- Long generation (> 30s): 120s+ grace period + +### 2. Enable Request Migration for Decode Workers + +If using disaggregated serving, enable migration for decode workers: + +```python +--migration-limit 3 # Allow up to 3 migration attempts +``` + +This allows immediate shutdown while preserving request state. + +### 3. Monitor Shutdown Metrics + +Track shutdown behavior via logs: + +``` +INFO Received shutdown signal, shutting down DistributedRuntime +INFO DistributedRuntime shutdown complete +DEBUG Cleaning up worker +``` + +### 4. Handle Cleanup Errors + +Ensure cleanup methods handle errors gracefully: + +```python +def cleanup(self): + for resource in self.resources: + try: + resource.cleanup() + except Exception as e: + logger.warning(f"Cleanup failed: {e}") + # Continue with other resources +``` + +## Related Documentation + +- [Request Migration](request-migration.md) - How requests migrate during shutdown +- [Request Cancellation](request-cancellation.md) - Canceling in-flight requests +- [Health Checks](../observability/health-checks.md) - Liveness and readiness probes diff --git a/fern/pages/fault-tolerance/request-cancellation.md b/fern/pages/fault-tolerance/request-cancellation.md new file mode 100644 index 00000000000..30124d1d9b8 --- /dev/null +++ b/fern/pages/fault-tolerance/request-cancellation.md @@ -0,0 +1,92 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Request Cancellation Architecture" +--- + +This document describes how Dynamo implements request cancellation to cancel in-flight requests between Dynamo workers. Request cancellation allows in-flight requests to terminate early, saving computational resources that would otherwise be spent on responses that are no longer needed. + +## AsyncEngineContext Trait + +At the core of Dynamo's request cancellation system is the `AsyncEngineContext` trait. This trait is associated with every request stream and provides lifecycle management for async operations, including stream identification, graceful shutdown capabilities, and immediate termination capabilities. + +### Key Methods + +#### Identification +- **`id()`**: Returns the unique identifier for the stream. This ID is set by the user for request identification, and the same ID can be used for sub-requests to associate them with the original user request. + +#### Status Checking +- **`is_stopped()`**: Returns `true` if graceful cancellation has been requested via `stop_generating()`. This represents a signal to the worker that the request has been cancelled and it should return early. +- **`is_killed()`**: Returns `true` if a hard stop has been issued via `kill()`. This typically indicates that the network connection between client and server has been cut or an immediate termination is required. + +#### Async Status Monitoring +- **`stopped()`**: An async method that completes when the context becomes stopped. If already stopped, returns immediately. +- **`killed()`**: An async method that completes when the context becomes killed. If already killed, returns immediately. + +#### Cancellation Control +- **`stop_generating()`**: The recommended method for cancelling a request. This informs the engine to stop producing results for the stream gracefully. This method is idempotent and does not invalidate results currently in the stream. +- **`stop()`**: Alias for `stop_generating()`. +- **`kill()`**: Extends `stop_generating()` but also indicates a preference to terminate without draining remaining items in the stream. This is implementation-specific and may not be supported by all engines. + +#### Child Request Management +- **`link_child(child: Arc)`**: Links a child `AsyncEngineContext` to this context. When `stop_generating()`, `stop()`, or `kill()` is called on the parent context, the same method is automatically called on all linked child contexts in the order they were linked. This is especially useful in disaggregated serving scenarios where a frontend receives cancellation notification and needs to cancel requests to workers, and the worker can then cancel its sub-requests (e.g., remote prefill operations). + +### Thread Safety + +The `AsyncEngineContext` trait ensures thread-safety with `Send + Sync` bounds, allowing safe concurrent access across multiple threads and async tasks. + +## Python Bindings + +The `AsyncEngineContext` functionality is exposed to Python through the `Context` class, which provides a largely one-to-one mapping from Rust methods to Python methods. + +### Python Context Class + +The Python `Context` class wraps the Rust `AsyncEngineContext` and exposes the following methods: + +- **`id()`**: Returns the unique identifier for the context +- **`is_stopped()`**: Synchronous method equivalent to the Rust `is_stopped()` +- **`is_killed()`**: Synchronous method equivalent to the Rust `is_killed()` +- **`stop_generating()`**: Issues a stop generating signal, equivalent to the Rust method +- **`async_killed_or_stopped()`**: An async method that completes when the context becomes either killed or stopped, whichever happens first. This combines the functionality of the Rust `killed()` and `stopped()` async methods using `tokio::select!`. + +For a working example of request cancellation, see the [cancellation demo](https://github.com/ai-dynamo/dynamo/tree/main/examples/custom_backend/cancellation/README.md). + +### Context Usage in Python + +The context is available optionally in both incoming and outgoing request scenarios: + +#### Incoming Requests +For incoming requests, the generate method may optionally accept a `context` argument after the `request` argument. If the `context` parameter is specified in the method signature, it will receive the context object of the incoming request. Request handlers can: + +- Check for cancellation synchronously using `context.is_stopped()` before beginning expensive operations +- Listen for cancellation asynchronously using `await context.async_killed_or_stopped()` + +Example: +```python +async def generate(self, request, context): + for i in range(1000): + # Check for cancellation before expensive work + if context.is_stopped(): + raise asyncio.CancelledError + + # Perform work... + await expensive_computation() + yield result +``` + +#### Outgoing Requests +For outgoing requests, Python scripts may optionally provide a context object to outgoing runtime endpoint client router operations (such as `generate`, `round_robin`, `random`, `direct` methods) as a keyword argument. The script can cancel the outgoing request via the provided context object. + +This is especially useful when child outgoing requests need to be cancelled when the parent incoming request is cancelled. In such cases, the script can simply pass the incoming context object to the outgoing request, automatically linking the cancellation behavior. + +Example: +```python +async def generate(self, request, context): + # Forward the incoming context to outgoing request + # If the incoming request is cancelled, the outgoing request will be too + stream = await self.client.generate(request, context=context) + async for response in stream: + yield response +``` + +This design enables seamless cancellation propagation through multi-tier request chains, ensuring that when a client cancels a request, all associated sub-requests are automatically cancelled, saving computational resources across the entire request pipeline. diff --git a/fern/pages/fault-tolerance/request-migration.md b/fern/pages/fault-tolerance/request-migration.md new file mode 100644 index 00000000000..394b5a11f79 --- /dev/null +++ b/fern/pages/fault-tolerance/request-migration.md @@ -0,0 +1,140 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Request Migration Architecture" +--- + +This document describes how Dynamo implements request migration to handle worker failures gracefully during LLM text generation. Request migration allows in-progress requests to continue on different workers when the original worker becomes unavailable, providing fault tolerance and improved user experience. + +## Overview + +Request migration is implemented through a Migration operator that sits in the LLM processing pipeline between the Backend operator and the service backend. When a worker fails during request processing, the migration system preserves the partial generation state and recreates the request on a new worker to continue from where the previous worker left off. + +## Architecture Components + +### Migrator + +The migration system is integrated into the LLM processing pipeline between the frontend preprocessing and the actual service backends. This positioning allows it to intercept all communication flows and manage failure scenarios transparently. + +Key responsibilities: +- Intercepts all requests and responses flowing through the pipeline +- Detects worker failure scenarios through error pattern matching +- Manages retry logic with configurable migration limits +- Tracks partial response state for seamless continuation + +### Migration Limit Configuration + +Each model can be configured with a migration limit parameter that specifies the maximum number of times a request can be migrated to another worker: + +- Default behavior: no migration allowed +- Can be set independently for different engine types +- Applicable to LLM worker nodes that perform inference +- Allows engines to override user-specified limits for compatibility + +## Token State Tracking and Request Migration + +The core of the migration system is the ability to preserve and continue partial generations through token state management. This ensures that when a worker fails mid-generation, the new worker can seamlessly continue from the exact point of failure. + +### Token Accumulation Process + +When a request is being processed and responses are flowing back from a worker, the migration system tracks every token that has been successfully generated: + +1. **Initial Request State**: The system starts with the original preprocessed request containing the initial prompt tokens. + +2. **Response Tracking**: As each response arrives from the worker, the migration system extracts the newly generated tokens and appends them to the request's token sequence. This creates accumulates all tokens that have been generated. + +3. **Token Count Management**: The system also updates the remaining token budget to reflect the number of tokens already generated, ensuring that the total generation stays within the originally requested limits. + +### Migration Trigger Scenarios + +The migration system handles two distinct failure scenarios: + +#### 1. New Request Migration (Initial Connection Failure) + +**Scenario**: Worker is unreachable when creating the initial connection. + +**Error Pattern**: Communication system reports chosen worker instance is unavailable. + +**Migration Process**: +- Detects connection failure during initial stream setup +- Decrements migration retry count +- Attempts to create a new stream with the original request +- No partial state to preserve since generation hasn't started + +#### 2. Ongoing Request Migration (Mid-Stream Disconnection) + +**Scenario**: Connection lost during active generation after partial responses have been received. + +**Error Pattern**: Stream termination detected before generation completion. + +**Migration Process**: + +1. **Failure Detection**: The system detects the stream disconnection through error monitoring. + +2. **State Preservation**: At this point, the request's token sequence contains both the original prompt tokens and all successfully generated tokens from the failed worker. + +3. **New Stream Creation**: A fresh stream is created with the accumulated request state, ensuring the new worker has complete context. + +4. **Continuation**: The new worker receives the request with the full token context and continues generation from the exact point where the previous worker left off. + +### Seamless Token Flow and Request State Evolution + +From the client's perspective, the token stream appears continuous and uninterrupted. The client receives tokens from the first worker until failure occurs, then seamlessly continues receiving tokens from the backup worker without any indication of the underlying migration. + +The request state evolves dynamically during processing. Initially, the request contains only the original prompt tokens. As generation proceeds, each successfully generated token is appended to the request's token sequence, creating a growing record of the complete conversation context. + +When a migration occurs, this accumulated state is transferred to the new worker, which uses it to reconstruct the complete context. The new worker then continues generation as if it had been processing the request from the beginning, but starting from the current position in the sequence. + +The migration is transparent because: +1. No tokens are lost or duplicated during the transition +2. The new worker has complete context via the accumulated token sequence +3. Generation continues from the exact failure point +4. Response streaming maintains consistent format and timing + +This token accumulation mechanism ensures that migrations are truly seamless, preserving all computational work and maintaining generation quality across worker transitions. + +## Benefits + +1. **Fault Tolerance**: System continues operating during individual worker failures +2. **Resource Efficiency**: Partial generations are preserved rather than restarted +3. **Seamless User Experience**: Users experience no interruption during worker failures +4. **Configurable Behavior**: Migration limits allow tuning based on deployment requirements +5. **No Token Loss**: Complete preservation of generation state across migrations + +## Design Considerations + +The migration system is designed with several important architectural considerations: + +**Engine Compatibility**: Different LLM engines may have varying capabilities for handling migrated requests. The system allows engines to override migration settings to ensure compatibility and correctness. + +**Multi-Model Support**: Since a frontend may serve multiple models simultaneously, migration limits can be configured at the engine level, providing flexibility for different model types with varying reliability characteristics. + +**State Management**: The system carefully tracks not only token sequences but also metadata such as remaining token budgets, stop conditions, and sampling parameters to ensure complete state preservation. + +**Error Handling**: The migration system distinguishes between different types of failures and applies appropriate recovery strategies for each scenario. + +## Monitoring and Metrics + +The migration system exposes Prometheus metrics to monitor migration activity. These metrics are available on the frontend's `/metrics` endpoint (default port 8000): + +- `dynamo_frontend_model_migration_total`: Counter tracking the total number of request migrations + - Labels: + - `model`: The model name being served + - `migration_type`: Either `new_request` (initial connection failure) or `ongoing_request` (mid-stream disconnection) + +**Example metrics output:** +``` +dynamo_frontend_model_migration_total{migration_type="ongoing_request",model="Qwen/Qwen3-0.6B"} 3 +dynamo_frontend_model_migration_total{migration_type="new_request",model="Qwen/Qwen3-0.6B"} 1 +``` + +These metrics can be used to: +- Monitor worker reliability and failure patterns +- Alert on excessive migration rates indicating infrastructure issues +- Track the effectiveness of fault tolerance mechanisms + +For more information on Dynamo metrics, see the [Metrics documentation](../observability/metrics.md). + +## Operational Impact + +Request migration fundamentally changes how the system handles failures, moving from a "fail-fast" approach to a "graceful degradation" model. This architectural shift enables higher availability and better resource utilization while maintaining the same external API contract for clients. diff --git a/fern/pages/fault-tolerance/request-rejection.md b/fern/pages/fault-tolerance/request-rejection.md new file mode 100644 index 00000000000..7c7a8ac1bc6 --- /dev/null +++ b/fern/pages/fault-tolerance/request-rejection.md @@ -0,0 +1,315 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Request Rejection (Load Shedding)" +--- + +This document describes how Dynamo implements request rejection to prevent system overload and maintain service stability under high load conditions. + +## Overview + +Request rejection (also known as load shedding) is a fault tolerance mechanism that proactively rejects new requests when workers are overloaded. This prevents: + +- Cascading failures from resource exhaustion +- Degraded latency for all requests +- Out-of-memory conditions on GPU workers + +When all workers exceed their configured busy thresholds, new requests receive an HTTP 503 (Service Unavailable) response, signaling clients to retry later. + +## Architecture + +``` + ┌─────────────────┐ + │ Worker Monitor │ + │ (Background) │ + └────────┬────────┘ + │ Updates busy list + ▼ +┌──────────┐ ┌──────────┐ ┌─────────────────────┐ ┌──────────┐ +│ Client │───▶│ Frontend │───▶│ Push Router │───▶│ Worker │ +└──────────┘ └──────────┘ │ (checks busy list) │ └──────────┘ + └─────────────────────┘ + │ + │ If all workers busy + ▼ + ┌─────────────────────┐ + │ HTTP 503 Error │ + │ "All workers busy" │ + └─────────────────────┘ +``` + +## Configuration + +### Frontend Arguments + +Configure busy thresholds when starting the frontend: + +```bash +python -m dynamo.frontend \ + --active-decode-blocks-threshold 0.85 \ + --active-prefill-tokens-threshold 10000 +``` + +| Argument | Type | Description | +|----------|------|-------------| +| `--active-decode-blocks-threshold` | float (0.0-1.0) | KV cache block utilization threshold | +| `--active-prefill-tokens-threshold` | int | Prefill token count threshold | + +### Dynamic Configuration via API + +Thresholds can be adjusted at runtime via the `/busy_threshold` endpoint: + +#### Set Thresholds + +```bash +curl -X POST http://localhost:8000/busy_threshold \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-0.6B", + "active_decode_blocks_threshold": 0.85, + "active_prefill_tokens_threshold": 10000 + }' +``` + +#### Get Current Thresholds + +```bash +curl http://localhost:8000/busy_threshold +``` + +Response: +```json +{ + "thresholds": [ + { + "model": "Qwen/Qwen3-0.6B", + "active_decode_blocks_threshold": 0.85, + "active_prefill_tokens_threshold": 10000 + } + ] +} +``` + +## Busy Detection Logic + +Workers are marked as "busy" based on a dual-threshold system. A worker is considered busy when **either** threshold is exceeded. + +### KV Cache Block Threshold + +Monitors the percentage of KV cache blocks in use: + +``` +busy = active_decode_blocks / kv_total_blocks > threshold +``` + +Example: With `active_decode_blocks_threshold=0.85`, a worker using 87% of its KV cache blocks is marked busy. + +### Prefill Token Threshold + +Monitors the number of tokens currently being prefilled: + +``` +busy = active_prefill_tokens > threshold +``` + +Example: With `active_prefill_tokens_threshold=10000`, a worker prefilling 12,000 tokens is marked busy. + +### Data-Parallel Rank Aggregation + +For workers with multiple data-parallel ranks (tensor parallelism), the worker is only marked busy if **ALL** ranks are busy: + +```python +def is_busy(worker): + return all(rank.is_busy() for rank in worker.dp_ranks) +``` + +This prevents false positives when only some ranks are temporarily loaded. + +## Worker Load Monitoring + +The `KvWorkerMonitor` runs as a background task that: + +1. Subscribes to KV cache metrics events from workers +2. Maintains load state for each worker instance +3. Recalculates busy instances when metrics change +4. Updates the router with the current busy list + +### Metrics Collected + +Workers publish these metrics for monitoring: + +| Metric | Description | +|--------|-------------| +| `active_decode_blocks` | Number of KV cache blocks currently in use | +| `kv_total_blocks` | Total KV cache blocks available | +| `active_prefill_tokens` | Number of tokens currently being prefilled | + +## Rejection Behavior + +### Request Flow + +1. Request arrives at frontend +2. Push router checks if busy threshold is configured +3. If configured, router retrieves list of free (non-busy) instances +4. If no free instances exist (but instances are registered): + - Request is rejected with `PipelineError::ServiceOverloaded` + - HTTP 503 response is returned to client + +### Error Response + +When requests are rejected, clients receive: + +```http +HTTP/1.1 503 Service Unavailable +Content-Type: application/json + +{ + "message": "Service temporarily unavailable: All workers are busy, please retry later", + "type": "service_unavailable", + "code": 503 +} +``` + +### Client Retry Strategy + +Clients should implement exponential backoff when receiving 503 responses: + +```python +import time +import random + +def send_with_retry(request, max_retries=5): + for attempt in range(max_retries): + response = send_request(request) + if response.status_code != 503: + return response + + # Exponential backoff with jitter + wait_time = min(60, (2 ** attempt) + random.uniform(0, 1)) + time.sleep(wait_time) + + raise Exception("Max retries exceeded") +``` + +## Monitoring + +### Prometheus Metrics + +Track rejection behavior with these metrics: + +| Metric | Type | Description | +|--------|------|-------------| +| `dynamo_tasks_rejected_total` | Counter | Total number of rejected tasks | +| `dynamo_queued_requests` | Gauge | Requests waiting in HTTP queue | + +### Example Prometheus Queries + +```promql +# Rejection rate over 5 minutes +rate(dynamo_tasks_rejected_total[5m]) + +# Percentage of requests rejected +sum(rate(dynamo_tasks_rejected_total[5m])) / +sum(rate(dynamo_tasks_issued_total[5m])) * 100 +``` + +### Grafana Alerting + +Example alert for high rejection rate: + +```yaml +alert: HighRequestRejectionRate +expr: | + sum(rate(dynamo_tasks_rejected_total[5m])) / + sum(rate(dynamo_tasks_issued_total[5m])) > 0.1 +for: 5m +labels: + severity: warning +annotations: + summary: "High request rejection rate" + description: "More than 10% of requests are being rejected" +``` + +## Tuning Thresholds + +### Conservative Settings (Latency-Focused) + +For applications prioritizing low latency: + +```bash +--active-decode-blocks-threshold 0.70 +--active-prefill-tokens-threshold 5000 +``` + +- Rejects earlier, before workers become fully loaded +- Maintains lower queue depths +- Better tail latencies + +### Aggressive Settings (Throughput-Focused) + +For applications prioritizing throughput: + +```bash +--active-decode-blocks-threshold 0.95 +--active-prefill-tokens-threshold 20000 +``` + +- Allows higher worker utilization +- May increase latency variability +- Better overall throughput + +### Disabled (No Rejection) + +To disable request rejection entirely: + +```bash +# Simply don't set the threshold arguments +python -m dynamo.frontend +``` + +Without thresholds configured, all requests are accepted regardless of worker load. + +## Best Practices + +### 1. Start Conservative, Then Tune + +Begin with conservative thresholds and increase based on observed behavior: + +```bash +# Start here +--active-decode-blocks-threshold 0.75 + +# Increase if rejection rate is too high +--active-decode-blocks-threshold 0.85 +``` + +### 2. Monitor Before Enabling + +Observe worker load patterns before setting thresholds: + +```bash +# Watch KV cache utilization +watch -n 1 'curl -s localhost:8000/metrics | grep kv_blocks' +``` + +### 3. Use Both Thresholds for Disaggregated Serving + +In disaggregated deployments: +- Use `active_prefill_tokens_threshold` for prefill workers +- Use `active_decode_blocks_threshold` for decode workers + +### 4. Coordinate with Autoscaling + +If using Kubernetes HPA, ensure rejection thresholds trigger before autoscaling: + +```yaml +# HPA triggers at 70% utilization +# Rejection at 85% provides buffer +--active-decode-blocks-threshold 0.85 +``` + +## Related Documentation + +- [Health Checks](../observability/health-checks.md) - Worker health monitoring +- [Metrics](../observability/metrics.md) - Available Prometheus metrics +- [Request Migration](request-migration.md) - Handling failed requests diff --git a/fern/pages/fault-tolerance/testing.md b/fern/pages/fault-tolerance/testing.md new file mode 100644 index 00000000000..4def19cbb2c --- /dev/null +++ b/fern/pages/fault-tolerance/testing.md @@ -0,0 +1,490 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Fault Tolerance Testing" +--- + +This document describes the test infrastructure for validating Dynamo's fault tolerance mechanisms. The testing framework supports request cancellation, migration, etcd HA, and hardware fault injection scenarios. + +## Overview + +Dynamo's fault tolerance test suite is located in `tests/fault_tolerance/` and includes: + +| Test Category | Location | Purpose | +|---------------|----------|---------| +| Cancellation | `cancellation/` | Request cancellation during in-flight operations | +| Migration | `migration/` | Request migration when workers fail | +| etcd HA | `etcd_ha/` | etcd failover and recovery | +| Hardware | `hardware/` | GPU and network fault injection | +| Deployment | `deploy/` | End-to-end deployment testing | + +## Test Directory Structure + +``` +tests/fault_tolerance/ +├── cancellation/ +│ ├── test_vllm.py +│ ├── test_trtllm.py +│ ├── test_sglang.py +│ └── utils.py +├── migration/ +│ ├── test_vllm.py +│ ├── test_trtllm.py +│ ├── test_sglang.py +│ └── utils.py +├── etcd_ha/ +│ ├── test_vllm.py +│ ├── test_trtllm.py +│ ├── test_sglang.py +│ └── utils.py +├── hardware/ +│ └── fault_injection_service/ +│ ├── api_service/ +│ └── agents/ +├── deploy/ +│ ├── test_deployment.py +│ ├── scenarios.py +│ ├── base_checker.py +│ └── ... +└── client.py +``` + +## Request Cancellation Tests + +Test that in-flight requests can be properly canceled. + +### Running Cancellation Tests + +```bash +# Run all cancellation tests +pytest tests/fault_tolerance/cancellation/ -v + +# Run for specific backend +pytest tests/fault_tolerance/cancellation/test_vllm.py -v +``` + +### Cancellation Test Utilities + +The `cancellation/utils.py` module provides: + +#### CancellableRequest + +Thread-safe request cancellation via TCP socket manipulation: + +```python +from tests.fault_tolerance.cancellation.utils import CancellableRequest + +request = CancellableRequest() + +# Send request in separate thread +thread = Thread(target=send_request, args=(request,)) +thread.start() + +# Cancel after some time +time.sleep(1) +request.cancel() # Closes underlying socket +``` + +#### send_completion_request / send_chat_completion_request + +Send cancellable completion requests: + +```python +from tests.fault_tolerance.cancellation.utils import ( + send_completion_request, + send_chat_completion_request +) + +# Non-streaming +response = send_completion_request( + base_url="http://localhost:8000", + model="Qwen/Qwen3-0.6B", + prompt="Hello, world!", + max_tokens=100 +) + +# Streaming with cancellation +responses = send_chat_completion_request( + base_url="http://localhost:8000", + model="Qwen/Qwen3-0.6B", + messages=[{"role": "user", "content": "Hello!"}], + stream=True, + cancellable_request=request +) +``` + +#### poll_for_pattern + +Wait for specific patterns in logs: + +```python +from tests.fault_tolerance.cancellation.utils import poll_for_pattern + +# Wait for cancellation confirmation +found = poll_for_pattern( + log_file="/var/log/dynamo/worker.log", + pattern="Request cancelled", + timeout=30, + interval=0.5 +) +``` + +## Migration Tests + +Test that requests migrate to healthy workers when failures occur. + +### Running Migration Tests + +```bash +# Run all migration tests +pytest tests/fault_tolerance/migration/ -v + +# Run for specific backend +pytest tests/fault_tolerance/migration/test_vllm.py -v +``` + +### Migration Test Utilities + +The `migration/utils.py` module provides: + +- Frontend wrapper with configurable request planes +- Long-running request spawning for migration scenarios +- Health check disabling for controlled testing + +### Example Migration Test + +```python +def test_migration_on_worker_failure(): + # Start deployment with 2 workers + deployment = start_deployment(workers=2) + + # Send long-running request + request_thread = spawn_long_request(max_tokens=1000) + + # Kill one worker mid-generation + kill_worker(deployment.workers[0]) + + # Verify request completes on remaining worker + response = request_thread.join() + assert response.status_code == 200 + assert len(response.tokens) > 0 +``` + +## etcd HA Tests + +Test system behavior during etcd failures and recovery. + +### Running etcd HA Tests + +```bash +pytest tests/fault_tolerance/etcd_ha/ -v +``` + +### Test Scenarios + +- **Leader failover**: etcd leader node fails, cluster elects new leader +- **Network partition**: etcd node becomes unreachable +- **Recovery**: System recovers after etcd becomes available + +## Hardware Fault Injection + +The fault injection service enables testing under simulated hardware failures. + +### Fault Injection Service + +Located at `tests/fault_tolerance/hardware/fault_injection_service/`, this FastAPI service orchestrates fault injection: + +```bash +# Start the fault injection service +cd tests/fault_tolerance/hardware/fault_injection_service +python -m api_service.main +``` + +### Supported Fault Types + +#### GPU Faults + +| Fault Type | Description | +|------------|-------------| +| `XID_ERROR` | Simulate GPU XID error (various codes) | +| `THROTTLE` | GPU thermal throttling | +| `MEMORY_PRESSURE` | GPU memory exhaustion | +| `OVERHEAT` | GPU overheating condition | +| `COMPUTE_OVERLOAD` | GPU compute saturation | + +#### Network Faults + +| Fault Type | Description | +|------------|-------------| +| `FRONTEND_WORKER` | Partition between frontend and workers | +| `WORKER_NATS` | Partition between workers and NATS | +| `WORKER_WORKER` | Partition between workers | +| `CUSTOM` | Custom network partition | + +### Fault Injection API + +#### Inject GPU Fault + +```bash +curl -X POST http://localhost:8080/api/v1/faults/gpu/inject \ + -H "Content-Type: application/json" \ + -d '{ + "target_pod": "vllm-worker-0", + "fault_type": "XID_ERROR", + "severity": "HIGH" + }' +``` + +#### Inject Specific XID Error + +```bash +# Inject XID 79 (GPU memory page fault) +curl -X POST http://localhost:8080/api/v1/faults/gpu/inject/xid-79 \ + -H "Content-Type: application/json" \ + -d '{"target_pod": "vllm-worker-0"}' +``` + +Supported XID codes: 43, 48, 74, 79, 94, 95, 119, 120 + +#### Inject Network Partition + +```bash +curl -X POST http://localhost:8080/api/v1/faults/network/inject \ + -H "Content-Type: application/json" \ + -d '{ + "partition_type": "FRONTEND_WORKER", + "duration_seconds": 30 + }' +``` + +#### Recover from Fault + +```bash +curl -X POST http://localhost:8080/api/v1/faults/{fault_id}/recover +``` + +#### List Active Faults + +```bash +curl http://localhost:8080/api/v1/faults +``` + +### GPU Fault Injector Agent + +The GPU fault injector runs as a DaemonSet on worker nodes: + +```yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: gpu-fault-injector +spec: + selector: + matchLabels: + app: gpu-fault-injector + template: + spec: + containers: + - name: agent + image: dynamo/gpu-fault-injector:latest + securityContext: + privileged: true + volumeMounts: + - name: dev + mountPath: /dev +``` + +The agent injects fake XID messages via `/dev/kmsg` to trigger NVSentinel detection. + +## Deployment Testing Framework + +The `deploy/` directory contains an end-to-end testing framework. + +### Test Phases + +Tests run through three phases: + +| Phase | Description | +|-------|-------------| +| `STANDARD` | Baseline performance under normal conditions | +| `OVERFLOW` | System behavior during fault/overload | +| `RECOVERY` | System recovery after fault resolution | + +### Scenario Configuration + +Define test scenarios in `scenarios.py`: + +```python +from tests.fault_tolerance.deploy.scenarios import Scenario, Load, Failure + +scenario = Scenario( + name="worker_failure_migration", + backend="vllm", + load=Load( + clients=10, + requests_per_client=100, + max_tokens=256 + ), + failure=Failure( + type="pod_kill", + target="vllm-worker-0", + trigger_after_requests=50 + ) +) +``` + +### Running Deployment Tests + +```bash +# Run all deployment tests +pytest tests/fault_tolerance/deploy/test_deployment.py -v + +# Run specific scenario +pytest tests/fault_tolerance/deploy/test_deployment.py::test_worker_failure -v +``` + +### Validation Checkers + +The framework includes pluggable validators: + +```python +from tests.fault_tolerance.deploy.base_checker import BaseChecker, ValidationContext + +class MigrationChecker(BaseChecker): + def check(self, context: ValidationContext) -> bool: + # Verify migrations occurred + migrations = context.metrics.get("migrations_total", 0) + return migrations > 0 +``` + +### Results Parsing + +Parse test results for analysis: + +```python +from tests.fault_tolerance.deploy.parse_results import process_overflow_recovery_test + +results = process_overflow_recovery_test(log_dir="/path/to/logs") +print(f"Success rate: {results['success_rate']}") +print(f"P99 latency: {results['p99_latency_ms']}ms") +``` + +## Client Utilities + +The `client.py` module provides shared client functionality: + +### Multi-Threaded Load Generation + +```python +from tests.fault_tolerance.client import client + +# Generate load with multiple clients +results = client( + base_url="http://localhost:8000", + num_clients=10, + requests_per_client=100, + model="Qwen/Qwen3-0.6B", + max_tokens=256, + log_dir="/tmp/test_logs" +) +``` + +### Request Options + +| Parameter | Description | +|-----------|-------------| +| `base_url` | Frontend URL | +| `num_clients` | Number of concurrent clients | +| `requests_per_client` | Requests per client | +| `model` | Model name | +| `max_tokens` | Max tokens per request | +| `log_dir` | Directory for client logs | +| `endpoint` | `completions` or `chat/completions` | + +## Running the Full Test Suite + +### Prerequisites + +1. Kubernetes cluster with GPU nodes +2. Dynamo deployment +3. etcd cluster (for HA tests) +4. Fault injection service (for hardware tests) + +### Environment Setup + +```bash +export KUBECONFIG=/path/to/kubeconfig +export DYNAMO_NAMESPACE=dynamo-test +export FRONTEND_URL=http://localhost:8000 +``` + +### Run All Tests + +```bash +# Install test dependencies +pip install pytest pytest-asyncio + +# Run all fault tolerance tests +pytest tests/fault_tolerance/ -v --tb=short + +# Run with specific markers +pytest tests/fault_tolerance/ -v -m "not slow" +``` + +### Test Markers + +| Marker | Description | +|--------|-------------| +| `slow` | Long-running tests (> 5 minutes) | +| `gpu` | Requires GPU resources | +| `k8s` | Requires Kubernetes cluster | +| `etcd_ha` | Requires multi-node etcd | + +## Best Practices + +### 1. Isolate Test Environments + +Run fault tolerance tests in dedicated namespaces: + +```bash +kubectl create namespace dynamo-fault-test +``` + +### 2. Clean Up After Tests + +Ensure fault injection is recovered: + +```bash +# List and recover all active faults +curl http://localhost:8080/api/v1/faults | jq -r '.[].id' | \ + xargs -I {} curl -X POST http://localhost:8080/api/v1/faults/{}/recover +``` + +### 3. Collect Logs + +Preserve logs for debugging: + +```bash +pytest tests/fault_tolerance/ -v \ + --log-dir=/tmp/fault_test_logs \ + --capture=no +``` + +### 4. Monitor During Tests + +Watch system state during tests: + +```bash +# Terminal 1: Watch pods +watch kubectl get pods -n dynamo-test + +# Terminal 2: Watch metrics +watch 'curl -s localhost:8000/metrics | grep -E "(migration|rejection)"' +``` + +## Related Documentation + +- [Request Migration](request-migration.md) - Migration implementation details +- [Request Cancellation](request-cancellation.md) - Cancellation implementation +- [Health Checks](../observability/health-checks.md) - Health monitoring +- [Metrics](../observability/metrics.md) - Available metrics for monitoring diff --git a/fern/pages/frontends/kserve.md b/fern/pages/frontends/kserve.md new file mode 100644 index 00000000000..9455285b120 --- /dev/null +++ b/fern/pages/frontends/kserve.md @@ -0,0 +1,103 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "KServe gRPC frontend" +--- + +## Motivation + +[KServe v2 API](https://github.com/kserve/kserve/tree/master/docs/predict-api/v2) is one of the industry standard protocol for machine learning model inference. Triton inference server is one of the inference solutions that comply with KServe v2 API and it has gained a lot of adoption. To quickly enable Triton users to explore with Dynamo benefits, Dynamo provides a KServe gRPC frontend. + +This documentation assumes readers are familiar with the usage of KServe v2 API and focuses on explaining the Dynamo parts that work together to support KServe API and how users may migrate existing KServe deployment to Dynamo. + +## Supported Endpoints + +* `ModelInfer` endpoint: KServe Standard endpoint as described [here](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#inference-1) +* `ModelStreamInfer` endpoint: Triton extension endpoint that provide bi-directional streaming version of the inference RPC to allow a sequence of inference requests/responses to be sent over a GRPC stream, as described [here](https://github.com/triton-inference-server/common/blob/main/protobuf/grpc_service.proto#L84-L92) +* `ModelMetadata` endpoint: KServe standard endpoint as described [here](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#model-metadata-1) +* `ModelConfig` endpoint: Triton extension endpoint as described [here](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_model_configuration.md) + +## Starting the Frontend + +To start the KServe frontend, run the below command +``` +python -m dynamo.frontend --kserve-grpc-server +``` + +## Registering a Backend + +Similar to HTTP frontend, the registered backend will be auto-discovered and added to the frontend list of serving model. To register a backend, the same `register_llm()` API will be used. Currently the frontend support serving of the following model type and model input combination: +* `ModelType::Completions` and `ModelInput::Text`: Combination for LLM backend that uses custom preprocessor +* `ModelType::Completions` and `ModelInput::Token`: Combination for LLM backend that uses Dynamo preprocessor (i.e. Dynamo vLLM / SGLang / TRTLLM backend) +* `ModelType::TensorBased` and `ModelInput::Tensor`: Combination for backend that is used for generic tensor based inference + +The first two combinations are backed by OpenAI Completions API, see [OpenAI Completions section](#openai-completions) for more detail. Whereas the last combination is most aligned with KServe API and the users can replace existing deployment with Dynamo once their backends implements adaptor for `NvCreateTensorRequest/NvCreateTensorResponse`, see [Tensor section](#tensor) for more detail: + +### OpenAI Completions + +Most of the Dynamo features are tailored for LLM inference and the combinations that are backed by OpenAI API can enable those features and are best suited for exploring those Dynamo features. However, this implies specific conversion between generic tensor based messages and OpenAI message and imposes specific structure of the KServe request message. + +#### Model Metadata / Config + +The metadata and config endpoint will report the registered backend to have the below, note that this is not the exact response. +``` +{ + name: $MODEL_NAME, + version: 1, + platform: "dynamo", + backend: "dynamo", # model config specific + inputs: [ + { + name: "text_input", + datatype: "BYTES", + shape: [1] + }, + { + name: "streaming", + datatype: "BOOL", + shape: [1], + optional: true + } + ] + outputs: [ + { + name: "text_output", + datatype: "BYTES", + shape: [-1] + }, + { + name: "finish_reason", + datatype: "BYTES", + shape: [-1], + optional: true + } + ] +} +``` + +#### Inference + +On receiving inference request, the following conversion will be performed: +* `text_input`: the element is expected to contain the user prompt string and will be converted to `prompt` field in OpenAI Completion request +* `streaming`: the element will be converted to `stream` field in OpenAI Completion request +On receiving model response, the following conversion will be performed: +* `text_output`: each element corresponds to one choice in OpenAI Completion response, and the content will be set to `text` of the choice. +* `finish_reason`: each element corresponds to one choice in OpenAI Completion response, and the content will be set to `finish_reason` of the choice. + +### Tensor + +This combination is used when the user is migrating an existing KServe based backend into Dynamo ecosystem. + +#### Model Metadata / Config + +When registering the backend, the backend must provide the model's metadata as tensor based deployment is generic and the frontend can't make any assumptions like for OpenAI Completions model. There are two methods to provide model metadata: +* [TensorModelConfig](https://github.com/ai-dynamo/dynamo/blob/main/lib/llm/src/protocols/tensor.rs): This is Dynamo defined structure for model metadata, the backend can provide the model metadata as shown in this [example](https://github.com/ai-dynamo/dynamo/blob/main/lib/bindings/python/tests/test_tensor.py). For metadata provided in such way, the following field will be set to a fixed value: `version: 1`, `platform: "dynamo"`, `backend: "dynamo"`. Note that for model config endpoint, the rest of the fields will be set to their default values. +* [triton_model_config](https://github.com/ai-dynamo/dynamo/blob/main/lib/llm/src/protocols/tensor.rs): For users that already have Triton model config and require the full config to be returned for client side logic, they can set the config in `TensorModelConfig::triton_model_config` which will supersedes other fields in `TensorModelConfig` and be used for endpoint responses. `triton_model_config` is expected to be the serialized string of the `ModelConfig` protobuf message, see [echo_tensor_worker.py](https://github.com/ai-dynamo/dynamo/blob/main/tests/frontend/grpc/echo_tensor_worker.py) for example. + +#### Inference + +When receiving inference request, the backend will receive [NvCreateTensorRequest](https://github.com/ai-dynamo/dynamo/blob/main/lib/llm/src/protocols/tensor.rs) and be expected to return [NvCreateTensorResponse](https://github.com/ai-dynamo/dynamo/blob/main/lib/llm/src/protocols/tensor.rs), which are the mapping of ModelInferRequest / ModelInferResponse protobuf message in Dynamo. + +## Python Bindings + +The frontend may be started via Python binding, this is useful when integrating Dynamo in existing system that desire the frontend to be run in the same process with other components. See [server.py](https://github.com/ai-dynamo/dynamo/blob/main/lib/bindings/python/examples/kserve_grpc_service/server.py) for example. diff --git a/fern/pages/getting-started/examples.md b/fern/pages/getting-started/examples.md new file mode 100644 index 00000000000..f73cdc7ba8d --- /dev/null +++ b/fern/pages/getting-started/examples.md @@ -0,0 +1,50 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Examples" +--- + +Explore practical examples to get started with NVIDIA Dynamo. + +## Quick Start Examples + +The [examples directory](https://github.com/ai-dynamo/dynamo/tree/main/examples) in the Dynamo repository contains ready-to-run examples for various use cases. + +### Backend Examples + +| Backend | Description | Link | +|---------|-------------|------| +| **vLLM** | Run inference with vLLM backend | [examples/backends/vllm](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm) | +| **SGLang** | Run inference with SGLang backend | [examples/backends/sglang](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/sglang) | +| **TensorRT-LLM** | Run inference with TensorRT-LLM backend | [examples/backends/trtllm](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/trtllm) | + +### Deployment Examples + +| Example | Description | Link | +|---------|-------------|------| +| **Basic Deployment** | Simple single-node deployment | [examples/basics](https://github.com/ai-dynamo/dynamo/tree/main/examples/basics) | +| **Kubernetes** | Deploy on Kubernetes | [examples/deployments](https://github.com/ai-dynamo/dynamo/tree/main/examples/deployments) | +| **Multimodal** | Vision and multimodal models | [examples/multimodal](https://github.com/ai-dynamo/dynamo/tree/main/examples/multimodal) | + +### Custom Backend Examples + +Learn how to create custom backends: + +| Example | Description | Link | +|---------|-------------|------| +| **Custom Backend** | Build your own backend | [examples/custom_backend](https://github.com/ai-dynamo/dynamo/tree/main/examples/custom_backend) | + +## Running Examples + +Most examples can be run directly after installing Dynamo: + +```bash +# Clone the repository +git clone https://github.com/ai-dynamo/dynamo.git +cd dynamo + +# Navigate to an example +cd examples/backends/sglang + +# Follow the README in each example directory +``` diff --git a/fern/pages/getting-started/installation.md b/fern/pages/getting-started/installation.md new file mode 100644 index 00000000000..d85ad3c4a97 --- /dev/null +++ b/fern/pages/getting-started/installation.md @@ -0,0 +1,45 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Installation" +--- + +## Pip (PyPI) + +Install a pre-built wheel from PyPI. + +```bash +# Create a virtual environment and activate it +uv venv venv +source venv/bin/activate + +# Install Dynamo from PyPI (choose one backend extra) +uv pip install "ai-dynamo[sglang]" # or [vllm], [trtllm] +``` + +## Pip from source + +Install directly from a local checkout for development. + +```bash +# Clone the repository +git clone https://github.com/ai-dynamo/dynamo.git +cd dynamo + +# Create a virtual environment and activate it +uv venv venv +source venv/bin/activate +uv pip install ".[sglang]" # or [vllm], [trtllm] +``` + +## Docker + +Pull and run prebuilt images from NVIDIA NGC (`nvcr.io`). + +```bash +# Run a container (mount your workspace if needed) +docker run --rm -it \ + --gpus all \ + --network host \ + nvcr.io/nvidia/ai-dynamo/sglang-runtime:latest # or vllm, tensorrtllm +``` diff --git a/fern/pages/getting-started/quickstart.md b/fern/pages/getting-started/quickstart.md new file mode 100644 index 00000000000..df05b40a03b --- /dev/null +++ b/fern/pages/getting-started/quickstart.md @@ -0,0 +1,97 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Welcome to NVIDIA Dynamo" +--- + +The NVIDIA Dynamo Platform is a high-performance, low-latency inference framework designed to serve all AI models—across any framework, architecture, or deployment scale. + + +**Discover the Latest Developments!** + +This guide is a snapshot of a specific point in time. For the latest information, examples, and Release Assets, see the [Dynamo GitHub repository](https://github.com/ai-dynamo/dynamo/releases/latest). + + +## Quickstart + +Get started with Dynamo locally in just a few commands: + +### 1. Install Dynamo + +```bash +# Install uv (recommended Python package manager) +curl -LsSf https://astral.sh/uv/install.sh | sh + +# Create virtual environment and install Dynamo +uv venv venv +source venv/bin/activate +# Use prerelease flag to install RC versions of flashinfer and/or other dependencies +uv pip install --prerelease=allow "ai-dynamo[sglang]" # or [vllm], [trtllm] +``` + +### 2. Start etcd/NATS + +```bash +# Fetch and start etcd and NATS using Docker Compose +VERSION=$(uv pip show ai-dynamo | grep Version | cut -d' ' -f2) +curl -fsSL -o docker-compose.yml https://raw.githubusercontent.com/ai-dynamo/dynamo/refs/tags/v${VERSION}/deploy/docker-compose.yml +docker compose -f docker-compose.yml up -d +``` + +### 3. Run Dynamo + +```bash +# Start the OpenAI compatible frontend (default port is 8000) +python -m dynamo.frontend + +# In another terminal, start an SGLang worker +python -m dynamo.sglang --model-path Qwen/Qwen3-0.6B +``` + +### 4. Test Your Deployment + +```bash +curl localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "Qwen/Qwen3-0.6B", + "messages": [{"role": "user", "content": "Hello!"}], + "max_tokens": 50}' +``` + +## Key Features + +| Feature | Description | +|---------|-------------| +| **Multi-Backend Support** | vLLM, SGLang, and TensorRT-LLM backends | +| **Disaggregated Serving** | Separate prefill and decode for optimal performance | +| **KV Cache Routing** | Intelligent request routing based on KV cache state | +| **Kubernetes Native** | Full operator and Helm chart support | +| **Observability** | Prometheus metrics, Grafana dashboards, and tracing | + +## Documentation Overview + +### Backends +- [vLLM Backend](../backends/vllm/README.md) - High-throughput serving with vLLM +- [SGLang Backend](../backends/sglang/README.md) - Fast inference with SGLang +- [TensorRT-LLM Backend](../backends/trtllm/README.md) - Optimized inference with TensorRT-LLM + +### Kubernetes Deployment +- [Installation Guide updated](../kubernetes/installation-guide.md) - Deploy Dynamo on Kubernetes +- [Operator Guide](../kubernetes/dynamo-operator.md) - Using the Dynamo Operator +- [Autoscaling](../kubernetes/autoscaling.md) - Automatic scaling configuration + +### Architecture +- [System Architecture](../design-docs/architecture.md) - Overall system design +- [Disaggregated Serving](../design-docs/disagg-serving.md) - P/D separation architecture +- [Distributed Runtime](../design-docs/distributed-runtime.md) - Runtime internals + +### Performance & Tuning +- [Performance Tuning](../performance/tuning.md) - Optimize your deployment +- [Benchmarking](../benchmarks/benchmarking.md) - Measure and compare performance +- [AI Configurator](../performance/aiconfigurator.md) - Automated configuration + +## Getting Help + +- **GitHub Issues**: [Report bugs or request features](https://github.com/ai-dynamo/dynamo/issues) +- **Discussions**: [Ask questions and share ideas](https://github.com/ai-dynamo/dynamo/discussions) +- **Reference**: [CLI Reference](../reference/cli.md) | [Glossary](../reference/glossary.md) | [Support Matrix](./support-matrix.md) diff --git a/fern/pages/getting-started/support-matrix.md b/fern/pages/getting-started/support-matrix.md new file mode 100644 index 00000000000..86c4356f9d2 --- /dev/null +++ b/fern/pages/getting-started/support-matrix.md @@ -0,0 +1,123 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Dynamo Support Matrix" +--- + +This document provides the support matrix for Dynamo, including hardware, software and build instructions. + +## Hardware Compatibility + +| **CPU Architecture** | **Status** | +| :------------------- | :----------- | +| **x86_64** | Supported | +| **ARM64** | Supported | + + +### GPU Compatibility + +If you are using a **GPU**, the following GPU models and architectures are supported: + +| **GPU Architecture** | **Status** | +| :----------------------------------- | :--------- | +| **NVIDIA Blackwell Architecture** | Supported | +| **NVIDIA Hopper Architecture** | Supported | +| **NVIDIA Ada Lovelace Architecture** | Supported | +| **NVIDIA Ampere Architecture** | Supported | + +## Platform Architecture Compatibility + +**Dynamo** is compatible with the following platforms: + +| **Operating System** | **Version** | **Architecture** | **Status** | +| :------------------- | :---------- | :--------------- | :----------- | +| **Ubuntu** | 22.04 | x86_64 | Supported | +| **Ubuntu** | 24.04 | x86_64 | Supported | +| **Ubuntu** | 24.04 | ARM64 | Supported | +| **CentOS Stream** | 9 | x86_64 | Experimental | + + +Wheels are built using a manylinux_2_28-compatible environment and they have been validated on CentOS 9 and Ubuntu (22.04, 24.04). +Compatibility with other Linux distributions is expected but has not been officially verified yet. + + + +KV Block Manager is supported only with Python 3.12. Python 3.12 support is currently limited to Ubuntu 24.04. + + +## Software Compatibility + +### Runtime Dependency + +| **Python Package** | **Version** | glibc version | CUDA Version | +| :----------------- | :---------- | :------------------------------------ | :----------- | +| ai-dynamo | 0.8.0 | >=2.28 | | +| ai-dynamo-runtime | 0.8.0 | >=2.28 (Python 3.12 has known issues) | | +| NIXL | 0.8.0 | >=2.27 | >=11.8 | + +### Build Dependency + +The following table shows the dependency versions included with each Dynamo release: + +| **Dependency** | **main (ToT)** | **v0.8.0 (unreleased)** | **v0.7.1** | **v0.7.0.post1** | **v0.7.0** | +| :------------- | :------------- | :---------------------- | :--------- | :--------------- | :--------- | +| SGLang | 0.5.7 | 0.5.7 | 0.5.3.post4| 0.5.3.post4 | 0.5.3.post4| +| TensorRT-LLM | 1.2.0rc6 | 1.2.0rc6 | 1.2.0rc3 | 1.2.0rc3 | 1.2.0rc2 | +| vLLM | 0.13.0 | 0.12.0 | 0.11.0 | 0.11.0 | 0.11.0 | +| NIXL | 0.8.0 | 0.8.0 | 0.8.0 | 0.8.0 | 0.8.0 | + + +**main (ToT)** reflects the current development branch. **v0.8.0** is the upcoming release (planned for January 14, 2025) and not yet available. + + + + +Specific versions of TensorRT-LLM supported by Dynamo are subject to change. Currently TensorRT-LLM does not support Python 3.11 so installation of the ai-dynamo[trtllm] will fail. + + +### CUDA Support by Framework +| **Dynamo Version** | **SGLang** | **TensorRT-LLM** | **vLLM** | +| :------------------- | :-----------------------| :-----------------------| :-----------------------| +| **Dynamo 0.7.1** | CUDA 12.8 | CUDA 13.0 | CUDA 12.9 | + +## Cloud Service Provider Compatibility + +### AWS + +| **Host Operating System** | **Version** | **Architecture** | **Status** | +| :------------------------ | :---------- | :--------------- | :--------- | +| **Amazon Linux** | 2023 | x86_64 | Supported¹ | + + +There is a known issue with the TensorRT-LLM framework when running the AL2023 container locally with `docker run --network host ...` due to a [bug](https://github.com/mpi4py/mpi4py/discussions/491#discussioncomment-12660609) in mpi4py. To avoid this issue, replace the `--network host` flag with more precise networking configuration by mapping only the necessary ports (e.g., 4222 for nats, 2379/2380 for etcd, 8000 for frontend). + + +## Build Support + +**Dynamo** currently provides build support in the following ways: + +- **Wheels**: We distribute Python wheels of Dynamo and KV Block Manager: + - [ai-dynamo](https://pypi.org/project/ai-dynamo/) + - [ai-dynamo-runtime](https://pypi.org/project/ai-dynamo-runtime/) + - **New as of Dynamo v0.7.0:** [kvbm](https://pypi.org/project/kvbm/) as a standalone implementation. + +- **Dynamo Runtime Images**: We distribute multi-arch images (x86 & ARM64 compatible) of the Dynamo Runtime for each of the LLM inference frameworks on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo): + - [SGLang](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/containers/sglang-runtime) + - [TensorRT-LLM](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/containers/tensorrtllm-runtime) + - [vLLM](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/containers/vllm-runtime) + +- **Dynamo Kubernetes Operator Images**: We distribute multi-arch images (x86 & ARM64 compatible) of the Dynamo Operator on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo): + - [kubernetes-operator](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/containers/kubernetes-operator) to simplify deployments of Dynamo Graphs. + +- **Helm Charts**: [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo) hosts the helm charts supporting Kubernetes deployments of Dynamo: + - [Dynamo CRDs](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/helm-charts/dynamo-crds) + - [Dynamo Platform](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/helm-charts/dynamo-platform) + - [Dynamo Graph](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/helm-charts/dynamo-graph) + +- **Rust Crates**: + - [dynamo-runtime](https://crates.io/crates/dynamo-runtime/) + - [dynamo-async-openai](https://crates.io/crates/dynamo-async-openai/) + - [dynamo-parsers](https://crates.io/crates/dynamo-parsers/) + - [dynamo-llm](https://crates.io/crates/dynamo-llm/) + +Once you've confirmed that your platform and architecture are compatible, you can install **Dynamo** by following the instructions in the [Quick Start Guide](https://github.com/ai-dynamo/dynamo/blob/main/README.md#installation). diff --git a/fern/pages/guides/jail-stream-readme.md b/fern/pages/guides/jail-stream-readme.md new file mode 100644 index 00000000000..4c0ac54c0fc --- /dev/null +++ b/fern/pages/guides/jail-stream-readme.md @@ -0,0 +1,132 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "JailedStream Implementation" +--- + +## Overview + +The `JailedStream` is a standalone implementation for handling "jail" detection in token streams. It provides a clean, builder-based API for accumulating tokens when certain sequences are detected, then releasing them as a single chunk when the jail ends. + +## Key Features + +- **Builder Pattern**: Clean configuration API using the builder pattern +- **Configurable Sequences**: Support for multiple start/end jail sequences +- **Tool Call Parsing**: Integrated tool call detection and parsing +- **Stream Macro**: Uses `async-stream::stream!` for clean async implementation +- **Standalone**: Completely independent of existing code +- **Annotations**: Preserves annotations for observability + +## Implementation + +### Location +- Main implementation: `lib/llm/src/protocols/openai/chat_completions/jail.rs` +- Examples: `lib/llm/src/protocols/openai/chat_completions/jail_example.rs` + +### Usage + +```rust +use crate::protocols::openai::chat_completions::jail::JailedStream; +use dynamo_runtime::engine::{AsyncEngineContextProvider, ResponseStream}; + +// Get your ResponseStream with context +let response_stream: Pin>> = get_stream_from_engine(); + +// Extract context BEFORE passing to apply +let context = response_stream.context(); + +// Apply jail transformation (ResponseStream implements Stream) +let jail = JailedStream::builder() + .tool_call_parser("nemotron_deci") + .build(); + +let jailed_stream = jail.apply(response_stream); + +// Re-wrap with context when needed for engine consumption +let final_stream = ResponseStream::new(Box::pin(jailed_stream), context); +``` + +### Advanced Configuration + +```rust +// With custom jail sequences +let jail = JailedStream::builder() + .jail_start_sequence("") + .jail_end_sequence("") + .tool_call_parser("nemotron_deci") + .build(); + +// With multiple sequences +let jail = JailedStream::builder() + .jail_start_sequences(vec!["", ""]) + .jail_end_sequences(vec!["", ""]) + .tool_call_parser("harmony") + .build(); +``` + +## How It Works + +1. **Detection**: When a jail start sequence (or tool call start) is detected, the stream enters "jail" mode +2. **Accumulation**: While jailed, tokens are accumulated in memory instead of being yielded +3. **Annotations**: Empty chunks with annotations are sent downstream for observability +4. **Release**: When a jail end sequence is detected OR the stream ends: + - Accumulated content is parsed for tool calls + - A single chunk with the parsed content is yielded +5. **Pass-through**: Non-jailed content passes through unchanged + +## Testing + +The implementation includes comprehensive tests: + +- `test_jailed_stream_with_start_end_sequences`: Tests explicit jail sequences +- `test_jailed_stream_with_tool_calls`: Tests tool call detection and parsing +- `test_jailed_stream_no_jailing`: Tests normal pass-through behavior + +Run tests with: +```bash +cargo test -p dynamo-llm jail --lib +``` + +## Benefits + +1. **Standalone**: No modifications to existing code required +2. **Clean API**: Builder pattern makes configuration intuitive +3. **Flexible**: Supports multiple jail detection strategies +4. **Maintainable**: Uses `stream!` macro for cleaner async code +5. **Testable**: Comprehensive test suite with shared utilities +6. **Efficient**: No unnecessary boxing or context handling in the library +7. **Composable**: Can chain multiple stream transformers before re-adding context + +## Performance Optimizations + +- **No Boxing in Library**: Returns `impl Stream` instead of `Pin>` +- **Stack Pinning**: Uses `tokio::pin!()` instead of `Box::pin()` for better performance +- **No Context Overhead**: JailedStream doesn't manage AsyncEngineContext +- **Lazy Evaluation**: Only processes what's needed +- **Efficient State Management**: Minimal cloning, only when entering jail state + +## Integration Options + +To replace the existing `apply_tool_calling_jail_internal` function: + +```rust +// In preprocessor.rs +pub fn apply_tool_calling_jail_with_parser( + &self, + stream: ManyOut>, +) -> ManyOut> { + let jail = JailedStream::builder() + .tool_call_parser(self.tool_call_parser.clone()) + .build(); + + jail.apply(stream) +} +``` + +## Future Enhancements + +- Add support for regex patterns for jail sequences +- Add metrics/telemetry for jail detection +- Support for partial sequence matching across chunk boundaries +- Configurable accumulation limits +- Support for nested jails \ No newline at end of file diff --git a/fern/pages/guides/request-plane.md b/fern/pages/guides/request-plane.md new file mode 100644 index 00000000000..2ff6bcba3a4 --- /dev/null +++ b/fern/pages/guides/request-plane.md @@ -0,0 +1,293 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Dynamo Request Planes User Guide" +--- + +## Overview + +Dynamo supports multiple transport mechanisms for its request plane (the communication layer between services). You can choose from three different request plane modes based on your deployment requirements: + +- **TCP** (default): Direct TCP connection for optimal performance +- **NATS**: Message broker-based request plane +- **HTTP**: HTTP/2-based request plane + +This guide explains how to configure and use request plane in your Dynamo deployment. + +## What is a Request Plane? + +The request plane is the transport layer that handles communication between Dynamo services (e.g., frontend to backend, worker to worker). Different request planes offer different trade-offs: + +| Request Plane | Suitable For | Characteristics | +|--------------|----------|-----------------| +| **NATS** | Production deployments with KV routing | Requires NATS infrastructure, provides pub/sub patterns, highest flexibility | +| **TCP** | Low-latency direct communication | Direct connections, minimal overhead | +| **HTTP** | Standard deployments, debugging | HTTP/2 protocol, easier observability with standard tools, widely compatible | + +## Request Plane vs KV Event Plane + +Dynamo has **two independent communication planes**: + +- **Request plane** (**`DYN_REQUEST_PLANE`**): how **RPC requests** flow between components (frontend → router → worker), via `tcp`, `http`, or `nats`. +- **KV event plane** (currently only **NATS** is supported): how **KV cache events** (and optional router replica sync) are distributed/persisted for KV-aware routing. + +**Note:** if you are using `tcp` or `http` request plane and choose to use NATS for KV events, you must still configure NATS server using `NATS_SERVER` environment variable, e.g. `NATS_SERVER=nats://nats-hostname:port`. + +Because they are independent, you can mix them. + +For example, a deployment with TCP request plane can use different KV event planes: +- **JetStream KV events**: requests use TCP, KV routing still uses NATS JetStream + object store for persistence. +- **NATS Core KV events (local indexer)**: requests use TCP, KV events use NATS Core pub/sub and persistence lives on workers. +- **no KV events**: requests use TCP and KV routing runs without events (no NATS required, but no event-backed persistence). + +## Configuration + +### Environment Variable + +Set the request plane mode using the `DYN_REQUEST_PLANE` environment variable: + +```bash +export DYN_REQUEST_PLANE= +``` + +Where `` is one of: +- `tcp` (default) +- `nats` +- `http` + +The value is case-insensitive. + +### Default Behavior + +If `DYN_REQUEST_PLANE` is not set or contains an invalid value, Dynamo defaults to `tcp`. + +## Usage Examples + +### Using TCP (Default) + +TCP is the default request plane and provides direct, low-latency communication between services. + +**Configuration:** + +```bash +# TCP is the default, so no need to set DYN_REQUEST_PLANE explicitly +# But you can explicitly set it if desired: +export DYN_REQUEST_PLANE=tcp + +# Optional: Configure TCP server host and port +export DYN_TCP_RPC_HOST=0.0.0.0 # Default host +# export DYN_TCP_RPC_PORT=9999 # Optional: specify a fixed port + +# Run your Dynamo service +DYN_REQUEST_PLANE=tcp python -m dynamo.frontend --http-port=8000 & +DYN_REQUEST_PLANE=tcp python -m dynamo.vllm --model Qwen/Qwen3-0.6B +``` + +**Note:** By default, TCP uses an OS-assigned free port (port 0). This is ideal for environments where multiple services may run on the same machine or when you want to avoid port conflicts. If you need a specific port (e.g., for firewall rules), set `DYN_TCP_RPC_PORT` explicitly. + +**When to use TCP:** +- Simple deployments with direct service-to-service communication (e.g. frontend to backend) +- Minimal infrastructure requirements (**no NATS needed unless you enable KV-event-backed routing/replica sync**) +- Low-latency requirements + +**TCP Configuration Options:** + +Additional TCP-specific environment variables: +- `DYN_TCP_RPC_HOST`: Server host address (default: auto-detected) +- `DYN_TCP_RPC_PORT`: Server port. If not set, the OS assigns a free port automatically (recommended for most deployments). Set explicitly only if you need a specific port for firewall rules. +- `DYN_TCP_MAX_MESSAGE_SIZE`: Maximum message size for TCP client (default: 32MB) +- `DYN_TCP_REQUEST_TIMEOUT`: Request timeout for TCP client (default: 10 seconds) +- `DYN_TCP_POOL_SIZE`: Connection pool size for TCP client (default: 50) +- `DYN_TCP_CONNECT_TIMEOUT`: Connect timeout for TCP client (default: 3 seconds) +- `DYN_TCP_CHANNEL_BUFFER`: Request channel buffer size for TCP client (default: 100) + +### Using HTTP + +HTTP/2 provides a standards-based request plane that's easy to debug and widely compatible. + +**Configuration:** + +```bash +# Optional: Configure HTTP server host and port +export DYN_HTTP_RPC_HOST=0.0.0.0 # Default host +export DYN_HTTP_RPC_PORT=8888 # Default port +export DYN_HTTP_RPC_ROOT_PATH=/v1/rpc # Default path + +# Run your Dynamo service +DYN_REQUEST_PLANE=http python -m dynamo.frontend --http-port=8000 & +DYN_REQUEST_PLANE=http python -m dynamo.vllm --model Qwen/Qwen3-0.6B +``` + +**When to use HTTP:** +- Standard deployments requiring HTTP compatibility +- Debugging scenarios (use curl, browser tools, etc.) +- Integration with HTTP-based infrastructure +- Load balancers and proxies that work with HTTP + +**HTTP Configuration Options:** + +Additional HTTP-specific environment variables: +- `DYN_HTTP_RPC_HOST`: Server host address (default: auto-detected) +- `DYN_HTTP_RPC_PORT`: Server port (default: 8888) +- `DYN_HTTP_RPC_ROOT_PATH`: Root path for RPC endpoints (default: /v1/rpc) + +`DYN_HTTP2_*`: Various HTTP/2 client configuration options +- `DYN_HTTP2_MAX_FRAME_SIZE`: Maximum frame size for HTTP client (default: 1MB) +- `DYN_HTTP2_MAX_CONCURRENT_STREAMS`: Maximum concurrent streams for HTTP client (default: 1000) +- `DYN_HTTP2_POOL_MAX_IDLE_PER_HOST`: Maximum idle connections per host for HTTP client (default: 100) +- `DYN_HTTP2_POOL_IDLE_TIMEOUT_SECS`: Idle timeout for HTTP client (default: 90 seconds) +- `DYN_HTTP2_KEEP_ALIVE_INTERVAL_SECS`: Keep-alive interval for HTTP client (default: 30 seconds) +- `DYN_HTTP2_KEEP_ALIVE_TIMEOUT_SECS`: Keep-alive timeout for HTTP client (default: 10 seconds) +- `DYN_HTTP2_ADAPTIVE_WINDOW`: Enable adaptive flow control (default: true) + +### Using NATS + +NATS provides durable jetstream messaging for request plane and can be used for KV events (and router replica sync). + +**Prerequisites:** +- NATS server must be running and accessible +- Configure NATS connection via standard Dynamo NATS environment variables + +```bash +# Explicitly set to NATS +export DYN_REQUEST_PLANE=nats + +# Run your Dynamo service +DYN_REQUEST_PLANE=nats python -m dynamo.frontend --http-port=8000 & +DYN_REQUEST_PLANE=nats python -m dynamo.vllm --model Qwen/Qwen3-0.6B +``` + +**When to use NATS:** +- Production deployments with service discovery +- Currently KV based routing require NATS. If you want to completely disable NATS, KV based routing won't be available +- Need for message replay and persistence features + +Limitations: +- NATS does not support payloads beyond 16MB (use TCP for larger payloads) + +## Complete Example + +Here's a complete example showing how to launch a Dynamo deployment with different request planes: + +See [`examples/backends/vllm/launch/agg_request_planes.sh`](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm/launch/agg_request_planes.sh) for a complete working example that demonstrates launching Dynamo with TCP, HTTP, or NATS request planes. + + +## Real-World Example + +The Dynamo repository includes a complete example demonstrating all three request planes: + +**Location:** `examples/backends/vllm/launch/agg_request_planes.sh` + +```bash +cd examples/backends/vllm/launch + +# Run with TCP +./agg_request_planes.sh --tcp + +# Run with HTTP +./agg_request_planes.sh --http + +# Run with NATS +./agg_request_planes.sh --nats +``` + +## Architecture Details + +### Network Manager + +The request plane implementation is centralized in the Network Manager (`lib/runtime/src/pipeline/network/manager.rs`), which: + +1. Reads the `DYN_REQUEST_PLANE` environment variable at startup +2. Creates the appropriate server and client implementations +3. Provides a transport-agnostic interface to the rest of the codebase +4. Manages all network configuration and lifecycle + +### Transport Abstraction + +All request plane implementations conform to common trait interfaces: +- `RequestPlaneServer`: Server-side interface for receiving requests +- `RequestPlaneClient`: Client-side interface for sending requests + +This abstraction means your application code doesn't need to change when switching request planes. + +### Configuration Loading + +Request plane configuration is loaded from environment variables at startup and cached globally. The configuration hierarchy is: + +1. **Mode Selection**: `DYN_REQUEST_PLANE` (defaults to `tcp`) +2. **Transport-Specific Config**: Mode-specific environment variables (e.g., `DYN_TCP_*`, `DYN_HTTP2_*`) + +## Migration Guide + +### From NATS to TCP + +1. Stop your Dynamo services +2. Set environment variable `DYN_REQUEST_PLANE=tcp` +3. Optionally configure TCP-specific settings (e.g., `DYN_TCP_RPC_HOST`). Note: `DYN_TCP_RPC_PORT` is optional; if not set, an OS-assigned free port is used automatically. +4. Restart your services + + +### From NATS to HTTP + +1. Stop your Dynamo services +2. Set environment variable `DYN_REQUEST_PLANE=http` +3. Optionally configure HTTP-specific settings (`DYN_HTTP_RPC_PORT`, etc.) +4. Restart your services + +### Testing the Migration + +After switching request planes, verify your deployment: + +```bash +# Test with a simple request +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-0.6B", + "messages": [{"role": "user", "content": "Hello!"}] + }' +``` + +## Troubleshooting + +### Issue: Services Can't Communicate + +**Symptoms:** Requests timeout or fail to reach the backend + +**Solutions:** +- Verify all services use the same `DYN_REQUEST_PLANE` setting +- Check that server ports are not blocked by k8s network policies or firewalls +- For TCP/HTTP: Ensure host/port configurations are correct and accessible +- For NATS: Verify NATS server is running and accessible + +### Issue: "Invalid request plane mode" Error + +**Symptoms:** Service fails to start with configuration error + +**Solutions:** +- Check `DYN_REQUEST_PLANE` spelling (valid values: `nats`, `tcp`, `http`) +- Value is case-insensitive but must be one of the three options +- If not set, defaults to `tcp` + +### Issue: Port Conflicts + +**Symptoms:** Server fails to start due to "address already in use" + +**Solutions:** +- TCP: By default, TCP uses an OS-assigned free port, so port conflicts should be rare. If you explicitly set `DYN_TCP_RPC_PORT` to a specific port and get conflicts, either change the port or remove the setting to use automatic port assignment. +- HTTP default port: 8888 (adjust environment variable `DYN_HTTP_RPC_PORT`) + +## Performance Considerations + +### Latency + +- **TCP**: Lowest latency due to direct connections and binary serialization +- **HTTP**: Moderate latency with HTTP/2 overhead +- **NATS**: Moderate latency due to nats jet stream persistence + + +### Resource Usage + +- **TCP**: Minimal infrastructure (no additional services required) +- **HTTP**: Minimal infrastructure (no additional services required) +- **NATS**: Requires running NATS server (additional memory/CPU) diff --git a/fern/pages/kubernetes/README.md b/fern/pages/kubernetes/README.md new file mode 100644 index 00000000000..a689a6f9c21 --- /dev/null +++ b/fern/pages/kubernetes/README.md @@ -0,0 +1,241 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Deploying Dynamo on Kubernetes" +--- + +[Link to installation](../getting-started/installation.md) + +High-level guide to Dynamo Kubernetes deployments. Start here, then dive into specific guides. + +## Important Terminology + +**Kubernetes Namespace**: The K8s namespace where your DynamoGraphDeployment resource is created. +- Used for: Resource isolation, RBAC, organizing deployments +- Example: `dynamo-system`, `team-a-namespace` + +**Dynamo Namespace**: The logical namespace used by Dynamo components for [service discovery](service-discovery.md). +- Used for: Runtime component communication, service discovery +- Specified in: `.spec.services..dynamoNamespace` field +- Example: `my-llm`, `production-model`, `dynamo-dev` + +These are independent. A single Kubernetes namespace can host multiple Dynamo namespaces, and vice versa. + +## Prerequisites + +Before you begin, ensure you have the following tools installed: + +| Tool | Minimum Version | Installation Guide | +|------|-----------------|-------------------| +| **kubectl** | v1.24+ | [Install kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl) | +| **Helm** | v3.0+ | [Install Helm](https://helm.sh/docs/intro/install/) | + +Verify your installation: +```bash +kubectl version --client # Should show v1.24+ +helm version # Should show v3.0+ +``` + +For detailed installation instructions, see the [Prerequisites section](installation-guide.md#prerequisites) in the Installation Guide. + +## Pre-deployment Checks + +Before deploying the platform, run the pre-deployment checks to ensure the cluster is ready: + +```bash +./deploy/pre-deployment/pre-deployment-check.sh +``` + +This validates kubectl connectivity, StorageClass configuration, and GPU availability. See [pre-deployment checks](https://github.com/ai-dynamo/dynamo/tree/main/deploy/pre-deployment/README.md) for more details. + +## 1. Install Platform First + +```bash +# 1. Set environment +export NAMESPACE=dynamo-system +export RELEASE_VERSION=0.x.x # any version of Dynamo 0.3.2+ listed at https://github.com/ai-dynamo/dynamo/releases + +# 2. Install CRDs (skip if on shared cluster where CRDs already exist) +helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-crds-${RELEASE_VERSION}.tgz +helm install dynamo-crds dynamo-crds-${RELEASE_VERSION}.tgz --namespace default + +# 3. Install Platform +helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-${RELEASE_VERSION}.tgz +helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz --namespace ${NAMESPACE} --create-namespace +``` + +**For Shared/Multi-Tenant Clusters:** + +If your cluster has namespace-restricted Dynamo operators, add this flag to step 3: +```bash +--set dynamo-operator.namespaceRestriction.enabled=true +``` + +For more details or customization options (including multinode deployments), see **[Installation Guide for Dynamo Kubernetes Platform](installation-guide.md)**. + +## 2. Choose Your Backend + +Each backend has deployment examples and configuration options: + +| Backend | Aggregated | Aggregated + Router | Disaggregated | Disaggregated + Router | Disaggregated + Planner | Disaggregated Multi-node | +|--------------|:----------:|:-------------------:|:-------------:|:----------------------:|:-----------------------:|:------------------------:| +| **[SGLang](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/sglang/deploy/README.md)** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| **[TensorRT-LLM](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/trtllm/deploy/README.md)** | ✅ | ✅ | ✅ | ✅ | 🚧 | ✅ | +| **[vLLM](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm/deploy/README.md)** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | + +## 3. Deploy Your First Model + +```bash +export NAMESPACE=dynamo-system +kubectl create namespace ${NAMESPACE} + +# to pull model from HF +export HF_TOKEN= +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN="$HF_TOKEN" \ + -n ${NAMESPACE}; + +# Deploy any example (this uses vLLM with Qwen model using aggregated serving) +kubectl apply -f examples/backends/vllm/deploy/agg.yaml -n ${NAMESPACE} + +# Check status +kubectl get dynamoGraphDeployment -n ${NAMESPACE} + +# Test it +kubectl port-forward svc/vllm-agg-frontend 8000:8000 -n ${NAMESPACE} +curl http://localhost:8000/v1/models +``` + +For SLA-based autoscaling, see [SLA Planner Quick Start Guide](../planner/sla-planner-quickstart.md). + +## Understanding Dynamo's Custom Resources + +Dynamo provides two main Kubernetes Custom Resources for deploying models: + +### DynamoGraphDeploymentRequest (DGDR) - Simplified SLA-Driven Configuration + +The **recommended approach** for generating optimal configurations. DGDR provides a high-level interface where you specify: +- Model name and backend framework +- SLA targets (latency requirements) +- GPU type (optional) + +Dynamo automatically handles profiling and generates an optimized DGD spec in the status. Perfect for: +- SLA-driven configuration generation +- Automated resource optimization +- Users who want simplicity over control + +**Note**: DGDR generates a DGD spec which you can then use to deploy. + +### DynamoGraphDeployment (DGD) - Direct Configuration + +A lower-level interface that defines your complete inference pipeline: +- Model configuration +- Resource allocation (GPUs, memory) +- Scaling policies +- Frontend/backend connections + +Use this when you need fine-grained control or have already completed profiling. + +Refer to the [API Reference and Documentation](api-reference.md) for more details. + +## 📖 API Reference & Documentation + +For detailed technical specifications of Dynamo's Kubernetes resources: + +- **[API Reference](api-reference.md)** - Complete CRD field specifications for all Dynamo resources +- **[Create Deployment](deployment/create-deployment.md)** - Step-by-step deployment creation with DynamoGraphDeployment +- **[Operator Guide](dynamo-operator.md)** - Dynamo operator configuration and management + +### Choosing Your Architecture Pattern + +When creating a deployment, select the architecture pattern that best fits your use case: + +- **Development / Testing** - Use `agg.yaml` as the base configuration +- **Production with Load Balancing** - Use `agg_router.yaml` to enable scalable, load-balanced inference +- **High Performance / Disaggregated** - Use `disagg_router.yaml` for maximum throughput and modular scalability + +### Frontend and Worker Components + +You can run the Frontend on one machine (e.g., a CPU node) and workers on different machines (GPU nodes). The Frontend serves as a framework-agnostic HTTP entry point that: + +- Provides OpenAI-compatible `/v1/chat/completions` endpoint +- Auto-discovers backend workers via [service discovery](service-discovery.md) (Kubernetes-native by default) +- Routes requests and handles load balancing +- Validates and preprocesses requests + +### Customizing Your Deployment + +Example structure: +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: my-llm +spec: + services: + Frontend: + dynamoNamespace: my-llm + componentType: frontend + replicas: 1 + extraPodSpec: + mainContainer: + image: your-image + VllmDecodeWorker: # or SGLangDecodeWorker, TrtllmDecodeWorker + dynamoNamespace: dynamo-dev + componentType: worker + replicas: 1 + envFromSecret: hf-token-secret # for HuggingFace models + resources: + limits: + gpu: "1" + extraPodSpec: + mainContainer: + image: your-image + command: ["/bin/sh", "-c"] + args: + - python3 -m dynamo.vllm --model YOUR_MODEL [--your-flags] +``` + +Worker command examples per backend: +```yaml +# vLLM worker +args: + - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B + +# SGLang worker +args: + - >- + python3 -m dynamo.sglang + --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B + --tp 1 + --trust-remote-code + +# TensorRT-LLM worker +args: + - python3 -m dynamo.trtllm + --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B + --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B + --extra-engine-args /workspace/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/agg.yaml +``` + +Key customization points include: +- **Model Configuration**: Specify model in the args command +- **Resource Allocation**: Configure GPU requirements under `resources.limits` +- **Scaling**: Set `replicas` for number of worker instances +- **Routing Mode**: Enable KV-cache routing by setting `DYN_ROUTER_MODE=kv` in Frontend envs +- **Worker Specialization**: Add `--is-prefill-worker` flag for disaggregated prefill workers + +## Additional Resources + +- **[Examples](../getting-started/examples.md)** - Complete working examples +- **[Create Custom Deployments](deployment/create-deployment.md)** - Build your own CRDs +- **[Managing Models with DynamoModel](deployment/dynamomodel-guide.md)** - Deploy LoRA adapters and manage models +- **[Operator Documentation](dynamo-operator.md)** - How the platform works +- **[Service Discovery](service-discovery.md)** - Discovery backends and configuration +- **[Helm Charts](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/README.md)** - For advanced users +- **[GitOps Deployment with FluxCD](fluxcd.md)** - For advanced users +- **[Logging](observability/logging.md)** - For logging setup +- **[Multinode Deployment](deployment/multinode-deployment.md)** - For multinode deployment +- **[Grove](grove.md)** - For grove details and custom installation +- **[Monitoring](observability/metrics.md)** - For monitoring setup +- **[Model Caching with Fluid](model-caching-with-fluid.md)** - For model caching with Fluid diff --git a/fern/pages/kubernetes/api-reference.md b/fern/pages/kubernetes/api-reference.md new file mode 100644 index 00000000000..5756851f723 --- /dev/null +++ b/fern/pages/kubernetes/api-reference.md @@ -0,0 +1,1093 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "API Reference" +--- + + +This documentation is automatically generated from source code. +Do not edit this file directly. + + +## Packages +- [nvidia.com/v1alpha1](#nvidiacomv1alpha1) + + +## nvidia.com/v1alpha1 + +Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API group. + +This package defines the DynamoGraphDeploymentRequest (DGDR) custom resource, which provides +a high-level, SLA-driven interface for deploying machine learning models on Dynamo. + +Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API group. + +### Resource Types +- [DynamoComponentDeployment](#dynamocomponentdeployment) +- [DynamoGraphDeployment](#dynamographdeployment) +- [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest) +- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter) +- [DynamoModel](#dynamomodel) + + + +#### Autoscaling + + + +Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter +with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md +for migration guidance. This field will be removed in a future API version. + + + +_Appears in:_ +- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec) +- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `enabled` _boolean_ | Deprecated: This field is ignored. | | | +| `minReplicas` _integer_ | Deprecated: This field is ignored. | | | +| `maxReplicas` _integer_ | Deprecated: This field is ignored. | | | +| `behavior` _[HorizontalPodAutoscalerBehavior](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#horizontalpodautoscalerbehavior-v2-autoscaling)_ | Deprecated: This field is ignored. | | | +| `metrics` _[MetricSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#metricspec-v2-autoscaling) array_ | Deprecated: This field is ignored. | | | + + + + +#### ComponentKind + +_Underlying type:_ _string_ + +ComponentKind represents the type of underlying Kubernetes resource. + +_Validation:_ +- Enum: [PodClique PodCliqueScalingGroup Deployment LeaderWorkerSet] + +_Appears in:_ +- [ServiceReplicaStatus](#servicereplicastatus) + +| Field | Description | +| --- | --- | +| `PodClique` | ComponentKindPodClique represents a PodClique resource.
| +| `PodCliqueScalingGroup` | ComponentKindPodCliqueScalingGroup represents a PodCliqueScalingGroup resource.
| +| `Deployment` | ComponentKindDeployment represents a Deployment resource.
| +| `LeaderWorkerSet` | ComponentKindLeaderWorkerSet represents a LeaderWorkerSet resource.
| + + +#### ConfigMapKeySelector + + + +ConfigMapKeySelector selects a specific key from a ConfigMap. +Used to reference external configuration data stored in ConfigMaps. + + + +_Appears in:_ +- [ProfilingConfigSpec](#profilingconfigspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `name` _string_ | Name of the ConfigMap containing the desired data. | | Required: \{\}
| +| `key` _string_ | Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml". | disagg.yaml | | + + +#### DeploymentOverridesSpec + + + +DeploymentOverridesSpec allows users to customize metadata for auto-created DynamoGraphDeployments. +When autoApply is enabled, these overrides are applied to the generated DGD resource. + + + +_Appears in:_ +- [DynamoGraphDeploymentRequestSpec](#dynamographdeploymentrequestspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `name` _string_ | Name is the desired name for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR name. | | Optional: \{\}
| +| `namespace` _string_ | Namespace is the desired namespace for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR namespace. | | Optional: \{\}
| +| `labels` _object (keys:string, values:string)_ | Labels are additional labels to add to the DynamoGraphDeployment metadata.
These are merged with auto-generated labels from the profiling process. | | Optional: \{\}
| +| `annotations` _object (keys:string, values:string)_ | Annotations are additional annotations to add to the DynamoGraphDeployment metadata. | | Optional: \{\}
| +| `workersImage` _string_ | WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.
This image is used for both temporary DGDs created during online profiling and the final DGD.
If omitted, the image from the base config file (e.g., disagg.yaml) is used.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Optional: \{\}
| + + +#### DeploymentStatus + + + +DeploymentStatus tracks the state of an auto-created DynamoGraphDeployment. +This status is populated when autoApply is enabled and a DGD is created. + + + +_Appears in:_ +- [DynamoGraphDeploymentRequestStatus](#dynamographdeploymentrequeststatus) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `name` _string_ | Name is the name of the created DynamoGraphDeployment. | | | +| `namespace` _string_ | Namespace is the namespace of the created DynamoGraphDeployment. | | | +| `state` _string_ | State is the current state of the DynamoGraphDeployment.
This value is mirrored from the DGD's status.state field. | | | +| `created` _boolean_ | Created indicates whether the DGD has been successfully created.
Used to prevent recreation if the DGD is manually deleted by users. | | | + + + + +#### DynamoComponentDeployment + + + +DynamoComponentDeployment is the Schema for the dynamocomponentdeployments API + + + + + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | | +| `kind` _string_ | `DynamoComponentDeployment` | | | +| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `spec` _[DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)_ | Spec defines the desired state for this Dynamo component deployment. | | | + + +#### DynamoComponentDeploymentSharedSpec + + + + + + + +_Appears in:_ +- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec) +- [DynamoGraphDeploymentSpec](#dynamographdeploymentspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `annotations` _object (keys:string, values:string)_ | Annotations to add to generated Kubernetes resources for this component
(such as Pod, Service, and Ingress when applicable). | | | +| `labels` _object (keys:string, values:string)_ | Labels to add to generated Kubernetes resources for this component. | | | +| `serviceName` _string_ | The name of the component | | | +| `componentType` _string_ | ComponentType indicates the role of this component (for example, "main"). | | | +| `subComponentType` _string_ | SubComponentType indicates the sub-role of this component (for example, "prefill"). | | | +| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.
The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: \{\}
| +| `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace | | | +| `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,
GPUs/devices, and any runtime-specific resources. | | | +| `autoscaling` _[Autoscaling](#autoscaling)_ | Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter
with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md
for migration guidance. This field will be removed in a future API version. | | | +| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. | | | +| `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as
environment variables in the component containers. | | | +| `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. | | | +| `ingress` _[IngressSpec](#ingressspec)_ | Ingress config to expose the component outside the cluster (or through a service mesh). | | | +| `modelRef` _[ModelReference](#modelreference)_ | ModelRef references a model that this component serves
When specified, a headless service will be created for endpoint discovery | | | +| `sharedMemory` _[SharedMemorySpec](#sharedmemoryspec)_ | SharedMemory controls the tmpfs mounted at /dev/shm (enable/disable and size). | | | +| `extraPodMetadata` _[ExtraPodMetadata](#extrapodmetadata)_ | ExtraPodMetadata adds labels/annotations to the created Pods. | | | +| `extraPodSpec` _[ExtraPodSpec](#extrapodspec)_ | ExtraPodSpec allows to override the main pod spec configuration.
It is a k8s standard PodSpec. It also contains a MainContainer (standard k8s Container) field
that allows overriding the main container configuration. | | | +| `livenessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | LivenessProbe to detect and restart unhealthy containers. | | | +| `readinessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | ReadinessProbe to signal when the container is ready to receive traffic. | | | +| `replicas` _integer_ | Replicas is the desired number of Pods for this component.
When scalingAdapter is enabled, this field is managed by the
DynamoGraphDeploymentScalingAdapter and should not be modified directly. | | Minimum: 0
| +| `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. | | | +| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.
When enabled, replicas are managed via DGDSA and external autoscalers can scale
the service using the Scale subresource. When disabled, replicas can be modified directly. | | | + + +#### DynamoComponentDeploymentSpec + + + +DynamoComponentDeploymentSpec defines the desired state of DynamoComponentDeployment + + + +_Appears in:_ +- [DynamoComponentDeployment](#dynamocomponentdeployment) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `backendFramework` _string_ | BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm") | | Enum: [sglang vllm trtllm]
| +| `annotations` _object (keys:string, values:string)_ | Annotations to add to generated Kubernetes resources for this component
(such as Pod, Service, and Ingress when applicable). | | | +| `labels` _object (keys:string, values:string)_ | Labels to add to generated Kubernetes resources for this component. | | | +| `serviceName` _string_ | The name of the component | | | +| `componentType` _string_ | ComponentType indicates the role of this component (for example, "main"). | | | +| `subComponentType` _string_ | SubComponentType indicates the sub-role of this component (for example, "prefill"). | | | +| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.
The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: \{\}
| +| `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace | | | +| `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,
GPUs/devices, and any runtime-specific resources. | | | +| `autoscaling` _[Autoscaling](#autoscaling)_ | Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter
with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md
for migration guidance. This field will be removed in a future API version. | | | +| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. | | | +| `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as
environment variables in the component containers. | | | +| `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. | | | +| `ingress` _[IngressSpec](#ingressspec)_ | Ingress config to expose the component outside the cluster (or through a service mesh). | | | +| `modelRef` _[ModelReference](#modelreference)_ | ModelRef references a model that this component serves
When specified, a headless service will be created for endpoint discovery | | | +| `sharedMemory` _[SharedMemorySpec](#sharedmemoryspec)_ | SharedMemory controls the tmpfs mounted at /dev/shm (enable/disable and size). | | | +| `extraPodMetadata` _[ExtraPodMetadata](#extrapodmetadata)_ | ExtraPodMetadata adds labels/annotations to the created Pods. | | | +| `extraPodSpec` _[ExtraPodSpec](#extrapodspec)_ | ExtraPodSpec allows to override the main pod spec configuration.
It is a k8s standard PodSpec. It also contains a MainContainer (standard k8s Container) field
that allows overriding the main container configuration. | | | +| `livenessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | LivenessProbe to detect and restart unhealthy containers. | | | +| `readinessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | ReadinessProbe to signal when the container is ready to receive traffic. | | | +| `replicas` _integer_ | Replicas is the desired number of Pods for this component.
When scalingAdapter is enabled, this field is managed by the
DynamoGraphDeploymentScalingAdapter and should not be modified directly. | | Minimum: 0
| +| `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. | | | +| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.
When enabled, replicas are managed via DGDSA and external autoscalers can scale
the service using the Scale subresource. When disabled, replicas can be modified directly. | | | + + +#### DynamoGraphDeployment + + + +DynamoGraphDeployment is the Schema for the dynamographdeployments API. + + + + + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | | +| `kind` _string_ | `DynamoGraphDeployment` | | | +| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `spec` _[DynamoGraphDeploymentSpec](#dynamographdeploymentspec)_ | Spec defines the desired state for this graph deployment. | | | +| `status` _[DynamoGraphDeploymentStatus](#dynamographdeploymentstatus)_ | Status reflects the current observed state of this graph deployment. | | | + + +#### DynamoGraphDeploymentRequest + + + +DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests API. +It serves as the primary interface for users to request model deployments with +specific performance and resource constraints, enabling SLA-driven deployments. + +Lifecycle: + 1. Initial → Pending: Validates spec and prepares for profiling + 2. Pending → Profiling: Creates and runs profiling job (online or AIC) + 3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes + 4. Deploying → Ready: When autoApply=true, monitors DGD until Ready + 5. Ready: Terminal state when DGD is operational or spec is available + 6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted + +The spec becomes immutable once profiling starts. Users must delete and recreate +the DGDR to modify configuration after this point. + + + + + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | | +| `kind` _string_ | `DynamoGraphDeploymentRequest` | | | +| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `spec` _[DynamoGraphDeploymentRequestSpec](#dynamographdeploymentrequestspec)_ | Spec defines the desired state for this deployment request. | | | +| `status` _[DynamoGraphDeploymentRequestStatus](#dynamographdeploymentrequeststatus)_ | Status reflects the current observed state of this deployment request. | | | + + +#### DynamoGraphDeploymentRequestSpec + + + +DynamoGraphDeploymentRequestSpec defines the desired state of a DynamoGraphDeploymentRequest. +This CRD serves as the primary interface for users to request model deployments with +specific performance constraints and resource requirements, enabling SLA-driven deployments. + + + +_Appears in:_ +- [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
This is a high-level identifier for easy reference in kubectl output and logs.
The controller automatically sets this value in profilingConfig.config.deployment.model. | | Required: \{\}
| +| `backend` _string_ | Backend specifies the inference backend for profiling.
The controller automatically sets this value in profilingConfig.config.engine.backend.
Profiling runs on real GPUs or via AIC simulation to collect performance data. | | Enum: [vllm sglang trtllm]
Required: \{\}
| +| `useMocker` _boolean_ | UseMocker indicates whether to deploy a mocker DynamoGraphDeployment instead of
a real backend deployment. When true, the deployment uses simulated engines that
don't require GPUs, using the profiling data to simulate realistic timing behavior.
Mocker is available in all backend images and useful for large-scale experiments.
Profiling still runs against the real backend (specified above) to collect performance data. | false | | +| `enableGpuDiscovery` _boolean_ | EnableGpuDiscovery controls whether the profiler should automatically discover GPU
resources from the Kubernetes cluster nodes. When enabled, the profiler will override
any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
num_gpus_per_node) with values detected from the cluster.
Requires cluster-wide node access permissions - only available with cluster-scoped operators. | false | Optional: \{\}
| +| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.
This configuration is passed directly to the profiler.
The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
Note: deployment.model and engine.backend are automatically set from the high-level
modelName and backend fields and should not be specified in this config. | | Required: \{\}
| +| `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment
after profiling completes. If false, only the spec is generated and stored in status.
Users can then manually create a DGD using the generated spec. | false | | +| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.
Only applicable when AutoApply is true. | | Optional: \{\}
| + + +#### DynamoGraphDeploymentRequestStatus + + + +DynamoGraphDeploymentRequestStatus represents the observed state of a DynamoGraphDeploymentRequest. +The controller updates this status as the DGDR progresses through its lifecycle. + + + +_Appears in:_ +- [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `state` _string_ | State is a high-level textual status of the deployment request lifecycle.
Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"
Empty string ("") represents the initial state before initialization. | | | +| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.
This field is populated by the controller and shown in kubectl output. | | Optional: \{\}
| +| `observedGeneration` _integer_ | ObservedGeneration reflects the generation of the most recently observed spec.
Used to detect spec changes and enforce immutability after profiling starts. | | | +| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.
Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.
Conditions are merged by type on patch updates. | | | +| `profilingResults` _string_ | ProfilingResults contains a reference to the ConfigMap holding profiling data.
Format: "configmap/\" | | Optional: \{\}
| +| `generatedDeployment` _[RawExtension](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#rawextension-runtime-pkg)_ | GeneratedDeployment contains the full generated DynamoGraphDeployment specification
including metadata, based on profiling results. Users can extract this to create
a DGD manually, or it's used automatically when autoApply is true.
Stored as RawExtension to preserve all fields including metadata.
For mocker backends, this contains the mocker DGD spec. | | EmbeddedResource: \{\}
Optional: \{\}
| +| `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.
Contains name, namespace, state, and creation status of the managed DGD. | | Optional: \{\}
| + + +#### DynamoGraphDeploymentScalingAdapter + + + +DynamoGraphDeploymentScalingAdapter provides a scaling interface for individual services +within a DynamoGraphDeployment. It implements the Kubernetes scale +subresource, enabling integration with HPA, KEDA, and custom autoscalers. + +The adapter acts as an intermediary between autoscalers and the DGD, +ensuring that only the adapter controller modifies the DGD's service replicas. +This prevents conflicts when multiple autoscaling mechanisms are in play. + + + + + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | | +| `kind` _string_ | `DynamoGraphDeploymentScalingAdapter` | | | +| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `spec` _[DynamoGraphDeploymentScalingAdapterSpec](#dynamographdeploymentscalingadapterspec)_ | | | | +| `status` _[DynamoGraphDeploymentScalingAdapterStatus](#dynamographdeploymentscalingadapterstatus)_ | | | | + + +#### DynamoGraphDeploymentScalingAdapterSpec + + + +DynamoGraphDeploymentScalingAdapterSpec defines the desired state of DynamoGraphDeploymentScalingAdapter + + + +_Appears in:_ +- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `replicas` _integer_ | Replicas is the desired number of replicas for the target service.
This field is modified by external autoscalers (HPA/KEDA/Planner) or manually by users. | | Minimum: 0
Required: \{\}
| +| `dgdRef` _[DynamoGraphDeploymentServiceRef](#dynamographdeploymentserviceref)_ | DGDRef references the DynamoGraphDeployment and the specific service to scale. | | Required: \{\}
| + + +#### DynamoGraphDeploymentScalingAdapterStatus + + + +DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter + + + +_Appears in:_ +- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `replicas` _integer_ | Replicas is the current number of replicas for the target service.
This is synced from the DGD's service replicas and is required for the scale subresource. | | | +| `selector` _string_ | Selector is a label selector string for the pods managed by this adapter.
Required for HPA compatibility via the scale subresource. | | | +| `lastScaleTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | LastScaleTime is the last time the adapter scaled the target service. | | | + + +#### DynamoGraphDeploymentServiceRef + + + +DynamoGraphDeploymentServiceRef identifies a specific service within a DynamoGraphDeployment + + + +_Appears in:_ +- [DynamoGraphDeploymentScalingAdapterSpec](#dynamographdeploymentscalingadapterspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `name` _string_ | Name of the DynamoGraphDeployment | | MinLength: 1
Required: \{\}
| +| `serviceName` _string_ | ServiceName is the key name of the service within the DGD's spec.services map to scale | | MinLength: 1
Required: \{\}
| + + +#### DynamoGraphDeploymentSpec + + + +DynamoGraphDeploymentSpec defines the desired state of DynamoGraphDeployment. + + + +_Appears in:_ +- [DynamoGraphDeployment](#dynamographdeployment) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `pvcs` _[PVC](#pvc) array_ | PVCs defines a list of persistent volume claims that can be referenced by components.
Each PVC must have a unique name that can be referenced in component specifications. | | MaxItems: 100
Optional: \{\}
| +| `services` _object (keys:string, values:[DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec))_ | Services are the services to deploy as part of this deployment. | | MaxProperties: 25
Optional: \{\}
| +| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs are environment variables applied to all services in the deployment unless
overridden by service-specific configuration. | | Optional: \{\}
| +| `backendFramework` _string_ | BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm"). | | Enum: [sglang vllm trtllm]
| + + +#### DynamoGraphDeploymentStatus + + + +DynamoGraphDeploymentStatus defines the observed state of DynamoGraphDeployment. + + + +_Appears in:_ +- [DynamoGraphDeployment](#dynamographdeployment) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `state` _string_ | State is a high-level textual status of the graph deployment lifecycle. | | | +| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the graph deployment.
The slice is merged by type on patch updates. | | | +| `services` _object (keys:string, values:[ServiceReplicaStatus](#servicereplicastatus))_ | Services contains per-service replica status information.
The map key is the service name from spec.services. | | | + + +#### DynamoModel + + + +DynamoModel is the Schema for the dynamo models API + + + + + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | | +| `kind` _string_ | `DynamoModel` | | | +| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `spec` _[DynamoModelSpec](#dynamomodelspec)_ | | | | +| `status` _[DynamoModelStatus](#dynamomodelstatus)_ | | | | + + +#### DynamoModelSpec + + + +DynamoModelSpec defines the desired state of DynamoModel + + + +_Appears in:_ +- [DynamoModel](#dynamomodel) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `modelName` _string_ | ModelName is the full model identifier (e.g., "meta-llama/Llama-3.3-70B-Instruct-lora") | | Required: \{\}
| +| `baseModelName` _string_ | BaseModelName is the base model identifier that matches the service label
This is used to discover endpoints via headless services | | Required: \{\}
| +| `modelType` _string_ | ModelType specifies the type of model (e.g., "base", "lora", "adapter") | base | Enum: [base lora adapter]
| +| `source` _[ModelSource](#modelsource)_ | Source specifies the model source location (only applicable for lora model type) | | | + + +#### DynamoModelStatus + + + +DynamoModelStatus defines the observed state of DynamoModel + + + +_Appears in:_ +- [DynamoModel](#dynamomodel) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `endpoints` _[EndpointInfo](#endpointinfo) array_ | Endpoints is the current list of all endpoints for this model | | | +| `readyEndpoints` _integer_ | ReadyEndpoints is the count of endpoints that are ready | | | +| `totalEndpoints` _integer_ | TotalEndpoints is the total count of endpoints | | | +| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions represents the latest available observations of the model's state | | | + + +#### EndpointInfo + + + +EndpointInfo represents a single endpoint (pod) serving the model + + + +_Appears in:_ +- [DynamoModelStatus](#dynamomodelstatus) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `address` _string_ | Address is the full address of the endpoint (e.g., "http://10.0.1.5:9090") | | | +| `podName` _string_ | PodName is the name of the pod serving this endpoint | | | +| `ready` _boolean_ | Ready indicates whether the endpoint is ready to serve traffic
For LoRA models: true if the POST /loras request succeeded with a 2xx status code
For base models: always false (no probing performed) | | | + + +#### ExtraPodMetadata + + + + + + + +_Appears in:_ +- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec) +- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `annotations` _object (keys:string, values:string)_ | | | | +| `labels` _object (keys:string, values:string)_ | | | | + + +#### ExtraPodSpec + + + + + + + +_Appears in:_ +- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec) +- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `mainContainer` _[Container](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#container-v1-core)_ | | | | + + +#### IngressSpec + + + + + + + +_Appears in:_ +- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec) +- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `enabled` _boolean_ | Enabled exposes the component through an ingress or virtual service when true. | | | +| `host` _string_ | Host is the base host name to route external traffic to this component. | | | +| `useVirtualService` _boolean_ | UseVirtualService indicates whether to configure a service-mesh VirtualService instead of a standard Ingress. | | | +| `virtualServiceGateway` _string_ | VirtualServiceGateway optionally specifies the gateway name to attach the VirtualService to. | | | +| `hostPrefix` _string_ | HostPrefix is an optional prefix added before the host. | | | +| `annotations` _object (keys:string, values:string)_ | Annotations to set on the generated Ingress/VirtualService resources. | | | +| `labels` _object (keys:string, values:string)_ | Labels to set on the generated Ingress/VirtualService resources. | | | +| `tls` _[IngressTLSSpec](#ingresstlsspec)_ | TLS holds the TLS configuration used by the Ingress/VirtualService. | | | +| `hostSuffix` _string_ | HostSuffix is an optional suffix appended after the host. | | | +| `ingressControllerClassName` _string_ | IngressControllerClassName selects the ingress controller class (e.g., "nginx"). | | | + + +#### IngressTLSSpec + + + + + + + +_Appears in:_ +- [IngressSpec](#ingressspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `secretName` _string_ | SecretName is the name of a Kubernetes Secret containing the TLS certificate and key. | | | + + + + +#### ModelReference + + + +ModelReference identifies a model served by this component + + + +_Appears in:_ +- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec) +- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `name` _string_ | Name is the base model identifier (e.g., "llama-3-70b-instruct-v1") | | Required: \{\}
| +| `revision` _string_ | Revision is the model revision/version (optional) | | | + + +#### ModelSource + + + +ModelSource defines the source location of a model + + + +_Appears in:_ +- [DynamoModelSpec](#dynamomodelspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `uri` _string_ | URI is the model source URI
Supported formats:
- S3: s3://bucket/path/to/model
- HuggingFace: hf://org/model@revision_sha | | Required: \{\}
| + + +#### MultinodeSpec + + + + + + + +_Appears in:_ +- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec) +- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `nodeCount` _integer_ | Indicates the number of nodes to deploy for multinode components.
Total number of GPUs is NumberOfNodes * GPU limit.
Must be greater than 1. | 2 | Minimum: 2
| + + +#### PVC + + + + + + + +_Appears in:_ +- [DynamoGraphDeploymentSpec](#dynamographdeploymentspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `create` _boolean_ | Create indicates to create a new PVC | | | +| `name` _string_ | Name is the name of the PVC | | Required: \{\}
| +| `storageClass` _string_ | StorageClass to be used for PVC creation. Required when create is true. | | | +| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | Size of the volume in Gi, used during PVC creation. Required when create is true. | | | +| `volumeAccessMode` _[PersistentVolumeAccessMode](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#persistentvolumeaccessmode-v1-core)_ | VolumeAccessMode is the volume access mode of the PVC. Required when create is true. | | | + + +#### ProfilingConfigSpec + + + +ProfilingConfigSpec defines configuration for the profiling process. +This structure maps directly to the profile_sla.py config format. +See benchmarks/profiler/utils/profiler_argparse.py for the complete schema. + + + +_Appears in:_ +- [DynamoGraphDeploymentRequestSpec](#dynamographdeploymentrequestspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `config` _[JSON](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#json-v1-apiextensions-k8s-io)_ | Config is the profiling configuration as arbitrary JSON/YAML. This will be passed directly to the profiler.
The profiler will validate the configuration and report any errors. | | Optional: \{\}
Type: object
| +| `configMapRef` _[ConfigMapKeySelector](#configmapkeyselector)_ | ConfigMapRef is an optional reference to a ConfigMap containing the DynamoGraphDeployment
base config file (disagg.yaml). This is separate from the profiling config above.
The path to this config will be set as engine.config in the profiling config. | | Optional: \{\}
| +| `profilerImage` _string_ | ProfilerImage specifies the container image to use for profiling jobs.
This image contains the profiler code and dependencies needed for SLA-based profiling.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Required: \{\}
| +| `outputPVC` _string_ | OutputPVC is an optional PersistentVolumeClaim name for storing profiling output.
If specified, all profiling artifacts (logs, plots, configs, raw data) will be written
to this PVC instead of an ephemeral emptyDir volume. This allows users to access
complete profiling results after the job completes by mounting the PVC.
The PVC must exist in the same namespace as the DGDR.
If not specified, profiling uses emptyDir and only essential data is saved to ConfigMaps.
Note: ConfigMaps are still created regardless of this setting for planner integration. | | Optional: \{\}
| +| `resources` _[ResourceRequirements](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourcerequirements-v1-core)_ | Resources specifies the compute resource requirements for the profiling job container.
If not specified, no resource requests or limits are set. | | Optional: \{\}
| +| `tolerations` _[Toleration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#toleration-v1-core) array_ | Tolerations allows the profiling job to be scheduled on nodes with matching taints.
For example, to schedule on GPU nodes, add a toleration for the nvidia.com/gpu taint. | | Optional: \{\}
| + + +#### ResourceItem + + + + + + + +_Appears in:_ +- [Resources](#resources) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `cpu` _string_ | CPU specifies the CPU resource request/limit (e.g., "1000m", "2") | | | +| `memory` _string_ | Memory specifies the memory resource request/limit (e.g., "4Gi", "8Gi") | | | +| `gpu` _string_ | GPU indicates the number of GPUs to request.
Total number of GPUs is NumberOfNodes * GPU in case of multinode deployment. | | | +| `gpuType` _string_ | GPUType can specify a custom GPU type, e.g. "gpu.intel.com/xe"
By default if not specified, the GPU type is "nvidia.com/gpu" | | | +| `custom` _object (keys:string, values:string)_ | Custom specifies additional custom resource requests/limits | | | + + +#### Resources + + + +Resources defines requested and limits for a component, including CPU, memory, +GPUs/devices, and any runtime-specific resources. + + + +_Appears in:_ +- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec) +- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `requests` _[ResourceItem](#resourceitem)_ | Requests specifies the minimum resources required by the component | | | +| `limits` _[ResourceItem](#resourceitem)_ | Limits specifies the maximum resources allowed for the component | | | +| `claims` _[ResourceClaim](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourceclaim-v1-core) array_ | Claims specifies resource claims for dynamic resource allocation | | | + + +#### ScalingAdapter + + + +ScalingAdapter configures whether a service uses the DynamoGraphDeploymentScalingAdapter +for replica management. When enabled, the DGDSA owns the replicas field and +external autoscalers (HPA, KEDA, Planner) can control scaling via the Scale subresource. + + + +_Appears in:_ +- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec) +- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `enabled` _boolean_ | Enabled indicates whether the ScalingAdapter should be enabled for this service.
When true, a DGDSA is created and owns the replicas field.
When false (default), no DGDSA is created and replicas can be modified directly in the DGD. | false | | + + +#### ServiceReplicaStatus + + + +ServiceReplicaStatus contains replica information for a single service. + + + +_Appears in:_ +- [DynamoGraphDeploymentStatus](#dynamographdeploymentstatus) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `componentKind` _[ComponentKind](#componentkind)_ | ComponentKind is the underlying resource kind (e.g., "PodClique", "PodCliqueScalingGroup", "Deployment", "LeaderWorkerSet"). | | Enum: [PodClique PodCliqueScalingGroup Deployment LeaderWorkerSet]
| +| `componentName` _string_ | ComponentName is the name of the underlying resource. | | | +| `replicas` _integer_ | Replicas is the total number of non-terminated replicas.
Required for all component kinds. | | Minimum: 0
| +| `updatedReplicas` _integer_ | UpdatedReplicas is the number of replicas at the current/desired revision.
Required for all component kinds. | | Minimum: 0
| +| `readyReplicas` _integer_ | ReadyReplicas is the number of ready replicas.
Populated for PodClique, Deployment, and LeaderWorkerSet.
Not available for PodCliqueScalingGroup.
When nil, the field is omitted from the API response. | | Minimum: 0
| +| `availableReplicas` _integer_ | AvailableReplicas is the number of available replicas.
For Deployment: replicas ready for >= minReadySeconds.
For PodCliqueScalingGroup: replicas where all constituent PodCliques have >= MinAvailable ready pods.
Not available for PodClique or LeaderWorkerSet.
When nil, the field is omitted from the API response. | | Minimum: 0
| + + +#### SharedMemorySpec + + + + + + + +_Appears in:_ +- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec) +- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `disabled` _boolean_ | | | | +| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | | | | + + +#### VolumeMount + + + +VolumeMount references a PVC defined at the top level for volumes to be mounted by the component + + + +_Appears in:_ +- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec) +- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `name` _string_ | Name references a PVC name defined in the top-level PVCs map | | Required: \{\}
| +| `mountPoint` _string_ | MountPoint specifies where to mount the volume.
If useAsCompilationCache is true and mountPoint is not specified,
a backend-specific default will be used. | | | +| `useAsCompilationCache` _boolean_ | UseAsCompilationCache indicates this volume should be used as a compilation cache.
When true, backend-specific environment variables will be set and default mount points may be used. | false | | + + +# Operator Default Values Injection + +The Dynamo operator automatically applies default values to various fields when they are not explicitly specified in your deployments. These defaults include: + +- **Health Probes**: Startup, liveness, and readiness probes are configured differently for frontend, worker, and planner components. For example, worker components receive a startup probe with a 2-hour timeout (720 failures × 10 seconds) to accommodate long model loading times. + +- **Security Context**: All components receive `fsGroup: 1000` by default to ensure proper file permissions for mounted volumes. This can be overridden via the `extraPodSpec.securityContext` field. + +- **Shared Memory**: All components receive an 8Gi shared memory volume mounted at `/dev/shm` by default (can be disabled or resized via the `sharedMemory` field). + +- **Environment Variables**: Components automatically receive environment variables like `DYN_NAMESPACE`, `DYN_PARENT_DGD_K8S_NAME`, `DYNAMO_PORT`, and backend-specific variables. + +- **Pod Configuration**: Default `terminationGracePeriodSeconds` of 60 seconds and `restartPolicy: Always`. + +- **Autoscaling**: When enabled without explicit metrics, defaults to CPU-based autoscaling with 80% target utilization. + +- **Backend-Specific Behavior**: For multinode deployments, probes are automatically modified or removed for worker nodes depending on the backend framework (VLLM, SGLang, or TensorRT-LLM). + +## Pod Specification Defaults + +All components receive the following pod-level defaults unless overridden: + +- **`terminationGracePeriodSeconds`**: `60` seconds +- **`restartPolicy`**: `Always` + +## Security Context + +The operator automatically applies default security context settings to all components to ensure proper file permissions, particularly for mounted volumes: + +- **`fsGroup`**: `1000` - Sets the group ownership of mounted volumes and any files created in those volumes + +This default ensures that non-root containers can write to mounted volumes (like model caches or persistent storage) without permission issues. The `fsGroup` setting is particularly important for: +- Model downloads and caching +- Compilation cache directories +- Persistent volume claims (PVCs) +- SSH key generation in multinode deployments + +### Overriding Security Context + +To override the default security context, specify your own `securityContext` in the `extraPodSpec` of your component: + +```yaml +services: + YourWorker: + extraPodSpec: + securityContext: + fsGroup: 2000 # Custom group ID + runAsUser: 1000 + runAsGroup: 1000 + runAsNonRoot: true +``` + +**Important**: When you provide *any* `securityContext` object in `extraPodSpec`, the operator will not inject any defaults. This gives you complete control over the security context, including the ability to run as root (by omitting `runAsNonRoot` or setting it to `false`). + +### OpenShift and Security Context Constraints + +In OpenShift environments with Security Context Constraints (SCCs), you may need to omit explicit UID/GID values to allow OpenShift's admission controllers to assign them dynamically: + +```yaml +services: + YourWorker: + extraPodSpec: + securityContext: + # Omit fsGroup to let OpenShift assign it based on SCC + # OpenShift will inject the appropriate UID range +``` + +Alternatively, if you want to keep the default `fsGroup: 1000` behavior and are certain your cluster allows it, you don't need to specify anything - the operator defaults will work. + +## Shared Memory Configuration + +Shared memory is enabled by default for all components: + +- **Enabled**: `true` (unless explicitly disabled via `sharedMemory.disabled`) +- **Size**: `8Gi` +- **Mount Path**: `/dev/shm` +- **Volume Type**: `emptyDir` with `memory` medium + +To disable shared memory or customize the size, use the `sharedMemory` field in your component specification. + +## Health Probes by Component Type + +The operator applies different default health probes based on the component type. + +### Frontend Components + +Frontend components receive the following probe configurations: + +**Liveness Probe:** +- **Type**: HTTP GET +- **Path**: `/health` +- **Port**: `http` (8000) +- **Initial Delay**: 60 seconds +- **Period**: 60 seconds +- **Timeout**: 30 seconds +- **Failure Threshold**: 10 + +**Readiness Probe:** +- **Type**: Exec command +- **Command**: `curl -s http://localhost:${DYNAMO_PORT}/health | jq -e ".status == \"healthy\""` +- **Initial Delay**: 60 seconds +- **Period**: 60 seconds +- **Timeout**: 30 seconds +- **Failure Threshold**: 10 + +### Worker Components + +Worker components receive the following probe configurations: + +**Liveness Probe:** +- **Type**: HTTP GET +- **Path**: `/live` +- **Port**: `system` (9090) +- **Period**: 5 seconds +- **Timeout**: 30 seconds +- **Failure Threshold**: 1 + +**Readiness Probe:** +- **Type**: HTTP GET +- **Path**: `/health` +- **Port**: `system` (9090) +- **Period**: 10 seconds +- **Timeout**: 30 seconds +- **Failure Threshold**: 60 + +**Startup Probe:** +- **Type**: HTTP GET +- **Path**: `/live` +- **Port**: `system` (9090) +- **Period**: 10 seconds +- **Timeout**: 5 seconds +- **Failure Threshold**: 720 (allows up to 2 hours for startup: 10s × 720 = 7200s) + + +**For larger models (typically >70B parameters) or slower storage systems, you may need to increase the `failureThreshold` to allow more time for model loading. Calculate the required threshold based on your expected startup time: `failureThreshold = (expected_startup_seconds / period)`. Override the startup probe in your component specification if the default 2-hour window is insufficient.** + + +### Multinode Deployment Probe Modifications + +For multinode deployments, the operator modifies probes based on the backend framework and node role: + +#### VLLM Backend + +The operator automatically selects between two deployment modes based on parallelism configuration: + +**Tensor/Pipeline Parallel Mode** (when `world_size > GPUs_per_node`): +- Uses Ray for distributed execution (`--distributed-executor-backend ray`) +- **Leader nodes**: Starts Ray head and runs vLLM; all probes remain active +- **Worker nodes**: Run Ray agents only; all probes (liveness, readiness, startup) are removed + +**Data Parallel Mode** (when `world_size × data_parallel_size > GPUs_per_node`): +- **Worker nodes**: All probes (liveness, readiness, startup) are removed +- **Leader nodes**: All probes remain active + +#### SGLang Backend +- **Worker nodes**: All probes (liveness, readiness, startup) are removed + +#### TensorRT-LLM Backend +- **Leader nodes**: All probes remain unchanged +- **Worker nodes**: + - Liveness and startup probes are removed + - Readiness probe is replaced with a TCP socket check on SSH port (2222): + - **Initial Delay**: 20 seconds + - **Period**: 20 seconds + - **Timeout**: 5 seconds + - **Failure Threshold**: 10 + +## Environment Variables + +The operator automatically injects environment variables based on component type and configuration: + +### All Components + +- **`DYN_NAMESPACE`**: The Dynamo namespace for the component +- **`DYN_PARENT_DGD_K8S_NAME`**: The parent DynamoGraphDeployment Kubernetes resource name +- **`DYN_PARENT_DGD_K8S_NAMESPACE`**: The parent DynamoGraphDeployment Kubernetes namespace + +### Frontend Components + +- **`DYNAMO_PORT`**: `8000` +- **`DYN_HTTP_PORT`**: `8000` + +### Worker Components + +- **`DYN_SYSTEM_PORT`**: `9090` (automatically enables the system metrics server) +- **`DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS`**: `["generate"]` +- **`DYN_SYSTEM_ENABLED`**: `true` (needed for runtime images 0.6.1 and older) + +### Planner Components + +- **`PLANNER_PROMETHEUS_PORT`**: `9085` + +### VLLM Backend (with compilation cache) + +When a volume mount is configured with `useAsCompilationCache: true`: +- **`VLLM_CACHE_ROOT`**: Set to the mount point of the cache volume + +## Service Account + +Planner components automatically receive the following service account: + +- **`serviceAccountName`**: `planner-serviceaccount` + +## Image Pull Secrets + +The operator automatically discovers and injects image pull secrets for container images. When a component specifies a container image, the operator: + +1. Scans all Kubernetes secrets of type `kubernetes.io/dockerconfigjson` in the component's namespace +2. Extracts the docker registry server URLs from each secret's authentication configuration +3. Matches the container image's registry host against the discovered registry URLs +4. Automatically injects matching secrets as `imagePullSecrets` in the pod specification + +This eliminates the need to manually specify image pull secrets for each component. The operator maintains an internal index of docker secrets and their associated registries, refreshing this index periodically. + +**To disable automatic image pull secret discovery** for a specific component, add the following annotation: + +```yaml +annotations: + nvidia.com/disable-image-pull-secret-discovery: "true" +``` + +## Autoscaling Defaults + +When autoscaling is enabled but no metrics are specified, the operator applies: + +- **Default Metric**: CPU utilization +- **Target Average Utilization**: `80%` + +## Port Configurations + +Default container ports are configured based on component type: + +### Frontend Components +- **Port**: 8000 +- **Protocol**: TCP +- **Name**: `http` + +### Worker Components +- **Port**: 9090 +- **Protocol**: TCP +- **Name**: `system` + +### Planner Components +- **Port**: 9085 +- **Protocol**: TCP +- **Name**: `metrics` + +## Backend-Specific Configurations + +### VLLM +- **Ray Head Port**: 6379 (for Ray cluster coordination in multinode TP/PP deployments) +- **Data Parallel RPC Port**: 13445 (for data parallel multinode deployments) + +### SGLang +- **Distribution Init Port**: 29500 (for multinode deployments) + +### TensorRT-LLM +- **SSH Port**: 2222 (for multinode MPI communication) +- **OpenMPI Environment**: `OMPI_MCA_orte_keep_fqdn_hostnames=1` + +## Implementation Reference + +For users who want to understand the implementation details or contribute to the operator, the default values described in this document are set in the following source files: + +- **Health Probes, Security Context & Pod Specifications**: [`internal/dynamo/graph.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/graph.go) - Contains the main logic for applying default probes, security context, environment variables, shared memory, and pod configurations +- **Component-Specific Defaults**: + - [`internal/dynamo/component_frontend.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/component_frontend.go) + - [`internal/dynamo/component_worker.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/component_worker.go) + - [`internal/dynamo/component_planner.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/component_planner.go) +- **Image Pull Secrets**: [`internal/secrets/docker.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/secrets/docker.go) - Implements the docker secret indexer and automatic discovery +- **Backend-Specific Behavior**: + - [`internal/dynamo/backend_vllm.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/backend_vllm.go) + - [`internal/dynamo/backend_sglang.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/backend_sglang.go) + - [`internal/dynamo/backend_trtllm.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/backend_trtllm.go) +- **Constants & Annotations**: [`internal/consts/consts.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/consts/consts.go) - Defines annotation keys and other constants + +## Notes + +- All these defaults can be overridden by explicitly specifying values in your DynamoComponentDeployment or DynamoGraphDeployment resources +- User-specified probes (via `livenessProbe`, `readinessProbe`, or `startupProbe` fields) take precedence over operator defaults +- For security context, if you provide *any* `securityContext` in `extraPodSpec`, no defaults will be injected, giving you full control +- For multinode deployments, some defaults are modified or removed as described above to accommodate distributed execution patterns +- The `extraPodSpec.mainContainer` field can be used to override probe configurations set by the operator diff --git a/fern/pages/kubernetes/autoscaling.md b/fern/pages/kubernetes/autoscaling.md new file mode 100644 index 00000000000..2d76006ce01 --- /dev/null +++ b/fern/pages/kubernetes/autoscaling.md @@ -0,0 +1,738 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Autoscaling" +--- + +This guide explains how to configure autoscaling for DynamoGraphDeployment (DGD) services using the `sglang-agg` example from `examples/backends/sglang/deploy/agg.yaml`. + +## Example DGD + +All examples in this guide use the following DGD: + +```yaml +# examples/backends/sglang/deploy/agg.yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: sglang-agg + namespace: default +spec: + services: + Frontend: + dynamoNamespace: sglang-agg + componentType: frontend + replicas: 1 + + decode: + dynamoNamespace: sglang-agg + componentType: worker + replicas: 1 + resources: + limits: + gpu: "1" +``` + +**Key identifiers:** +- **DGD name**: `sglang-agg` +- **Namespace**: `default` +- **Services**: `Frontend`, `decode` +- **dynamo_namespace label**: `default-sglang-agg` (used for metric filtering) + +## Overview + +Dynamo provides flexible autoscaling through the `DynamoGraphDeploymentScalingAdapter` (DGDSA) resource. When you deploy a DGD, the operator automatically creates one adapter per service (unless explicitly disabled). These adapters implement the Kubernetes [Scale subresource](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/#scale-subresource), enabling integration with: + +| Autoscaler | Description | Best For | +|------------|-------------|----------| +| **KEDA** | Event-driven autoscaling (recommended) | Most use cases | +| **Kubernetes HPA** | Native horizontal pod autoscaling | Simple CPU/memory-based scaling | +| **Dynamo Planner** | LLM-aware autoscaling with SLA optimization | Production LLM workloads | +| **Custom Controllers** | Any scale-subresource-compatible controller | Custom requirements | + + +**Deprecation Notice:** The `spec.services[X].autoscaling` field in DGD is **deprecated and ignored**. Use DGDSA with HPA, KEDA, or Planner instead. If you have existing DGDs with `autoscaling` configured, you'll see a warning. Remove the field to silence the warning. + + +## Architecture + +``` +┌──────────────────────────────────┐ ┌─────────────────────────────────────┐ +│ DynamoGraphDeployment │ │ Scaling Adapters (auto-created) │ +│ "sglang-agg" │ │ (one per service) │ +├──────────────────────────────────┤ ├─────────────────────────────────────┤ +│ │ │ │ +│ spec.services: │ │ ┌─────────────────────────────┐ │ ┌──────────────────┐ +│ │ │ │ sglang-agg-frontend │◄───┼──────│ Autoscalers │ +│ ┌────────────────────────┐◄───┼──────────┼──│ spec.replicas: 1 │ │ │ │ +│ │ Frontend: 1 replica │ │ │ └─────────────────────────────┘ │ │ • KEDA │ +│ └────────────────────────┘ │ │ │ │ • HPA │ +│ │ │ ┌─────────────────────────────┐ │ │ • Planner │ +│ ┌────────────────────────┐◄───┼──────────┼──│ sglang-agg-decode │◄───┼──────│ • Custom │ +│ │ decode: 1 replica │ │ │ │ spec.replicas: 1 │ │ │ │ +│ └────────────────────────┘ │ │ └─────────────────────────────┘ │ └──────────────────┘ +│ │ │ │ +└──────────────────────────────────┘ └─────────────────────────────────────┘ +``` + +**How it works:** + +1. You deploy a DGD with services (Frontend, decode) +2. The operator auto-creates one DGDSA per service +3. Autoscalers (KEDA, HPA, Planner) target the adapters via `/scale` subresource +4. Adapter controller syncs replica changes to the DGD +5. DGD controller reconciles the underlying pods + +## Viewing Scaling Adapters + +After deploying the `sglang-agg` DGD, verify the auto-created adapters: + +```bash +kubectl get dgdsa -n default + +# Example output: +# NAME DGD SERVICE REPLICAS AGE +# sglang-agg-frontend sglang-agg Frontend 1 5m +# sglang-agg-decode sglang-agg decode 1 5m +``` + +## Replica Ownership Model + +When DGDSA is enabled (the default), it becomes the **source of truth** for replica counts. This follows the same pattern as Kubernetes Deployments owning ReplicaSets. + +### How It Works + +1. **DGDSA owns replicas**: Autoscalers (HPA, KEDA, Planner) update the DGDSA's `spec.replicas` +2. **DGDSA syncs to DGD**: The DGDSA controller writes the replica count to the DGD's service +3. **Direct DGD edits blocked**: A validating webhook prevents users from directly editing `spec.services[X].replicas` in the DGD +4. **Controllers allowed**: Only authorized controllers (operator, Planner) can modify DGD replicas + +### Manual Scaling with DGDSA Enabled + +When DGDSA is enabled, use `kubectl scale` on the adapter (not the DGD): + +```bash +# ✅ Correct - scale via DGDSA +kubectl scale dgdsa sglang-agg-decode --replicas=3 + +# ❌ Blocked - direct DGD edit rejected by webhook +kubectl patch dgd sglang-agg --type=merge -p '{"spec":{"services":{"decode":{"replicas":3}}}}' +# Error: spec.services[decode].replicas cannot be modified directly when scaling adapter is enabled; +# use 'kubectl scale dgdsa/sglang-agg-decode --replicas=3' or update the DynamoGraphDeploymentScalingAdapter instead +``` + +## Enabling DGDSA for a Service + +By default, no DGDSA is created for services, allowing direct replica management via the DGD. To enable autoscaling via HPA, KEDA, or Planner, explicitly enable the scaling adapter: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: sglang-agg +spec: + services: + Frontend: + replicas: 2 # ← No DGDSA by default, direct edits allowed + + decode: + replicas: 1 + scalingAdapter: + enabled: true # ← DGDSA created, managed via adapter +``` + +**When to enable DGDSA:** +- You want to use HPA, KEDA, or Planner for autoscaling +- You want a clear separation between "desired scale" (adapter) and "deployment config" (DGD) +- You want protection against accidental direct replica edits + +**When to keep DGDSA disabled (default):** +- You want simple, manual replica management +- You don't need autoscaling for that service +- You prefer direct DGD edits over adapter-based scaling + +## Autoscaling with Dynamo Planner + +The Dynamo Planner is an LLM-aware autoscaler that optimizes scaling decisions based on inference-specific metrics like Time To First Token (TTFT), Inter-Token Latency (ITL), and KV cache utilization. + +**When to use Planner:** +- You want LLM-optimized autoscaling out of the box +- You need coordinated scaling across prefill/decode services +- You want SLA-driven scaling (e.g., target TTFT < 500ms) + +**How Planner works:** + +Planner is deployed as a service component within your DGD. It: +1. Queries Prometheus for frontend metrics (request rate, latency, etc.) +2. Uses profiling data to predict optimal replica counts +3. Scales prefill/decode workers to meet SLA targets + +**Deployment:** + +The recommended way to deploy Planner is via `DynamoGraphDeploymentRequest` (DGDR). See the [SLA Planner Quick Start](../planner/sla-planner-quickstart.md) for complete instructions. + +Example configurations with Planner: +- `examples/backends/vllm/deploy/disagg_planner.yaml` +- `examples/backends/sglang/deploy/disagg_planner.yaml` +- `examples/backends/trtllm/deploy/disagg_planner.yaml` + +For more details, see the [SLA Planner documentation](../planner/sla-planner.md). + +## Autoscaling with Kubernetes HPA + +The Horizontal Pod Autoscaler (HPA) is Kubernetes' native autoscaling solution. + +**When to use HPA:** +- You have simple, predictable scaling requirements +- You want to use standard Kubernetes tooling +- You need CPU or memory-based scaling + +> **Note**: For custom metrics (like TTFT or queue depth), consider using [KEDA](#autoscaling-with-keda-recommended) instead - it's simpler to configure. + +### Basic HPA (CPU-based) + +```yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: sglang-agg-frontend-hpa + namespace: default +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: sglang-agg-frontend + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + behavior: + scaleDown: + stabilizationWindowSeconds: 300 + scaleUp: + stabilizationWindowSeconds: 0 +``` + +### HPA with Dynamo Metrics + +Dynamo exports several metrics useful for autoscaling. These are available at the `/metrics` endpoint on each frontend pod. + +> **See also**: For a complete list of all Dynamo metrics, see the [Metrics Reference](../observability/metrics.md). For Prometheus and Grafana setup, see the [Prometheus and Grafana Setup Guide](../observability/prometheus-grafana.md). + +#### Available Dynamo Metrics + +| Metric | Type | Description | Good for scaling | +|--------|------|-------------|------------------| +| `dynamo_frontend_queued_requests` | Gauge | Requests waiting in HTTP queue | ✅ Workers | +| `dynamo_frontend_inflight_requests` | Gauge | Concurrent requests to engine | ✅ All services | +| `dynamo_frontend_time_to_first_token_seconds` | Histogram | TTFT latency | ✅ Workers | +| `dynamo_frontend_inter_token_latency_seconds` | Histogram | ITL latency | ✅ Decode | +| `dynamo_frontend_request_duration_seconds` | Histogram | Total request duration | ⚠️ General | +| `kvstats_gpu_cache_usage_percent` | Gauge | GPU KV cache usage (0-1) | ✅ Decode | + +#### Metric Labels + +Dynamo metrics include these labels for filtering: + +| Label | Description | Example | +|-------|-------------|---------| +| `dynamo_namespace` | Unique DGD identifier (`{k8s-namespace}-{dynamoNamespace}`) | `default-sglang-agg` | +| `model` | Model being served | `Qwen/Qwen3-0.6B` | + +> **Note**: When you have multiple DGDs in the same namespace, use `dynamo_namespace` to filter metrics for a specific DGD. + +#### Example: Scale Decode Service Based on TTFT + +Using HPA with Prometheus Adapter requires configuring external metrics. + +**Step 1: Configure Prometheus Adapter** + +Add this to your Helm values file (e.g., `prometheus-adapter-values.yaml`): + +```yaml +# prometheus-adapter-values.yaml +prometheus: + url: http://prometheus-kube-prometheus-prometheus.monitoring.svc + port: 9090 + +rules: + external: + # TTFT p95 from frontend - used to scale decode + - seriesQuery: 'dynamo_frontend_time_to_first_token_seconds_bucket{namespace!=""}' + resources: + overrides: + namespace: {resource: "namespace"} + name: + as: "dynamo_ttft_p95_seconds" + metricsQuery: | + histogram_quantile(0.95, + sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{<<.LabelMatchers>>}[5m])) + by (le, namespace, dynamo_namespace) + ) +``` + +**Step 2: Install Prometheus Adapter** + +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update + +helm upgrade --install prometheus-adapter prometheus-community/prometheus-adapter \ + -n monitoring --create-namespace \ + -f prometheus-adapter-values.yaml +``` + +**Step 3: Verify the metric is available** + +```bash +kubectl get --raw "/apis/external.metrics.k8s.io/v1beta1/namespaces//dynamo_ttft_p95_seconds" | jq +``` + +**Step 4: Create the HPA** + +```yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: sglang-agg-decode-hpa +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: sglang-agg-decode # ← DGD name + service name (lowercase) + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: External + external: + metric: + name: dynamo_ttft_p95_seconds + selector: + matchLabels: + dynamo_namespace: "default-sglang-agg" # ← {namespace}-{dynamoNamespace} + target: + type: Value + value: "500m" # Scale up when TTFT p95 > 500ms + behavior: + scaleDown: + stabilizationWindowSeconds: 60 # Wait 1 min before scaling down + policies: + - type: Pods + value: 1 + periodSeconds: 30 + scaleUp: + stabilizationWindowSeconds: 0 # Scale up immediately + policies: + - type: Pods + value: 2 + periodSeconds: 30 +``` + +**How it works:** +1. Frontend pods export `dynamo_frontend_time_to_first_token_seconds` histogram +2. Prometheus Adapter calculates p95 TTFT per `dynamo_namespace` +3. HPA monitors this metric filtered by `dynamo_namespace: "default-sglang-agg"` +4. When TTFT p95 > 500ms, HPA scales up the `sglang-agg-decode` adapter +5. Adapter controller syncs the replica count to the DGD's `decode` service +6. More decode workers are created, reducing TTFT + +#### Example: Scale Based on Queue Depth + +Add this rule to your `prometheus-adapter-values.yaml` (alongside the TTFT rule): + +```yaml +# Add to rules.external in prometheus-adapter-values.yaml +- seriesQuery: 'dynamo_frontend_queued_requests{namespace!=""}' + resources: + overrides: + namespace: {resource: "namespace"} + name: + as: "dynamo_queued_requests" + metricsQuery: | + sum(<<.Series>>{<<.LabelMatchers>>}) by (namespace, dynamo_namespace) +``` + +Then create the HPA: + +```yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: sglang-agg-decode-queue-hpa + namespace: default +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: sglang-agg-decode + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: External + external: + metric: + name: dynamo_queued_requests + selector: + matchLabels: + dynamo_namespace: "default-sglang-agg" + target: + type: Value + value: "10" # Scale up when queue > 10 requests +``` + +## Autoscaling with KEDA (Recommended) + +KEDA (Kubernetes Event-driven Autoscaling) extends Kubernetes with event-driven autoscaling, supporting 50+ scalers including Prometheus. + +**Advantages over HPA + Prometheus Adapter:** +- No Prometheus Adapter configuration needed +- PromQL queries are defined in the ScaledObject itself (declarative, per-deployment) +- Easy to update - just `kubectl apply` the ScaledObject +- Can scale to zero when idle +- Supports multiple triggers per object + +**When to use KEDA:** +- You want simpler configuration (no Prometheus Adapter to manage) +- You need event-driven scaling (e.g., queue depth, Kafka, etc.) +- You want to scale to zero when idle + +### Installing KEDA + +```bash +# Add KEDA Helm repo +helm repo add kedacore https://kedacore.github.io/charts +helm repo update + +# Install KEDA +helm install keda kedacore/keda \ + --namespace keda \ + --create-namespace + +# Verify installation +kubectl get pods -n keda +``` + +> **Note**: If you have Prometheus Adapter installed, either uninstall it first (`helm uninstall prometheus-adapter -n monitoring`) or install KEDA with `--set metricsServer.enabled=false` to avoid API conflicts. + +### Example: Scale Decode Based on TTFT + +Using the `sglang-agg` DGD from `examples/backends/sglang/deploy/agg.yaml`: + +```yaml +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: sglang-agg-decode-scaler + namespace: default +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: sglang-agg-decode + minReplicaCount: 1 + maxReplicaCount: 10 + pollingInterval: 15 # Check metrics every 15 seconds + cooldownPeriod: 60 # Wait 60s before scaling down + triggers: + - type: prometheus + metadata: + # Update this URL to match your Prometheus service + serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090 + metricName: dynamo_ttft_p95 + query: | + histogram_quantile(0.95, + sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{dynamo_namespace="default-sglang-agg"}[5m])) + by (le) + ) + threshold: "0.5" # Scale up when TTFT p95 > 500ms (0.5 seconds) + activationThreshold: "0.1" # Start scaling when TTFT > 100ms +``` + +Apply it: + +```bash +kubectl apply -f sglang-agg-decode-scaler.yaml +``` + +### Verify KEDA Scaling + +```bash +# Check ScaledObject status +kubectl get scaledobject -n default + +# KEDA creates an HPA under the hood - you can see it +kubectl get hpa -n default + +# Example output: +# NAME REFERENCE TARGETS MINPODS MAXPODS REPLICAS +# keda-hpa-sglang-agg-decode-scaler DynamoGraphDeploymentScalingAdapter/sglang-agg-decode 45m/500m 1 10 1 + +# Get detailed status +kubectl describe scaledobject sglang-agg-decode-scaler -n default +``` + +### Example: Scale Based on Queue Depth + +```yaml +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: sglang-agg-decode-queue-scaler + namespace: default +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: sglang-agg-decode + minReplicaCount: 1 + maxReplicaCount: 10 + pollingInterval: 15 + cooldownPeriod: 60 + triggers: + - type: prometheus + metadata: + serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090 + metricName: dynamo_queued_requests + query: | + sum(dynamo_frontend_queued_requests{dynamo_namespace="default-sglang-agg"}) + threshold: "10" # Scale up when queue > 10 requests +``` + +### How KEDA Works + +KEDA creates and manages an HPA under the hood: + +``` +┌──────────────────────────────────────────────────────────────────────┐ +│ You create: ScaledObject │ +│ - scaleTargetRef: sglang-agg-decode │ +│ - triggers: prometheus query │ +└──────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────────┐ +│ KEDA Operator automatically creates: HPA │ +│ - name: keda-hpa-sglang-agg-decode-scaler │ +│ - scaleTargetRef: sglang-agg-decode │ +│ - metrics: External (from KEDA metrics server) │ +└──────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────────┐ +│ DynamoGraphDeploymentScalingAdapter: sglang-agg-decode │ +│ - spec.replicas: updated by HPA │ +└──────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────────┐ +│ DynamoGraphDeployment: sglang-agg │ +│ - spec.services.decode.replicas: synced from adapter │ +└──────────────────────────────────────────────────────────────────────┘ +``` + +## Mixed Autoscaling + +For disaggregated deployments (prefill + decode), you can use different autoscaling strategies for different services: + +```yaml +--- +# HPA for Frontend (CPU-based) +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: sglang-agg-frontend-hpa + namespace: default +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: sglang-agg-frontend + minReplicas: 1 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + +--- +# KEDA for Decode (TTFT-based) +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: sglang-agg-decode-scaler + namespace: default +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: sglang-agg-decode + minReplicaCount: 1 + maxReplicaCount: 10 + triggers: + - type: prometheus + metadata: + serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090 + query: | + histogram_quantile(0.95, + sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{dynamo_namespace="default-sglang-agg"}[5m])) + by (le) + ) + threshold: "0.5" +``` + +## Manual Scaling + +### With DGDSA Enabled (Default) + +When DGDSA is enabled (the default), scale via the adapter: + +```bash +kubectl scale dgdsa sglang-agg-decode -n default --replicas=3 +``` + +Verify the scaling: + +```bash +kubectl get dgdsa sglang-agg-decode -n default + +# Output: +# NAME DGD SERVICE REPLICAS AGE +# sglang-agg-decode sglang-agg decode 3 10m +``` + +> **Note**: If an autoscaler (KEDA, HPA, Planner) is managing the adapter, your change will be overwritten on the next evaluation cycle. + +### With DGDSA Disabled + +If you've disabled the scaling adapter for a service, edit the DGD directly: + +```bash +kubectl patch dgd sglang-agg --type=merge -p '{"spec":{"services":{"decode":{"replicas":3}}}}' +``` + +Or edit the YAML (no `scalingAdapter.enabled: true` means direct edits are allowed): + +```yaml +spec: + services: + decode: + replicas: 3 + # No scalingAdapter.enabled means replicas can be edited directly +``` + +## Best Practices + +### 1. Choose One Autoscaler Per Service + +Avoid configuring multiple autoscalers for the same service: + +| Configuration | Status | +|---------------|--------| +| HPA for frontend, Planner for prefill/decode | ✅ Good | +| KEDA for all services | ✅ Good | +| Planner only (default) | ✅ Good | +| HPA + Planner both targeting decode | ❌ Bad - they will fight | + +### 2. Use Appropriate Metrics + +| Service Type | Recommended Metrics | Dynamo Metric | +|--------------|---------------------|---------------| +| Frontend | CPU utilization, request rate | `dynamo_frontend_requests_total` | +| Prefill | Queue depth, TTFT | `dynamo_frontend_queued_requests`, `dynamo_frontend_time_to_first_token_seconds` | +| Decode | KV cache utilization, ITL | `kvstats_gpu_cache_usage_percent`, `dynamo_frontend_inter_token_latency_seconds` | + +### 3. Configure Stabilization Windows + +Prevent thrashing with appropriate stabilization: + +```yaml +# HPA +behavior: + scaleDown: + stabilizationWindowSeconds: 300 # Wait 5 min before scaling down + scaleUp: + stabilizationWindowSeconds: 0 # Scale up immediately + +# KEDA +spec: + cooldownPeriod: 300 +``` + +### 4. Set Sensible Min/Max Replicas + +Always configure minimum and maximum replicas in your HPA/KEDA to prevent: +- Scaling to zero (unless intentional) +- Unbounded scaling that exhausts cluster resources + +## Troubleshooting + +### Adapters Not Created + +```bash +# Check DGD status +kubectl describe dgd sglang-agg -n default + +# Check operator logs +kubectl logs -n dynamo-system deployment/dynamo-operator +``` + +### Scaling Not Working + +```bash +# Check adapter status +kubectl describe dgdsa sglang-agg-decode -n default + +# Check HPA/KEDA status +kubectl describe hpa sglang-agg-decode-hpa -n default +kubectl describe scaledobject sglang-agg-decode-scaler -n default + +# Verify metrics are available in Kubernetes metrics API +kubectl get --raw /apis/external.metrics.k8s.io/v1beta1 +``` + +### Metrics Not Available + +If HPA/KEDA shows `` for metrics: + +```bash +# Check if Dynamo metrics are being scraped +kubectl port-forward -n default svc/sglang-agg-frontend 8000:8000 +curl http://localhost:8000/metrics | grep dynamo_frontend + +# Example output: +# dynamo_frontend_queued_requests{model="Qwen/Qwen3-0.6B"} 2 +# dynamo_frontend_inflight_requests{model="Qwen/Qwen3-0.6B"} 5 + +# Verify Prometheus is scraping the metrics +kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 +# Then query: dynamo_frontend_time_to_first_token_seconds_bucket + +# Check KEDA operator logs +kubectl logs -n keda deployment/keda-operator +``` + +### Rapid Scaling Up and Down + +If you see unstable scaling: + +1. Check if multiple autoscalers are targeting the same adapter +2. Increase `cooldownPeriod` in KEDA ScaledObject +3. Increase `stabilizationWindowSeconds` in HPA behavior + +## References + +- [Kubernetes HPA Documentation](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) +- [KEDA Documentation](https://keda.sh/) +- [Prometheus Adapter](https://github.com/kubernetes-sigs/prometheus-adapter) +- [Planner Documentation](../planner/sla-planner.md) +- [Dynamo Metrics Reference](../observability/metrics.md) +- [Prometheus and Grafana Setup](../observability/prometheus-grafana.md) + diff --git a/fern/pages/kubernetes/deployment/create-deployment.md b/fern/pages/kubernetes/deployment/create-deployment.md new file mode 100644 index 00000000000..5d7b2934087 --- /dev/null +++ b/fern/pages/kubernetes/deployment/create-deployment.md @@ -0,0 +1,263 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Creating Kubernetes Deployments" +--- + +The scripts in the `examples//launch` folder like [agg.sh](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm/launch/agg.sh) demonstrate how you can serve your models locally. +The corresponding YAML files like [agg.yaml](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm/deploy/agg.yaml) show you how you could create a Kubernetes deployment for your inference graph. + +This guide explains how to create your own deployment files. + +## Step 1: Choose Your Architecture Pattern + +Before choosing a template, understand the different architecture patterns: + +### Aggregated Serving (agg.yaml) + +**Pattern**: Prefill and decode on the same GPU in a single process. + +**Suggested to use for**: +- Small to medium models (under 70B parameters) +- Development and testing +- Low to moderate traffic +- Simplicity is prioritized over maximum throughput + +**Tradeoffs**: +- Simpler setup and debugging +- Lower operational complexity +- GPU utilization may not be optimal (prefill and decode compete for resources) +- Lower throughput ceiling compared to disaggregated + +**Example**: [`agg.yaml`](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm/deploy/agg.yaml) + +### Aggregated + Router (agg_router.yaml) + +**Pattern**: Load balancer routing across multiple aggregated worker instances. + +**Suggested to use for**: +- Medium traffic requiring high availability +- Need horizontal scaling +- Want some load balancing without disaggregation complexity + +**Tradeoffs**: +- Better scalability than plain aggregated +- High availability through multiple replicas +- Still has GPU underutilization issues of aggregated serving +- More complex than plain aggregated but simpler than disaggregated + +**Example**: [`agg_router.yaml`](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm/deploy/agg_router.yaml) + +### Disaggregated Serving (disagg_router.yaml) + +**Pattern**: Separate prefill and decode workers with specialized optimization. + +**Suggested to use for**: +- Production-style deployments +- High throughput requirements +- Large models (70B+ parameters) +- Maximum GPU utilization needed + +**Tradeoffs**: +- Maximum performance and throughput +- Better GPU utilization (prefill and decode specialized) +- Independent scaling of prefill and decode +- More complex setup and debugging +- Requires understanding of prefill/decode separation + +**Example**: [`disagg_router.yaml`](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm/deploy/disagg_router.yaml) + +### Quick Selection Guide + +Select the architecture pattern as your template that best fits your use case. + +For example, when using the `vLLM` backend: + +- **Development / Testing**: Use [`agg.yaml`](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm/deploy/agg.yaml) as the base configuration. + +- **Production with Load Balancing**: Use [`agg_router.yaml`](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm/deploy/agg_router.yaml) to enable scalable, load-balanced inference. + +- **High Performance / Disaggregated Deployment**: Use [`disagg_router.yaml`](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm/deploy/disagg_router.yaml) for maximum throughput and modular scalability. + + +## Step 2: Customize the Template + +You can run the Frontend on one machine, for example a CPU node, and the worker on a different machine (a GPU node). +The Frontend serves as a framework-agnostic HTTP entry point and is likely not to need many changes. + +It serves the following roles: +1. OpenAI-Compatible HTTP Server + * Provides `/v1/chat/completions` endpoint + * Handles HTTP request/response formatting + * Supports streaming responses + * Validates incoming requests + +2. Service Discovery and Routing + * Auto-discovers backend workers via etcd + * Routes requests to the appropriate Processor/Worker components + * Handles load balancing between multiple workers + +3. Request Preprocessing + * Initial request validation + * Model name verification + * Request format standardization + +You should then pick a worker and specialize the config. For example, + +```yaml +VllmWorker: # vLLM-specific config + enforce-eager: true + enable-prefix-caching: true + +SglangWorker: # SGLang-specific config + router-mode: kv + disagg-mode: true + +TrtllmWorker: # TensorRT-LLM-specific config + engine-config: ./engine.yaml + kv-cache-transfer: ucx +``` + +Here's a template structure based on the examples: + +```yaml + YourWorker: + dynamoNamespace: your-namespace + componentType: worker + replicas: N + envFromSecret: your-secrets # e.g., hf-token-secret + # Health checks for worker initialization + readinessProbe: + exec: + command: ["/bin/sh", "-c", 'grep "Worker.*initialized" /tmp/worker.log'] + resources: + requests: + gpu: "1" # GPU allocation + extraPodSpec: + mainContainer: + image: your-image + command: + - /bin/sh + - -c + args: + - python -m dynamo.YOUR_INFERENCE_ENGINE --model YOUR_MODEL --your-flags +``` + +Consult the corresponding sh file. Each of the python commands to launch a component will go into your yaml spec under the +`extraPodSpec: -> mainContainer: -> args:` + +The front end is launched with "python3 -m dynamo.frontend [--http-port 8000] [--router-mode kv]" +Each worker will launch `python -m dynamo.YOUR_INFERENCE_BACKEND --model YOUR_MODEL --your-flags `command. +If you are a Dynamo contributor the [dynamo run guide](../../reference/cli.md) for details on how to run this command. + + +## Step 3: Key Customization Points + +### Model Configuration + +```yaml + args: + - "python -m dynamo.YOUR_INFERENCE_BACKEND --model YOUR_MODEL --your-flag" +``` + +### Resource Allocation + +```yaml + resources: + requests: + cpu: "N" + memory: "NGi" + gpu: "N" +``` + +### Scaling + +```yaml + replicas: N # Number of worker instances +``` + +### Routing Mode +```yaml + args: + - --router-mode + - kv # Enable KV-cache routing +``` + +### Worker Specialization + +```yaml + args: + - --is-prefill-worker # For disaggregated prefill workers +``` + +### Image Pull Secret Configuration + +#### Automatic Discovery and Injection + +By default, the Dynamo operator automatically discovers and injects image pull secrets based on container registry host matching. The operator scans Docker config secrets within the same namespace and matches their registry hostnames to the container image URLs, automatically injecting the appropriate secrets into the pod's `imagePullSecrets`. + +**Disabling Automatic Discovery:** +To disable this behavior for a component and manually control image pull secrets: + +```yaml + YourWorker: + dynamoNamespace: your-namespace + componentType: worker + annotations: + nvidia.com/disable-image-pull-secret-discovery: "true" +``` + +When disabled, you can manually specify secrets as you would for a normal pod spec via: +```yaml + YourWorker: + dynamoNamespace: your-namespace + componentType: worker + annotations: + nvidia.com/disable-image-pull-secret-discovery: "true" + extraPodSpec: + imagePullSecrets: + - name: my-registry-secret + - name: another-secret + mainContainer: + image: your-image +``` + +This automatic discovery eliminates the need to manually configure image pull secrets for each deployment. + +## Step 6: Deploy LoRA Adapters (Optional) + +After your base model deployment is running, you can deploy LoRA adapters using the `DynamoModel` custom resource. This allows you to fine-tune and extend your models without modifying the base deployment. + +To add a LoRA adapter to your deployment, link it using `modelRef` in your worker configuration: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: my-deployment +spec: + services: + Worker: + modelRef: + name: Qwen/Qwen3-0.6B # Base model identifier + componentType: worker + # ... rest of worker config +``` + +Then create a `DynamoModel` resource for your LoRA: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoModel +metadata: + name: my-lora +spec: + modelName: my-custom-lora + baseModelName: Qwen/Qwen3-0.6B # Must match modelRef.name above + modelType: lora + source: + uri: s3://my-bucket/loras/my-lora +``` + +**For complete details on managing models and LoRA adapters, see:** +📖 **[Managing Models with DynamoModel Guide](dynamomodel-guide.md)** diff --git a/fern/pages/kubernetes/deployment/dynamomodel-guide.md b/fern/pages/kubernetes/deployment/dynamomodel-guide.md new file mode 100644 index 00000000000..14061a39ef0 --- /dev/null +++ b/fern/pages/kubernetes/deployment/dynamomodel-guide.md @@ -0,0 +1,629 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Managing Models with DynamoModel" +--- + +## Overview + +`DynamoModel` is a Kubernetes Custom Resource that represents a machine learning model deployed on Dynamo. It enables you to: + +- **Deploy LoRA adapters** on top of running base models +- **Track model endpoints** and their readiness across your cluster +- **Manage model lifecycle** declaratively with Kubernetes + +DynamoModel works alongside `DynamoGraphDeployment` (DGD) or `DynamoComponentDeployment` (DCD) resources. While DGD/DCD deploy the inference infrastructure (pods, services), DynamoModel handles model-specific operations like loading LoRA adapters. + +## Quick Start + +### Prerequisites + +Before creating a DynamoModel, you need: + +1. A running `DynamoGraphDeployment` or `DynamoComponentDeployment` +2. Components configured with `modelRef` pointing to your base model +3. Pods are ready and serving your base model + +For complete setup including DGD configuration, see [Integration with DynamoGraphDeployment](#integration-with-dynamographdeployment). + +### Deploy a LoRA Adapter + +**1. Create your DynamoModel:** + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoModel +metadata: + name: my-lora + namespace: dynamo-system +spec: + modelName: my-custom-lora + baseModelName: Qwen/Qwen3-0.6B # Must match modelRef.name in your DGD + modelType: lora + source: + uri: s3://my-bucket/loras/my-lora +``` + +**2. Apply and verify:** + +```bash +# Apply the DynamoModel +kubectl apply -f my-lora.yaml + +# Check status +kubectl get dynamomodel my-lora +``` + +**Expected output:** +``` +NAME TOTAL READY AGE +my-lora 2 2 30s +``` + +That's it! The operator automatically discovers endpoints and loads the LoRA. + +For detailed status monitoring, see [Monitoring & Operations](#monitoring--operations). + +## Understanding DynamoModel + +### Model Types + +DynamoModel supports three model types: + +| Type | Description | Use Case | +|------|-------------|----------| +| **`base`** | Reference to an existing base model | Tracking endpoints for a base model (default) | +| **`lora`** | LoRA adapter that extends a base model | Deploy fine-tuned adapters on existing models | +| **`adapter`** | Generic model adapter | Future extensibility for other adapter types | + +Most users will use **`lora`** to deploy fine-tuned models on top of their base model deployments. + +### How It Works + +When you create a DynamoModel, the operator: + +1. **Discovers endpoints**: Finds all pods running your `baseModelName` (by matching `modelRef.name` in DGD/DCD) +2. **Creates service**: Automatically creates a Kubernetes Service to track these pods +3. **Loads LoRA**: Calls the LoRA load API on each endpoint (for `lora` type) +4. **Updates status**: Reports which endpoints are ready + +**Key linkage:** +```yaml +# DGD modelRef.name ↔ DynamoModel baseModelName must match +Worker: + modelRef: + name: Qwen/Qwen3-0.6B +--- +spec: + baseModelName: Qwen/Qwen3-0.6B +``` + +## Configuration Overview + +DynamoModel requires just a few key fields to deploy a model or adapter: + +| Field | Required | Purpose | Example | +|-------|----------|---------|---------| +| `modelName` | Yes | Model identifier | `my-custom-lora` | +| `baseModelName` | Yes | Links to DGD modelRef | `Qwen/Qwen3-0.6B` | +| `modelType` | No | Type: base/lora/adapter | `lora` (default: `base`) | +| `source.uri` | For LoRA | Model location | `s3://bucket/path` or `hf://org/model` | + +**Example minimal LoRA configuration:** +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoModel +metadata: + name: my-lora +spec: + modelName: my-custom-lora + baseModelName: Qwen/Qwen3-0.6B + modelType: lora + source: + uri: s3://my-bucket/my-lora +``` + +**For complete field specifications, validation rules, and all options, see:** +📖 [DynamoModel API Reference](../api-reference.md#dynamomodel) + +### Status Summary + +The status shows discovered endpoints and their readiness: + +```bash +kubectl get dynamomodel my-lora +``` + +**Key status fields:** +- `totalEndpoints` / `readyEndpoints`: Counts of discovered vs ready endpoints +- `endpoints[]`: List with addresses, pod names, and ready status +- `conditions`: Standard Kubernetes conditions (EndpointsReady, ServicesFound) + +For detailed status usage, see the [Monitoring & Operations](#monitoring--operations) section below + +## Common Use Cases + +### Use Case 1: S3-Hosted LoRA Adapter + +Deploy a LoRA adapter stored in an S3 bucket. + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoModel +metadata: + name: customer-support-lora + namespace: production +spec: + modelName: customer-support-adapter-v1 + baseModelName: meta-llama/Llama-3.3-70B-Instruct + modelType: lora + source: + uri: s3://my-models-bucket/loras/customer-support/v1 +``` + +**Prerequisites:** +- S3 bucket accessible from your pods (IAM role or credentials) +- Base model `meta-llama/Llama-3.3-70B-Instruct` running via DGD/DCD + +**Verification:** +```bash +# Check LoRA is loaded +kubectl get dynamomodel customer-support-lora -o jsonpath='{.status.readyEndpoints}' +# Should output: 2 (or your number of replicas) + +# View which pods are serving +kubectl get dynamomodel customer-support-lora -o jsonpath='{.status.endpoints[*].podName}' +``` + +### Use Case 2: HuggingFace-Hosted LoRA + +Deploy a LoRA adapter from HuggingFace Hub. + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoModel +metadata: + name: multilingual-lora + namespace: dynamo-system +spec: + modelName: multilingual-adapter + baseModelName: Qwen/Qwen3-0.6B + modelType: lora + source: + uri: hf://myorg/qwen-multilingual-lora@v1.0.0 # Optional: @revision +``` + +**Prerequisites:** +- HuggingFace Hub accessible from your pods +- If private repo: HF token configured as secret and mounted in pods +- Base model `Qwen/Qwen3-0.6B` running via DGD/DCD + +**With HuggingFace token:** +```yaml +# In your DGD/DCD +spec: + services: + worker: + envFromSecret: hf-token-secret # Provides HF_TOKEN env var + modelRef: + name: Qwen/Qwen3-0.6B + # ... rest of config +``` + +### Use Case 3: Multiple LoRAs on Same Base Model + +Deploy multiple LoRA adapters on the same base model deployment. + +```yaml +--- +# LoRA for customer support +apiVersion: nvidia.com/v1alpha1 +kind: DynamoModel +metadata: + name: support-lora +spec: + modelName: support-adapter + baseModelName: Qwen/Qwen3-0.6B + modelType: lora + source: + uri: s3://models/support-lora + +--- +# LoRA for code generation +apiVersion: nvidia.com/v1alpha1 +kind: DynamoModel +metadata: + name: code-lora +spec: + modelName: code-adapter + baseModelName: Qwen/Qwen3-0.6B # Same base model + modelType: lora + source: + uri: s3://models/code-lora +``` + +Both LoRAs will be loaded on all pods serving `Qwen/Qwen3-0.6B`. Your application can then route requests to the appropriate adapter. + +## Monitoring & Operations + +### Checking Status + +**Quick status check:** +```bash +kubectl get dynamomodel +``` + +**Example output:** +``` +NAME TOTAL READY AGE +my-lora 2 2 5m +customer-lora 4 3 2h +``` + +**Detailed status:** +```bash +kubectl describe dynamomodel my-lora +``` + +**Example output:** +``` +Name: my-lora +Namespace: dynamo-system +Spec: + Model Name: my-custom-lora + Base Model Name: Qwen/Qwen3-0.6B + Model Type: lora + Source: + Uri: s3://my-bucket/my-lora +Status: + Ready Endpoints: 2 + Total Endpoints: 2 + Endpoints: + Address: http://10.0.1.5:9090 + Pod Name: worker-0 + Ready: true + Address: http://10.0.1.6:9090 + Pod Name: worker-1 + Ready: true + Conditions: + Type: EndpointsReady + Status: True + Reason: EndpointsDiscovered +Events: + Type Reason Message + ---- ------ ------- + Normal EndpointsReady Discovered 2 ready endpoints for base model Qwen/Qwen3-0.6B +``` + +### Understanding Readiness + +An endpoint is **ready** when: +1. The pod is running and healthy +2. The LoRA load API call succeeded + +**Condition states:** +- `EndpointsReady=True`: All endpoints are ready (full availability) +- `EndpointsReady=False, Reason=NotReady`: Not all endpoints ready (check message for counts) +- `EndpointsReady=False, Reason=NoEndpoints`: No endpoints found + +When `readyEndpoints < totalEndpoints`, the operator automatically retries loading every 30 seconds. + +### Viewing Endpoints + +**Get endpoint addresses:** +```bash +kubectl get dynamomodel my-lora -o jsonpath='{.status.endpoints[*].address}' | tr ' ' '\n' +``` + +**Output:** +``` +http://10.0.1.5:9090 +http://10.0.1.6:9090 +``` + +**Get endpoint pod names:** +```bash +kubectl get dynamomodel my-lora -o jsonpath='{.status.endpoints[*].podName}' | tr ' ' '\n' +``` + +**Check readiness of each endpoint:** +```bash +kubectl get dynamomodel my-lora -o json | jq '.status.endpoints[] | {podName, ready}' +``` + +**Output:** +```json +{ + "podName": "worker-0", + "ready": true +} +{ + "podName": "worker-1", + "ready": true +} +``` + +### Updating a Model + +To update a LoRA (e.g., deploy a new version): + +```bash +# Edit the source URI +kubectl edit dynamomodel my-lora + +# Or apply an updated YAML +kubectl apply -f my-lora-v2.yaml +``` + +The operator will detect the change and reload the LoRA on all endpoints. + +### Deleting a Model + +```bash +kubectl delete dynamomodel my-lora +``` + +For LoRA models, the operator will: +1. Unload the LoRA from all endpoints +2. Clean up associated resources +3. Remove the DynamoModel CR + +The base model deployment (DGD/DCD) continues running normally. + +## Troubleshooting + +### No Endpoints Found + +**Symptom:** +```yaml +status: + totalEndpoints: 0 + readyEndpoints: 0 + conditions: + - type: EndpointsReady + status: "False" + reason: NoEndpoints + message: "No endpoint slices found for base model Qwen/Qwen3-0.6B" +``` + +**Common Causes:** + +1. **Base model deployment not running** + ```bash + # Check if pods exist + kubectl get pods -l nvidia.com/dynamo-component-type=worker + ``` + **Solution:** Deploy your DGD/DCD first, wait for pods to be ready. + +2. **`baseModelName` mismatch** + ```bash + # Check modelRef in your DGD + kubectl get dynamographdeployment my-deployment -o yaml | grep -A2 modelRef + ``` + **Solution:** Ensure `baseModelName` in DynamoModel exactly matches `modelRef.name` in DGD. + +3. **Pods not ready** + ```bash + # Check pod status + kubectl get pods -l nvidia.com/dynamo-component-type=worker + ``` + **Solution:** Wait for pods to reach `Running` and `Ready` state. + +4. **Wrong namespace** + **Solution:** Ensure DynamoModel is in the same namespace as your DGD/DCD. + +### LoRA Load Failures + +**Symptom:** +```yaml +status: + totalEndpoints: 2 + readyEndpoints: 0 # ← No endpoints ready despite pods existing + conditions: + - type: EndpointsReady + status: "False" + reason: NoReadyEndpoints +``` + +**Common Causes:** + +1. **Source URI not accessible** + ```bash + # Check operator logs + kubectl logs -n dynamo-system deployment/dynamo-operator-controller-manager -f | grep "Failed to load" + ``` + **Solution:** + - For S3: Verify bucket permissions, IAM role, credentials + - For HuggingFace: Verify token is valid, repo exists and is accessible + +2. **Invalid LoRA format** + **Solution:** Ensure your LoRA weights are in the format expected by your backend framework (vLLM, SGLang, etc.) + +3. **Endpoint API errors** + ```bash + # Check operator logs for HTTP errors + kubectl logs -n dynamo-system deployment/dynamo-operator-controller-manager | grep "error" + ``` + **Solution:** Check the backend framework's logs in the worker pods: + ```bash + kubectl logs worker-0 + ``` + +4. **Out of memory** + **Solution:** LoRA adapters require additional memory. Increase memory limits in your DGD: + ```yaml + resources: + limits: + memory: "32Gi" # Increase if needed + ``` + +### Status Shows Not Ready + +**Symptom:** +Some endpoints remain not ready for extended periods. + +**Diagnosis:** +```bash +# Check which endpoints are not ready +kubectl get dynamomodel my-lora -o json | jq '.status.endpoints[] | select(.ready == false)' + +# View operator logs for that specific pod +kubectl logs -n dynamo-system deployment/dynamo-operator-controller-manager | grep "worker-0" + +# Check the worker pod logs +kubectl logs worker-0 | tail -50 +``` + +**Common Causes:** + +1. **Network issues**: Pod can't reach S3/HuggingFace +2. **Resource constraints**: Pod is OOMing or being throttled +3. **API endpoint not responding**: Backend framework isn't serving the LoRA API + +**When to wait vs investigate:** +- **Wait**: If readyEndpoints is increasing over time (LoRAs loading progressively) +- **Investigate**: If stuck at same readyEndpoints for >5 minutes + +### Viewing Events and Logs + +**Check events:** +```bash +kubectl describe dynamomodel my-lora | tail -20 +``` + +**View operator logs:** +```bash +# Follow logs +kubectl logs -n dynamo-system deployment/dynamo-operator-controller-manager -f + +# Filter for specific model +kubectl logs -n dynamo-system deployment/dynamo-operator-controller-manager | grep "my-lora" +``` + +**Common events and messages:** + +| Event/Message | Meaning | Action | +|---------------|---------|--------| +| `EndpointsReady` | All endpoints are ready | ✅ Good - full service availability | +| `NotReady` | Not all endpoints ready | ⚠️ Check readyEndpoints count - operator will retry | +| `PartialEndpointFailure` | Some endpoints failed to load | Check logs for errors | +| `NoEndpointsFound` | No pods discovered | Verify DGD running and modelRef matches | +| `EndpointDiscoveryFailed` | Can't query endpoints | Check operator RBAC permissions | +| `Successfully reconciled` | Reconciliation complete | ✅ Good | + +## Integration with DynamoGraphDeployment + +This section shows the complete end-to-end workflow for deploying base models and LoRA adapters together. + +DynamoModel and DynamoGraphDeployment work together to provide complete model deployment: + +- **DGD**: Deploys the infrastructure (pods, services, resources) +- **DynamoModel**: Manages model-specific operations (LoRA loading) + +### Linking Models to Components + +The connection is established through the `modelRef` field in your DGD: + +**Complete example:** + +```yaml +--- +# 1. Deploy the base model infrastructure +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: my-deployment +spec: + backendFramework: vllm + services: + Frontend: + componentType: frontend + replicas: 1 + dynamoNamespace: my-app + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + + Worker: + # This modelRef creates the link to DynamoModel + modelRef: + name: Qwen/Qwen3-0.6B # ← Key linking field + + componentType: worker + replicas: 2 + resources: + limits: + gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + args: + - --model + - Qwen/Qwen3-0.6B + - --tensor-parallel-size + - "1" + +--- +# 2. Deploy LoRA adapters on top +apiVersion: nvidia.com/v1alpha1 +kind: DynamoModel +metadata: + name: my-lora +spec: + modelName: my-custom-lora + baseModelName: Qwen/Qwen3-0.6B # ← Must match modelRef.name above + modelType: lora + source: + uri: s3://my-bucket/loras/my-lora +``` + +### Deployment Workflow + +**Recommended order:** + +```bash +# 1. Deploy base model infrastructure +kubectl apply -f my-deployment.yaml + +# 2. Wait for pods to be ready +kubectl wait --for=condition=ready pod -l nvidia.com/dynamo-component-type=worker --timeout=5m + +# 3. Deploy LoRA adapters +kubectl apply -f my-lora.yaml + +# 4. Verify LoRA is loaded +kubectl get dynamomodel my-lora +``` + +**What happens behind the scenes:** + +| Step | DGD | DynamoModel | +|------|-----|-------------| +| 1 | Creates pods with modelRef | - | +| 2 | Pods become running and ready | - | +| 3 | - | CR created, discovers endpoints via auto-created Service | +| 4 | - | Calls LoRA load API on each endpoint | +| 5 | - | All endpoints ready ✓ | + +The operator automatically handles all service discovery - you don't configure services, labels, or selectors manually. + +## API Reference + +For complete field specifications, validation rules, and detailed type definitions, see: + +**📖 [Dynamo CRD API Reference](../api-reference.md#dynamomodel)** + +## Summary + +DynamoModel provides declarative model management for Dynamo deployments: + +✅ **Simple**: 2-step deployment of LoRA adapters +✅ **Automatic**: Endpoint discovery and loading handled by operator +✅ **Observable**: Rich status reporting and conditions +✅ **Integrated**: Works seamlessly with DynamoGraphDeployment + +**Next Steps:** +- Try the [Quick Start](#quick-start) example +- Explore [Common Use Cases](#common-use-cases) +- Check the [API Reference](../api-reference.md#dynamomodel) for advanced configuration + diff --git a/fern/pages/kubernetes/deployment/minikube-setup.md b/fern/pages/kubernetes/deployment/minikube-setup.md new file mode 100644 index 00000000000..e67eee80254 --- /dev/null +++ b/fern/pages/kubernetes/deployment/minikube-setup.md @@ -0,0 +1,50 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Minikube Setup Guide" +--- + +Don't have a Kubernetes cluster? No problem! You can set up a local development environment using Minikube. This guide walks through the set up of everything you need to run Dynamo Kubernetes Platform locally. + +## 1. Install Minikube +First things first! Start by installing Minikube. Follow the official [Minikube installation guide](https://minikube.sigs.k8s.io/docs/start/) for your operating system. + +## 2. Configure GPU Support (Optional) +Planning to use GPU-accelerated workloads? You'll need to configure GPU support in Minikube. Follow the [Minikube GPU guide](https://minikube.sigs.k8s.io/docs/tutorials/nvidia/) to set up NVIDIA GPU support before proceeding. + + +Make sure to configure GPU support before starting Minikube if you plan to use GPU workloads! + + + +## 3. Start Minikube +Time to launch your local cluster! + +```bash +# Start Minikube with GPU support (if configured) +minikube start --driver docker --container-runtime docker --gpus all --memory=16000mb --cpus=8 + +# Enable required addons +minikube addons enable istio-provisioner +minikube addons enable istio +minikube addons enable storage-provisioner-rancher +``` + +## 4. Verify Installation +Let's make sure everything is working correctly! + +```bash +# Check Minikube status +minikube status + +# Verify Istio installation +kubectl get pods -n istio-system + +# Verify storage class +kubectl get storageclass +``` + +## Next Steps + +Once your local environment is set up, you can proceed with the [Dynamo Kubernetes Platform installation guide](../installation-guide.md) to deploy the platform to your local cluster. + diff --git a/fern/pages/kubernetes/deployment/multinode-deployment.md b/fern/pages/kubernetes/deployment/multinode-deployment.md new file mode 100644 index 00000000000..a87533cf319 --- /dev/null +++ b/fern/pages/kubernetes/deployment/multinode-deployment.md @@ -0,0 +1,312 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Multinode Deployment Guide" +--- + +This guide explains how to deploy Dynamo workloads across multiple nodes. Multinode deployments enable you to scale compute-intensive LLM workloads across multiple physical machines, maximizing GPU utilization and supporting larger models. + +## Overview + +Dynamo supports multinode deployments through the `multinode` section in resource specifications. This allows you to: + +- Distribute workloads across multiple physical nodes +- Scale GPU resources beyond a single machine +- Support large models requiring extensive tensor parallelism +- Achieve high availability and fault tolerance + +## Basic requirements + +- **Kubernetes Cluster**: Version 1.24 or later +- **GPU Nodes**: Multiple nodes with NVIDIA GPUs +- **High-Speed Networking**: InfiniBand, RoCE, or high-bandwidth Ethernet (recommended for optimal performance) + + +### Advanced Multinode Orchestration + +#### Using Grove (default) + +For sophisticated multinode deployments, Dynamo integrates with advanced Kubernetes orchestration systems: + +- **[Grove](https://github.com/NVIDIA/grove)**: Network topology-aware gang scheduling and auto-scaling for AI workloads +- **[KAI-Scheduler](https://github.com/NVIDIA/KAI-Scheduler)**: Kubernetes native scheduler optimized for AI workloads at scale + +These systems provide enhanced scheduling capabilities including topology-aware placement, gang scheduling, and coordinated auto-scaling across multiple nodes. + +**Features Enabled with Grove:** +- Declarative composition of AI workloads +- Multi-level horizontal auto-scaling +- Custom startup ordering for components +- Resource-aware rolling updates + + +[KAI-Scheduler](https://github.com/NVIDIA/KAI-Scheduler) is a Kubernetes native scheduler optimized for AI workloads at large scale. + +**Features Enabled with KAI-Scheduler:** +- Gang scheduling +- Network topology-aware pod placement +- AI workload-optimized scheduling algorithms +- GPU resource awareness and allocation +- Support for complex scheduling constraints +- Integration with Grove for enhanced capabilities +- Performance optimizations for large-scale deployments + + +##### Prerequisites + +- [Grove](https://github.com/NVIDIA/grove/blob/main/docs/installation.md) installed on the cluster +- (Optional) [KAI-Scheduler](https://github.com/NVIDIA/KAI-Scheduler) installed on the cluster with the default queue name `dynamo` created. If no queue annotation is specified on the DGD resource, the operator uses the `dynamo` queue by default. Custom queue names can be specified via the `nvidia.com/kai-scheduler-queue` annotation, but the queue must exist in the cluster before deployment. + +KAI-Scheduler is optional but recommended for advanced scheduling capabilities. + +#### Using LWS and Volcano + +LWS is a simple multinode deployment mechanism that allows you to deploy a workload across multiple nodes. + +- **LWS**: [LWS Installation](https://github.com/kubernetes-sigs/lws#installation) +- **Volcano**: [Volcano Installation](https://volcano.sh/en/docs/installation/) + +Volcano is a Kubernetes native scheduler optimized for AI workloads at scale. It is used in conjunction with LWS to provide gang scheduling support. + + +## Core Concepts + +### Orchestrator Selection Algorithm + +Dynamo automatically selects the best available orchestrator for multinode deployments using the following logic: + +#### When Both Grove and LWS are Available: +- **Grove is selected by default** (recommended for advanced AI workloads) +- **LWS is selected** if you explicitly set `nvidia.com/enable-grove: "false"` annotation on your DGD resource + +#### When Only One Orchestrator is Available: +- The installed orchestrator (Grove or LWS) is automatically selected + +#### Scheduler Integration: +- **With Grove**: Automatically integrates with [KAI-Scheduler](https://github.com/NVIDIA/KAI-Scheduler) when available, providing: + - Advanced queue management via `nvidia.com/kai-scheduler-queue` annotation + - AI-optimized scheduling policies + - Resource-aware workload placement +- **With LWS**: Uses Volcano scheduler for gang scheduling and resource coordination + +#### Configuration Examples: + +**Default (Grove with KAI-Scheduler):** +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: my-multinode-deployment + annotations: + nvidia.com/kai-scheduler-queue: "dynamo" +spec: + # ... your deployment spec +``` + +> **Note:** The `nvidia.com/kai-scheduler-queue` annotation defaults to `"dynamo"`. If you specify a custom queue name, ensure the queue exists in your cluster before deploying. You can verify available queues with `kubectl get queues`. + +**Force LWS usage:** +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: my-multinode-deployment + annotations: + nvidia.com/enable-grove: "false" +spec: + # ... your deployment spec +``` + + +### The `multinode` Section + +The `multinode` section in a resource specification defines how many physical nodes the workload should span: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: my-multinode-deployment +spec: + # ... your deployment spec + services: + my-service: + ... + multinode: + nodeCount: 2 + resources: + limits: + gpu: "2" # 2 GPUs per node +``` + +### GPU Distribution + +The relationship between `multinode.nodeCount` and `gpu` is multiplicative: + +- **`multinode.nodeCount`**: Number of physical nodes +- **`gpu`**: Number of GPUs per node +- **Total GPUs**: `multinode.nodeCount × gpu` + +**Example:** +- `multinode.nodeCount: "2"` + `gpu: "4"` = 8 total GPUs (4 GPUs per node across 2 nodes) +- `multinode.nodeCount: "4"` + `gpu: "8"` = 32 total GPUs (8 GPUs per node across 4 nodes) + +### Tensor Parallelism Alignment + +The tensor parallelism (`tp-size` or `--tp`) in your command/args must match the total number of GPUs: + +```yaml +# Example: 2 multinode.nodeCount × 4 GPUs = 8 total GPUs +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: my-multinode-deployment +spec: + # ... your deployment spec + services: + my-service: + ... + multinode: + nodeCount: 2 + resources: + limits: + gpu: "4" + extraPodSpec: + mainContainer: + ... + args: + # Command args must use tp-size=8 + - "--tp-size" + - "8" # Must equal multinode.nodeCount × gpu + +``` + + +## Backend-Specific Operator Behavior + +When you deploy a multinode workload, the Dynamo operator automatically applies backend-specific configurations to enable distributed execution. Understanding these automatic modifications helps troubleshoot issues and optimize your deployments. + +### vLLM Backend + +For vLLM multinode deployments, the operator automatically selects and configures the appropriate distributed execution mode based on your parallelism settings: + +#### Deployment Modes + +The operator automatically determines the deployment mode based on your parallelism configuration: + +**1. Tensor/Pipeline Parallelism Mode (Single model across nodes)** +- **When used**: When `world_size > GPUs_per_node` where `world_size = tensor_parallel_size × pipeline_parallel_size` +- **Use case**: Distributing a single model instance across multiple nodes using tensor or pipeline parallelism + +The operator uses Ray for multi-node tensor/pipeline parallel deployments. Ray provides automatic placement group management and worker spawning across nodes. + +**Leader Node:** +- **Command**: `ray start --head --port=6379 && --distributed-executor-backend ray` +- **Behavior**: Starts Ray head node, then runs vLLM which creates a placement group spanning all Ray workers +- **Probes**: All health probes remain active (liveness, readiness, startup) + +**Worker Nodes:** +- **Command**: `ray start --address=:6379 --block` +- **Behavior**: Joins Ray cluster and blocks; vLLM on leader spawns Ray actors to these workers +- **Probes**: All probes (liveness, readiness, startup) are automatically removed + +> **Note**: vLLM's Ray executor automatically creates a placement group and spawns workers across the cluster. The `--nnodes` flag is NOT used with Ray - it's only compatible with the `mp` backend. + +**2. Data Parallel Mode (Multiple model instances across nodes)** +- **When used**: When `world_size × data_parallel_size > GPUs_per_node` +- **Use case**: Running multiple independent model instances across nodes with data parallelism (e.g., MoE models with expert parallelism) + +**All Nodes (Leader and Workers):** +- **Injected Flags**: + - `--data-parallel-address ` - Address of the coordination server + - `--data-parallel-size-local ` - Number of data parallel workers per node + - `--data-parallel-rpc-port 13445` - RPC port for data parallel coordination + - `--data-parallel-start-rank ` - Starting rank for this node (calculated automatically) +- **Probes**: Worker probes are removed; leader probes remain active + +**Note**: The operator intelligently injects these flags into your command regardless of command structure (direct Python commands or shell wrappers) + +#### Why Ray for Multi-Node TP/PP? + +vLLM supports two distributed executor backends: `ray` and `mp`. For multi-node deployments: + +- **Ray executor**: vLLM creates a placement group and spawns Ray actors across the cluster. Workers don't run vLLM directly - the leader's vLLM process manages everything. +- **mp executor**: Each node must run its own vLLM process with `--nnodes`, `--node-rank`, `--master-addr`, `--master-port`. This approach is more complex to orchestrate. + +The Dynamo operator uses Ray because: +1. It aligns with vLLM's official multi-node documentation (see `multi-node-serving.sh`) +2. Simpler orchestration - only the leader runs vLLM, workers just need Ray agents +3. vLLM automatically handles placement group creation and worker management + +#### Compilation Cache Support +When a volume mount is configured with `useAsCompilationCache: true`, the operator automatically sets: +- **`VLLM_CACHE_ROOT`**: Environment variable pointing to the cache mount point + +### SGLang Backend + +For SGLang multinode deployments, the operator injects distributed training parameters: + +#### Leader Node +- **Distributed Flags**: Injects `--dist-init-addr :29500 --nnodes --node-rank 0` +- **Probes**: All health probes remain active + +#### Worker Nodes +- **Distributed Flags**: Injects `--dist-init-addr :29500 --nnodes --node-rank ` + - The `node-rank` is automatically determined from the pod's stateful identity +- **Probes**: All probes (liveness, readiness, startup) are automatically removed + +**Note:** The operator intelligently injects these flags regardless of your command structure (direct Python commands or shell wrappers). + +### TensorRT-LLM Backend + +For TensorRT-LLM multinode deployments, the operator configures MPI-based communication: + +#### Leader Node +- **SSH Configuration**: Automatically sets up SSH keys and configuration from a Kubernetes secret +- **MPI Command**: Wraps your command in an `mpirun` command with: + - Proper host list including all worker nodes + - SSH configuration for passwordless authentication on port 2222 + - Environment variable propagation to all nodes + - Activation of the Dynamo virtual environment +- **Probes**: All health probes remain active + +#### Worker Nodes +- **SSH Daemon**: Replaces your command with SSH daemon setup and execution + - Generates host keys in user-writable directories (non-privileged) + - Configures SSH daemon to listen on port 2222 + - Sets up authorized keys for leader access +- **Probes**: + - **Liveness and Startup**: Removed (workers run SSH daemon, not the main application) + - **Readiness**: Replaced with TCP socket check on SSH port 2222 + - Initial Delay: 20 seconds + - Period: 20 seconds + - Timeout: 5 seconds + - Failure Threshold: 10 + +#### Additional Configuration +- **Environment Variable**: `OMPI_MCA_orte_keep_fqdn_hostnames=1` is added to all nodes +- **SSH Volume**: Automatically mounts the SSH keypair secret (typically named `mpirun-ssh-key-`) + +**Important:** TensorRT-LLM requires an SSH keypair secret to be created before deployment. The secret name follows the pattern `mpirun-ssh-key-`. + +### Compilation Cache Configuration + +The operator supports compilation cache volumes for backend-specific optimization: + +| Backend | Support Level | Environment Variables | Default Mount Point | +|---------|--------------|----------------------|---------------------| +| vLLM | Fully Supported | `VLLM_CACHE_ROOT` | User-specified | +| SGLang | Partial Support | _None (pending upstream)_ | User-specified | +| TensorRT-LLM | Partial Support | _None (pending upstream)_ | User-specified | + +To enable compilation cache, add a volume mount with `useAsCompilationCache: true` in your component specification. For vLLM, the operator will automatically configure the necessary environment variables. For other backends, volume mounts are created, but additional environment configuration may be required until upstream support is added. + +## Next Steps + +For additional support and examples, see the working multinode configurations in: + +- **SGLang**: [examples/backends/sglang/deploy/](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/sglang/deploy/) +- **TensorRT-LLM**: [examples/backends/trtllm/deploy/](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/trtllm/deploy/) +- **vLLM**: [examples/backends/vllm/deploy/](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm/deploy/) + +These examples demonstrate proper usage of the `multinode` section with corresponding `gpu` limits and correct `tp-size` configuration. diff --git a/fern/pages/kubernetes/dynamo-operator.md b/fern/pages/kubernetes/dynamo-operator.md new file mode 100644 index 00000000000..2c134740182 --- /dev/null +++ b/fern/pages/kubernetes/dynamo-operator.md @@ -0,0 +1,198 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Working with Dynamo Kubernetes Operator" +--- + +## Overview + +Dynamo operator is a Kubernetes operator that simplifies the deployment, configuration, and lifecycle management of DynamoGraphs. It automates the reconciliation of custom resources to ensure your desired state is always achieved. This operator is ideal for users who want to manage complex deployments using declarative YAML definitions and Kubernetes-native tooling. + +## Architecture + +- **Operator Deployment:** + Deployed as a Kubernetes `Deployment` in a specific namespace. + +- **Controllers:** + - `DynamoGraphDeploymentController`: Watches `DynamoGraphDeployment` CRs and orchestrates graph deployments. + - `DynamoComponentDeploymentController`: Watches `DynamoComponentDeployment` CRs and handles individual component deployments. + - `DynamoModelController`: Watches `DynamoModel` CRs and manages model lifecycle (e.g., loading LoRA adapters). + +- **Workflow:** + 1. A custom resource is created by the user or API server. + 2. The corresponding controller detects the change and runs reconciliation. + 3. Kubernetes resources (Deployments, Services, etc.) are created or updated to match the CR spec. + 4. Status fields are updated to reflect the current state. + +## Deployment Modes + +The Dynamo operator supports three deployment modes to accommodate different cluster environments and use cases: + +### 1. Cluster-Wide Mode (Default) + +The operator monitors and manages DynamoGraph resources across **all namespaces** in the cluster. + +**When to Use:** +- You have full cluster admin access +- You want centralized management of all Dynamo workloads +- Standard production deployment on a dedicated cluster + +--- + +### 2. Namespace-Scoped Mode + +The operator monitors and manages DynamoGraph resources **only in a specific namespace**. A lease marker is created to signal the operator's presence to any cluster-wide operators. + +**When to Use:** +- You're on a shared/multi-tenant cluster +- You only have namespace-level permissions +- You want to test a new operator version in isolation +- You need to avoid conflicts with other operators + +**Installation:** +```bash +helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz \ + --namespace my-namespace \ + --create-namespace \ + --set dynamo-operator.namespaceRestriction.enabled=true +``` + +--- + +### 3. Hybrid Mode + +A **cluster-wide operator** manages most namespaces, while **one or more namespace-scoped operators** run in specific namespaces (e.g., for testing new versions). The cluster-wide operator automatically detects and excludes namespaces with namespace-scoped operators using lease markers. + +**When to Use:** +- Running production workloads with a stable operator version +- Testing new operator versions in isolated namespaces without affecting production +- Gradual rollout of operator updates +- Development/staging environments on production clusters + +**How It Works:** +1. Namespace-scoped operator creates a lease named `dynamo-operator-namespace-scope` in its namespace +2. Cluster-wide operator watches for these lease markers across all namespaces +3. Cluster-wide operator automatically excludes any namespace with a lease marker +4. If namespace-scoped operator stops, its lease expires (TTL: 30s by default) +5. Cluster-wide operator automatically resumes managing that namespace + +**Setup Example:** + +```bash +# 1. Install cluster-wide operator (production, v1.0.0) +helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz \ + --namespace dynamo-system \ + --create-namespace + +# 2. Install namespace-scoped operator (testing, v2.0.0-beta) +helm install dynamo-test dynamo-platform-${RELEASE_VERSION}.tgz \ + --namespace test-namespace \ + --create-namespace \ + --set dynamo-operator.namespaceRestriction.enabled=true \ + --set dynamo-operator.controllerManager.manager.image.tag=v2.0.0-beta + +**Observability:** + +```bash +# List all namespaces with local operators +kubectl get lease -A --field-selector metadata.name=dynamo-operator-namespace-scope + +# Check which operator version is running in a namespace +kubectl get lease -n my-namespace dynamo-operator-namespace-scope \ + -o jsonpath='{.spec.holderIdentity}' +``` + + +## Custom Resource Definitions (CRDs) + +Dynamo provides the following Custom Resources: + +- **DynamoGraphDeployment (DGD)**: Deploys complete inference pipelines +- **DynamoComponentDeployment (DCD)**: Deploys individual components +- **DynamoModel**: Manages model lifecycle (e.g., loading LoRA adapters) + +For the complete technical API reference for Dynamo Custom Resource Definitions, see: + +**📖 [Dynamo CRD API Reference](api-reference.md)** + +For a user-focused guide on deploying and managing models with DynamoModel, see: + +**📖 [Managing Models with DynamoModel Guide](deployment/dynamomodel-guide.md)** + +## Webhooks + +The Dynamo Operator uses **Kubernetes admission webhooks** for real-time validation of custom resources before they are persisted to the cluster. Webhooks are **enabled by default** and ensure that invalid configurations are rejected immediately at the API server level. + +**Key Features:** +- ✅ Shared certificate infrastructure across all webhook types +- ✅ Automatic certificate generation (for testing/development) +- ✅ cert-manager integration (for production) +- ✅ Multi-operator support with lease-based coordination +- ✅ Immutability enforcement for critical fields + +For complete documentation on webhooks, certificate management, and troubleshooting, see: + +**📖 [Webhooks Guide](webhooks.md)** + +## Installation + +### Quick Install with Helm + +```bash +# Set environment +export NAMESPACE=dynamo-system +export RELEASE_VERSION=0.x.x # any version of Dynamo 0.3.2+ listed at https://github.com/ai-dynamo/dynamo/releases + +# Install Platform (includes operator) +helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-${RELEASE_VERSION}.tgz +helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz --namespace ${NAMESPACE} --create-namespace +``` + +> **Note:** For shared/multi-tenant clusters or testing scenarios, see [Deployment Modes](#deployment-modes) above for namespace-scoped and hybrid configurations. + +### Building from Source + +```bash +# Set environment +export NAMESPACE=dynamo-system +export DOCKER_SERVER=your-registry.com/ # your container registry +export IMAGE_TAG=latest + +# Build operator image +cd deploy/operator +docker build -t $DOCKER_SERVER/dynamo-operator:$IMAGE_TAG . +docker push $DOCKER_SERVER/dynamo-operator:$IMAGE_TAG +cd - + +# Install CRDs +cd deploy/helm/charts +helm install dynamo-crds ./crds/ --namespace default + +# Install platform with custom operator image +helm install dynamo-platform ./platform/ \ + --namespace ${NAMESPACE} \ + --create-namespace \ + --set "dynamo-operator.controllerManager.manager.image.repository=${DOCKER_SERVER}/dynamo-operator" \ + --set "dynamo-operator.controllerManager.manager.image.tag=${IMAGE_TAG}" +``` + +For detailed installation options, see the [Installation Guide](installation-guide.md) + + +## Development + +- **Code Structure:** + +The operator is built using Kubebuilder and the operator-sdk, with the following structure: + +- `controllers/`: Reconciliation logic +- `api/v1alpha1/`: CRD types +- `config/`: Manifests and Helm charts + + +## References + +- [Kubernetes Operator Pattern](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/) +- [Custom Resource Definitions](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/) +- [Operator SDK](https://sdk.operatorframework.io/) +- [Helm Best Practices for CRDs](https://helm.sh/docs/chart_best_practices/custom_resource_definitions/) diff --git a/fern/pages/kubernetes/fluxcd.md b/fern/pages/kubernetes/fluxcd.md new file mode 100644 index 00000000000..d2e0900a59b --- /dev/null +++ b/fern/pages/kubernetes/fluxcd.md @@ -0,0 +1,80 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "GitOps Deployment with FluxCD" +--- + +This section describes how to use FluxCD for GitOps-based deployment of Dynamo inference graphs. GitOps enables you to manage your Dynamo deployments declaratively using Git as the source of truth. We'll use the [aggregated vLLM example](../backends/vllm/README.md) to demonstrate the workflow. + +## Prerequisites + +- A Kubernetes cluster with [Dynamo Kubernetes Platform](installation-guide.md) installed +- [FluxCD](https://fluxcd.io/flux/installation/) installed in your cluster +- A Git repository to store your deployment configurations + +## Workflow Overview + +The GitOps workflow for Dynamo deployments consists of three main steps: + +1. Build and push the Dynamo Operator +2. Create and commit a DynamoGraphDeployment custom resource for initial deployment +3. Update the graph by building a new version and updating the CR for subsequent updates + +## Step 1: Build and Push Dynamo Operator + +First, follow to [See Install Dynamo Kubernetes Platform](installation-guide.md). + +## Step 2: Create Initial Deployment + +Create a new file in your Git repository (e.g., `deployments/llm-agg.yaml`) with the following content: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: llm-agg +spec: + pvcs: + - name: vllm-model-storage + size: 100Gi + services: + Frontend: + replicas: 1 + envs: + - name: SPECIFIC_ENV_VAR + value: some_specific_value + Processor: + replicas: 1 + envs: + - name: SPECIFIC_ENV_VAR + value: some_specific_value + VllmWorker: + replicas: 1 + envs: + - name: SPECIFIC_ENV_VAR + value: some_specific_value + # Add PVC for model storage + volumeMounts: + - name: vllm-model-storage + mountPoint: /models +``` + +Commit and push this file to your Git repository. FluxCD will detect the new CR and create the initial Dynamo deployment in your cluster. + +## Step 3: Update Existing Deployment + +To update your pipeline, just update the associated DynamoGraphDeployment CRD + +The Dynamo operator will automatically reconcile it. + +## Monitoring the Deployment + +You can monitor the deployment status using: + +```bash + +export NAMESPACE= + +# Check the DynamoGraphDeployment status +kubectl get dynamographdeployment llm-agg -n $NAMESPACE +``` \ No newline at end of file diff --git a/fern/pages/kubernetes/grove.md b/fern/pages/kubernetes/grove.md new file mode 100644 index 00000000000..5a307f9b8c1 --- /dev/null +++ b/fern/pages/kubernetes/grove.md @@ -0,0 +1,104 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Grove Deployment Guide" +--- + +Grove is a Kubernetes API specifically designed to address the orchestration challenges of modern AI workloads, particularly disaggregated inference systems. Grove provides seamless integration with NVIDIA Dynamo for comprehensive AI infrastructure management. + +## Overview + +Grove was originally motivated by the challenges of orchestrating multinode, disaggregated inference systems. It provides a consistent and unified API that allows users to define, configure, and scale prefill, decode, and any other components like routing within a single custom resource. + +### How Grove Works for Disaggregated Serving + +Grove enables disaggregated serving by breaking down large language model inference into separate, specialized components that can be independently scaled and managed. This architecture provides several advantages: + +- **Component Specialization**: Separate prefill, decode, and routing components optimized for their specific tasks +- **Independent Scaling**: Each component can scale based on its individual resource requirements and workload patterns +- **Resource Optimization**: Better utilization of hardware resources through specialized workload placement +- **Fault Isolation**: Issues in one component don't necessarily affect others + +## Core Components and API Resources + +Grove implements disaggregated serving through several custom Kubernetes resources that provide declarative composition of role-based pod groups: + +### PodCliqueSet +The top-level Grove object that defines a group of components managed and colocated together. Key features include: +- Support for autoscaling +- Topology-aware spread of replicas for availability +- Unified management of multiple disaggregated components + +### PodClique +Represents a group of pods with a specific role (e.g., leader, worker, frontend). Each clique features: +- Independent configuration options +- Custom scaling logic support +- Role-specific resource allocation + +### PodCliqueScalingGroup +A set of PodCliques that scale and are scheduled together, ideal for tightly coupled roles like prefill leader and worker components that need coordinated scaling behavior. + +## Key Capabilities for Disaggregated Serving + +Grove provides several specialized features that make it particularly well-suited for disaggregated serving: + +### Flexible Gang Scheduling +PodCliques and PodCliqueScalingGroups allow users to specify flexible gang-scheduling requirements at multiple levels within a PodCliqueSet to prevent resource deadlocks and ensure all components of a disaggregated system start together. + +### Multi-level Horizontal Auto-Scaling +Supports pluggable horizontal auto-scaling solutions to scale PodCliqueSet, PodClique, and PodCliqueScalingGroup custom resources independently based on their specific metrics and requirements. + +### Network Topology-Aware Scheduling +Allows specifying network topology pack and spread constraints to optimize for both network performance and service availability, crucial for disaggregated systems where components need efficient inter-node communication. + +### Custom Startup Dependencies +Prescribes the order in which PodCliques must start in a declarative specification, with pod startup decoupled from pod creation or scheduling. This ensures proper initialization order for disaggregated components. + +## Use Cases and Examples + +Grove specifically supports: + +- **Multi-node disaggregated inference** for large models such as DeepSeek-R1 and Llama-4-Maverick +- **Single-node disaggregated inference** for optimized resource utilization +- **Agentic pipelines of models** for complex AI workflows +- **Standard aggregated serving** patterns for single node or single GPU inference + +## Integration with NVIDIA Dynamo + +Grove is strategically aligned with NVIDIA Dynamo for seamless integration within the AI infrastructure stack: + +### Complementary Roles +- **Grove**: Handles the Kubernetes orchestration layer for disaggregated AI workloads +- **Dynamo**: Provides comprehensive AI infrastructure capabilities including serving backends, routing, and resource management + +### Release Coordination +Grove is aligning its release schedule with NVIDIA Dynamo to ensure seamless integration, with the finalized release cadence reflected in the project roadmap. + +### Unified AI Platform +The integration creates a comprehensive platform where: +- Grove manages complex orchestration of disaggregated components +- Dynamo provides the serving infrastructure, routing capabilities, and backend integrations +- Together they enable sophisticated AI serving architectures with simplified management + +## Architecture Benefits + +Grove represents a significant advancement in Kubernetes-based orchestration for AI workloads by: + +1. **Simplifying Complex Deployments**: Provides a unified API that can manage multiple components (prefill, decode, routing) within a single resource definition +2. **Enabling Sophisticated Architectures**: Supports advanced disaggregated inference patterns that were previously difficult to orchestrate +3. **Reducing Operational Complexity**: Abstracts away the complexity of coordinating multiple interdependent AI components +4. **Optimizing Resource Utilization**: Enables fine-grained control over component placement and scaling + +## Getting Started + +Grove relies on KAI Scheduler for resource allocation and scheduling. + +For KAI Scheduler, see the [KAI Scheduler Deployment Guide](https://github.com/NVIDIA/KAI-Scheduler). + +For installation instructions, see the [Grove Installation Guide](https://github.com/NVIDIA/grove/blob/main/docs/installation.md). + +For practical examples of Grove-based multinode deployments in action, see the [Multinode Deployment Guide](deployment/multinode-deployment.md), which demonstrates multi-node disaggregated serving scenarios. + +For the latest updates on Grove, refer to the [official project on GitHub](https://github.com/NVIDIA/grove). + +Dynamo Kubernetes Platform also allows you to install Grove and KAI Scheduler as part of the platform installation. See the [Dynamo Kubernetes Platform Deployment Installation Guide](installation-guide.md) for more details. \ No newline at end of file diff --git a/fern/pages/kubernetes/installation-guide.md b/fern/pages/kubernetes/installation-guide.md new file mode 100644 index 00000000000..4f434ac76d5 --- /dev/null +++ b/fern/pages/kubernetes/installation-guide.md @@ -0,0 +1,371 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Installation Guide for Dynamo Kubernetes Platform" +--- + +Deploy and manage Dynamo inference graphs on Kubernetes with automated orchestration and scaling, using the Dynamo Kubernetes Platform. + +## Before You Start + +Determine your cluster environment: + +**Shared/Multi-Tenant Cluster** (K8s cluster with existing Dynamo artifacts): +- CRDs already installed cluster-wide - skip CRD installation step +- A cluster-wide Dynamo operator is likely already running +- **Do NOT install another operator** - use the existing cluster-wide operator +- Only install a namespace-restricted operator if you specifically need to prevent the cluster-wide operator from managing your namespace (e.g., testing operator features you're developing) + +**Dedicated Cluster** (full cluster admin access): +- You install CRDs yourself +- Can use cluster-wide operator (default) + +**Local Development** (Minikube, testing): +- See [Minikube Setup](deployment/minikube-setup.md) first, then follow installation steps below + +To check if CRDs already exist: +```bash +kubectl get crd | grep dynamo +# If you see dynamographdeployments, dynamocomponentdeployments, etc., CRDs are already installed +``` + +To check if a cluster-wide operator already exists: +```bash +# Check for cluster-wide operator and show its namespace +kubectl get clusterrolebinding -o json | \ + jq -r '.items[] | select(.metadata.name | contains("dynamo-operator-manager")) | + "Cluster-wide operator found in namespace: \(.subjects[0].namespace)"' + +# If a cluster-wide operator exists: Do NOT install another operator +# Only install namespace-restricted mode if you specifically need namespace isolation +``` + +## Installation Paths + +Platform is installed using Dynamo Kubernetes Platform [helm chart](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/charts/platform/README.md). + +**Path A: Pre-built Artifacts** +- Use case: Production deployment, shared or dedicated clusters +- Source: NGC published Helm charts +- Time: ~10 minutes +- Jump to: [Path A](#path-a-production-install) + +**Path B: Custom Build from Source** +- Use case: Contributing to Dynamo, using latest features from main branch, customization +- Requirements: Docker build environment +- Time: ~30 minutes +- Jump to: [Path B](#path-b-custom-build-from-source) + +All helm install commands could be overridden by either setting the values.yaml file or by passing in your own values.yaml: + +```bash +helm install ... + -f your-values.yaml +``` + +and/or setting values as flags to the helm install command, as follows: + +```bash +helm install ... + --set "your-value=your-value" +``` + +## Prerequisites + +Before installing the Dynamo Kubernetes Platform, ensure you have the following tools and access: + +### Required Tools + +| Tool | Minimum Version | Description | Installation | +|------|-----------------|-------------|--------------| +| **kubectl** | v1.24+ | Kubernetes command-line tool | [Install kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl) | +| **Helm** | v3.0+ | Kubernetes package manager | [Install Helm](https://helm.sh/docs/intro/install/) | +| **Docker** | Latest | Container runtime (Path B only) | [Install Docker](https://docs.docker.com/get-docker/) | + +### Cluster and Access Requirements + +- **Kubernetes cluster v1.24+** with admin or namespace-scoped access +- **Cluster type determined** (shared vs dedicated) — see [Before You Start](#before-you-start) +- **CRD status checked** if on a shared cluster +- **NGC credentials** (optional) — required only if pulling NVIDIA images from NGC + +### Verify Installation + +Run the following to confirm your tools are correctly installed: + +```bash +# Verify tools and versions +kubectl version --client # Should show v1.24+ +helm version # Should show v3.0+ +docker version # Required for Path B only + +# Set your release version +export RELEASE_VERSION=0.x.x # any version of Dynamo 0.3.2+ listed at https://github.com/ai-dynamo/dynamo/releases +``` + +### Pre-Deployment Checks + +Before proceeding, run the pre-deployment check script to verify your cluster meets all requirements: + +```bash +./deploy/pre-deployment/pre-deployment-check.sh +``` + +This script validates kubectl connectivity, default StorageClass configuration, and GPU node availability. See [Pre-Deployment Checks](https://github.com/ai-dynamo/dynamo/tree/main/deploy/pre-deployment/README.md) for details. + +> **No cluster?** See [Minikube Setup](deployment/minikube-setup.md) for local development. + +**Estimated installation time:** 5-30 minutes depending on path + +## Path A: Production Install + +Install from [NGC published artifacts](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo/artifacts). + +```bash +# 1. Set environment +export NAMESPACE=dynamo-system +export RELEASE_VERSION=0.x.x # any version of Dynamo 0.3.2+ listed at https://github.com/ai-dynamo/dynamo/releases + +# 2. Install CRDs (skip if on shared cluster where CRDs already exist) +helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-crds-${RELEASE_VERSION}.tgz +helm install dynamo-crds dynamo-crds-${RELEASE_VERSION}.tgz --namespace default + +# 3. Install Platform +helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-${RELEASE_VERSION}.tgz +helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz --namespace ${NAMESPACE} --create-namespace +``` + +**For Shared/Multi-Tenant Clusters:** + +If your cluster has namespace-restricted Dynamo operators, you MUST add namespace restriction to your installation: + +```bash +# Add this flag to the helm install command above +--set dynamo-operator.namespaceRestriction.enabled=true +``` + +Note: Use the full path `dynamo-operator.namespaceRestriction.enabled=true` (not just `namespaceRestriction.enabled=true`). + +If you see this validation error, you need namespace restriction: +``` +VALIDATION ERROR: Cannot install cluster-wide Dynamo operator. +Found existing namespace-restricted Dynamo operators in namespaces: ... +``` + + +For multinode deployments, you need to install multinode orchestration components: +**Option 1 (Recommended): Grove + KAI Scheduler** +- Grove and KAI Scheduler can be installed manually or through the dynamo-platform helm install command. +- When using the dynamo-platform helm install command, Grove and KAI Scheduler are NOT installed by default. You can enable their installation by setting the following flags: +```bash +--set "grove.enabled=true" +--set "kai-scheduler.enabled=true" +``` +**Option 2: LeaderWorkerSet (LWS) + Volcano** +- If using LWS for multinode deployments, you must also install Volcano (required dependency): +- [LWS Installation](https://github.com/kubernetes-sigs/lws#installation) +- [Volcano Installation](https://volcano.sh/en/docs/installation/) (required for gang scheduling with LWS) +- These must be installed manually before deploying multinode workloads with LWS. +See the [Multinode Deployment Guide](deployment/multinode-deployment.md) for details on orchestrator selection. + + + +By default, Model Express Server is not used. +If you wish to use an existing Model Express Server, you can set the modelExpressURL to the existing server's URL in the helm install command: + + +```bash +--set "dynamo-operator.modelExpressURL=http://model-express-server.model-express.svc.cluster.local:8080" +``` + + +By default, Dynamo Operator is installed cluster-wide and will monitor all namespaces. +If you wish to restrict the operator to monitor only a specific namespace (the helm release namespace by default), you can set the namespaceRestriction.enabled to true. +You can also change the restricted namespace by setting the targetNamespace property. + + +```bash +--set "dynamo-operator.namespaceRestriction.enabled=true" +--set "dynamo-operator.namespaceRestriction.targetNamespace=dynamo-namespace" # optional +``` + +→ [Verify Installation](#verify-installation) + +## Path B: Custom Build from Source + +Build and deploy from source for customization, contributing to Dynamo, or using the latest features from the main branch. + +Note: This gives you access to the latest unreleased features and fixes on the main branch. + +```bash +# 1. Set environment +export NAMESPACE=dynamo-system +export DOCKER_SERVER=nvcr.io/nvidia/ai-dynamo/ # or your registry +export DOCKER_USERNAME='$oauthtoken' +export DOCKER_PASSWORD= +export IMAGE_TAG=${RELEASE_VERSION} + +# 2. Build operator +cd deploy/operator + +# 2.1 Alternative 1 : Build and push the operator image for multiple platforms +docker buildx create --name multiplatform --driver docker-container --bootstrap +docker buildx use multiplatform +docker buildx build --platform linux/amd64,linux/arm64 -t $DOCKER_SERVER/dynamo-operator:$IMAGE_TAG --push . + +# 2.2 Alternative 2 : Build and push the operator image for a single platform +docker build -t $DOCKER_SERVER/dynamo-operator:$IMAGE_TAG . && docker push $DOCKER_SERVER/dynamo-operator:$IMAGE_TAG + +cd - + +# 3. Create namespace and secrets to be able to pull the operator image (only needed if you pushed the operator image to a private registry) +kubectl create namespace ${NAMESPACE} +kubectl create secret docker-registry docker-imagepullsecret \ + --docker-server=${DOCKER_SERVER} \ + --docker-username=${DOCKER_USERNAME} \ + --docker-password=${DOCKER_PASSWORD} \ + --namespace=${NAMESPACE} + +cd deploy/helm/charts + +# 4. Install CRDs +helm upgrade --install dynamo-crds ./crds/ --namespace default + +# 5. Install Platform +helm dep build ./platform/ + +# To install cluster-wide instead, set NS_RESTRICT_FLAGS="" (empty) or omit that line entirely. + +NS_RESTRICT_FLAGS="--set dynamo-operator.namespaceRestriction.enabled=true" +helm install dynamo-platform ./platform/ \ + --namespace "${NAMESPACE}" \ + --set "dynamo-operator.controllerManager.manager.image.repository=${DOCKER_SERVER}/dynamo-operator" \ + --set "dynamo-operator.controllerManager.manager.image.tag=${IMAGE_TAG}" \ + --set "dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret" \ + ${NS_RESTRICT_FLAGS} + +``` + +→ [Verify Installation](#verify-installation) + +## Verify Installation + +```bash +# Check CRDs +kubectl get crd | grep dynamo + +# Check operator and platform pods +kubectl get pods -n ${NAMESPACE} +# Expected: dynamo-operator-* and etcd-* and nats-* pods Running +``` + +## Next Steps + +1. **Deploy Model/Workflow** + ```bash + # Example: Deploy a vLLM workflow with Qwen3-0.6B using aggregated serving + kubectl apply -f examples/backends/vllm/deploy/agg.yaml -n ${NAMESPACE} + + # Port forward and test + kubectl port-forward svc/agg-vllm-frontend 8000:8000 -n ${NAMESPACE} + curl http://localhost:8000/v1/models + ``` + +2. **Explore Backend Guides** + - [vLLM Deployments](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm/deploy/README.md) + - [SGLang Deployments](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/sglang/deploy/README.md) + - [TensorRT-LLM Deployments](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/trtllm/deploy/README.md) + +3. **Optional:** + - [Set up Prometheus & Grafana](observability/metrics.md) + - [SLA Planner Quickstart Guide](../planner/sla-planner-quickstart.md) (for SLA-aware scheduling and autoscaling) + +## Troubleshooting + +**"VALIDATION ERROR: Cannot install cluster-wide Dynamo operator"** + +``` +VALIDATION ERROR: Cannot install cluster-wide Dynamo operator. +Found existing namespace-restricted Dynamo operators in namespaces: ... +``` + +Cause: Attempting cluster-wide install on a shared cluster with existing namespace-restricted operators. + +Solution: Add namespace restriction to your installation: +```bash +--set dynamo-operator.namespaceRestriction.enabled=true +``` + +Note: Use the full path `dynamo-operator.namespaceRestriction.enabled=true` (not just `namespaceRestriction.enabled=true`). + +**CRDs already exist** + +Cause: Installing CRDs on a cluster where they're already present (common on shared clusters). + +Solution: Skip step 2 (CRD installation), proceed directly to platform installation. + +To check if CRDs exist: +```bash +kubectl get crd | grep dynamo +``` + +**Pods not starting?** +```bash +kubectl describe pod -n ${NAMESPACE} +kubectl logs -n ${NAMESPACE} +``` + +**HuggingFace model access?** +```bash +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN=${HF_TOKEN} \ + -n ${NAMESPACE} +``` + +**Bitnami etcd "unrecognized" image?** + +```bash +ERROR: Original containers have been substituted for unrecognized ones. Deploying this chart with non-standard containers is likely to cause degraded security and performance, broken chart features, and missing environment variables. +``` +This error that you might encounter during helm install is due to bitnami changing their docker repository to a [secure one](https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog). + +just add the following to the helm install command: +```bash +--set "etcd.image.repository=bitnamilegacy/etcd" --set "etcd.global.security.allowInsecureImages=true" +``` + +**Clean uninstall?** + +To uninstall the platform, you can run the following command: +``` +helm uninstall dynamo-platform --namespace ${NAMESPACE} +``` + +To uninstall the CRDs, follow these steps: + +Get all of the dynamo CRDs installed in your cluster: +```bash +kubectl get crd | grep "dynamo.*nvidia.com" +``` + +You should see something like this: +``` +dynamocomponentdeployments.nvidia.com 2025-10-21T14:49:52Z +dynamocomponents.nvidia.com 2025-10-25T05:16:10Z +dynamographdeploymentrequests.nvidia.com 2025-11-24T05:26:04Z +dynamographdeployments.nvidia.com 2025-09-04T20:56:40Z +dynamographdeploymentscalingadapters.nvidia.com 2025-12-09T21:05:59Z +dynamomodels.nvidia.com 2025-11-07T00:19:43Z +``` + +Delete each CRD one by one: +```bash +kubectl delete crd +``` + +## Advanced Options + +- [Helm Chart Configuration](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/charts/platform/README.md) +- [Create custom deployments](deployment/create-deployment.md) +- [Dynamo Operator details](dynamo-operator.md) +- [Model Express Server details](https://github.com/ai-dynamo/modelexpress) diff --git a/fern/pages/kubernetes/model-caching-with-fluid.md b/fern/pages/kubernetes/model-caching-with-fluid.md new file mode 100644 index 00000000000..17748267cf5 --- /dev/null +++ b/fern/pages/kubernetes/model-caching-with-fluid.md @@ -0,0 +1,332 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Model Caching with Fluid: Cloud-Native Data Orchestration and Acceleration" +--- + +Fluid is an open-source, cloud-native data orchestration and acceleration platform for Kubernetes. It virtualizes and accelerates data access from various sources (object storage, distributed file systems, cloud storage), making it ideal for AI, machine learning, and big data workloads. + +## Key Features + +- **Data Caching and Acceleration:** Cache remote data close to compute workloads for faster access. +- **Unified Data Access:** Access data from S3, HDFS, NFS, and more through a single interface. +- **Kubernetes Native:** Integrates with Kubernetes using CRDs for data management. +- **Scalability:** Supports large-scale data and compute clusters. + +## Installation + +You can install Fluid on any Kubernetes cluster using Helm. + +**Prerequisites:** +- Kubernetes >= 1.18 +- `kubectl` >= 1.18 +- `Helm` >= 3.5 + +**Quick Install:** +```sh +kubectl create ns fluid-system +helm repo add fluid https://fluid-cloudnative.github.io/charts +helm repo update +helm install fluid fluid/fluid -n fluid-system +``` +For advanced configuration, see the [Fluid Installation Guide](https://fluid-cloudnative.github.io/docs/get-started/installation). + +## Pre-deployment Steps + +1. Install Fluid (see [Installation](#installation)). +2. Create a Dataset and Runtime (see [the following example](#webufs-example)). +3. Mount the resulting PVC in your workload. + + +## Mounting Data Sources + +### WebUFS Example + +WebUFS allows mounting HTTP/HTTPS sources as filesystems. + +```yaml +# Mount a public HTTP directory as a Fluid Dataset +apiVersion: data.fluid.io/v1alpha1 +kind: Dataset +metadata: + name: webufs-model +spec: + mounts: + - mountPoint: https://myhost.org/path_to_my_model # Replace with your HTTP source + name: webufs-model +--- +apiVersion: data.fluid.io/v1alpha1 +kind: AlluxioRuntime +metadata: + name: webufs-model +spec: + replicas: 2 + tieredstore: + levels: + - mediumtype: MEM + path: /dev/shm + quota: 2Gi + high: "0.95" + low: "0.7" +``` +After applying, Fluid creates a PersistentVolumeClaim (PVC) named `webufs-model` containing the files. + +### S3 Example + +Mount an S3 bucket as a Fluid Dataset. + +```yaml +# Mount an S3 bucket as a Fluid Dataset +apiVersion: data.fluid.io/v1alpha1 +kind: Dataset +metadata: + name: s3-model +spec: + mounts: + - mountPoint: s3:// # Replace with your bucket name + options: + alluxio.underfs.s3.endpoint: http://minio:9000 # S3 endpoint (e.g., MinIO) + alluxio.underfs.s3.disable.dns.buckets: "true" + aws.secretKey: "" + aws.accessKeyId: "" +--- +apiVersion: data.fluid.io/v1alpha1 +kind: AlluxioRuntime +metadata: + name: s3-model +spec: + replicas: 1 + tieredstore: + levels: + - mediumtype: MEM + path: /dev/shm + quota: 1Gi + high: "0.95" + low: "0.7" +--- +apiVersion: data.fluid.io/v1alpha1 +kind: DataLoad +metadata: + name: s3-model-loader +spec: + dataset: + name: s3-model + namespace: # Replace with your namespace + loadMetadata: true + target: + - path: "/" + replicas: 1 +``` + +The resulting PVC is named `s3-model`. + +## Using HuggingFace Models with Fluid + +**Limitations:** +- HuggingFace models are not exposed as simple filesystems or buckets. +- No native integration exists between Fluid and the HuggingFace Hub API. + +**Workaround: Download and Upload to S3/MinIO** + +1. Download the model using the HuggingFace CLI or SDK. +2. Upload the model files to a supported storage backend (S3, GCS, NFS). +3. Mount that backend using Fluid. + +**Example Pod to Download and Upload:** +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: download-hf-to-minio +spec: + restartPolicy: Never + containers: + - name: downloader + image: python:3.10-slim + command: ["sh", "-c"] + args: + - | + set -eux + pip install --no-cache-dir huggingface_hub awscli + BUCKET_NAME=hf-models + ENDPOINT_URL=http://minio:9000 + MODEL_NAME=deepseek-ai/DeepSeek-R1-Distill-Llama-8B + LOCAL_DIR=/tmp/model + if ! aws --endpoint-url $ENDPOINT_URL s3 ls "s3://$BUCKET_NAME" > /dev/null 2>&1; then + aws --endpoint-url $ENDPOINT_URL s3 mb "s3://$BUCKET_NAME" + fi + huggingface-cli download $MODEL_NAME --local-dir $LOCAL_DIR --local-dir-use-symlinks False + aws --endpoint-url $ENDPOINT_URL s3 cp $LOCAL_DIR s3://$BUCKET_NAME/$MODEL_NAME --recursive + env: + - name: AWS_ACCESS_KEY_ID + value: "" + - name: AWS_SECRET_ACCESS_KEY + value: "" + volumeMounts: + - name: tmp-volume + mountPath: /tmp/model + volumes: + - name: tmp-volume + emptyDir: {} +``` + +You can then use `s3://hf-models/deepseek-ai/DeepSeek-R1-Distill-Llama-8B` as your Dataset mount. + +## Usage with Dynamo + +Mount the Fluid-generated PVC in your DynamoGraphDeployment: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: model-caching +spec: + pvcs: + - name: s3-model + envs: + - name: HF_HOME + value: /model + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common": {"model": "/model", ...}}' + services: + VllmWorker: + volumeMounts: + - name: s3-model + mountPoint: /model + Processor: + volumeMounts: + - name: s3-model + mountPoint: /model +``` + + +## Full example with llama3.3 70B + +### Performance + +When deploying LLaMA 3.3 70B using Fluid as the caching layer, we observed the best performance by configuring a single-node cache that holds 100% of the model files locally. By ensuring that the vllm worker pod is scheduled on the same node as the Fluid cache, we were able to eliminate network I/O bottlenecks, which resulted in the fastest model startup time and the highest inference efficiency during our tests. + +| Cache Configuration | vLLM Pod Placement | Startup Time | +|----------------------------------------------|----------------------------------|-----------------| +| ❌ No Cache (Download from HuggingFace) | N/A | ~9 minutes | +| 🟡 Multi-Node Cache (100% Model Cached) | Not on Cache Node | ~18 minutes | +| 🟡 Multi-Node Cache (100% Model Cached) | On Cache Node | ~10 minutes | +| ✅ Single-Node Cache (100% Model Cached) | On Cache Node | ~80 seconds | + + +### Resources + +```yaml +# dataset.yaml +apiVersion: data.fluid.io/v1alpha1 +kind: Dataset +metadata: + name: llama-3-3-70b-instruct-model + namespace: my-namespace +spec: + mounts: + - mountPoint: s3://hf-models/meta-llama/Llama-3.3-70B-Instruct + options: + alluxio.underfs.s3.endpoint: http://minio:9000 + alluxio.underfs.s3.disable.dns.buckets: "true" + aws.secretKey: "minioadmin" + aws.accessKeyId: "minioadmin" + alluxio.underfs.s3.streaming.upload.enabled: "true" + alluxio.underfs.s3.multipart.upload.threads: "20" + alluxio.underfs.s3.socket.timeout: "50s" + alluxio.underfs.s3.request.timeout: "60s" +--- +# runtime.yaml +apiVersion: data.fluid.io/v1alpha1 +kind: AlluxioRuntime +metadata: + name: llama-3-3-70b-instruct-model + namespace: my-namespace +spec: + replicas: 1 + properties: + alluxio.user.file.readtype.default: CACHE_PROMOTE + alluxio.user.file.write.type.default: CACHE_THROUGH + alluxio.user.block.size.bytes.default: 128MB + tieredstore: + levels: + - mediumtype: MEM + path: /dev/shm + quota: 300Gi + high: "1.0" + low: "0.7" +--- +# DataLoad - Preloads the model into cache +apiVersion: data.fluid.io/v1alpha1 +kind: DataLoad +metadata: + name: llama-3-3-70b-instruct-model-loader +spec: + dataset: + name: llama-3-3-70b-instruct-model + namespace: my-namespace + loadMetadata: true + target: + - path: "/" + replicas: 1 +``` + +and the associated DynamoGraphDeployment with pod affinity to schedule the vllm worker on the same node than the Alluxio cache worker + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: my-hello-world +spec: + envs: + - name: DYN_LOG + value: "debug" + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common": {"model": "/model", "block-size": 64, "max-model-len": 16384}, + "Frontend": {"served_model_name": "meta-llama/Llama-3.3-70B-Instruct", "endpoint": + "dynamo.Processor.chat/completions", "port": 8000}, "Processor": {"router": + "round-robin", "router-num-threads": 4, "common-configs": ["model", "block-size", + "max-model-len"]}, "VllmWorker": {"tensor-parallel-size": 4, "enforce-eager": true, "max-num-batched-tokens": + 16384, "enable-prefix-caching": true, "ServiceArgs": {"workers": 1, "resources": + {"gpu": "4", "memory": "40Gi"}}, "common-configs": ["model", "block-size", "max-model-len"]}, + "Planner": {"environment": "kubernetes", "no-operation": true}}' + pvcs: + - name: llama-3-3-70b-instruct-model + services: + Processor: + volumeMounts: + - name: llama-3-3-70b-instruct-model + mountPoint: /model + VllmWorker: + volumeMounts: + - name: llama-3-3-70b-instruct-model + mountPoint: /model + extraPodSpec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: fluid.io/s-alluxio-my-namespace-llama-3-3-70b-instruct-model + operator: In + values: + - "true" +``` + + +## Troubleshooting & FAQ + +- **PVC not created?** Check Fluid and AlluxioRuntime pod logs. +- **Model not found?** Ensure the model was uploaded to the correct bucket/path. +- **Permission errors?** Verify S3/MinIO credentials and bucket policies. + +## Resources + +- [Fluid Documentation](https://fluid-cloudnative.github.io/) +- [Alluxio Documentation](https://docs.alluxio.io/) +- [MinIO Documentation](https://docs.min.io/) +- [Hugging Face Hub](https://huggingface.co/docs/hub/index) +- [Dynamo README](https://github.com/ai-dynamo/dynamo/blob/main/.devcontainer/README.md) +- [Dynamo Documentation](https://docs.nvidia.com/dynamo/latest/index.html) diff --git a/fern/pages/kubernetes/observability/logging.md b/fern/pages/kubernetes/observability/logging.md new file mode 100644 index 00000000000..c6d05aa3681 --- /dev/null +++ b/fern/pages/kubernetes/observability/logging.md @@ -0,0 +1,160 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Log Aggregation in Dynamo on Kubernetes" +--- + +This guide demonstrates how to set up logging for Dynamo in Kubernetes using Grafana Loki and Alloy. This setup provides a simple reference logging setup that can be followed in Kubernetes clusters including Minikube and MicroK8s. + + +This setup is intended for development and testing purposes. For production environments, please refer to the official documentation for high-availability configurations. + + +## Components Overview + +- **[Grafana Loki](https://grafana.com/oss/loki/)**: Fast and cost-effective Kubernetes-native log aggregation system. + +- **[Grafana Alloy](https://grafana.com/oss/alloy/)**: OpenTelemetry collector that replaces Promtail, gathering logs, metrics and traces from Kubernetes pods. + +- **[Grafana](https://grafana.com/grafana/)**: Visualization platform for querying and exploring logs. + +## Prerequisites + +### 1. Dynamo Kubernetes Platform + +This guide assumes you have installed Dynamo Kubernetes Platform. For more information, see [Dynamo Kubernetes Platform](../README.md). + +### 2. Kube-prometheus + +While this guide does not use Prometheus, it assumes Grafana is pre-installed with the kube-prometheus. For more information, see [kube-prometheus](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack). + +### 3. Environment Variables + +#### Kubernetes Setup Variables + +The following env variables are set: +- `MONITORING_NAMESPACE`: The namespace where Loki is installed +- `DYN_NAMESPACE`: The namespace where Dynamo Kubernetes Platform is installed + +```bash +export MONITORING_NAMESPACE=monitoring +export DYN_NAMESPACE=dynamo-system +``` + +#### Dynamo Logging Variables + +| Variable | Description | Example | +|----------|-------------|---------| +| `DYN_LOGGING_JSONL` | Enable JSONL logging format (required for Loki) | `true` | +| `DYN_LOG` | Log levels per target `,=,=` | `DYN_LOG=info,dynamo_runtime::system_status_server:trace` | +| `DYN_LOG_USE_LOCAL_TZ` | Use local timezone for timestamps | `true` | + +## Installation Steps + +### 1. Install Loki + +First, we'll install Loki in single binary mode, which is ideal for testing and development: + +```bash +# Add the Grafana Helm repository +helm repo add grafana https://grafana.github.io/helm-charts +helm repo update + +# Install Loki +helm install --values deploy/observability/k8s/logging/values/loki-values.yaml loki grafana/loki -n $MONITORING_NAMESPACE +``` + +Our configuration (`loki-values.yaml`) sets up Loki in a simple configuration that is suitable for testing and development. It uses a local MinIO for storage. The installation pods can be viewed with: +```bash +kubectl get pods -n $MONITORING_NAMESPACE -l app=loki +``` + +### 2. Install Grafana Alloy + +Next, install the Grafana Alloy collector to gather logs from your Kubernetes cluster and forward them to Loki. Here we use the Helm chart `k8s-monitoring` provided by Grafana to install the collector: + +```bash +# Generate a custom values file with the namespace information +envsubst < deploy/observability/k8s/logging/values/alloy-values.yaml > alloy-custom-values.yaml + +# Install the collector +helm install --values alloy-custom-values.yaml alloy grafana/k8s-monitoring -n $MONITORING_NAMESPACE +``` + +The values file (`alloy-values.yaml`) includes the following configurations for the collector: +- Destination to forward logs to Loki +- Namespace to collect logs from +- Pod labels to be mapped to Loki labels +- Collection method (kubernetesApi or tailing `/var/log/containers/`) + +```yaml +destinations: +- name: loki + type: loki + url: http://loki-gateway.$MONITORING_NAMESPACE.svc.cluster.local/loki/api/v1/push +podLogs: + enabled: true + gatherMethod: kubernetesApi # collect logs from the kubernetes api, rather than /var/log/containers/; friendly for testing and development + collector: alloy-logs + labels: + app_kubernetes_io_name: app.kubernetes.io/name + nvidia_com_dynamo_component_type: nvidia.com/dynamo-component-type + nvidia_com_dynamo_graph_deployment_name: nvidia.com/dynamo-graph-deployment-name + labelsToKeep: + - "app_kubernetes_io_name" + - "container" + - "instance" + - "job" + - "level" + - "namespace" + - "service_name" + - "service_namespace" + - "deployment_environment" + - "deployment_environment_name" + - "nvidia_com_dynamo_component_type" # extract this label from the dynamo graph deployment + - "nvidia_com_dynamo_graph_deployment_name" # extract this label from the dynamo graph deployment + namespaces: + - $DYN_NAMESPACE +``` + +### 3. Configure Grafana with the Loki datasource and Dynamo Logs dashboard + +We will be viewing the logs associated with our DynamoGraphDeployment in Grafana. To do this, we need to configure Grafana with the Loki datasource and Dynamo Logs dashboard. + +Since we are using Grafana with the Prometheus Operator, we can simply apply the following ConfigMaps to quickly achieve this configuration. + +```bash +# Configure Grafana with the Loki datasource +envsubst < deploy/observability/k8s/logging/grafana/loki-datasource.yaml | kubectl apply -n $MONITORING_NAMESPACE -f - + +# Configure Grafana with the Dynamo Logs dashboard +envsubst < deploy/observability/k8s/logging/grafana/logging-dashboard.yaml | kubectl apply -n $MONITORING_NAMESPACE -f - +``` + + +If using Grafana installed without the Prometheus Operator, you can manually import the Loki datasource and Dynamo Logs dashboard using the Grafana UI. + + +### 4. Deploy a DynamoGraphDeployment with JSONL Logging + +At this point, we should have everything in place to collect and view logs in our Grafana instance. All that is left is to deploy a DynamoGraphDeployment to collect logs from. + +To enable structured logs in a DynamoGraphDeployment, we need to set the `DYN_LOGGING_JSONL` environment variable to `1`. This is done for us in the `agg_logging.yaml` setup for the Sglang backend. We can now deploy the DynamoGraphDeployment with: + +```bash +kubectl apply -n $DYN_NAMESPACE -f examples/backends/sglang/deploy/agg_logging.yaml +``` + +Send a few chat completions requests to generate structured logs across the frontend and worker pods across the DynamoGraphDeployment. We are now all set to view the logs in Grafana. + +## Viewing Logs in Grafana + +Port-forward the Grafana service to access the UI: + +```bash +kubectl port-forward svc/prometheus-grafana 3000:80 -n $MONITORING_NAMESPACE +``` + +If everything is working, under Home > Dashboards > Dynamo Logs, you should see a dashboard that can be used to view the logs associated with our DynamoGraphDeployments + +The dashboard enables filtering by DynamoGraphDeployment, namespace, and component type (e.g., frontend, worker, etc.). diff --git a/fern/pages/kubernetes/observability/metrics.md b/fern/pages/kubernetes/observability/metrics.md new file mode 100644 index 00000000000..30c8144bd5a --- /dev/null +++ b/fern/pages/kubernetes/observability/metrics.md @@ -0,0 +1,180 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Dynamo Metrics Collection on Kubernetes" +--- + +## Overview + +This guide provides a walkthrough for collecting and visualizing metrics from Dynamo components using the kube-prometheus-stack. The kube-prometheus-stack provides a powerful and flexible way to configure monitoring for Kubernetes applications through custom resources like PodMonitors, making it easy to automatically discover and scrape metrics from Dynamo components. + +## Prerequisites + +### Install kube-prometheus-stack +If you don't have an existing Prometheus setup, you'll likely want to install the kube-prometheus-stack. This is a collection of Kubernetes manifests that includes the Prometheus Operator, Prometheus, Grafana, and other monitoring components in a pre-configured setup. The stack introduces custom resources that make it easy to deploy and manage monitoring in Kubernetes: + +- `PodMonitor`: Automatically discovers and scrapes metrics from pods based on label selectors +- `ServiceMonitor`: Similar to PodMonitor but works with Services +- `PrometheusRule`: Defines alerting and recording rules + +For a basic installation: +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +# Values allow PodMonitors to be picked up that are outside of the kube-prometheus-stack helm release +helm install prometheus -n monitoring --create-namespace prometheus-community/kube-prometheus-stack \ + --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ + --set prometheus.prometheusSpec.podMonitorNamespaceSelector="{}" \ + --set prometheus.prometheusSpec.probeNamespaceSelector="{}" +``` + + +The commands enumerated below assume you have installed the kube-prometheus-stack with the installation method listed above. Depending on your installation configuration of the monitoring stack, you may need to modify the `kubectl` commands that follow in this document accordingly (e.g modifying Namespace or Service names accordingly). + + +### Install Dynamo Operator +Before setting up metrics collection, you'll need to have the Dynamo operator installed in your cluster. Follow our [Installation Guide](../installation-guide.md) for detailed instructions on deploying the Dynamo operator. +Make sure to set the `prometheusEndpoint` to the Prometheus endpoint you installed in the previous step. + +```bash +helm install dynamo-platform ... + --set prometheusEndpoint=http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090 +``` + + +### Node Exporter for CPU/Memory Metrics + +The Dynamo Grafana dashboard includes panels for node-level CPU utilization, system load, and container resource usage. These metrics are collected and exported to Prometheus via [node-exporter](https://github.com/prometheus/node_exporter), which exposes hardware and OS metrics from Linux systems. + + +The kube-prometheus-stack installation described above includes node-exporter by default. If you're using a custom Prometheus setup, you'll need to ensure node-exporter is deployed as a DaemonSet on your cluster nodes. + + +To verify node-exporter is running: + +```bash +kubectl get daemonset -A | grep node-exporter +``` + +If node-exporter is not running, you can install it via the kube-prometheus-stack or deploy it separately. For more information, see the [node-exporter documentation](https://github.com/prometheus/node_exporter). + +### DCGM Metrics Collection (Optional) + +GPU utilization metrics are collected and exported to Prometheus via dcgm-exporter. The Dynamo Grafana dashboard includes a panel for GPU utilization related to your Dynamo deployment. For that panel to be populated, you need to ensure that the dcgm-exporter is running in your cluster. To check if the dcgm-exporter is running, please run the following command: + +```bash +kubectl get daemonset -A | grep dcgm-exporter +``` + +If the output is empty, you need to install the dcgm-exporter. For more information, please consult the official [dcgm-exporter documentation](https://docs.nvidia.com/datacenter/cloud-native/gpu-telemetry/latest/dcgm-exporter.html). + + +## Deploy a DynamoGraphDeployment + +Let's start by deploying a simple vLLM aggregated deployment: + +```bash +export NAMESPACE=dynamo-system # namespace where dynamo operator is installed +pushd examples/backends/vllm/deploy +kubectl apply -f agg.yaml -n $NAMESPACE +popd +``` + +This will create two components: +- A Frontend component exposing metrics on its HTTP port +- A Worker component exposing metrics on its system port + +Both components expose a `/metrics` endpoint following the OpenMetrics format, but with different metrics appropriate to their roles. For details about: +- Deployment configuration: See the [vLLM README](../../backends/vllm/README.md) +- Available metrics: See the [metrics guide](../../observability/metrics.md) + +### Validate the Deployment + +Let's send some test requests to populate metrics: + +```bash +curl localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-0.6B", + "messages": [ + { + "role": "user", + "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden." + } + ], + "stream": true, + "max_tokens": 30 + }' +``` + +For more information about validating the deployment, see the [vLLM README](../../backends/vllm/README.md). + +## Set Up Metrics Collection + +### Create PodMonitors + +The Prometheus Operator uses PodMonitor resources to automatically discover and scrape metrics from pods. To enable this discovery, the Dynamo operator automatically creates PodMonitor resource and adds these labels to all pods: +- `nvidia.com/metrics-enabled: "true"` - Enables metrics collection +- `nvidia.com/dynamo-component-type: "frontend|worker"` - Identifies the component type + +> **Note**: You can opt-out specific deployments from metrics collection by adding this annotation to your DynamoGraphDeployment: +```yaml +apiVersion: nvidia.com/v1 +kind: DynamoGraphDeployment +metadata: + name: my-deployment + annotations: + nvidia.com/enable-metrics: "false" +spec: + # … +``` + +### Configure Grafana Dashboard + +Apply the Dynamo dashboard configuration to populate Grafana with the Dynamo dashboard: +```bash +kubectl apply -n monitoring -f deploy/observability/k8s/grafana-dynamo-dashboard-configmap.yaml +``` + +The dashboard is embedded in the ConfigMap. Since it is labeled with `grafana_dashboard: "1"`, the Grafana will discover and populate it to its list of available dashboards. The dashboard includes panels for: +- Frontend request rates +- Time to first token +- Inter-token latency +- Request duration +- Input/Output sequence lengths +- GPU utilization via DCGM +- Node CPU utilization and system load +- Container CPU usage per pod +- Memory usage per pod + +## Viewing the Metrics + +### In Prometheus +```bash +kubectl port-forward svc/prometheus-kube-prometheus-prometheus 9090:9090 -n monitoring +``` + +Visit http://localhost:9090 and try these example queries: +- `dynamo_frontend_requests_total` +- `dynamo_frontend_time_to_first_token_seconds_bucket` + +![Prometheus UI showing Dynamo metrics](../../../assets/img/prometheus-k8s.png) + +### In Grafana +```bash +# Get Grafana credentials +export GRAFANA_USER=$(kubectl get secret -n monitoring prometheus-grafana -o jsonpath="{.data.admin-user}" | base64 --decode) +export GRAFANA_PASSWORD=$(kubectl get secret -n monitoring prometheus-grafana -o jsonpath="{.data.admin-password}" | base64 --decode) +echo "Grafana user: $GRAFANA_USER" +echo "Grafana password: $GRAFANA_PASSWORD" + +# Port forward Grafana service +kubectl port-forward svc/prometheus-grafana 3000:80 -n monitoring +``` + +Visit http://localhost:3000 and log in with the credentials captured above. + +Once logged in, find the Dynamo dashboard under General. + +![Grafana dashboard showing Dynamo metrics](../../../assets/img/grafana-k8s.png) diff --git a/fern/pages/kubernetes/quickstart.md b/fern/pages/kubernetes/quickstart.md new file mode 100644 index 00000000000..20c9620d8c8 --- /dev/null +++ b/fern/pages/kubernetes/quickstart.md @@ -0,0 +1,239 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Deploying Dynamo on Kubernetes" +--- + +High-level guide to Dynamo Kubernetes deployments. Start here, then dive into specific guides. + +## Important Terminology + +**Kubernetes Namespace**: The K8s namespace where your DynamoGraphDeployment resource is created. +- Used for: Resource isolation, RBAC, organizing deployments +- Example: `dynamo-system`, `team-a-namespace` + +**Dynamo Namespace**: The logical namespace used by Dynamo components for [service discovery](service-discovery.md). +- Used for: Runtime component communication, service discovery +- Specified in: `.spec.services..dynamoNamespace` field +- Example: `my-llm`, `production-model`, `dynamo-dev` + +These are independent. A single Kubernetes namespace can host multiple Dynamo namespaces, and vice versa. + +## Prerequisites + +Before you begin, ensure you have the following tools installed: + +| Tool | Minimum Version | Installation Guide | +|------|-----------------|-------------------| +| **kubectl** | v1.24+ | [Install kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl) | +| **Helm** | v3.0+ | [Install Helm](https://helm.sh/docs/intro/install/) | + +Verify your installation: +```bash +kubectl version --client # Should show v1.24+ +helm version # Should show v3.0+ +``` + +For detailed installation instructions, see the [Prerequisites section](installation-guide.md#prerequisites) in the Installation Guide. + +## Pre-deployment Checks + +Before deploying the platform, run the pre-deployment checks to ensure the cluster is ready: + +```bash +./deploy/pre-deployment/pre-deployment-check.sh +``` + +This validates kubectl connectivity, StorageClass configuration, and GPU availability. See [pre-deployment checks](https://github.com/ai-dynamo/dynamo/tree/main/deploy/pre-deployment/README.md) for more details. + +## 1. Install Platform First + +```bash +# 1. Set environment +export NAMESPACE=dynamo-system +export RELEASE_VERSION=0.x.x # any version of Dynamo 0.3.2+ listed at https://github.com/ai-dynamo/dynamo/releases + +# 2. Install CRDs (skip if on shared cluster where CRDs already exist) +helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-crds-${RELEASE_VERSION}.tgz +helm install dynamo-crds dynamo-crds-${RELEASE_VERSION}.tgz --namespace default + +# 3. Install Platform +helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-${RELEASE_VERSION}.tgz +helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz --namespace ${NAMESPACE} --create-namespace +``` + +**For Shared/Multi-Tenant Clusters:** + +If your cluster has namespace-restricted Dynamo operators, add this flag to step 3: +```bash +--set dynamo-operator.namespaceRestriction.enabled=true +``` + +For more details or customization options (including multinode deployments), see **[Installation Guide for Dynamo Kubernetes Platform](installation-guide.md)**. + +## 2. Choose Your Backend + +Each backend has deployment examples and configuration options: + +| Backend | Aggregated | Aggregated + Router | Disaggregated | Disaggregated + Router | Disaggregated + Planner | Disaggregated Multi-node | +|--------------|:----------:|:-------------------:|:-------------:|:----------------------:|:-----------------------:|:------------------------:| +| **[SGLang](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/sglang/deploy/README.md)** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| **[TensorRT-LLM](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/trtllm/deploy/README.md)** | ✅ | ✅ | ✅ | ✅ | 🚧 | ✅ | +| **[vLLM](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm/deploy/README.md)** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | + +## 3. Deploy Your First Model + +```bash +export NAMESPACE=dynamo-system +kubectl create namespace ${NAMESPACE} + +# to pull model from HF +export HF_TOKEN= +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN="$HF_TOKEN" \ + -n ${NAMESPACE}; + +# Deploy any example (this uses vLLM with Qwen model using aggregated serving) +kubectl apply -f examples/backends/vllm/deploy/agg.yaml -n ${NAMESPACE} + +# Check status +kubectl get dynamoGraphDeployment -n ${NAMESPACE} + +# Test it +kubectl port-forward svc/vllm-agg-frontend 8000:8000 -n ${NAMESPACE} +curl http://localhost:8000/v1/models +``` + +For SLA-based autoscaling, see [SLA Planner Quick Start Guide](../planner/sla-planner-quickstart.md). + +## Understanding Dynamo's Custom Resources + +Dynamo provides two main Kubernetes Custom Resources for deploying models: + +### DynamoGraphDeploymentRequest (DGDR) - Simplified SLA-Driven Configuration + +The **recommended approach** for generating optimal configurations. DGDR provides a high-level interface where you specify: +- Model name and backend framework +- SLA targets (latency requirements) +- GPU type (optional) + +Dynamo automatically handles profiling and generates an optimized DGD spec in the status. Perfect for: +- SLA-driven configuration generation +- Automated resource optimization +- Users who want simplicity over control + +**Note**: DGDR generates a DGD spec which you can then use to deploy. + +### DynamoGraphDeployment (DGD) - Direct Configuration + +A lower-level interface that defines your complete inference pipeline: +- Model configuration +- Resource allocation (GPUs, memory) +- Scaling policies +- Frontend/backend connections + +Use this when you need fine-grained control or have already completed profiling. + +Refer to the [API Reference and Documentation](api-reference.md) for more details. + +## 📖 API Reference & Documentation + +For detailed technical specifications of Dynamo's Kubernetes resources: + +- **[API Reference](api-reference.md)** - Complete CRD field specifications for all Dynamo resources +- **[Create Deployment](deployment/create-deployment.md)** - Step-by-step deployment creation with DynamoGraphDeployment +- **[Operator Guide](dynamo-operator.md)** - Dynamo operator configuration and management + +### Choosing Your Architecture Pattern + +When creating a deployment, select the architecture pattern that best fits your use case: + +- **Development / Testing** - Use `agg.yaml` as the base configuration +- **Production with Load Balancing** - Use `agg_router.yaml` to enable scalable, load-balanced inference +- **High Performance / Disaggregated** - Use `disagg_router.yaml` for maximum throughput and modular scalability + +### Frontend and Worker Components + +You can run the Frontend on one machine (e.g., a CPU node) and workers on different machines (GPU nodes). The Frontend serves as a framework-agnostic HTTP entry point that: + +- Provides OpenAI-compatible `/v1/chat/completions` endpoint +- Auto-discovers backend workers via [service discovery](service-discovery.md) (Kubernetes-native by default) +- Routes requests and handles load balancing +- Validates and preprocesses requests + +### Customizing Your Deployment + +Example structure: +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: my-llm +spec: + services: + Frontend: + dynamoNamespace: my-llm + componentType: frontend + replicas: 1 + extraPodSpec: + mainContainer: + image: your-image + VllmDecodeWorker: # or SGLangDecodeWorker, TrtllmDecodeWorker + dynamoNamespace: dynamo-dev + componentType: worker + replicas: 1 + envFromSecret: hf-token-secret # for HuggingFace models + resources: + limits: + gpu: "1" + extraPodSpec: + mainContainer: + image: your-image + command: ["/bin/sh", "-c"] + args: + - python3 -m dynamo.vllm --model YOUR_MODEL [--your-flags] +``` + +Worker command examples per backend: +```yaml +# vLLM worker +args: + - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B + +# SGLang worker +args: + - >- + python3 -m dynamo.sglang + --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B + --tp 1 + --trust-remote-code + +# TensorRT-LLM worker +args: + - python3 -m dynamo.trtllm + --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B + --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B + --extra-engine-args /workspace/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/agg.yaml +``` + +Key customization points include: +- **Model Configuration**: Specify model in the args command +- **Resource Allocation**: Configure GPU requirements under `resources.limits` +- **Scaling**: Set `replicas` for number of worker instances +- **Routing Mode**: Enable KV-cache routing by setting `DYN_ROUTER_MODE=kv` in Frontend envs +- **Worker Specialization**: Add `--is-prefill-worker` flag for disaggregated prefill workers + +## Additional Resources + +- **[Examples](../getting-started/examples.md)** - Complete working examples +- **[Create Custom Deployments](deployment/create-deployment.md)** - Build your own CRDs +- **[Managing Models with DynamoModel](deployment/dynamomodel-guide.md)** - Deploy LoRA adapters and manage models +- **[Operator Documentation](dynamo-operator.md)** - How the platform works +- **[Service Discovery](service-discovery.md)** - Discovery backends and configuration +- **[Helm Charts](https://github.com/ai-dynamo/dynamo/tree/main/deploy/helm/README.md)** - For advanced users +- **[GitOps Deployment with FluxCD](fluxcd.md)** - For advanced users +- **[Logging](observability/logging.md)** - For logging setup +- **[Multinode Deployment](deployment/multinode-deployment.md)** - For multinode deployment +- **[Grove](grove.md)** - For grove details and custom installation +- **[Monitoring](observability/metrics.md)** - For monitoring setup +- **[Model Caching with Fluid](model-caching-with-fluid.md)** - For model caching with Fluid diff --git a/fern/pages/kubernetes/service-discovery.md b/fern/pages/kubernetes/service-discovery.md new file mode 100644 index 00000000000..7938ccaa12d --- /dev/null +++ b/fern/pages/kubernetes/service-discovery.md @@ -0,0 +1,101 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Service Discovery" +--- + +Dynamo components (frontends, workers, planner) need to be able to discover each other and their capabilities at runtime. We refer to this as service discovery. There are 2 kinds of service discovery backends supported on Kubernetes. + +## Discovery Backends + +| Backend | Default | Dependencies | Use Case | +|---------|---------|--------------|----------| +| **Kubernetes** | ✅ Yes | None (native K8s) | Recommended for all Kubernetes deployments | +| **KV Store (etcd)** | No | etcd cluster | Legacy deployments | + +## Kubernetes Discovery (Default) + +Kubernetes discovery is the default and recommended backend when running on Kubernetes. It uses native Kubernetes primitives to facilitate discovery of components: + +- **DynamoWorkerMetadata CRD**: Each worker stores its registered endpoints and model cards in a Custom Resource +- **EndpointSlices**: EndpointSlices signal each component's readiness status + +### Implementation Details + +Each pod runs a **discovery daemon** that watches both EndpointSlices and DynamoWorkerMetadata CRs. A pod is only discoverable when it appears as "ready" in an EndpointSlice AND has a corresponding `DynamoWorkerMetadata` CR. This correlation ensures pods aren't discoverable until they're ready, metadata is immediately available, and stale entries are cleaned up when pods terminate. + +#### DynamoWorkerMetadata CRD + +Each worker pod creates a `DynamoWorkerMetadata` CR that stores its discovery metadata: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoWorkerMetadata +metadata: + name: my-worker-pod-abc123 + namespace: dynamo-system + ownerReferences: + - apiVersion: v1 + kind: Pod + name: my-worker-pod-abc123 + uid: + controller: true +spec: + data: + endpoints: + "dynamo/backend/generate": + type: Endpoint + namespace: dynamo + component: backend + endpoint: generate + instance_id: 12345678901234567890 + transport: + nats_tcp: "dynamo_backend.generate-abc123" + model_cards: {} +``` + +The CR is named after the pod and includes an owner reference for automatic garbage collection when the pod is deleted. + +#### EndpointSlices + +While DynamoWorkerMetadata resources provide an up-to-date snapshot of a component's capabilities, EndpointSlices give a snapshot of health of the various Dynamo components. + +The operator creates a Kubernetes Service targeting the Dynamo components. The Kubernetes controller in turn creates and maintains EndpointSlice resources that keep track of the readiness of the pods targeted by the Service. Watching these slices gives us an up-to-date snapshot of which Dynamo components are ready to serve traffic. + +##### Readiness Probes +A pod is marked ready if the readiness probe succeeds. On Dynamo workers, this is when the `generate` endpoint is available and healthy. These probes are configured by the Dynamo operator for each pod/component. + +#### RBAC + +Each Dynamo component pod is automatically given a ServiceAccount that allows it to watch `EndpointSlice` and `DynamoWorkerMetadata` resources within its namespace. + +#### Environment Variables + +The following environment variables are automatically injected into pods by the operator to facilitate service discovery: + +| Variable | Description | +|----------|-------------| +| `DYN_DISCOVERY_BACKEND` | Set to `kubernetes` | +| `POD_NAME` | Pod name (via downward API) | +| `POD_NAMESPACE` | Pod namespace (via downward API) | +| `POD_UID` | Pod UID (via downward API) | + +The pod's instance ID is deterministically generated by hashing the pod name, ensuring consistent identity and correlation between EndpointSlices and CRs. + +## KV Store Discovery (etcd) + +To use etcd-based discovery instead of Kubernetes-native discovery, add the annotation to your DynamoGraphDeployment: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: my-deployment + annotations: + nvidia.com/dynamo-discovery-backend: etcd +spec: + services: + # ... +``` + +This requires an etcd cluster to be available. The etcd connection is configured via the platform Helm chart. diff --git a/fern/pages/kubernetes/webhooks.md b/fern/pages/kubernetes/webhooks.md new file mode 100644 index 00000000000..2742192765f --- /dev/null +++ b/fern/pages/kubernetes/webhooks.md @@ -0,0 +1,702 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Webhooks" +--- + +This document describes the webhook functionality in the Dynamo Operator, including validation webhooks, certificate management, and troubleshooting. + +## Table of Contents + +- [Overview](#overview) +- [Architecture](#architecture) +- [Configuration](#configuration) + - [Enabling/Disabling Webhooks](#enablingdisabling-webhooks) + - [Certificate Management Options](#certificate-management-options) + - [Advanced Configuration](#advanced-configuration) +- [Certificate Management](#certificate-management) + - [Automatic Certificates (Default)](#automatic-certificates-default) + - [cert-manager Integration](#cert-manager-integration) + - [External Certificates](#external-certificates) +- [Multi-Operator Deployments](#multi-operator-deployments) +- [Troubleshooting](#troubleshooting) + +--- + +## Overview + +The Dynamo Operator uses **Kubernetes admission webhooks** to provide real-time validation and mutation of custom resources. Currently, the operator implements **validation webhooks** that ensure invalid configurations are rejected immediately at the API server level, providing faster feedback to users compared to controller-based validation. + +All webhook types (validating, mutating, conversion, etc.) share the same **webhook server** and **TLS certificate infrastructure**, making certificate management consistent across all webhook operations. + +### Key Features + +- ✅ **Enabled by default** - Zero-touch validation out of the box +- ✅ **Shared certificate infrastructure** - All webhook types use the same TLS certificates +- ✅ **Automatic certificate generation** - No manual certificate management required +- ✅ **Defense in depth** - Controllers validate when webhooks are disabled +- ✅ **cert-manager integration** - Optional integration for automated certificate lifecycle +- ✅ **Multi-operator support** - Lease-based coordination for cluster-wide and namespace-restricted deployments +- ✅ **Immutability enforcement** - Critical fields protected via CEL validation rules + +### Current Webhook Types + +- **Validating Webhooks**: Validate custom resource specifications before persistence + - `DynamoComponentDeployment` validation + - `DynamoGraphDeployment` validation + - `DynamoModel` validation + +**Note:** Future releases may add mutating webhooks (for defaults/transformations) and conversion webhooks (for CRD version migrations). All will use the same certificate infrastructure described in this document. + +--- + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ API Server │ +│ 1. User submits CR (kubectl apply) │ +│ 2. API server calls ValidatingWebhookConfiguration │ +└────────────────────────┬────────────────────────────────────────┘ + │ HTTPS (TLS required) + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Webhook Server (in Operator Pod) │ +│ 3. Validates CR against business rules │ +│ 4. Returns admit/deny decision + warnings │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ API Server │ +│ 5. If admitted: Persist CR to etcd │ +│ 6. If denied: Return error to user │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Validation Flow + +1. **Webhook validation** (if enabled): Validates at API server level +2. **CEL validation**: Kubernetes-native immutability checks (always active) +3. **Controller validation** (if webhooks disabled): Defense-in-depth validation during reconciliation + +--- + +## Configuration + +### Enabling/Disabling Webhooks + +Webhooks are **enabled by default**. To disable them: + +```yaml +# Platform-level values.yaml +dynamo-operator: + webhook: + enabled: false +``` + +**When to disable webhooks:** +- During development/testing when rapid iteration is needed +- In environments where admission webhooks are not supported +- When troubleshooting validation issues + +**Note:** When webhooks are disabled, controllers perform validation during reconciliation (defense in depth). + +--- + +### Certificate Management Options + +The operator supports three certificate management modes: + +| Mode | Description | Use Case | +|------|-------------|----------| +| **Automatic (Default)** | Helm hooks generate self-signed certificates | Testing and development environments | +| **cert-manager** | Integrate with cert-manager for automated lifecycle | Production deployments with cert-manager | +| **External** | Bring your own certificates | Production deployments with custom PKI | + +--- + +### Advanced Configuration + +#### Complete Configuration Reference + +```yaml +dynamo-operator: + webhook: + # Enable/disable validation webhooks + enabled: true + + # Certificate management + certManager: + enabled: false + issuerRef: + kind: Issuer + name: selfsigned-issuer + + # Certificate secret configuration + certificateSecret: + name: webhook-server-cert + external: false + + # Certificate validity period (automatic generation only) + certificateValidity: 3650 # 10 years + + # Certificate generator image (automatic generation only) + certGenerator: + image: + repository: bitnami/kubectl + tag: latest + + # Webhook behavior configuration + failurePolicy: Fail # Fail (reject on error) or Ignore (allow on error) + timeoutSeconds: 10 # Webhook timeout + + # Namespace filtering (advanced) + namespaceSelector: {} # Kubernetes label selector for namespaces +``` + +#### Failure Policy + +```yaml +# Fail: Reject resources if webhook is unavailable (recommended for production) +webhook: + failurePolicy: Fail + +# Ignore: Allow resources if webhook is unavailable (use with caution) +webhook: + failurePolicy: Ignore +``` + +**Recommendation:** Use `Fail` in production to ensure validation is always enforced. Only use `Ignore` if you need high availability and can tolerate occasional invalid resources. + +#### Namespace Filtering + +Control which namespaces are validated (applies to **cluster-wide operator** only): + +```yaml +# Only validate resources in namespaces with specific labels +webhook: + namespaceSelector: + matchLabels: + dynamo-validation: enabled + +# Or exclude specific namespaces +webhook: + namespaceSelector: + matchExpressions: + - key: dynamo-validation + operator: NotIn + values: ["disabled"] +``` + +**Note:** For **namespace-restricted operators**, the namespace selector is automatically set to validate only the operator's namespace. This configuration is ignored in namespace-restricted mode. + +--- + +## Certificate Management + +### Automatic Certificates (Default) + +**Zero configuration required!** Certificates are automatically generated during `helm install` and `helm upgrade`. + +#### How It Works + +1. **Pre-install/pre-upgrade hook**: Generates self-signed TLS certificates + - Root CA (valid 10 years) + - Server certificate (valid 10 years) + - Stores in Secret: `-webhook-server-cert` + +2. **Post-install/post-upgrade hook**: Injects CA bundle into `ValidatingWebhookConfiguration` + - Reads `ca.crt` from Secret + - Patches `ValidatingWebhookConfiguration` with base64-encoded CA bundle + +3. **Operator pod**: Mounts certificate secret and serves webhook on port 9443 + +#### Certificate Validity + +- **Root CA**: 10 years +- **Server Certificate**: 10 years (same as Root CA) +- **Automatic rotation**: Certificates are re-generated on every `helm upgrade` + +#### Smart Certificate Generation + +The certificate generation hook is intelligent: +- ✅ **Checks existing certificates** before generating new ones +- ✅ **Skips generation** if valid certificates exist (valid for 30+ days with correct SANs) +- ✅ **Regenerates** only when needed (missing, expiring soon, or incorrect SANs) + +This means: +- Fast `helm upgrade` operations (no unnecessary cert generation) +- Safe to run `helm upgrade` frequently +- Certificates persist across reinstalls (stored in Secret) + +#### Manual Certificate Rotation + +If you need to rotate certificates manually: + +```bash +# Delete the certificate secret +kubectl delete secret -webhook-server-cert -n + +# Upgrade the release to regenerate certificates +helm upgrade dynamo-platform -n +``` + +--- + +### cert-manager Integration + +For clusters with cert-manager installed, you can enable automated certificate lifecycle management. + +#### Prerequisites + +1. **cert-manager installed** (v1.0+) +2. **CA issuer configured** (e.g., `selfsigned-issuer`) + +#### Configuration + +```yaml +dynamo-operator: + webhook: + certManager: + enabled: true + issuerRef: + kind: Issuer # Or ClusterIssuer + name: selfsigned-issuer # Your issuer name +``` + +#### How It Works + +1. **Helm creates Certificate resource**: Requests TLS certificate from cert-manager +2. **cert-manager generates certificate**: Based on configured issuer +3. **cert-manager stores in Secret**: `-webhook-server-cert` +4. **cert-manager ca-injector**: Automatically injects CA bundle into `ValidatingWebhookConfiguration` +5. **Operator pod**: Mounts certificate secret and serves webhook + +#### Benefits Over Automatic Mode + +- ✅ **Automated rotation**: cert-manager renews certificates before expiration +- ✅ **Custom validity periods**: Configure certificate lifetime +- ✅ **CA rotation support**: ca-injector handles CA updates automatically +- ✅ **Integration with existing PKI**: Use your organization's certificate infrastructure + +#### Certificate Rotation + +With cert-manager, certificate rotation is **fully automated**: + +1. **Leaf certificate rotation** (default: every year) + - cert-manager auto-renews before expiration + - controller-runtime auto-reloads new certificate + - **No pod restart required** + - **No caBundle update required** (same Root CA) + +2. **Root CA rotation** (every 10 years) + - cert-manager rotates Root CA + - ca-injector auto-updates caBundle in `ValidatingWebhookConfiguration` + - **No manual intervention required** + +#### Example: Self-Signed Issuer + +```yaml +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: selfsigned-issuer + namespace: dynamo-system +spec: + selfSigned: {} +--- +# Enable in platform values.yaml +dynamo-operator: + webhook: + certManager: + enabled: true + issuerRef: + kind: Issuer + name: selfsigned-issuer +``` + +--- + +### External Certificates + +Bring your own certificates for custom PKI requirements. + +#### Steps + +1. **Create certificate secret manually**: + +```bash +kubectl create secret tls -webhook-server-cert \ + --cert=tls.crt \ + --key=tls.key \ + -n + +# Also add ca.crt to the secret +kubectl patch secret -webhook-server-cert -n \ + --type='json' \ + -p='[{"op": "add", "path": "/data/ca.crt", "value": "'$(base64 -w0 < ca.crt)'"}]' +``` + +2. **Configure operator to use external secret**: + +```yaml +dynamo-operator: + webhook: + certificateSecret: + external: true + caBundle: # Must manually specify +``` + +3. **Deploy operator**: + +```bash +helm install dynamo-platform . -n -f values.yaml +``` + +#### Certificate Requirements + +- **Secret name**: Must match `webhook.certificateSecret.name` (default: `webhook-server-cert`) +- **Secret keys**: `tls.crt`, `tls.key`, `ca.crt` +- **Certificate SAN**: Must include `..svc` + - Example: `dynamo-platform-dynamo-operator-webhook-service.dynamo-system.svc` + +--- + +## Multi-Operator Deployments + +The operator supports running both **cluster-wide** and **namespace-restricted** instances simultaneously using a **lease-based coordination mechanism**. + +### Scenario + +``` +Cluster: +├─ Operator A (cluster-wide, namespace: platform-system) +│ └─ Validates all namespaces EXCEPT team-a +└─ Operator B (namespace-restricted, namespace: team-a) + └─ Validates only team-a namespace +``` + +### How It Works + +1. **Namespace-restricted operator** creates a Lease in its namespace +2. **Cluster-wide operator** watches for Leases named `dynamo-operator-ns-lock` +3. **Cluster-wide operator** skips validation for namespaces with active Leases +4. **Namespace-restricted operator** validates resources in its namespace + +### Lease Configuration + +The lease mechanism is **automatically configured** based on deployment mode: + +```yaml +# Cluster-wide operator (default) +namespaceRestriction: + enabled: false +# → Watches for leases in all namespaces +# → Skips validation for namespaces with active leases + +# Namespace-restricted operator +namespaceRestriction: + enabled: true + namespace: team-a +# → Creates lease in team-a namespace +# → Does NOT check for leases (no cluster permissions) +``` + +### Deployment Example + +```bash +# 1. Deploy cluster-wide operator +helm install platform-operator dynamo-platform \ + -n platform-system \ + --set namespaceRestriction.enabled=false + +# 2. Deploy namespace-restricted operator for team-a +helm install team-a-operator dynamo-platform \ + -n team-a \ + --set namespaceRestriction.enabled=true \ + --set namespaceRestriction.namespace=team-a +``` + +### ValidatingWebhookConfiguration Naming + +The webhook configuration name reflects the deployment mode: + +- **Cluster-wide**: `-validating` +- **Namespace-restricted**: `-validating-` + +Example: + +```bash +# Cluster-wide +platform-operator-validating + +# Namespace-restricted (team-a) +team-a-operator-validating-team-a +``` + +This allows multiple webhook configurations to coexist without conflicts. + +### Lease Health + +If the namespace-restricted operator is deleted or becomes unhealthy: +- Lease expires after `leaseDuration + gracePeriod` (default: ~30 seconds) +- Cluster-wide operator automatically resumes validation for that namespace + +--- + +## Troubleshooting + +### Webhook Not Called + +**Symptoms:** +- Invalid resources are accepted +- No validation errors in logs + +**Checks:** + +1. **Verify webhook is enabled**: +```bash +kubectl get validatingwebhookconfiguration | grep dynamo +``` + +2. **Check webhook configuration**: +```bash +kubectl get validatingwebhookconfiguration -o yaml +# Verify: +# - caBundle is present and non-empty +# - clientConfig.service points to correct service +# - webhooks[].namespaceSelector matches your namespace +``` + +3. **Verify webhook service exists**: +```bash +kubectl get service -n | grep webhook +``` + +4. **Check operator logs for webhook startup**: +```bash +kubectl logs -n deployment/-dynamo-operator | grep webhook +# Should see: "Webhooks are enabled - webhooks will validate, controllers will skip validation" +# Should see: "Starting webhook server" +``` + +--- + +### Connection Refused Errors + +**Symptoms:** +``` +Error from server (InternalError): Internal error occurred: failed calling webhook: +Post "https://...webhook-service...:443/validate-...": dial tcp ...:443: connect: connection refused +``` + +**Checks:** + +1. **Verify operator pod is running**: +```bash +kubectl get pods -n -l app.kubernetes.io/name=dynamo-operator +``` + +2. **Check webhook server is listening**: +```bash +# Port-forward to pod +kubectl port-forward -n pod/ 9443:9443 + +# In another terminal, test connection +curl -k https://localhost:9443/validate-nvidia-com-v1alpha1-dynamocomponentdeployment +# Should NOT get "connection refused" +``` + +3. **Verify webhook port in deployment**: +```bash +kubectl get deployment -n -dynamo-operator -o yaml | grep -A5 "containerPort: 9443" +``` + +4. **Check for webhook initialization errors**: +```bash +kubectl logs -n deployment/-dynamo-operator | grep -i error +``` + +--- + +### Certificate Errors + +**Symptoms:** +``` +Error from server (InternalError): Internal error occurred: failed calling webhook: +x509: certificate signed by unknown authority +``` + +**Checks:** + +1. **Verify caBundle is present**: +```bash +kubectl get validatingwebhookconfiguration -o jsonpath='{.webhooks[0].clientConfig.caBundle}' | base64 -d +# Should output a valid PEM certificate +``` + +2. **Verify certificate secret exists**: +```bash +kubectl get secret -n -webhook-server-cert +``` + +3. **Check certificate validity**: +```bash +kubectl get secret -n -webhook-server-cert -o jsonpath='{.data.tls\.crt}' | base64 -d | openssl x509 -noout -text +# Check: +# - Not expired +# - SAN includes: ..svc +``` + +4. **Check CA injection job logs**: +```bash +kubectl logs -n job/-webhook-ca-inject- +``` + +--- + +### Helm Hook Job Failures + +**Symptoms:** +- `helm install` or `helm upgrade` hangs or fails +- Certificate generation errors + +**Checks:** + +1. **List hook jobs**: +```bash +kubectl get jobs -n | grep webhook +``` + +2. **Check job logs**: +```bash +# Certificate generation +kubectl logs -n job/-webhook-cert-gen- + +# CA injection +kubectl logs -n job/-webhook-ca-inject- +``` + +3. **Check RBAC permissions**: +```bash +# Verify ServiceAccount exists +kubectl get sa -n -webhook-ca-inject + +# Verify ClusterRole and ClusterRoleBinding exist +kubectl get clusterrole -webhook-ca-inject +kubectl get clusterrolebinding -webhook-ca-inject +``` + +4. **Manual cleanup**: +```bash +# Delete failed jobs +kubectl delete job -n -webhook-cert-gen- +kubectl delete job -n -webhook-ca-inject- + +# Retry helm upgrade +helm upgrade dynamo-platform -n +``` + +--- + +### Validation Errors Not Clear + +**Symptoms:** +- Webhook rejects resource but error message is unclear + +**Solution:** + +Check operator logs for detailed validation errors: + +```bash +kubectl logs -n deployment/-dynamo-operator | grep "validate create\|validate update" +``` + +Webhook logs include: +- Resource name and namespace +- Validation errors with context +- Warnings for immutable field changes + +--- + +### Stuck Deleting Resources + +**Symptoms:** +- Resource stuck in "Terminating" state +- Webhook blocks finalizer removal + +**Solution:** + +The webhook automatically skips validation for resources being deleted. If stuck: + +1. **Check if webhook is blocking**: +```bash +kubectl describe -n +# Look for events mentioning webhook errors +``` + +2. **Temporarily disable webhook**: +```bash +# Option 1: Delete ValidatingWebhookConfiguration +kubectl delete validatingwebhookconfiguration + +# Option 2: Set failurePolicy to Ignore +kubectl patch validatingwebhookconfiguration \ + --type='json' \ + -p='[{"op": "replace", "path": "/webhooks/0/failurePolicy", "value": "Ignore"}]' +``` + +3. **Delete resource again**: +```bash +kubectl delete -n +``` + +4. **Re-enable webhook**: +```bash +helm upgrade dynamo-platform -n +``` + +--- + +## Best Practices + +### Production Deployments + +1. ✅ **Keep webhooks enabled** (default) for real-time validation +2. ✅ **Use `failurePolicy: Fail`** (default) to ensure validation is enforced +3. ✅ **Monitor webhook latency** - Validation adds ~10-50ms per resource operation +4. ✅ **Use cert-manager** for automated certificate lifecycle in large deployments +5. ✅ **Test webhook configuration** in staging before production + +### Development Deployments + +1. ✅ **Disable webhooks** for rapid iteration if needed +2. ✅ **Use `failurePolicy: Ignore`** if webhook availability is problematic +3. ✅ **Keep automatic certificates** (simpler than cert-manager for dev) + +### Multi-Tenant Deployments + +1. ✅ **Deploy one cluster-wide operator** for platform-wide validation +2. ✅ **Deploy namespace-restricted operators** for tenant-specific namespaces +3. ✅ **Monitor lease health** to ensure coordination works correctly +4. ✅ **Use unique release names** per namespace to avoid naming conflicts + +--- + +## Additional Resources + +- [Kubernetes Admission Webhooks](https://kubernetes.io/docs/reference/access-authn-authz/extensible-admission-controllers/) +- [cert-manager Documentation](https://cert-manager.io/docs/) +- [Kubebuilder Webhook Tutorial](https://book.kubebuilder.io/cronjob-tutorial/webhook-implementation.html) +- [CEL Validation Rules](https://kubernetes.io/docs/reference/using-api/cel/) + +--- + +## Support + +For issues or questions: +- Check [Troubleshooting](#troubleshooting) section +- Review operator logs: `kubectl logs -n deployment/-dynamo-operator` +- Open an issue on GitHub + diff --git a/fern/pages/kvbm/kvbm-architecture.md b/fern/pages/kvbm/kvbm-architecture.md new file mode 100644 index 00000000000..6f64da65740 --- /dev/null +++ b/fern/pages/kvbm/kvbm-architecture.md @@ -0,0 +1,26 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "KVBM Architecture" +--- + +The KVBM serves as a critical infrastructure component for scaling LLM inference workloads efficiently. By cleanly separating runtime logic from memory management, and by enabling distributed block sharing, KVBM lays the foundation for high-throughput, multi-node, and memory-disaggregated AI systems. + +![A block diagram showing a layered architecture view of Dynamo KV Block manager.](../../assets/img/kvbm-architecture.png) +**High level layered architecture view of Dynamo KV Block manager and how it interfaces with different components of LLM inference ecosystem** + +The KVBM has three primary logical layers. The top layer-the LLM inference runtimes (TRTLLM, vLLM and SGLang)-integrates through a dedicated connector module to the Dynamo KVBM module. These connectors act as translation layers, mapping runtime-specific operations and events into the KVBM’s block-oriented memory interface. This decouples memory management from the inference runtime, enabling backend portability and providing memory tiering. + +The middle layer, the KVBM layer, encapsulates the core logic of the KV block manager and serves as the runtime substrate for managing block memory. The KVBM adapter layer normalizes the representations and data layout for the incoming requests across runtimes and forwards them to the core memory manager. The KVBM and the core modules implement required internal functionality, such as table lookups, memory allocation, block layout management, lifecycle, and state transitions and block reuse or eviction was on policies. The KVBM layer also has required abstractions for external components to override or augment its behavior. + +The last layer, the NIXL layer, provides unified support for enabling all data and storage transactions. NIXL enables P2P GPU transfers, enables RDMA and NVLINK remote memory sharing, dynamic block registration and metadata exchange and provides a plugin interface for storage backends. + +NIXL integrates with several backends: + +- Block memory (Eg. GPU HBM, Host DRAM, Remote DRAM, Local SSD when exposed as block device) +- Local file system (for example, POSIX) +- Remote file system (for example, NFS) +- Object stores (for example, S3-compatible) +- Cloud storage (for example, blob storage APIs) + +**[NIXL](https://github.com/ai-dynamo/nixl/blob/main/docs/nixl.md)** abstracts away the registration and integration complexity for each backends via custom optimizable plugin architecture and enables memory blocks to be published, serialized, and accessed remotely, allowing the disaggregation of compute and memory across nodes. Combined with the Dynamo KV Block Manager (KVBM), storage providers no longer need to retrofit or optimize individual LLM inference engines. Instead, they can focus on tuning their own stack, providing optimized endpoints, knowing that integration is smooth, standardized, and efficient. And for those who *do* want to go further, Dynamo KVBM offers a clean separation of concerns, making custom optimization not only possible, but simple. \ No newline at end of file diff --git a/fern/pages/kvbm/kvbm-components.md b/fern/pages/kvbm/kvbm-components.md new file mode 100644 index 00000000000..30bbc518178 --- /dev/null +++ b/fern/pages/kvbm/kvbm-components.md @@ -0,0 +1,56 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Understanding KVBM components" +--- + +KVBM design takes inspiration from the KV block managers used in vLLM and SGLang, with an added influence from historical memory tiering strategies common in general GPU programming. For more details, [See KVBM Reading](kvbm-reading.md). The figure below illustrates the internal components of KVBM. + +![Internal Components of Dynamo KVBM. ](../../assets/img/kvbm-components.png) +**Internal Components of Dynamo KVBM** + +## KVBM Components +### Core +- **KvBlockManager**: Public facade. Constructs/owns the internal state and exposes the pools and onboarding APIs. +- **Scheduler**: Gates transfer execution relative to model progress (iteration/layer completion) when integrated with a framework connector (e.g., vLLM V1). +- **Config (config.rs)**: Describes model dims, page size, layout choices, and runtime flags used to build pools and layouts. +- **KvBlockManagerState**: Central object wiring together layouts, storage backends, and pools; owns the OffloadManager, metrics, and events hooks. +- **Events/Metrics**: Observability components emitting counters/gauges and event hooks for integration/testing. + +### Layouts and Blocks +- **LayoutConfig & LayoutType**: Translate tensor shapes into KV cache layouts (layer-separated or fully-contiguous), including block counts and geometry. +- **Blocks & Metadata**: Typed block handles (mutable/immutable), metadata (e.g., priority), and views by layer/outer dims; used to allocate, register, and match by `sequence_hash`. + +### Transfer Manager +- **TransferManager**: Asynchronous transfer orchestrator with per-path queues (Device→Host, Host→Disk, Host→Device, Disk→Device). + +### Storage & Pools +- **Device Pool(G1)**: GPU-resident KV block pool. Allocates mutable GPU blocks, registers completed blocks (immutable), serves lookups by sequence hash, and is the target for onboarding (Host→Device, Disk→Device). +- **Host Pool(G2)**: CPU pinned-memory KV block pool. Receives Device offloads (Device→Host), can onboard to Device (Host→Device), and offloads to Disk. Uses pinned (page-locked) memory for efficient CUDA transfers and NIXL I/O. +- **Disk Pool(G3)**: Local SSD NVMe-backed KV block pool. Receives Host offloads (Host→Disk) and provides blocks for onboarding to Device (Disk→Device). NIXL descriptors expose file offsets/regions for zero-copy I/O and optional GDS. + +## KVBM DataFlows +![KVBM Data Flows. ](../../assets/img/kvbm-data-flows.png) +**KVBM Data Flows from device to other memory hierarchies** + +**Device → Host (Offload)** +* Triggered explicitly requested to offload by the connector scheduler. +* Worker allocates a Host block and performs CUDA D2H/Custom Kernel copy. +* Host pool registers the new immutable block (dedup by sequence hash). + +**Host → Disk (Offload)** +* Local Disk: NIXL Write via POSIX; GDS when available. +* Remote Disk (Network FS like NFS/Lustre/GPFS): NIXL Write via POSIX to the mounted FS; batching/concurrency identical. +* Triggered on registered host blocks or explicit offload requests. +* Worker allocates a Disk block and performs NIXL Write (Host→Disk). +* Disk pool registers the new immutable block (dedup by sequence hash). + +**Host → Device (Onboard)** +* Called to bring a host block into GPU memory. +* Worker uses provided Device targets and performs CUDA H2D/Custom Kernel copy. +* Device pool registers the new immutable block. + +**Disk → Device (Onboard)** +* Called to bring a disk block directly into GPU memory. +* Worker uses provided Device targets and performs NIXL Read (Disk→Device), possibly via GDS. +* Device pool registers the new immutable block. diff --git a/fern/pages/kvbm/kvbm-design-deepdive.md b/fern/pages/kvbm/kvbm-design-deepdive.md new file mode 100644 index 00000000000..d15a2f0fd67 --- /dev/null +++ b/fern/pages/kvbm/kvbm-design-deepdive.md @@ -0,0 +1,247 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "KVBM components" +--- + +The design of the KVBM is inspired from vLLM and SGLang KV block managers but with a twist from historical memory tiering design aspired in general GPU programming. [See KVBM Reading](kvbm-reading.md). The following figure shows the internal architecture of KVBM and how it works across workers using NIXL. + +![Internal architecture and key modules in the Dynamo KVBM. ](../../assets/img/kvbm-internal-arch.png) +**Internal architecture and key modules in the Dynamo KVBM** + +## KvBlockManager as Orchestration Layer + +The `KvBlockManager ` acts as a coordinator across memory tiers—host (CPU), device (GPU), and remote—by managing per-backend block pools and exposing consistent block lifecycle APIs. It tracks KV block locations across device memory (G1), CPU memory within and across nodes (G2), local/pooled SSDs (G3), and remote storage (G4). G1-G4 are key tiers enabled by KVBM. Critical to note that KVBM treats G4 storage as an opaque blob store, unaware of internal layout optimizations. + +`KvBlockManager` owns: + +* A device-side `BlockPool` +* A host-side `BlockPool` +* A remote NIXL agent that supports communication and memory sharing across nodes +* A block set registry for remote lookup and import/export of block metadata + +Implementation-wise, `KvBlockManagerState` holds the logic: it's initialized by `KvBlockManagerConfig`, which merges runtime, model, and layout configurations. `NixlOptions` injects remote awareness. + +## Block Layout and Memory Mapping + +Each block is a 2D array `[num_layers][page_size × inner_dim]`. `BlockLayouttrait` abstracts the memory layout. The default implementation,`FullyContiguous`, stores all layers for all blocks in one region with alignment-aware stride computation: + + +```none +block_stride_in_bytes = align_up(num_layers × layer_stride, alignment); +``` + + +Both CPU and GPU pools share this memory layout, but they use storage-specific backends: + +* `DeviceStorage` → CUDA device buffer +* `PinnedStorage` → page-locked host memory +* `SystemStorage` → CPU heap memory (fallback/test) +* `NixlStorage` → remote memory through NIXL RDMA handles (includes storage) + +Each layout is constructed using a `LayoutConfig`, and storage is either passed directly or allocated using a StorageAllocator. + +## BlockPool and Memory Pools (Active and Inactive) + +Each `BlockPool` (where `T` is `DeviceStorage`, `PinnedStorage`, and so forth) tracks two sub-pools: + +* `ActivePool`: Contains blocks currently in use by sequences +* `InactivePool`: Recycled blocks ready for allocation; think free list + +When a token block is requested (for example, `get_mutable_block()`), the allocator pops from `InactivePool`, transitions its state, and returns a writable handle. On sequence commit or eviction, the system resets blocks and returns them to the inactive pool. + +The state machine (`BlockState`) that tracks the block lifecycle transitions includes: + +| State | Description | Ownership | Valid Actions/Transitions | +| ----- | ----- | ----- | ----- | +| Reset | Block hasn't been initialized or was reset. No associated sequence. | Held in InactivePool, reusable | init_sequence(salt_hash) → Partial | +| Partial | Block is being filled with tokens for a new sequence. In-progress. | Owned by the sequence creator | add_token() / add_tokens() (accumulate)- commit() → Complete- reset() → Reset | +| Complete | Block is fully filled with token data but not yet visible to others. | Still owned by creator thread | register() → Registered- reset() → Reset | +| Registered | Block is finalized and visible for reuse. Available in the deduplication cache. Can use block for lookups | Shared ownership (global registry) | Auto drop() → triggers Remove event and transitions to Reset | + +This table lists the valid KVBM transitions: + +| From → To | Trigger | Validation | +| ----- | ----- | ----- | +| Reset → Partial | initsequence(salt_hash) | Must not be in use | +| Partial → Complete | commit() | Must be full | +| Complete → Registered | register() | Must be finalized | +| Registered → Reset | Drop of RegistrationHandle | Automatic | +| Partial → Reset | Aborted sequence | Explicit or drop | +| Complete → Reset | Invalidated | Explicit or drop | + +Consider this example lifecycle of a block in the KVBM; in it, a sequence requests a new KV block: + +1. Allocator pops from InactivePool → Block is in Reset +2. `init_sequence()` → Transitions to Partial +3. Tokens are appended → State remains Partial +4. On full → `commit()` → State becomes Complete +5. `register()` → Block is hashed and moved to Registered. Blocks can now be used to lookup. +6. On eviction or end-of-life → `drop()` of RAII handle returns block to Reset + +## Lifecycle Management using RAII and Event Plane + +The system uses RAII for memory lifecycle management. Every block holds metadata and registration state, and registration is coupled with an `EventManager`. On registration and drop: + +* `PublishHandle` triggers Register events +* Dropping it triggers Remove events + +This pattern ensures consistency for shared memory tracking across workers without requiring explicit deallocation logic. The events are propagated in the Dynamo Events plane. Any Dynamo component subscribed to the events plane can listen to these changes. Note that even the storage provider can subscribe to the events plane and create an internal prefix tree representation that is tailored and optimized for the specific platform. + +## Remote Memory Integration using NIXL + +The NIXL agent exposes remote memory buffers using `NixlBlockSet`, `RemoteBlocks`, and layout descriptors. Key operations include: + +* `nixl_register()`: Registers memory region with NIXL runtime +* `serialize() / deserialize()`: Converts layout and memory into transferable descriptors +* `import_remote_blockset()`: Loads remote node's block layouts into the manager +* `get_remote_blocks_mutable()`: Fetches transferable memory views from another node + +`RemoteBlocks` is a lightweight abstraction over shared memory for cross-node block usage (through UCX or other backends). + +The left side of the figure in [KVBM Components](kvbm-components.md) illustrates a bidirectional remote memory registration and layout synchronization protocol between workers (for example, Worker 1 and Worker 2) using NIXL. The following steps break down the process: + +1. *Agent Creation & Memory Registration:* + + Each worker independently sets up a NixlAgent: + * Registers its memory regions (that is, device memory) through `nixl_register()`. + * These regions correspond to blocks managed in the local BlockPool. + Once the worker registers the memory, NIXL creates remote-accessible descriptors, which it binds to the memory layout. + +2. *Metadata exchange:* + + After memory registration, workers exchange serialized layout metadata, encapsulated in a `SerializedNixlBlockLayout`. + + Why is this step critical? + * LLM inference workloads often differ in *tensor parallel (TP)* configurations: + * Worker 1 might have TP=4, while Worker 2 has TP=8. + * Thus, even if both systems use similar `FullyContiguous` layouts, their internal slicing and alignment assumptions differ. + * The metadata exchange bridges this semantic mismatch by sharing: + * LayoutConfig (num_layers, page_size, inner_dim, dtype) + * BlockSetID + * Base address + stride information (including alignment) + * Device ID + memory type (host/device) + * Once the workers share metadata, each can reconstruct the layout on its side using deserialize(). + This enables NIXL to: + * Understand where each layer/block resides + * Perform correct gather-scatter operations during RDMA-like transfers + + Without this step, remote fetches would result in data corruption or misaligned tokens. + +3. *Serialization & Deserialization: Making Layouts Portable* + + In the serialization stage, KVBM exports and `FullyContiguous::serialize()` encodes: + * FullyContiguousConfig + * base_offset + * Physical memory descriptors (NixlStorage), including: + * Memory type (VRAM, DRAM) + * Address & size + * Device ID + + The system sends this using NIXL transfer and then injects it into a KVBM scheduler state. In the deserialization stage, `SerializedNixlBlockLayout::deserialize()` rehydrates this into: + * A fully reconstructed memory layout view + * Local representation of a remote memory slice with correct offsets and size semantics + It also enables direct access to remote memory with consistent logical semantics + This guarantees that even across different system configurations (hardware or LLM shape), both parties agree on the memory view for each KV block. + +4. *Ownership handles and lifetime tracking* + + Memory ownership in NIXL is tightly coupled with RAII-based handles: + * When a block is registered, it returns a `PublishHandle` which wraps a `RegistrationHandle` + * On drop of this handle, an automatic Remove event is published, which: + * Deregisters the block from the NIXL layer + * Removes it from the remote block registry + * This ensures that, once the block is evicted from the cache or no longer used in inference, all references are invalidated cleanly across nodes + This mechanism avoids: + * Stale memory access + * Dangling pointers on GPU or host + * Manual deregistration bugs + The system can batch and publish registration events using a Publisher, optimizing performance under high concurrency + + +## Storage backends and pluggability + +You can integrate KVBM with a storage backend by extending or wrapping `NixlEnabledStorage` to support cross-node RDMA registration. All layouts and block pools are generic over these backends, allowing for fine-grained control over memory tiers. We defer detailed integration guidance, since we collaborate with storage partners to simplify and standardize these integration paths. + +```mermaid +--- +title: Example KVBM System Architecture +--- +flowchart TD + A["Distributed Inference Engine"] --> B["Dynamo KV Block Manager"] + + B --> C["NIXL Storage Agent
- Volume registration
- get()/put() abstraction"] + B --> D["Event Plane
- NATS-based Pub/Sub
- StoreEvent / RemoveEvent"] + + C --> E["G4 Storage Infrastructure
(SSD, Object store, etc.)
- Store KV blocks"] + D --> F["Storage Provider Subscriber
- Parse Events
- Build fast tree/index
- Optimize G4 tiering"] +``` + +For now, the following breakdown provides a high-level understanding of how KVBM interacts with external storage using the NIXL storage interface and the Dynamo Event Plane: + +### NIXL Storage Interface (for Backend Integration) + +The NIXL interface abstracts volume interaction and decouples it from mounting, metadata tracking, or direct system I/O. It provides: + +* registerVolume(descriptor): Register a logical volume for KV cache data. +* unregisterVolume(): Cleanly deregister and release volume mappings. +* get() / put(): Block-level APIs used by KVBM to fetch and store token blocks. + +These abstractions allow backends to be integrated without tying into the host's file system stack, enabling safe interaction with block devices, local filesystems, and RDMA-capable volumes. Please note that these APIs are still being finalized. + +### Dynamo Event Plane (Pub/Sub Coordination Layer) + +To support external storage optimizations without modifying KVBM logic, we provide an **event plane** built on NATS.io that emits lifecycle events for all block operations. Particularly there are two events emitted. + +* StoreEvent: Emitted when a KV block is registered. +* RemoveEvent: Emitted when a KV block is released or evicted. + +Each KVEvent (\~100 bytes) contains: + +* sequence_hash: Unique identifier of the KV block +* prefix_hash: Prefix grouping for query-level aggregation +* block_size: Size in bytes +* storage_location: Logical volume identifier +* event_type: Store or Remove +* extra_metadata: Reserved fields for partner-specific optimization + +For scalability, the system batches and publishes these events periodically (for example, every \~10s, or dynamically based on system load). + +### A conceptual design of a storage advisor + +This section provides an overview for the storage provider who is interested in integrating as a custom backend to KVBM and providing optimized performance. **Please note, this is optional for KVBM integration with a backend.** + +External storage systems are not tightly coupled with Dynamo's execution pipeline. Instead, they passively observe KV block lifecycle events through a subscription model: + +* Storage volumes are pre-provisioned and mounted by the storage provider. +* These volumes are then registered with Dynamo through the NIXL Storage Agent using registerVolume() APIs. Dynamo itself does not manage mounts or provisioning. +* The Dynamo KV Block Manager interacts only with logical block-level APIs (that is, get() and put()). +* In parallel, the Event Plane asynchronously broadcasts KV lifecycle events using a NATS-based pub/sub channel. +* Storage vendors implement a lightweight subscriber process that listens to these events without interfering with the KV Manager's runtime behavior. +* This decoupling ensures that external storage systems can optimize block placement and lifecycle tracking without modifying or instrumenting the core Dynamo codebase. + +Now, to enable fast lookup and dynamic tiering, storage vendors may build internal data structures using the received event stream. Here is a high level conceptual design: + +* On receiving a StoreEvent, the storage system: + * Inserts a record into an internal prefix tree, hash map, or LRU index. + * This record includes the prefix_hash and sequence_hash, which logically identify the token block and its grouping. + * Associated metadata (for example, block_size, storage_location) is also captured. +* On receiving a RemoveEvent, the system: + * Deletes or prunes the corresponding record from its index. + * Optionally triggers cleanup or tier migration workflows. + +This event-driven indexing allows the storage system to track which KV blocks are live and where they belong—enabling low-latency lookup, efficient space reclamation, and multi-tier coordination. With real-time visibility into KV block usage patterns, the storage system can implement smart tiering policies, such as: + +* Hot block promotion: Frequently accessed KV blocks can be migrated to fast SSD volumes. +* Cold block demotion: Infrequently used blocks can be demoted to slower storage (for example, HDDs, cloud object storage). +* Proactive compaction: If block sizes or prefix patterns indicate fragmentation, the storage backend can coalesce or rewrite blocks. + +These optimizations are performed entirely outside of Dynamo, with the assumption that storage providers adhere to SLA guarantees and volume availability. + +Critically, this entire system is designed to be non-intrusive: + +* The Dynamo KV Block Manager remains agnostic to how data is stored or optimized. +* The Event Plane doesn't block or intercept any critical path of inference. +* Storage vendors are given the freedom to innovate and optimize without requiring changes to the inference runtime. + +This design ensures that performance, resilience, and extensibility scale independently across the KV layer and the storage backend layer. diff --git a/fern/pages/kvbm/kvbm-integrations.md b/fern/pages/kvbm/kvbm-integrations.md new file mode 100644 index 00000000000..5518b683e00 --- /dev/null +++ b/fern/pages/kvbm/kvbm-integrations.md @@ -0,0 +1,30 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "KVBM Integrations" +--- + +KVBM Integrates with Inference frameworks (vLLM, TRTLLM, SGLang) via Connector APIs to influence KV caching behaviour, scheduling, and forward pass execution. +There are two components of the interface, Scheduler and Worker. Scheduler(leader) is responsible for the orchestration of KV block offload/onboard, builds metadata specifying transfer data to the workers. It also maintains hooks for handling asynchronous transfer completion. Worker is responsible for reading metadata built by the scheduler(leader), does async onboarding/ offloading at the end of the forward pass. + +## Typical KVBM Integrations + +The following figure shows the typical integration of KVBM with inference frameworks (vLLM used as an example) + +![vLLM KVBM Integration ](../../assets/img/kvbm-integrations.png) +**vLLM KVBM Integration** + + +## How to run KVBM with Frameworks +* Instructions to [run KVBM in vLLM](vllm-setup.md) +* Instructions to [run KVBM with TRTLLM](trtllm-setup.md) + +## Onboarding +![Onboarding blocks from Host to Device](../../assets/img/kvbm-onboard-host2device.png) +**Onboarding blocks from Host to Device** +![Onboarding blocks from Disk to Device](../../assets/img/kvbm-onboard-disk2device.png) +**Onboarding blocks from Disk to Device** + +## Offloading +![Offloading blocks from Device to Host&Disk](../../assets/img/kvbm-offload.png) +**Offloading blocks from Device to Host&Disk** diff --git a/fern/pages/kvbm/kvbm-intro.md b/fern/pages/kvbm/kvbm-intro.md new file mode 100644 index 00000000000..d34d2dace1f --- /dev/null +++ b/fern/pages/kvbm/kvbm-intro.md @@ -0,0 +1,30 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "KV Block Manager" +--- + +The Dynamo KV Block Manager (KVBM) is a scalable runtime component +designed to handle memory allocation, management, and remote sharing of +Key-Value (KV) blocks for inference tasks across heterogeneous and +distributed environments. It acts as a unified memory layer for +frameworks like vLLM, SGLang, and TRT-LLM. + +It offers: + +- A **unified memory API** that spans GPU memory(in future) , pinned + host memory, remote RDMA-accessible memory, local or distributed pool + of SSDs and remote file/object/cloud storage systems. +- Support for evolving **block lifecycles** (allocate → register → + match) with event-based state transitions that storage can subscribe + to. +- Integration with **NIXL**, a dynamic memory exchange layer used for + remote registration, sharing, and access of memory blocks over + RDMA/NVLink. + +The Dynamo KV Block Manager serves as a reference implementation that +emphasizes modularity and extensibility. Its pluggable design enables +developers to customize components and optimize for specific +performance, memory, and deployment needs. + + diff --git a/fern/pages/kvbm/kvbm-motivation.md b/fern/pages/kvbm/kvbm-motivation.md new file mode 100644 index 00000000000..19fd46c8149 --- /dev/null +++ b/fern/pages/kvbm/kvbm-motivation.md @@ -0,0 +1,29 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Motivation behind KVBM" +--- + +Large language models (LLMs) and other AI workloads increasingly rely on KV caches that extend beyond GPU and local CPU memory into remote storage tiers. However, efficiently managing the lifecycle of KV blocks in remote storage presents challenges: + +* Tailored for GenAI use-cases +* Lack of visibility into real-time block usage patterns. +* Need for lightweight, ownership-driven memory management over complex object stores with unneeded overheads. +* Modular and need simplified UX and to be memory safe. +* Inability to differentiate between hot (frequently accessed) and cold (infrequently accessed) blocks across the stack without intrusive application-level changes. +* Difficulty in optimizing storage placement across heterogeneous storage tiers (for example, SSDs, object storage, and cloud storage). + +Conventional systems either lack dynamic feedback mechanisms or require deep integration into core storage paths, which both increases complexity and reduces portability. + +## Benefits of KV Cache offloading + +KV Cache offloading avoids expensive KV Cache recomputation, resulting in faster response times and a better user experience. In the end, providers benefit from higher throughput and lower cost per token, making their inference services more scalable and efficient. + +## When to offload KV Cache for reuse + +Offloading KV cache to CPU or storage is most effective when KV Cache exceeds GPU memory and cache reuse outweighs the overhead of transferring data. It is especially valuable in long-context, high-concurrency, or resource-constrained inference environments such as: + +* **Long sessions and multi-turn conversations:** Offloading preserves large prompt prefixes, avoids recomputation, and improves first-token latency and throughput. +* **High concurrency (future):** Idle or partial conversations can be moved out of GPU memory, allowing active requests to proceed without hitting memory limits. +* **Shared or repeated content (future):** Reuse across users or sessions (for example, system prompts and templates) increases cache hits, especially with remote or cross-instance sharing. +* **Memory- or cost-constrained deployments:** Offloading to RAM or SSD reduces GPU demand, allowing longer prompts or more users without adding hardware. diff --git a/fern/pages/kvbm/kvbm-reading.md b/fern/pages/kvbm/kvbm-reading.md new file mode 100644 index 00000000000..fa3e9a427b2 --- /dev/null +++ b/fern/pages/kvbm/kvbm-reading.md @@ -0,0 +1,9 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "KVBM Further Reading" +--- + +- [vLLM](https://docs.vllm.ai/en/latest/features/automatic_prefix_caching.html) +- [SGLang](https://github.com/sgl-project/sglang/tree/main/benchmark/hicache) +- [EMOGI](https://arxiv.org/abs/2006.06890) \ No newline at end of file diff --git a/fern/pages/kvbm/trtllm-setup.md b/fern/pages/kvbm/trtllm-setup.md new file mode 100644 index 00000000000..3a3850e98ca --- /dev/null +++ b/fern/pages/kvbm/trtllm-setup.md @@ -0,0 +1,200 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Running KVBM in TensorRT-LLM" +--- + +This guide explains how to leverage KVBM (KV Block Manager) to manage KV cache and do KV offloading in TensorRT-LLM (trtllm). + +To learn what KVBM is, please check [here](kvbm-architecture.md) + + +- Ensure that `etcd` and `nats` are running before starting. +- KVBM only supports TensorRT-LLM’s PyTorch backend. +- Disable partial reuse `enable_partial_reuse: false` in the LLM API config’s `kv_connector_config` to increase offloading cache hits. +- KVBM requires TensorRT-LLM v1.1.0rc5 or newer. +- Enabling KVBM metrics with TensorRT-LLM is still a work in progress. + + +## Quick Start + +To use KVBM in TensorRT-LLM, you can follow the steps below: + +```bash +# Start up etcd for KVBM leader/worker registration and discovery +docker compose -f deploy/docker-compose.yml up -d + +# Build a dynamo TRTLLM container (KVBM is built in by default) +./container/build.sh --framework trtllm + +# Launch the container +./container/run.sh --framework trtllm -it --mount-workspace --use-nixl-gds + +# Configure KVBM cache tiers (choose one of the following options): + +# Option 1: CPU cache only (GPU -> CPU offloading) +# 4 means 4GB of pinned CPU memory would be used +export DYN_KVBM_CPU_CACHE_GB=4 + +# Option 2: Both CPU and Disk cache (GPU -> CPU -> Disk tiered offloading) +export DYN_KVBM_CPU_CACHE_GB=4 +# 8 means 8GB of disk would be used +export DYN_KVBM_DISK_CACHE_GB=8 + +# [Experimental] Option 3: Disk cache only (GPU -> Disk direct offloading, bypassing CPU) +# NOTE: this option is only experimental and it might not give out the best performance. +# NOTE: disk offload filtering is not supported when using this option. +export DYN_KVBM_DISK_CACHE_GB=8 + +# Note: You can also use DYN_KVBM_CPU_CACHE_OVERRIDE_NUM_BLOCKS or +# DYN_KVBM_DISK_CACHE_OVERRIDE_NUM_BLOCKS to specify exact block counts instead of GB +``` + + +When disk offloading is enabled, to extend SSD lifespan, disk offload filtering would be enabled by default. The current policy is only offloading KV blocks from CPU to disk if the blocks have frequency equal or more than `2`. Frequency is determined via doubling on cache hit (init with 1) and decrement by 1 on each time decay step. +To disable disk offload filtering, set `DYN_KVBM_DISABLE_DISK_OFFLOAD_FILTER` to true or 1. + + +```bash +# write an example LLM API config +# Note: Disable partial reuse "enable_partial_reuse: false" in the LLM API config’s "kv_connector_config" to increase offloading cache hits. +cat > "/tmp/kvbm_llm_api_config.yaml" < "/tmp/llm_api_config.yaml" < +Configure or tune KVBM cache tiers (choose one of the following options): +```bash +# Option 1: CPU cache only (GPU -> CPU offloading) +# 4 means 4GB of pinned CPU memory would be used +export DYN_KVBM_CPU_CACHE_GB=4 +# Option 2: Both CPU and Disk cache (GPU -> CPU -> Disk tiered offloading) +export DYN_KVBM_CPU_CACHE_GB=4 +# 8 means 8GB of disk would be used +export DYN_KVBM_DISK_CACHE_GB=8 +# [Experimental] Option 3: Disk cache only (GPU -> Disk direct offloading, bypassing CPU) +# NOTE: this option is only experimental and it might not give out the best performance. +# NOTE: disk offload filtering is not supported when using this option. +export DYN_KVBM_DISK_CACHE_GB=8 +``` +You can also use "DYN_KVBM_CPU_CACHE_OVERRIDE_NUM_BLOCKS" or +"DYN_KVBM_DISK_CACHE_OVERRIDE_NUM_BLOCKS" to specify exact block counts instead of GB + + + +When disk offloading is enabled, to extend SSD lifespan, disk offload filtering would be enabled by default. The current policy is only offloading KV blocks from CPU to disk if the blocks have frequency equal or more than `2`. Frequency is determined via doubling on cache hit (init with 1) and decrement by 1 on each time decay step. +To disable disk offload filtering, set `DYN_KVBM_DISABLE_DISK_OFFLOAD_FILTER` to true or 1. + + +### Sample Request +```bash +# Make a request to verify vLLM with KVBM is started up correctly +# NOTE: change the model name if served with a different one +curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "Qwen/Qwen3-0.6B", + "messages": [ + { + "role": "user", + "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden." + } + ], + "stream":false, + "max_tokens": 10 + }' +``` + +Alternatively, can use `vllm serve` directly to use KVBM for aggregated serving: +```bash +vllm serve --kv-transfer-config '{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "kvbm.vllm_integration.connector"}' Qwen/Qwen3-0.6B +``` + +## Enable and View KVBM Metrics + +Follow below steps to enable metrics collection and view via Grafana dashboard: +```bash +# Start the basic services (etcd & natsd), along with Prometheus and Grafana +docker compose -f deploy/docker-observability.yml up -d + +# Set env var DYN_KVBM_METRICS to true, when launch via dynamo +# Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880). +# NOTE: update launch/disagg_kvbm.sh or launch/disagg_kvbm_2p2d.sh as needed +DYN_KVBM_METRICS=true \ +DYN_KVBM_CPU_CACHE_GB=20 \ +python -m dynamo.vllm \ + --model Qwen/Qwen3-0.6B \ + --enforce-eager \ + --connector kvbm + +# Optional, if firewall blocks KVBM metrics ports to send prometheus metrics +sudo ufw allow 6880/tcp +``` + +View grafana metrics via http://localhost:3000 (default login: dynamo/dynamo) and look for KVBM Dashboard + +KVBM currently provides following types of metrics out of the box: +- `kvbm_matched_tokens`: The number of matched tokens +- `kvbm_offload_blocks_d2h`: The number of offload blocks from device to host +- `kvbm_offload_blocks_h2d`: The number of offload blocks from host to disk +- `kvbm_offload_blocks_d2d`: The number of offload blocks from device to disk (bypassing host memory) +- `kvbm_onboard_blocks_d2d`: The number of onboard blocks from disk to device +- `kvbm_onboard_blocks_h2d`: The number of onboard blocks from host to device +- `kvbm_host_cache_hit_rate`: Host cache hit rate (0.0-1.0) from sliding window +- `kvbm_disk_cache_hit_rate`: Disk cache hit rate (0.0-1.0) from sliding window + +## Troubleshooting + +1. If enabling KVBM does not show any TTFT perf gain or even perf degradation, one potential reason is not enough prefix cache hit on KVBM to reuse offloaded KV blocks. +To confirm, please enable KVBM metrics as mentioned above and check the grafana dashboard `Onboard Blocks - Host to Device` and `Onboard Blocks - Disk to Device`. +If observed large number of onboarded KV blocks as the example below, we can rule out this cause: +![Grafana Example](../../assets/img/kvbm-metrics-grafana.png) + +2. Allocating large memory and disk storage can take some time and lead to KVBM worker initialization timeout. +To avoid it, please set a longer timeout (default 1800 seconds) for leader–worker initialization. + +```bash +# 3600 means 3600 seconds timeout +export DYN_KVBM_LEADER_WORKER_INIT_TIMEOUT_SECS=3600 +``` + +3. When offloading to disk is enabled, KVBM could fail to start up if fallocate is not supported to create the files. +To bypass the issue, please use disk zerofill fallback. + +```bash +# Set to true to enable fallback behavior when disk operations fail (e.g. fallocate not available) +export DYN_KVBM_DISK_ZEROFILL_FALLBACK=true +``` + +## Benchmark KVBM + +Once the model is loaded ready, follow below steps to use LMBenchmark to benchmark KVBM performance: +```bash +git clone https://github.com/LMCache/LMBenchmark.git + +# Show case of running the synthetic multi-turn chat dataset. +# We are passing model, endpoint, output file prefix and qps to the sh script. +cd LMBenchmark/synthetic-multi-round-qa +./long_input_short_output_run.sh \ + "Qwen/Qwen3-0.6B" \ + "http://localhost:8000" \ + "benchmark_kvbm" \ + 1 + +# Average TTFT and other perf numbers would be in the output from above cmd +``` +More details about how to use LMBenchmark could be found [here](https://github.com/LMCache/LMBenchmark). + +`NOTE`: if metrics are enabled as mentioned in the above section, you can observe KV offloading, and KV onboarding in the grafana dashboard. + +To compare, you can run `vllm serve Qwen/Qwen3-0.6B` to turn KVBM off as the baseline. + +## Developing Locally + +Inside the Dynamo container, after changing KVBM related code (Rust and/or Python), to test or use it: +```bash +cd /workspace/lib/bindings/kvbm +uv pip install maturin[patchelf] +maturin build --release --out /workspace/dist +uv pip install --upgrade --force-reinstall --no-deps /workspace/dist/kvbm*.whl +``` diff --git a/fern/pages/multimodal/index.md b/fern/pages/multimodal/index.md new file mode 100644 index 00000000000..123b3ed0779 --- /dev/null +++ b/fern/pages/multimodal/index.md @@ -0,0 +1,194 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Multimodal Inference in Dynamo" +--- + +Dynamo supports multimodal inference across multiple LLM backends, enabling models to process images, video, and audio alongside text. This section provides comprehensive documentation for deploying multimodal models. + + +**Security Requirement**: Multimodal processing must be explicitly enabled at startup. +See the relevant documentation for each backend for the necessary flags. +This prevents unintended processing of multimodal data from untrusted sources. + + +## Backend Documentation + + + +## Support Matrix + +### Backend Capabilities + +| Stack | E/PD | E/P/D | EP/D | EPD | Image | Video | Audio | +|-------|------|-------|------|-----|-------|-------|-------| +| **[vLLM](vllm.md)** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🧪 | +| **[TRT-LLM](trtllm.md)** | ❌ | 🚧* | ✅ | ✅ | ✅ | ❌ | ❌ | +| **[SGLang](sglang.md)** | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | + +\* E/P/D supported in TRT-LLM with pre-computed embeddings only; image URL support is WIP ([PR #4668](https://github.com/ai-dynamo/dynamo/pull/4668)) + +**Pattern Key:** + +- **EPD** - All-in-one worker (Simple Aggregated) +- **E/PD** - Separate encode, combined prefill+decode +- **E/P/D** - All stages separate +- **EP/D** - Combined encode+prefill, separate decode + +**Status:** ✅ Supported | 🚧 WIP | 🧪 Experimental | ❌ Not supported + +### Input Format Support + +| Format | vLLM | TRT-LLM | SGLang | +|--------|------|---------|--------| +| HTTP/HTTPS URL | ✅ | ✅ | ✅ | +| Data URL (Base64) | ✅ | ❌ | ❌ | +| Pre-computed Embeddings (.pt) | ❌ | ✅ | ❌ | + +## Architecture Patterns + +Dynamo supports several deployment patterns for multimodal inference based on two dimensions: + +1. **Encoding**: Is media encoding handled inline (within prefill) or by a separate **Encode Worker**? + - *Inline*: Simpler setup, encoding happens in the prefill worker + - *Separate (EPD)*: Dedicated encode worker transfers embeddings via **NIXL (RDMA)**, enabling independent scaling + +2. **Prefill/Decode**: Are prefill and decode in the same worker or separate? + - *Aggregated*: Single worker handles both prefill and decode + - *Disaggregated*: Separate workers for prefill and decode, with KV cache transfer between them + +These combine into four deployment patterns: + +### EPD - Simple Aggregated + +All processing happens within a single worker - the simplest setup. + +```text +HTTP Frontend (Rust) + ↓ +Worker (Python) + ↓ image load + encode + prefill + decode +Response +``` + +| Component | Purpose | +|-----------|---------| +| Frontend (Rust) | HTTP entry point, tokenization, image URL preprocessing | +| Worker | Complete inference pipeline (encode + prefill + decode) | + +**When to use:** Quick setup, smaller models, development/testing. + +### E/PD - Encode Separate + +Encoding happens in a separate worker; prefill and decode share the same engine. + +```text +HTTP Frontend (Rust) + ↓ +Processor (Python) + ↓ tokenizes, extracts media URL +Encode Worker (Python) + ↓ downloads media, generates embeddings, NIXL transfer +PD Worker (Python) + ↓ receives embeddings via NIXL, prefill + decode +Response +``` + +| Component | Purpose | +|-----------|---------| +| Frontend (Rust) | HTTP entry point | +| Processor (Python) | Tokenization, extracts media URLs | +| Encode Worker | Media encoding, embeddings generation | +| PD Worker | Prefill + Decode with embeddings | + +**When to use:** Offload vision encoding to separate GPU, scale encode workers independently. + +### E/P/D - Full Disaggregation + +Full disaggregation with separate workers for encoding, prefill, and decode. +There are two variants of this workflow: +- Prefill-first, used by vLLM +- Decode-first, used by SGlang + +Prefill-first: + +```text +HTTP Frontend (Rust) + ↓ +Processor (Python) + ↓ tokenizes, extracts media URL +Encode Worker (Python) + ↓ downloads media, generates embeddings, NIXL transfer +Prefill Worker (Python) + ↓ receives embeddings via NIXL, prefill only, KV cache transfer +Decode Worker (Python) + ↓ decode only, token generation +Response +``` + +OR + +Decode-first: + +```text +HTTP Frontend (Rust) + ↓ +Processor (Python) + ↓ tokenizes, extracts media URL +Encode Worker (Python) + ↓ downloads media, generates embeddings, NIXL transfer +Decode Worker (Python) + ↓ Bootstraps prefill worker +Prefill Worker (Python) + ↓ receives embeddings via NIXL, prefill only, KV cache transfer +Decode Worker (Python) + ↓ decode only, token generation +Response +``` + +| Component | Purpose | +|-----------|---------| +| Frontend (Rust) | HTTP entry point | +| Processor (Python) | Tokenization, extracts media URLs | +| Encode Worker | Media encoding, embeddings generation | +| Prefill Worker | Prefill only, transfers KV cache | +| Decode Worker | Decode only, token generation | + +**When to use:** Maximum optimization, multi-node deployment, independent scaling of each phase. + +### EP/D - Traditional Disaggregated + +Encoding is combined with prefill, with decode separate. + +```text +HTTP Frontend (Rust) + ↓ +Processor (Python) + ↓ tokenizes, extracts media URL +Encode+Prefill Worker (Python) + ↓ downloads media, encodes inline, prefill, KV cache transfer +Decode Worker (Python) + ↓ decode only, token generation +Response +``` + +| Component | Purpose | +|-----------|---------| +| Frontend (Rust) | HTTP entry point | +| Processor (Python) | Tokenization, extracts media URLs (vLLM only) | +| Encode+Prefill Worker | Combined encoding and prefill | +| Decode Worker | Decode only, token generation | + +> **Note:** TRT-LLM's EP/D mode skips the Python Processor - the Rust frontend handles tokenization and routes directly to the Prefill worker. +> For multimodal requests, the Python prefill worker still re-tokenizes/builds inputs; Rust token_ids are ignored. + +**When to use:** Models without pre-computed embedding support (Llama 4), or TRT-LLM disaggregated deployment. + +## Example Workflows + +You can find example workflows and reference implementations for deploying multimodal models in: + +- [vLLM multimodal examples](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm/launch) +- [TRT-LLM multimodal examples](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/trtllm/launch) +- [SGLang multimodal examples](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/sglang/launch) +- [Advanced multimodal examples](https://github.com/ai-dynamo/dynamo/tree/main/examples/multimodal/launch) (video, audio) diff --git a/fern/pages/multimodal/sglang.md b/fern/pages/multimodal/sglang.md new file mode 100644 index 00000000000..e86136eff42 --- /dev/null +++ b/fern/pages/multimodal/sglang.md @@ -0,0 +1,367 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "SGLang Multimodal" +--- + +This document provides a comprehensive guide for multimodal inference using SGLang backend in Dynamo. SGLang multimodal uses specialized **E/PD or E/P/D** flows with **NIXL (RDMA)** for zero-copy tensor transfer. + +## Support Matrix + +| Modality | Input Format | Aggregated | Disaggregated | Notes | +|----------|--------------|------------|---------------|-------| +| **Image** | HTTP/HTTPS URL | Yes | Yes | Vision encoder generates embeddings | +| **Image** | Data URL (Base64) | No | No | | +| **Video** | HTTP/HTTPS URL | No | No | | +| **Audio** | HTTP/HTTPS URL | No | No | | + +### Supported URL Formats + +| Format | Example | Description | +|--------|---------|-------------| +| **HTTP/HTTPS** | `http://example.com/image.jpg` | Remote media files | + +## Deployment Patterns + +SGLang supports E/PD and E/P/D patterns only (always has a separate encode worker). See [Multimodal Architecture Patterns](index.md#architecture-patterns) for detailed explanations. + +| Pattern | Supported | Launch Script | Notes | +|---------|-----------|---------------|-------| +| EPD (Simple Aggregated) | ❌ | N/A | Not supported | +| E/PD (Encode Separate) | ✅ | `multimodal_agg.sh` | Vision encoder separate | +| E/P/D (Full Disaggregation) | ✅ | `multimodal_disagg.sh` | KV cache via bootstrap | +| EP/D (Traditional Disaggregated) | ❌ | N/A | Not supported | + +### Component Flags + +| Component | Flag | Purpose | +|-----------|------|---------| +| Processor | `--multimodal-processor` | HTTP entry, OpenAI→SGLang conversion | +| Encode Worker | `--multimodal-encode-worker` | Vision encoder, embeddings generation | +| PD Worker | `--multimodal-worker` | Prefill + Decode with embeddings | +| Decode Worker | `--multimodal-worker --serving-mode=decode` | Entry point for disaggregation | +| Prefill Worker | `--multimodal-worker --serving-mode=prefill` | Called by Decode, bootstrap coordination | + +### SGLang-Specific Characteristics + +- **Vision Encoder in Python**: Encode worker loads vision model (AutoModel) and image processor (AutoImageProcessor) +- **Token Expansion**: Single `<|image_pad|>` token replaced with N tokens based on embedding shape +- **NIXL Transfer**: Embeddings transferred from Encoder → PD Worker using NIXL +- **No Rust Processing**: All tokenization and image handling happens in Python + +## Use the Latest Release + +We recommend using the latest stable release of dynamo to avoid breaking changes: + +[![GitHub Release](https://img.shields.io/github/v/release/ai-dynamo/dynamo)](https://github.com/ai-dynamo/dynamo/releases/latest) + +You can find the [latest release](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with: + +```bash +git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) +``` + +## E/PD Serving (Encode Separate) + +### Components + +- workers: + - [MultimodalEncodeWorkerHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/sglang/request_handlers/multimodal/encode_worker_handler.py) for encoding + - [MultimodalWorkerHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/sglang/request_handlers/multimodal/worker_handler.py) for prefilling and decoding. +- processor: [MultimodalProcessorHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/sglang/request_handlers/multimodal/processor_handler.py) + - tokenizes the prompt using the chat template + - passes the text and image url to the MultimodalEncodeWorker. + +### Workflow + +The `MultimodalEncodeWorker` downloads and encodes the image and passes the embeddings to the MultimodalWorker. The work complete event is sent via NATS, while the embeddings tensor is transferred via RDMA through the NIXL interface. The `MultimodalWorker` then prefills and decodes the prompt in the same engine, as in the [LLM aggregated serving](../backends/sglang/README.md) example. Only the processor is registered to the Dynamo frontend as an available endpoint. Workers do NOT register - they are internal components and communicate via NATS. + +```mermaid +flowchart LR + HTTP --> processor + processor --tokenized request + image_url--> encode_worker + encode_worker --request + embeddings--> worker + + worker -.-> encode_worker + encode_worker -.-> processor + processor -.-> HTTP +``` + + +### Launch + +```bash +cd $DYNAMO_HOME/examples/backends/sglang +./launch/multimodal_agg.sh +``` + +**Client:** + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-VL-7B-Instruct", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Describe the image." + }, + { + "type": "image_url", + "image_url": { + "url": "http://images.cocodataset.org/test2017/000000155781.jpg" + } + } + ] + } + ], + "max_tokens": 50, + "stream": false + }' | jq +``` + +## E/P/D Serving (Full Disaggregation) + +### Components + +- workers: + - [MultimodalEncodeWorkerHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/sglang/request_handlers/multimodal/encode_worker_handler.py) for encoding + - [MultimodalWorkerHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/sglang/request_handlers/multimodal/worker_handler.py) for decoding + - [MultimodalPrefillWorkerHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/sglang/request_handlers/multimodal/worker_handler.py) for prefilling +- processor: [MultimodalProcessorHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/sglang/request_handlers/multimodal/processor_handler.py) tokenizes the prompt and passes it to the MultimodalEncodeWorker. + +### Workflow + +In models like Qwen2.5-VL, embeddings are only required during the prefill stage. The image embeddings are transferred via NIXL from the Encode Worker to the Decode Worker (the entry point for disaggregation), which then coordinates with the Prefill Worker. The Prefill Worker processes the embeddings and forwards the KV cache back to the Decode Worker for token generation. + +```mermaid +flowchart LR + HTTP --> processor + processor --tokenized request + image_url--> encode_worker + encode_worker --request + embeddings--> worker + worker --request + embeddings--> prefill_worker + + prefill_worker --KV Cache--> worker + encode_worker -.-> processor + worker -.-> encode_worker + processor -.-> HTTP +``` + +### Launch + +```bash +cd $DYNAMO_HOME/examples/backends/sglang +./launch/multimodal_disagg.sh +``` + +**Client:** + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-VL-7B-Instruct", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Describe the image." + }, + { + "type": "image_url", + "image_url": { + "url": "http://images.cocodataset.org/test2017/000000155781.jpg" + } + } + ] + } + ], + "max_tokens": 50, + "stream": false + }' | jq +``` + +## Bootstrap Coordination + +SGLang disaggregation uses a bootstrap mechanism for P->D coordination: + +### Request Flow (Important) + +```text +Client → Frontend → Processor → Encode → DECODE Worker → Prefill Worker + ↑ + Entry point for disaggregation! +``` + +### Bootstrap Process + +1. **Decode Worker** receives request from Encode Worker +2. **Decode Worker** calls Prefill Worker via NATS to request bootstrap info +3. **Prefill Worker** generates `{host, port, room}` and returns immediately +4. **Both workers** connect to same "room" using bootstrap coordinates +5. **SGLang internally** transfers KV cache state via bootstrap connection (not NIXL) + +### Key Difference from vLLM + +- vLLM: Frontend → Prefill → Decode (Prefill is entry point) +- SGLang: Frontend → Processor → Encode → **Decode → Prefill** (Decode is entry point) + +## Inter-Component Communication + +### Control Flow (NATS) + +All component-to-component communication happens via NATS: + +#### E/PD Mode (Encode Separate) + +```text +Processor → Encode Worker → PD Worker + (NATS) (NATS + NIXL embeddings) +``` + +#### E/P/D Mode (Full Disaggregation) + +```text +Processor → Encode Worker → DECODE Worker → Prefill Worker + (NATS) (NATS) (NATS) + ↓ + Decode requests bootstrap + ↓ + Prefill returns {host, port, room} + ↓ + Both connect via bootstrap + ↓ + SGLang internal KV cache transfer +``` + +### Detailed Message Flow + +```text +Processor → Encode Worker: + - NATS round_robin with SglangMultimodalRequest + - Contains: tokenized input_ids, image URL, sampling params + +Encode Worker → Decode/PD Worker: + - NATS round_robin to "backend" component + - Contains: expanded token_ids, NIXL metadata, embeddings shape + - NIXL transfer: embeddings tensor + +Decode Worker → Prefill Worker (disagg only): + - NATS call to "prefill" component + - Decode requests bootstrap coordinates + - Prefill returns: {bootstrap_host, bootstrap_port, bootstrap_room} + +Prefill ↔ Decode (via bootstrap): + - SGLang internal connection (not NATS) + - KV cache state shared via bootstrap mechanism +``` + +### Data Transfer (NIXL) + +NIXL is used only for embedding transfer: + +```python +# Encode Worker +descriptor = connect.Descriptor(precomputed_embeddings) +with connector.create_readable(descriptor) as readable: + request.serialized_request = readable.metadata() + await pd_worker_client.round_robin(request) + await readable.wait_for_completion() + +# PD Worker +embeddings = torch.empty(request.embeddings_shape, dtype=torch.float16) +descriptor = connect.Descriptor(embeddings) +read_op = await connector.begin_read(request.serialized_request, descriptor) +await read_op.wait_for_completion() +``` + +## Vision Encoding Details + +### Encode Worker Components + +The encode worker loads and runs the vision model in Python: + +```python +self.image_processor = AutoImageProcessor.from_pretrained( + model_path, trust_remote_code=True +) +self.vision_model = AutoModel.from_pretrained( + model_path, + device_map="auto", + torch_dtype=torch.float16, + trust_remote_code=True +) +``` + +### Token Expansion Process + +1. Processor inserts single image token (e.g., `<|image_pad|>`) +2. Encode worker generates embeddings: `shape = (batch, num_patches, hidden_dim)` +3. Encode worker replaces single token with `num_patches` tokens +4. Downstream worker receives expanded token sequence + +Example: + +```python +# Before: ["Hello", "<|image_pad|>", "world"] +# After: ["Hello", "<|image_pad|>", "<|image_pad|>", ...(576 tokens), "world"] +``` + +## Chat Template Processing + +SGLang uses its own chat template system: + +```python +from sglang.srt.parser.conversation import chat_templates + +conv = chat_templates["qwen2-vl"].copy() +conv.append_message(conv.roles[0], f"{conv.image_token} Describe this image") +processed = tokenizer(text=conv.get_prompt(), return_tensors="pt") +``` + +Supported templates: `qwen2-vl`, `llama-3`, `vicuna`, etc. + +## NIXL Usage + +| Use Case | NIXL Used? | Data Transfer | Notes | +|----------|------------|---------------|-------| +| E/PD (Encode Separate) | Yes | Encoder → PD (embeddings) | Vision encoder separate | +| E/P/D (Full Disaggregation) | Yes | Encoder → Prefill (embeddings) | KV cache via SGLang bootstrap | + +**Key Difference:** SGLang P/D uses bootstrap mechanism, not NIXL for KV cache like vLLM. + +## Known Limitations + +- **No Data URL support** - Only HTTP/HTTPS URLs supported; `data:image/...` base64 URLs not supported +- **No pre-computed embeddings** - Cannot use `.pt`, `.pth`, `.bin` embedding files; vision encoder runs for every request +- **No video support** - No video encoder implementation +- **No audio support** - No audio encoder implementation +- **Only Processor registers with Dynamo** - Workers are internal components, frontend routes to Processor only +- **Disaggregated routing** - Decode Worker is the entry point (calls Prefill), cannot route directly to Prefill workers +- **Limited model generalization** - Token expansion logic is model-specific; adding new models may require implementation updates + +## Supported Models + +SGLang multimodal **only supports image-based vision-language models**: + +- **Qwen2-VL** / **Qwen2.5-VL** (primary support) +- Models with `AutoImageProcessor` and vision tower +- Models compatible with SGLang's image embedding format + +## Key Files + +| File | Description | +|------|-------------| +| `components/src/dynamo/sglang/main.py` | Component initialization, only Processor registers | +| `components/src/dynamo/sglang/request_handlers/multimodal/processor_handler.py` | Processor implementation, OpenAI→SGLang | +| `components/src/dynamo/sglang/request_handlers/multimodal/encode_worker_handler.py` | Vision encoder, embeddings generation | +| `components/src/dynamo/sglang/request_handlers/multimodal/worker_handler.py` | PD/Prefill/Decode workers, NIXL read | +| `components/src/dynamo/sglang/multimodal_utils/multimodal_chat_processor.py` | Chat template processing | +| `components/src/dynamo/sglang/protocol.py` | Request/response data structures | +| `components/src/dynamo/sglang/register.py` | Registration logic (only called for Processor) | diff --git a/fern/pages/multimodal/trtllm.md b/fern/pages/multimodal/trtllm.md new file mode 100644 index 00000000000..674afa73a23 --- /dev/null +++ b/fern/pages/multimodal/trtllm.md @@ -0,0 +1,370 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "TensorRT-LLM Multimodal" +--- + +This document provides a comprehensive guide for multimodal inference using TensorRT-LLM backend in Dynamo. + +You can provide multimodal inputs in the following ways: +- By sending image URLs +- By providing paths to pre-computed embedding files + +> **Note:** You should provide **either image URLs or embedding file paths** in a single request. + +## Support Matrix + +| Modality | Input Format | Aggregated | Disaggregated | Notes | +|----------|--------------|------------|---------------|-------| +| **Image** | HTTP/HTTPS URL | Yes | Yes | Full support for all image models | +| **Image** | Pre-computed Embeddings (.pt, .pth, .bin) | Yes | Yes | Direct embedding files | +| **Video** | HTTP/HTTPS URL | No | No | Not implemented | +| **Audio** | HTTP/HTTPS URL | No | No | Not implemented | + +### Supported URL Formats + +| Format | Example | Description | +|--------|---------|-------------| +| **HTTP/HTTPS** | `http://example.com/image.jpg` | Remote media files | +| **Pre-computed Embeddings** | `/path/to/embedding.pt` | Local embedding files (.pt, .pth, .bin) | + +## Deployment Patterns + +TRT-LLM supports aggregated and traditional disaggregated patterns. See [Architecture Patterns](index.md#architecture-patterns) for detailed explanations. + +| Pattern | Supported | Launch Script | Notes | +|---------|-----------|---------------|-------| +| EPD (Simple Aggregated) | ✅ | `agg.sh` | Easiest setup | +| E/PD (Encode Separate) | ❌ | N/A | Not supported | +| E/P/D (Full Disaggregation) | 🚧 WIP | N/A | PR #4668 in progress | +| EP/D (Traditional Disaggregated) | ✅ | `disagg_multimodal.sh` | Prefill handles encoding | + +### Component Flags + +| Component | Flag | Purpose | +|-----------|------|---------| +| Worker | `--modality multimodal` | Complete pipeline (aggregated) | +| Prefill Worker | `--disaggregation-mode prefill` | Image processing + Prefill (multimodal tokenization happens here) | +| Decode Worker | `--disaggregation-mode decode` | Decode only | +| Encode Worker (WIP) | `--disaggregation-mode encode` | Image encoding (E/P/D flow) | + +## Aggregated Serving + +Quick steps to launch Llama-4 Maverick BF16 in aggregated mode: + +```bash +cd $DYNAMO_HOME + +export AGG_ENGINE_ARGS=./examples/backends/trtllm/engine_configs/llama4/multimodal/agg.yaml +export SERVED_MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct" +export MODEL_PATH="meta-llama/Llama-4-Maverick-17B-128E-Instruct" +./examples/backends/trtllm/launch/agg.sh +``` + +**Client:** +```bash +curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Describe the image" + }, + { + "type": "image_url", + "image_url": { + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png" + } + } + ] + } + ], + "stream": false, + "max_tokens": 160 +}' +``` + +## Disaggregated Serving + +Example using `Qwen/Qwen2-VL-7B-Instruct`: + +```bash +cd $DYNAMO_HOME + +export MODEL_PATH="Qwen/Qwen2-VL-7B-Instruct" +export SERVED_MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct" +export PREFILL_ENGINE_ARGS="examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/prefill.yaml" +export DECODE_ENGINE_ARGS="examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/decode.yaml" +export MODALITY="multimodal" + +./examples/backends/trtllm/launch/disagg.sh +``` + +```bash +curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "Qwen/Qwen2-VL-7B-Instruct", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Describe the image" + }, + { + "type": "image_url", + "image_url": { + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png" + } + } + ] + } + ], + "stream": false, + "max_tokens": 160 +}' +``` + +For a large model like `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, a multi-node setup is required for disaggregated serving (see [Multi-node Deployment](#multi-node-deployment-slurm) below), while aggregated serving can run on a single node. This is because the model with a disaggregated configuration is too large to fit on a single node's GPUs. For instance, running this model in disaggregated mode requires 2 nodes with 8xH200 GPUs or 4 nodes with 4xGB200 GPUs. + +## Pre-computed Embeddings with E/P/D Flow + +For high-performance multimodal inference, Dynamo supports pre-computed embeddings with an **Encode-Prefill-Decode (E/P/D)** flow using **NIXL (RDMA)** for zero-copy tensor transfer. + +### Supported File Types + +- `.pt` - PyTorch tensor files +- `.pth` - PyTorch checkpoint files +- `.bin` - Binary tensor files + +### Embedding File Formats + +TRT-LLM supports two formats for embedding files: + +**1. Simple Tensor Format** + +Direct tensor saved as `.pt` file containing only the embedding tensor: + +```python +embedding_tensor = torch.rand(1, 576, 4096) # [batch, seq_len, hidden_dim] +torch.save(embedding_tensor, "embedding.pt") +``` + +**2. Dictionary Format with Auxiliary Data** + +Dictionary containing multiple keys, used by models like Llama-4 that require additional metadata: + +```python +embedding_dict = { + "mm_embeddings": torch.rand(1, 576, 4096), + "special_tokens": [128256, 128257], + "image_token_offsets": [[0, 576]], + # ... other model-specific metadata +} +torch.save(embedding_dict, "llama4_embedding.pt") +``` + +- **Simple tensors**: Loaded directly and passed to `mm_embeddings` parameter +- **Dictionary format**: `mm_embeddings` key extracted as main tensor, other keys preserved as auxiliary data + +### How to Launch + +```bash +cd $DYNAMO_HOME/examples/backends/trtllm + +# Launch 3-worker E/P/D flow with NIXL +./launch/epd_disagg.sh +``` + +> **Note:** This script is designed for 8-node H200 with `Llama-4-Scout-17B-16E-Instruct` model and assumes you have a model-specific embedding file ready. + +### Configuration + +```bash +# Encode endpoint for Prefill → Encode communication +export ENCODE_ENDPOINT="dyn://dynamo.tensorrt_llm_encode.generate" + +# Security: Allowed directory for embedding files (default: /tmp) +export ALLOWED_LOCAL_MEDIA_PATH="/tmp" + +# Security: Max file size to prevent DoS attacks (default: 50MB) +export MAX_FILE_SIZE_MB=50 +``` + +### Example Request with Pre-computed Embeddings + +```bash +curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe the image"}, + {"type": "image_url", "image_url": {"url": "/path/to/embedding.pt"}} + ] + } + ], + "max_tokens": 160 +}' +``` + +### E/P/D Architecture + +The E/P/D flow implements a **3-worker architecture**: + +- **Encode Worker**: Loads pre-computed embeddings, transfers via NIXL +- **Prefill Worker**: Receives embeddings, handles context processing and KV-cache generation +- **Decode Worker**: Performs streaming token generation + +```mermaid +sequenceDiagram + participant Client + participant Frontend + participant PrefillWorker as "Prefill Worker" + participant EncodeWorker as "Encode Worker" + participant DecodeWorker as "Decode Worker" + participant NIXL as "NIXL (RDMA)" + + Client->>Frontend: POST /v1/chat/completions + Frontend->>PrefillWorker: Route to prefill worker + PrefillWorker->>EncodeWorker: Send request (embedding paths) + EncodeWorker->>NIXL: Create readable operation + EncodeWorker->>PrefillWorker: Send metadata + NIXL info + PrefillWorker->>NIXL: Begin read operation + NIXL-->>PrefillWorker: Zero-copy transfer complete + PrefillWorker->>Frontend: Return prefill response + Frontend->>DecodeWorker: Route to decode worker + DecodeWorker->>Frontend: Stream response chunks + Frontend->>Client: Stream response +``` + +## Multi-node Deployment (Slurm) + +This section demonstrates how to deploy large multimodal models that require a multi-node setup using Slurm. + +> **Note:** The scripts referenced in this section can be found in [`examples/basics/multinode/trtllm/`](https://github.com/ai-dynamo/dynamo/tree/main/examples/basics/multinode/trtllm/). + +### Environment Setup + +Assuming you have allocated your nodes via `salloc` and are inside an interactive shell: + +```bash +# Container image (build using docs/backends/trtllm/README.md#build-container) +export IMAGE="" + +# Host:container path pairs for mounting +export MOUNTS="${PWD}/../../../../:/mnt" + +# Model configuration +export MODEL_PATH="meta-llama/Llama-4-Maverick-17B-128E-Instruct" +export SERVED_MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct" +export MODALITY=${MODALITY:-"multimodal"} +``` + +### Multi-node Disaggregated Launch + +For 4 4xGB200 nodes (2 for prefill, 2 for decode): + +```bash +# Customize parallelism to match your engine configs +# export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/prefill.yaml" +# export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/decode.yaml" +# export NUM_PREFILL_NODES=2 +# export NUM_DECODE_NODES=2 +# export NUM_GPUS_PER_NODE=4 + +# Launches frontend + etcd/nats on head node, plus prefill and decode workers +./srun_disaggregated.sh +``` + +### Understanding the Output + +1. `srun_disaggregated.sh` launches three srun jobs: frontend, prefill worker, and decode worker +2. The OpenAI frontend will dynamically discover workers as they register: + ``` + INFO dynamo_run::input::http: Watching for remote model at models + INFO dynamo_llm::http::service::service_v2: Starting HTTP service on: 0.0.0.0:8000 + ``` +3. TRT-LLM workers output progress from each MPI rank while loading +4. When ready, the frontend logs: + ``` + INFO dynamo_llm::discovery::watcher: added model model_name="meta-llama/Llama-4-Maverick-17B-128E-Instruct" + ``` + +### Cleanup + +```bash +pkill srun +``` + +## NIXL Usage + +| Use Case | Script | NIXL Used? | Data Transfer | +|----------|--------|------------|---------------| +| EPD (Simple Aggregated) | `agg.sh` | No | All in one worker | +| EP/D (Traditional Disaggregated) | `disagg_multimodal.sh` | Optional | Prefill → Decode (KV cache via UCX or NIXL) | +| E/P/D (pre-computed embeddings) | `epd_disagg.sh` | Yes | Encoder → Prefill (embeddings via NIXL) | +| E/P/D (WIP) | N/A | No | Encoder → Prefill (handles via params), Prefill → Decode (KV cache) | + +> **Note:** NIXL for KV cache transfer is currently beta and only supported on AMD64 (x86_64) architecture. + +## ModelInput Types and Registration + +TRT-LLM workers register with Dynamo using: + +| ModelInput Type | Preprocessing | Use Case | +|-----------------|---------------|----------| +| `ModelInput.Tokens` | Rust frontend may tokenize, but multimodal flows re-tokenize and build inputs in the Python worker; Rust token_ids are ignored | All TRT-LLM workers | + +```python +# TRT-LLM Worker - Register with Tokens +await register_llm( + ModelInput.Tokens, # Rust does minimal preprocessing + model_type, # ModelType.Chat or ModelType.Prefill + generate_endpoint, + model_name, + ... +) +``` + +## Inter-Component Communication + +| Transfer Stage | Message | NIXL Transfer | +|----------------|---------|---------------| +| **Frontend → Prefill** | Request with image URL or embedding path | No | +| **Encode → Prefill (pre-computed)** | NIXL metadata | Yes (Embeddings tensor) | +| **Encode → Prefill (Image URL) (WIP)** | Disaggregated params with multimodal handles | No | +| **Prefill → Decode** | Disaggregated params | Configurable (KV cache: NIXL default, UCX optional) | + +## Known Limitations + +- **No Data URL support** - Only HTTP/HTTPS URLs supported; `data:image/...` base64 URLs not supported +- **No video support** - No video encoder implementation +- **No audio support** - No audio encoder implementation +- **Multimodal preprocessing/tokenization happens in Python** - Rust may forward token_ids, but multimodal requests are parsed and re-tokenized in the Python worker +- **E/P/D mode is WIP** - Full E/P/D with image URLs under development +- **Multi-node H100 limitation** - Loading `meta-llama/Llama-4-Maverick-17B-128E-Instruct` with 8 nodes of H100 with TP=16 is not possible due to head count divisibility (`num_attention_heads: 40` not divisible by `tp_size: 16`) + +## Supported Models + +Multimodal models listed in [TensorRT-LLM supported models](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/models/supported-models.md) are supported by Dynamo. + +Common examples: +- Llama 4 Vision models (Maverick, Scout) +- Qwen2-VL models +- Other vision-language models with TRT-LLM support + +## Key Files + +| File | Description | +|------|-------------| +| `components/src/dynamo/trtllm/main.py` | Worker initialization and setup | +| `components/src/dynamo/trtllm/utils/trtllm_utils.py` | Command-line argument parsing | +| `components/src/dynamo/trtllm/multimodal_processor.py` | Multimodal request processing | +| `components/src/dynamo/trtllm/request_handlers/handlers.py` | Request handler factory | +| `components/src/dynamo/trtllm/request_handlers/handler_base.py` | Base handler and disaggregation modes | + diff --git a/fern/pages/multimodal/vllm.md b/fern/pages/multimodal/vllm.md new file mode 100644 index 00000000000..8c0d9c0e4ed --- /dev/null +++ b/fern/pages/multimodal/vllm.md @@ -0,0 +1,512 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "vLLM Multimodal" +--- + +This document provides a comprehensive guide for multimodal inference using vLLM backend in Dynamo. + + +**Security Requirement**: All multimodal workers require the `--enable-multimodal` flag to be explicitly set at startup. This is a security feature to prevent unintended processing of multimodal data from untrusted sources. Workers will fail at startup if multimodal flags (e.g., `--multimodal-worker`, `--multimodal-processor`) are used without `--enable-multimodal`. +This flag is analogous to `--enable-mm-embeds` in vllm serve but also extends it to all multimodal content (url, embeddings, b64). + + +## Support Matrix + +| Modality | Input Format | Aggregated | Disaggregated | Notes | +|----------|--------------|------------|---------------|-------| +| **Image** | HTTP/HTTPS URL | Yes | Yes | Full support for all image models | +| **Image** | Data URL (Base64) | Yes | Yes | Inline base64-encoded images | +| **Video** | HTTP/HTTPS URL | Yes | Yes | Frame extraction and processing | +| **Audio** | HTTP/HTTPS URL | Yes | Yes | Experimental - requires audio dependencies | + +### Supported URL Formats + +| Format | Example | Description | +|--------|---------|-------------| +| **HTTP/HTTPS** | `http://example.com/image.jpg` | Remote media files | +| **Data URL** | `data:image/jpeg;base64,/9j/4AAQ...` | Base64-encoded inline data | + +## Deployment Patterns + +vLLM supports all multimodal deployment patterns. See [Architecture Patterns](index.md#architecture-patterns) for detailed explanations. + +| Pattern | Supported | Launch Script | Notes | +|---------|-----------|---------------|-------| +| EPD (Simple Aggregated) | ✅ | `agg_multimodal.sh` | Easiest setup | +| E/PD (Encode Separate) | ✅ | `agg_multimodal_epd.sh` | Separate encode worker | +| E/P/D (Full Disaggregation) | ✅ | `disagg_multimodal_epd.sh` | All stages separate | +| EP/D (Traditional Disaggregated) | ✅ | `disagg_multimodal_llama.sh` | For Llama 4 models | +| E/PD (EC Connector) | ✅ | `agg_multimodal_ec_connector.sh` | vLLM-native encoder with ECConnector | + +### Component Flags + +| Component | Flag | Purpose | +|-----------|------|---------| +| Processor | `--multimodal-processor` | HTTP entry, tokenization | +| Encode Worker | `--multimodal-encode-worker` | Media encoding | +| PD Worker | `--multimodal-worker` | Prefill + Decode | +| Prefill Worker | `--multimodal-worker --is-prefill-worker` | Prefill only | +| Decode Worker | `--multimodal-decode-worker` | Decode only | +| Encode+Prefill Worker | `--multimodal-encode-prefill-worker --is-prefill-worker` | Combined (Llama 4) | +| vLLM Native Encoder | `--vllm-native-encoder-worker` | vLLM-native encoding with ECConnector | + +## Use the Latest Release + +We recommend using the latest stable release of dynamo to avoid breaking changes: + +[![GitHub Release](https://img.shields.io/github/v/release/ai-dynamo/dynamo)](https://github.com/ai-dynamo/dynamo/releases/latest) + +You can find the [latest release](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with: + +```bash +git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) +``` + +## Image Serving + +### E/PD Serving (Encode Separate) + +**Components:** + +- workers: [EncodeWorkerHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/vllm/multimodal_handlers/encode_worker_handler.py) for encoding and [MultimodalPDWorkerHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py) for prefilling and decoding. +- processor: Tokenizes the prompt and passes it to the EncodeWorkerHandler. +- frontend: HTTP endpoint to handle incoming requests. + +**Workflow:** + +The EncodeWorkerHandler encodes the image and passes the embeddings to the MultimodalPDWorkerHandler via NATS and RDMA. The work complete event is sent via NATS, while the embeddings tensor is transferred via RDMA through the NIXL interface. + +```mermaid +flowchart LR + HTTP --> processor + processor --> HTTP + processor --image_url--> encode_worker + encode_worker --> processor + encode_worker --embeddings--> pd_worker + pd_worker --> encode_worker +``` + +> **Note:** Aggregated serving supports LLaVA 1.5 7B and Qwen2.5-VL-7B-Instruct. Disaggregated serving is currently only confirmed for LLaVA. + +**Launch:** + +```bash +cd $DYNAMO_HOME/examples/backends/vllm +# Serve a LLaVA 1.5 7B model: +bash launch/agg_multimodal_epd.sh --model llava-hf/llava-1.5-7b-hf +# Serve a Qwen2.5-VL model: +bash launch/agg_multimodal_epd.sh --model Qwen/Qwen2.5-VL-7B-Instruct +``` + +**Client:** + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llava-hf/llava-1.5-7b-hf", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "http://images.cocodataset.org/test2017/000000155781.jpg" + } + } + ] + } + ], + "max_tokens": 300, + "temperature": 0.0, + "stream": false + }' +``` + +### E/P/D Serving (Full Disaggregation) + +**Components:** + +- workers: [EncodeWorkerHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/vllm/multimodal_handlers/encode_worker_handler.py) for encoding, [MultimodalDecodeWorkerHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py) for decoding, and [MultimodalPDWorkerHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py) for prefilling. +- processor: Tokenizes the prompt and passes it to the EncodeWorkerHandler. +- frontend: HTTP endpoint to handle incoming requests. + +**Workflow:** + +For the LLaVA model, embeddings are only required during the prefill stage. The EncodeWorkerHandler is connected directly to the prefill worker, encoding the image and passing embeddings via NATS and RDMA. The prefill worker performs the prefilling step and forwards the KV cache to the decode worker. + +```mermaid +flowchart LR + HTTP --> processor + processor --> HTTP + processor --image_url--> encode_worker + encode_worker --> processor + encode_worker --embeddings--> prefill_worker + prefill_worker --> encode_worker + prefill_worker --> decode_worker + decode_worker --> prefill_worker +``` + +**Launch:** + +```bash +cd $DYNAMO_HOME/examples/backends/vllm +bash launch/disagg_multimodal_epd.sh --model llava-hf/llava-1.5-7b-hf +``` + + +Disaggregation is currently only confirmed to work with LLaVA. Qwen2.5-VL is not confirmed to be supported. + + +## ECConnector Serving + +ECConnector is vLLM's native connector for transferring multimodal embeddings via an Embedding Cache. The encoder worker acts as a **producer** (writes embeddings), while the PD worker acts as a **consumer** (reads embeddings). + +**Workflow:** + +```mermaid +flowchart LR + HTTP --> processor[EC Processor] + processor --image_url--> encoder[vLLM Native Encoder
Producer] + encoder --writes--> cache[(Embedding Cache)] + cache --reads--> pd[PD Worker
Consumer] + pd --> processor + processor --> HTTP +``` + +**Launch:** + +```bash +cd $DYNAMO_HOME/examples/backends/vllm +bash launch/agg_multimodal_ec_connector.sh --model llava-hf/llava-1.5-7b-hf + +# Custom storage path for Embedding Cache +bash launch/agg_multimodal_ec_connector.sh --ec-storage-path /shared/encoder-cache +``` + +**Client:** Same as [E/PD Serving](#epd-serving-encode-separate) + +## Llama 4 Serving + +The Llama 4 model family is natively multimodal. Unlike LLaVA, they do not directly consume image embeddings as input (see the [vLLM support matrix](https://docs.vllm.ai/en/latest/models/supported_models.html#text-generation_1)). Therefore, the encoder worker is not used and encoding is done alongside prefill. + +Example model: `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8` on H100x8. + +### Llama 4 Aggregated Serving + +**Workflow:** + +```mermaid +flowchart LR + HTTP --> processor + processor --> HTTP + processor --image_url--> pd_worker + pd_worker --> processor +``` + +**Launch:** + +```bash +cd $DYNAMO_HOME/examples/backends/vllm +bash launch/agg_multimodal_llama.sh +``` + +**Client:** + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "http://images.cocodataset.org/test2017/000000155781.jpg" + } + } + ] + } + ], + "max_tokens": 300, + "temperature": 0.0, + "stream": false + }' +``` + +### Llama 4 Disaggregated Serving + +**Workflow:** + +```mermaid +flowchart LR + HTTP --> processor + processor --> HTTP + processor --image_url--> prefill_worker + prefill_worker --> processor + prefill_worker --> decode_worker + decode_worker --> prefill_worker +``` + +**Launch:** + +```bash +cd $DYNAMO_HOME/examples/backends/vllm +bash launch/disagg_multimodal_llama.sh --head-node + +# On a separate node with NATS_SERVER and ETCD_ENDPOINTS pointing to head node: +cd $DYNAMO_HOME/examples/backends/vllm +bash launch/disagg_multimodal_llama.sh +``` + +## Video Serving + +### Video Aggregated Serving + +**Components:** + +- workers: [VideoEncodeWorker](https://github.com/ai-dynamo/dynamo/tree/main/examples/multimodal/components/video_encode_worker.py) for decoding video into frames, and [VllmPDWorker](https://github.com/ai-dynamo/dynamo/tree/main/examples/multimodal/components/worker.py) for prefilling and decoding. +- processor: Tokenizes the prompt and passes it to the VideoEncodeWorker. +- frontend: HTTP endpoint to handle incoming requests. + +**Workflow:** + +The VideoEncodeWorker decodes the video into frames. Unlike the image pipeline which generates embeddings, this pipeline passes raw frames directly to the VllmPDWorker via NATS and RDMA. + +```mermaid +flowchart LR + HTTP --> processor + processor --> HTTP + processor --video_url--> video_encode_worker + video_encode_worker --> processor + video_encode_worker --frames--> pd_worker + pd_worker --> video_encode_worker +``` + +**Launch:** + +```bash +cd $DYNAMO_HOME/examples/multimodal +bash launch/video_agg.sh +``` + +**Client:** + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llava-hf/LLaVA-NeXT-Video-7B-hf", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Describe the video in detail" + }, + { + "type": "video_url", + "video_url": { + "url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4" + } + } + ] + } + ], + "max_tokens": 300, + "stream": false + }' | jq +``` + +### Video Disaggregated Serving + +**Workflow:** + +For the LLaVA-NeXT-Video-7B model, frames are only required during the prefill stage. The VideoEncodeWorker is connected directly to the prefill worker, decoding the video into frames and passing them via RDMA. + +```mermaid +flowchart LR + HTTP --> processor + processor --> HTTP + processor --video_url--> video_encode_worker + video_encode_worker --> processor + video_encode_worker --frames--> prefill_worker + prefill_worker --> video_encode_worker + prefill_worker --> decode_worker + decode_worker --> prefill_worker +``` + +**Launch:** + +```bash +cd $DYNAMO_HOME/examples/multimodal +bash launch/video_disagg.sh +``` + +## Audio Serving + +### Audio Aggregated Serving + +**Components:** + +- workers: [AudioEncodeWorker](https://github.com/ai-dynamo/dynamo/tree/main/examples/multimodal/components/audio_encode_worker.py) for decoding audio into embeddings, and [VllmPDWorker](https://github.com/ai-dynamo/dynamo/tree/main/examples/multimodal/components/worker.py) for prefilling and decoding. +- processor: Tokenizes the prompt and passes it to the AudioEncodeWorker. +- frontend: HTTP endpoint to handle incoming requests. + +**Workflow:** + +```mermaid +flowchart LR + HTTP --> processor + processor --> HTTP + processor --audio_url--> audio_encode_worker + audio_encode_worker --> processor + audio_encode_worker --embeddings--> pd_worker + pd_worker --> audio_encode_worker +``` + +**Launch:** + +```bash +pip install vllm["audio"] accelerate # multimodal audio models dependency +cd $DYNAMO_HOME/examples/multimodal +bash launch/audio_agg.sh +``` + +**Client:** + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2-Audio-7B-Instruct", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is recited in the audio?" + }, + { + "type": "audio_url", + "audio_url": { + "url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav" + } + } + ] + } + ], + "max_tokens": 6000, + "temperature": 0.8, + "stream": false + }' | jq +``` + +### Audio Disaggregated Serving + +**Workflow:** + +For the Qwen2-Audio model, audio embeddings are only required during the prefill stage. The AudioEncodeWorker is connected directly to the prefill worker. + +```mermaid +flowchart LR + HTTP --> processor + processor --> HTTP + processor --audio_url--> audio_encode_worker + audio_encode_worker --> processor + audio_encode_worker --embeddings--> prefill_worker + prefill_worker --> audio_encode_worker + prefill_worker --> decode_worker + decode_worker --> prefill_worker +``` + +**Launch:** + +```bash +pip install vllm["audio"] accelerate # multimodal audio models dependency +cd $DYNAMO_HOME/examples/multimodal +bash launch/audio_disagg.sh +``` + +## NIXL Usage + +| Use Case | Script | NIXL Used? | Data Transfer | +|----------|--------|------------|---------------| +| EPD (Simple Aggregated) | `agg_multimodal.sh` | No | All in one worker | +| E/PD (Encode Separate) | `agg_multimodal_epd.sh` | Yes | Encoder → PD (embeddings) | +| E/P/D (Full Disaggregation) | `disagg_multimodal_epd.sh` | Yes | Encoder → Prefill (embeddings), Prefill → Decode (KV cache) | +| EP/D (Llama 4) | `disagg_multimodal_llama.sh` | Yes | Prefill → Decode (KV cache) | +| E/PD (EC Connector) | `agg_multimodal_ec_connector.sh` | No | ECConnector via Embedding Cache | + +## ModelInput Types and Registration + +Dynamo's Rust SDK supports two input types that determine how the HTTP frontend preprocesses requests: + +| ModelInput Type | Preprocessing | Use Case | +|-----------------|---------------|----------| +| `ModelInput.Text` | None (raw text passed through) | Components that tokenize themselves | +| `ModelInput.Tokens` | Rust SDK would tokenize (but bypassed in multimodal) | Components expecting pre-tokenized input | + +**Registration Pattern:** + +```python +# Processor - Entry point from HTTP frontend +await register_llm( + ModelInput.Text, # Frontend sends raw text + ModelType.Chat, + generate_endpoint, + model_name, + ... +) + +# Workers - Internal components +await register_llm( + ModelInput.Tokens, # Expect pre-tokenized input + ModelType.Chat, # or ModelType.Prefill for prefill workers + generate_endpoint, + model_name, + ... +) +``` + +## Known Limitations + +- **Disaggregated flows require Python Processor** - All multimodal disaggregation requires the Python Processor component (`ModelInput.Text`). + +## Supported Models + +The following models have been tested with Dynamo's vLLM multimodal backend: + +- **Qwen2.5-VL** - `Qwen/Qwen2.5-VL-7B-Instruct` +- **Qwen3-VL** - `Qwen/Qwen3-VL-30B-A3B-Instruct-FP8` +- **LLaVA 1.5** - `llava-hf/llava-1.5-7b-hf` +- **Llama 4 Maverick** - `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8` +- **LLaVA Next Video** - `llava-hf/LLaVA-NeXT-Video-7B-hf` +- **Qwen2-Audio** - `Qwen/Qwen2-Audio-7B-Instruct` + +For a complete list of multimodal models supported by vLLM, see [vLLM Supported Multimodal Models](https://docs.vllm.ai/en/latest/models/supported_models/#list-of-multimodal-language-models). Models listed there should work with Simple Aggregated Mode but may not be explicitly tested. + +## Key Files + +| File | Description | +|------|-------------| +| `components/src/dynamo/vllm/main.py` | Worker initialization and setup | +| `components/src/dynamo/vllm/args.py` | Command-line argument parsing | +| `components/src/dynamo/vllm/multimodal_handlers/processor_handler.py` | Processor implementation | +| `components/src/dynamo/vllm/multimodal_handlers/encode_worker_handler.py` | Encode worker implementations (custom and vLLM-native) | +| `components/src/dynamo/vllm/multimodal_handlers/worker_handler.py` | PD/Prefill/Decode worker implementation | diff --git a/fern/pages/observability/README.md b/fern/pages/observability/README.md new file mode 100644 index 00000000000..3e6e2e29944 --- /dev/null +++ b/fern/pages/observability/README.md @@ -0,0 +1,95 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Dynamo Observability" +--- + +## Getting Started Quickly + +This is an example to get started quickly on a single machine. + +### Prerequisites + +Install these on your machine: + +- [Docker](https://docs.docker.com/get-docker/) +- [Docker Compose](https://docs.docker.com/compose/install/) + +### Starting the Observability Stack + +Dynamo provides a Docker Compose-based observability stack that includes Prometheus, Grafana, Tempo, and various exporters for metrics, tracing, and visualization. + +From the Dynamo root directory: + +```bash +# Start infrastructure (NATS, etcd) +docker compose -f deploy/docker-compose.yml up -d + +# Start observability stack (Prometheus, Grafana, Tempo, DCGM GPU exporter, NATS exporter) +docker compose -f deploy/docker-observability.yml up -d +``` + +For detailed setup instructions and configuration, see [Prometheus + Grafana Setup](prometheus-grafana.md). + +## Observability Documentations + +| Guide | Description | Environment Variables to Control | +|-------|-------------|----------------------------------| +| [Metrics](metrics.md) | Available metrics reference | `DYN_SYSTEM_PORT`† | +| [Health Checks](health-checks.md) | Component health monitoring and readiness probes | `DYN_SYSTEM_PORT`†, `DYN_SYSTEM_STARTING_HEALTH_STATUS`, `DYN_SYSTEM_HEALTH_PATH`, `DYN_SYSTEM_LIVE_PATH`, `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` | +| [Tracing](tracing.md) | Distributed tracing with OpenTelemetry and Tempo | `DYN_LOGGING_JSONL`†, `OTEL_EXPORT_ENABLED`†, `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`†, `OTEL_SERVICE_NAME`† | +| [Logging](logging.md) | Structured logging configuration | `DYN_LOGGING_JSONL`†, `DYN_LOG`, `DYN_LOG_USE_LOCAL_TZ`, `DYN_LOGGING_CONFIG_PATH`, `OTEL_SERVICE_NAME`†, `OTEL_EXPORT_ENABLED`†, `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`† | + +**Variables marked with † are shared across multiple observability systems.** + +## Developer Guides + +| Guide | Description | Environment Variables to Control | +|-------|-------------|----------------------------------| +| [Metrics Developer Guide](metrics-developer-guide.md) | Creating custom metrics in Rust and Python | `DYN_SYSTEM_PORT`† | + +## Kubernetes + +For Kubernetes-specific setup and configuration, see [Kubernetes Metrics](../kubernetes/observability/metrics.md). + +--- + +## Topology + +This provides: +- **Prometheus** on `http://localhost:9090` - metrics collection and querying +- **Grafana** on `http://localhost:3000` - visualization dashboards (username: `dynamo`, password: `dynamo`) +- **Tempo** on `http://localhost:3200` - distributed tracing backend +- **DCGM Exporter** on `http://localhost:9401/metrics` - GPU metrics +- **NATS Exporter** on `http://localhost:7777/metrics` - NATS messaging metrics + +### Service Relationship Diagram +```mermaid +graph TD + BROWSER[Browser] -->|:3000| GRAFANA[Grafana :3000] + subgraph DockerComposeNetwork [Network inside Docker Compose] + NATS_PROM_EXP[nats-prom-exp :7777 /metrics] -->|:8222/varz| NATS_SERVER[nats-server :4222, :6222, :8222] + PROMETHEUS[Prometheus server :9090] -->|:2379/metrics| ETCD_SERVER[etcd-server :2379, :2380] + PROMETHEUS -->|:9401/metrics| DCGM_EXPORTER[dcgm-exporter :9401] + PROMETHEUS -->|:7777/metrics| NATS_PROM_EXP + PROMETHEUS -->|:8000/metrics| DYNAMOFE[Dynamo HTTP FE :8000] + PROMETHEUS -->|:8081/metrics| DYNAMOBACKEND[Dynamo backend :8081] + DYNAMOFE --> DYNAMOBACKEND + GRAFANA -->|:9090/query API| PROMETHEUS + end +``` + +The dcgm-exporter service in the Docker Compose network is configured to use port 9401 instead of the default port 9400. This adjustment is made to avoid port conflicts with other dcgm-exporter instances that may be running simultaneously. Such a configuration is typical in distributed systems like SLURM. + +### Configuration Files + +The following configuration files are located in the `deploy/observability/` directory: +- [docker-compose.yml](https://github.com/ai-dynamo/dynamo/tree/main/deploy/docker-compose.yml): Defines NATS and etcd services +- [docker-observability.yml](https://github.com/ai-dynamo/dynamo/tree/main/deploy/docker-observability.yml): Defines Prometheus, Grafana, Tempo, and exporters +- [prometheus.yml](https://github.com/ai-dynamo/dynamo/tree/main/deploy/observability/prometheus.yml): Contains Prometheus scraping configuration +- [grafana-datasources.yml](https://github.com/ai-dynamo/dynamo/tree/main/deploy/observability/grafana-datasources.yml): Contains Grafana datasource configuration +- [grafana_dashboards/dashboard-providers.yml](https://github.com/ai-dynamo/dynamo/tree/main/deploy/observability/grafana_dashboards/dashboard-providers.yml): Contains Grafana dashboard provider configuration +- [grafana_dashboards/dynamo.json](https://github.com/ai-dynamo/dynamo/tree/main/deploy/observability/grafana_dashboards/dynamo.json): A general Dynamo Dashboard for both SW and HW metrics +- [grafana_dashboards/dcgm-metrics.json](https://github.com/ai-dynamo/dynamo/tree/main/deploy/observability/grafana_dashboards/dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics +- [grafana_dashboards/kvbm.json](https://github.com/ai-dynamo/dynamo/tree/main/deploy/observability/grafana_dashboards/kvbm.json): Contains Grafana dashboard configuration for KVBM metrics + diff --git a/fern/pages/observability/health-checks.md b/fern/pages/observability/health-checks.md new file mode 100644 index 00000000000..fd57984cf06 --- /dev/null +++ b/fern/pages/observability/health-checks.md @@ -0,0 +1,343 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Dynamo Health Checks" +--- + +## Overview + +Dynamo provides health check and liveness HTTP endpoints for each component which +can be used to configure startup, liveness and readiness probes in +orchestration frameworks such as Kubernetes. + +## Environment Variables + +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `DYN_SYSTEM_PORT` | System status server port | `8081` | `9090` | +| `DYN_SYSTEM_STARTING_HEALTH_STATUS` | Initial health status | `notready` | `ready`, `notready` | +| `DYN_SYSTEM_HEALTH_PATH` | Custom health endpoint path | `/health` | `/custom/health` | +| `DYN_SYSTEM_LIVE_PATH` | Custom liveness endpoint path | `/live` | `/custom/live` | +| `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` | Endpoints required for ready state | none | `["generate"]` | +| `DYN_HEALTH_CHECK_ENABLED` | Enable canary health checks | `false` (K8s: `true`) | `true`, `false` | +| `DYN_CANARY_WAIT_TIME` | Seconds before sending canary health check | `10` | `5`, `30` | +| `DYN_HEALTH_CHECK_REQUEST_TIMEOUT` | Health check request timeout in seconds | `3` | `5`, `10` | + +## Getting Started Quickly + +Enable health checks and query endpoints: + +```bash +# Start your Dynamo components (default port 8000, override with --http-port or DYN_HTTP_PORT env var) +python -m dynamo.frontend & + +# Enable system status server on port 8081 +DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & +``` + +Check health status: + +```bash +# Frontend health (port 8000) +curl -s localhost:8000/health | jq + +# Worker health (port 8081) +curl -s localhost:8081/health | jq +``` + +## Frontend Liveness Check + +The frontend liveness endpoint reports a status of `live` as long as +the service is running. + +> **Note**: Frontend liveness doesn't depend on worker health or liveness only on the Frontend service itself. + +### Example Request + +``` +curl -s localhost:8080/live -q | jq +``` + +### Example Response + +``` +{ + "message": "Service is live", + "status": "live" +} +``` + +## Frontend Health Check + +The frontend health endpoint reports a status of `healthy` as long as +the service is running. Once workers have been registered, the +`health` endpoint will also list registered endpoints and instances. + +> **Note**: Frontend liveness doesn't depend on worker health or liveness only on the Frontend service itself. + +### Example Request + +``` +curl -v localhost:8080/health -q | jq +``` + +### Example Response + +Before workers are registered: + +``` +HTTP/1.1 200 OK +content-type: application/json +content-length: 72 +date: Wed, 03 Sep 2025 13:31:44 GMT + +{ + "instances": [], + "message": "No endpoints available", + "status": "unhealthy" +} +``` + +After workers are registered: + +``` +HTTP/1.1 200 OK +content-type: application/json +content-length: 609 +date: Wed, 03 Sep 2025 13:32:03 GMT + +{ + "endpoints": [ + "dyn://dynamo.backend.generate" + ], + "instances": [ + { + "component": "backend", + "endpoint": "clear_kv_blocks", + "instance_id": 7587888160958628000, + "namespace": "dynamo", + "transport": { + "nats_tcp": "dynamo_backend.clear_kv_blocks-694d98147d54be25" + } + }, + { + "component": "backend", + "endpoint": "generate", + "instance_id": 7587888160958628000, + "namespace": "dynamo", + "transport": { + "nats_tcp": "dynamo_backend.generate-694d98147d54be25" + } + }, + { + "component": "backend", + "endpoint": "load_metrics", + "instance_id": 7587888160958628000, + "namespace": "dynamo", + "transport": { + "nats_tcp": "dynamo_backend.load_metrics-694d98147d54be25" + } + } + ], + "status": "healthy" +} +``` + +## Worker Liveness and Health Check + +Health checks for components other than the frontend are enabled +selectively based on environment variables. If a health check for a +component is enabled the starting status can be set along with the set +of endpoints that are required to be served before the component is +declared `ready`. + +Once all endpoints declared in `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` +are served the component transitions to a `ready` state until the +component is shutdown. The endpoints return HTTP status code of `HTTP/1.1 503 Service Unavailable` +when initializing and HTTP status code `HTTP/1.1 200 OK` once ready. + +> **Note**: Both /live and /ready return the same information + +### Example Environment Setting + +``` +export DYN_SYSTEM_PORT=9090 +export DYN_SYSTEM_STARTING_HEALTH_STATUS="notready" +export DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS="[\"generate\"]" +``` + +#### Example Request + +``` +curl -v localhost:9090/health | jq +``` + +#### Example Response +Before endpoints are being served: + +``` +HTTP/1.1 503 Service Unavailable +content-type: text/plain; charset=utf-8 +content-length: 96 +date: Wed, 03 Sep 2025 13:42:39 GMT + +{ + "endpoints": { + "generate": "notready" + }, + "status": "notready", + "uptime": { + "nanos": 313803539, + "secs": 12 + } +} +``` + +After endpoints are being served: + +``` +HTTP/1.1 200 OK +content-type: text/plain; charset=utf-8 +content-length: 139 +date: Wed, 03 Sep 2025 13:42:45 GMT + +{ + "endpoints": { + "clear_kv_blocks": "ready", + "generate": "ready", + "load_metrics": "ready" + }, + "status": "ready", + "uptime": { + "nanos": 356504530, + "secs": 18 + } +} +``` + +## Canary Health Checks (Active Monitoring) + +In addition to the HTTP endpoints described above, Dynamo includes a **canary health check** system that actively monitors worker endpoints. + +### Overview + +The canary health check system: +- **Monitors endpoint health** by sending periodic test requests to worker endpoints +- **Only activates during idle periods** - if there's ongoing traffic, health checks are skipped to avoid overhead +- **Automatically enabled in Kubernetes** deployments via the operator +- **Disabled by default** in local/development environments + +### How It Works + +1. **Idle Detection**: After no activity on an endpoint for a configurable wait time (default: 10 seconds), a canary health check is triggered +2. **Health Check Request**: A lightweight test request is sent to the endpoint with a minimal payload (generates 1 token) +3. **Activity Resets Timer**: If normal requests arrive, the canary timer resets and no health check is sent +4. **Timeout Handling**: If a health check doesn't respond within the timeout (default: 3 seconds), the endpoint is marked as unhealthy + +### Configuration + +#### In Kubernetes (Enabled by Default) + +Health checks are automatically enabled by the Dynamo operator. No additional configuration is required. + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: my-deployment +spec: + services: + VllmWorker: + componentType: worker + replicas: 2 + # Health checks automatically enabled by operator +``` + +#### In Local/Development Environments (Disabled by Default) + +To enable health checks locally: + +```bash +# Enable health checks +export DYN_HEALTH_CHECK_ENABLED=true + +# Optional: Customize timing +export DYN_CANARY_WAIT_TIME=5 # Wait 5 seconds before sending health check +export DYN_HEALTH_CHECK_REQUEST_TIMEOUT=5 # 5 second timeout + +# Start worker +python -m dynamo.vllm --model Qwen/Qwen3-0.6B +``` + +#### Configuration Options + +| Environment Variable | Description | Default | Notes | +|---------------------|-------------|---------|-------| +| `DYN_HEALTH_CHECK_ENABLED` | Enable/disable canary health checks | `false` (K8s: `true`) | Automatically set to `true` in K8s | +| `DYN_CANARY_WAIT_TIME` | Seconds to wait (during idle) before sending health check | `10` | Lower values = more frequent checks | +| `DYN_HEALTH_CHECK_REQUEST_TIMEOUT` | Max seconds to wait for health check response | `3` | Higher values = more tolerance for slow responses | + +### Health Check Payloads + +Each backend defines its own minimal health check payload: + +- **vLLM**: Single token generation with minimal sampling options +- **TensorRT-LLM**: Single token with BOS token ID +- **SGLang**: Single token generation request + +These payloads are designed to: +- Complete quickly (< 100ms typically) +- Minimize GPU overhead +- Verify the full inference stack is working + +### Observing Health Checks + +When health checks are enabled, you'll see logs like: + +``` +INFO Health check manager started (canary_wait_time: 10s, request_timeout: 3s) +INFO Spawned health check task for endpoint: generate +INFO Canary timer expired for generate, sending health check +INFO Health check successful for generate +``` + +If an endpoint fails: + +``` +WARN Health check timeout for generate +ERROR Health check request failed for generate: connection refused +``` + +### When to Use Canary Health Checks + +**Enable in production (Kubernetes):** +- ✅ Detect unhealthy workers before they affect user traffic +- ✅ Enable faster failure detection and recovery +- ✅ Monitor worker availability continuously + +**Disable in development:** +- ✅ Reduce log noise during debugging +- ✅ Avoid overhead when not needed +- ✅ Simplify local testing + +### Troubleshooting + +**Health checks timing out:** +- Increase `DYN_HEALTH_CHECK_REQUEST_TIMEOUT` +- Check worker logs for errors +- Verify network connectivity + +**Too many health check logs:** +- Increase `DYN_CANARY_WAIT_TIME` to reduce frequency +- Or disable with `DYN_HEALTH_CHECK_ENABLED=false` in dev + +**Health checks not running:** +- Verify `DYN_HEALTH_CHECK_ENABLED=true` is set +- Check that `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` includes the endpoint +- Ensure the worker is serving the endpoint + +## Related Documentation + +- [Distributed Runtime Architecture](../design-docs/distributed-runtime.md) +- [Dynamo Architecture Overview](../design-docs/architecture.md) +- [Backend Guide](../development/backend-guide.md) diff --git a/fern/pages/observability/logging.md b/fern/pages/observability/logging.md new file mode 100644 index 00000000000..123b9b59ff3 --- /dev/null +++ b/fern/pages/observability/logging.md @@ -0,0 +1,260 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Dynamo Logging" +--- + +## Overview + +Dynamo provides structured logging in both text as well as JSONL. When +JSONL is enabled logs additionally contain `span` creation and exit +events as well as support for `trace_id` and `span_id` fields for +distributed tracing. + +## Environment Variables + +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `DYN_LOGGING_JSONL` | Enable JSONL logging format | `false` | `true` | +| `DYN_LOG` | Log levels per target `,=,=` | `info` | `DYN_LOG=info,dynamo_runtime::system_status_server:trace` | +| `DYN_LOG_USE_LOCAL_TZ` | Use local timezone for timestamps (default is UTC) | `false` | `true` | +| `DYN_LOGGING_CONFIG_PATH` | Path to custom TOML logging configuration | none | `/path/to/config.toml` | +| `OTEL_SERVICE_NAME` | Service name for trace and span information | `dynamo` | `dynamo-frontend` | +| `OTEL_EXPORT_ENABLED` | Enable OTLP trace exporting | `false` | `true` | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP exporter endpoint | `http://localhost:4317` | `http://tempo:4317` | + +## Getting Started Quickly + +### Start Observability Stack + +For collecting and visualizing logs with Grafana Loki (Kubernetes), or viewing trace context in logs alongside Grafana Tempo, start the observability stack. See [Observability Getting Started](README.md#getting-started-quickly) for instructions. + +### Enable Structured Logging + +Enable structured JSONL logging: + +```bash +export DYN_LOGGING_JSONL=true +export DYN_LOG=debug + +# Start your Dynamo components (default port 8000, override with --http-port or DYN_HTTP_PORT env var) +python -m dynamo.frontend & +python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & +``` + +Logs will be written to stderr in JSONL format with trace context. + +## Available Logging Levels + +| **Logging Levels (Least to Most Verbose)** | **Description** | +|-------------------------------------------|---------------------------------------------------------------------------------| +| **ERROR** | Critical errors (e.g., unrecoverable failures, resource exhaustion) | +| **WARN** | Unexpected or degraded situations (e.g., retries, recoverable errors) | +| **INFO** | Operational information (e.g., startup/shutdown, major events) | +| **DEBUG** | General debugging information (e.g., variable values, flow control) | +| **TRACE** | Very low-level, detailed information (e.g., internal algorithm steps) | + +## Example Readable Format + +Environment Setting: + +``` +export DYN_LOG="info,dynamo_runtime::system_status_server:trace" +export DYN_LOGGING_JSONL="false" +``` + +Resulting Log format: + +``` +2025-09-02T15:50:01.770028Z INFO main.init: VllmWorker for Qwen/Qwen3-0.6B has been initialized +2025-09-02T15:50:01.770195Z INFO main.init: Reading Events from tcp://127.0.0.1:21555 +2025-09-02T15:50:01.770265Z INFO main.init: Getting engine runtime configuration metadata from vLLM engine... +2025-09-02T15:50:01.770316Z INFO main.get_engine_cache_info: Cache config values: {'num_gpu_blocks': 24064} +2025-09-02T15:50:01.770358Z INFO main.get_engine_cache_info: Scheduler config values: {'max_num_seqs': 256, 'max_num_batched_tokens': 2048} +``` + +## Example JSONL Format + +Environment Setting: + +``` +export DYN_LOG="info,dynamo_runtime::system_status_server:trace" +export DYN_LOGGING_JSONL="true" +``` + +Resulting Log format: + +``` +{"time":"2025-09-02T15:53:31.943377Z","level":"INFO","target":"log","message":"VllmWorker for Qwen/Qwen3-0.6B has been initialized","log.file":"/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/vllm/main.py","log.line":191,"log.target":"main.init"} +{"time":"2025-09-02T15:53:31.943550Z","level":"INFO","target":"log","message":"Reading Events from tcp://127.0.0.1:26771","log.file":"/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/vllm/main.py","log.line":212,"log.target":"main.init"} +{"time":"2025-09-02T15:53:31.943636Z","level":"INFO","target":"log","message":"Getting engine runtime configuration metadata from vLLM engine...","log.file":"/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/vllm/main.py","log.line":220,"log.target":"main.init"} +{"time":"2025-09-02T15:53:31.943701Z","level":"INFO","target":"log","message":"Cache config values: {'num_gpu_blocks': 24064}","log.file":"/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/vllm/main.py","log.line":267,"log.target":"main.get_engine_cache_info"} +{"time":"2025-09-02T15:53:31.943747Z","level":"INFO","target":"log","message":"Scheduler config values: {'max_num_seqs': 256, 'max_num_batched_tokens': 2048}","log.file":"/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/vllm/main.py","log.line":268,"log.target":"main.get_engine_cache_info"} +``` + +## Logging of Trace and Span IDs + +When `DYN_LOGGING_JSONL` is enabled, all logs include `trace_id` and `span_id` fields, and spans are automatically created for requests. This is useful for short debugging sessions where you want to examine trace context in logs without setting up a full tracing backend and for correlating log messages with traces. + +The trace and span information uses the OpenTelemetry format and libraries, which means the IDs are compatible with OpenTelemetry-based tracing backends like Tempo or Jaeger if you later choose to enable trace export. + +**Note:** This section has overlap with [Distributed Tracing with Tempo](tracing.md). For trace visualization in Grafana Tempo and persistent trace analysis, see [Distributed Tracing with Tempo](tracing.md). + +### Configuration for Logging + +To see trace information in logs: + +```bash +export DYN_LOGGING_JSONL=true +export DYN_LOG=debug # Set to debug to see detailed trace logs + +# Start your Dynamo components (e.g., frontend and worker) (default port 8000, override with --http-port or DYN_HTTP_PORT env var) +python -m dynamo.frontend & +python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & +``` + +This enables JSONL logging with `trace_id` and `span_id` fields. Traces appear in logs but are not exported to any backend. + +### Example Request + +Send a request to generate logs with trace context: + +```bash +curl -H 'Content-Type: application/json' \ +-H 'x-request-id: test-trace-001' \ +-d '{ + "model": "Qwen/Qwen3-0.6B", + "max_completion_tokens": 100, + "messages": [ + {"role": "user", "content": "What is the capital of France?"} + ] +}' \ +http://localhost:8000/v1/chat/completions +``` + +Check the logs (stderr) for JSONL output containing `trace_id`, `span_id`, and `x_request_id` fields. + +## Trace and Span Information in Logs + +This section shows how trace and span information appears in JSONL logs. These logs can be used to understand request flows even without a trace visualization backend. + +### Example Disaggregated Trace in Grafana + +When viewing the corresponding trace in Grafana, you should be able to see something like the following: + +![Disaggregated Trace Example](../../assets/img/grafana-disagg-trace.png) + +### Trace Overview + +| Attribute | Value | +|-----------|-------| +| **Trace ID** | b672ccf48683b392891c5cb4163d4b51 | +| **Start Time** | 2025-10-31 13:52:10.706 | +| **Duration** | 4.04s | +| **Request** | `POST /v1/chat/completions` | + +### Root Span (Frontend): `http-request` + +| Attribute | Value | +|-----------|-------| +| **Service** | frontend | +| **Span ID** | 5c20cc08e6afb2b7 | +| **Duration** | 4.04s | +| **Start Time** | 13:52:10.706 | +| **Status** | unset | +| **Method** | POST | +| **URI** | `/v1/chat/completions` | +| **HTTP Version** | HTTP/1.1 | +| **Parent ID** | (none) | +| **Child Count** | 2 | +| **Busy Time** | 18,101,350 ns (18.10ms) | +| **Idle Time** | 4,022,100,356 ns (4.02s) | + +### Child Span (Prefill): `handle_payload` + +| Attribute | Value | +|-----------|-------| +| **Service** | prefill | +| **Duration** | 39.65ms | +| **Start Time** | 13:52:10.707 | +| **Status** | unset | +| **Component** | prefill | +| **Endpoint** | generate | +| **Namespace** | vllm-disagg | +| **Instance ID** | 3866790875219207267 | +| **Trace ID** | b672ccf48683b392891c5cb4163d4b51 | +| **Parent ID** | 5c20cc08e6afb2b7 | +| **Busy Time** | 613,633 ns (0.61ms) | +| **Idle Time** | 36,340,242 ns (36.34ms) | + +### Child Span (Decode): `handle_payload` + +| Attribute | Value | +|-----------|-------| +| **Service** | decode | +| **Duration** | 4s | +| **Start Time** | 13:52:10.745 | +| **Status** | unset | +| **Component** | backend | +| **Endpoint** | generate | +| **Namespace** | vllm-disagg | +| **Instance ID** | 3866790875219207263 | +| **Trace ID** | b672ccf48683b392891c5cb4163d4b51 | +| **Parent ID** | 5c20cc08e6afb2b7 | +| **Busy Time** | 3,795,258 ns (3.79ms) | +| **Idle Time** | 3,996,532,471 ns (3.99s) | + +### Frontend Logs with Trace Context + +The following shows the JSONL logs from the frontend service for the same request. Note the `trace_id` field (`b672ccf48683b392891c5cb4163d4b51`) that correlates all logs for this request, and the `span_id` field that identifies individual operations: + +``` +{"time":"2025-10-31T20:52:07.707164Z","level":"INFO","file":"/opt/dynamo/lib/runtime/src/logging.rs","line":806,"target":"dynamo_runtime::logging","message":"OTLP export enabled","endpoint":"http://tempo.tm.svc.cluster.local:4317","service":"frontend"} +{"time":"2025-10-31T20:52:10.707164Z","level":"DEBUG","file":"/opt/dynamo/lib/runtime/src/pipeline/network/tcp/server.rs","line":230,"target":"dynamo_runtime::pipeline::network::tcp::server","message":"Registering new TcpStream on 10.0.4.65:41959","method":"POST","span_id":"5c20cc08e6afb2b7","span_name":"http-request","trace_id":"b672ccf48683b392891c5cb4163d4b51","uri":"/v1/chat/completions","version":"HTTP/1.1"} +{"time":"2025-10-31T20:52:10.745264Z","level":"DEBUG","file":"/opt/dynamo/lib/llm/src/kv_router/prefill_router.rs","line":232,"target":"dynamo_llm::kv_router::prefill_router","message":"Prefill succeeded, using disaggregated params for decode","method":"POST","span_id":"5c20cc08e6afb2b7","span_name":"http-request","trace_id":"b672ccf48683b392891c5cb4163d4b51","uri":"/v1/chat/completions","version":"HTTP/1.1"} +{"time":"2025-10-31T20:52:10.745545Z","level":"DEBUG","file":"/opt/dynamo/lib/runtime/src/pipeline/network/tcp/server.rs","line":230,"target":"dynamo_runtime::pipeline::network::tcp::server","message":"Registering new TcpStream on 10.0.4.65:41959","method":"POST","span_id":"5c20cc08e6afb2b7","span_name":"http-request","trace_id":"b672ccf48683b392891c5cb4163d4b51","uri":"/v1/chat/completions","version":"HTTP/1.1"} +``` + +## Custom Request IDs in Logs + +You can provide a custom request ID using the `x-request-id` header. This ID will be attached to all spans and logs for that request, making it easier to correlate traces with application-level request tracking. + +### Example Request with Custom Request ID + +```sh +curl -X POST http://localhost:8000/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -H 'x-request-id: 8372eac7-5f43-4d76-beca-0a94cfb311d0' \ + -d '{ + "model": "Qwen/Qwen3-0.6B", + "messages": [ + { + "role": "user", + "content": "Explain why Roger Federer is considered one of the greatest tennis players of all time" + } + ], + "stream": false, + "max_tokens": 1000 + }' +``` + +All spans and logs for this request will include the `x_request_id` attribute with value `8372eac7-5f43-4d76-beca-0a94cfb311d0`. + +### Frontend Logs with Custom Request ID + +Notice how the `x_request_id` field appears in all log entries, alongside the `trace_id` (`80196f3e3a6fdf06d23bb9ada3788518`) and `span_id`: + +``` +{"time":"2025-10-31T21:06:45.397194Z","level":"DEBUG","file":"/opt/dynamo/lib/runtime/src/pipeline/network/tcp/server.rs","line":230,"target":"dynamo_runtime::pipeline::network::tcp::server","message":"Registering new TcpStream on 10.0.4.65:41959","method":"POST","span_id":"f7e487a9d2a6bf38","span_name":"http-request","trace_id":"80196f3e3a6fdf06d23bb9ada3788518","uri":"/v1/chat/completions","version":"HTTP/1.1","x_request_id":"8372eac7-5f43-4d76-beca-0a94cfb311d0"} +{"time":"2025-10-31T21:06:45.418584Z","level":"DEBUG","file":"/opt/dynamo/lib/llm/src/kv_router/prefill_router.rs","line":232,"target":"dynamo_llm::kv_router::prefill_router","message":"Prefill succeeded, using disaggregated params for decode","method":"POST","span_id":"f7e487a9d2a6bf38","span_name":"http-request","trace_id":"80196f3e3a6fdf06d23bb9ada3788518","uri":"/v1/chat/completions","version":"HTTP/1.1","x_request_id":"8372eac7-5f43-4d76-beca-0a94cfb311d0"} +{"time":"2025-10-31T21:06:45.418854Z","level":"DEBUG","file":"/opt/dynamo/lib/runtime/src/pipeline/network/tcp/server.rs","line":230,"target":"dynamo_runtime::pipeline::network::tcp::server","message":"Registering new TcpStream on 10.0.4.65:41959","method":"POST","span_id":"f7e487a9d2a6bf38","span_name":"http-request","trace_id":"80196f3e3a6fdf06d23bb9ada3788518","uri":"/v1/chat/completions","version":"HTTP/1.1","x_request_id":"8372eac7-5f43-4d76-beca-0a94cfb311d0"} +``` + + + +## Related Documentation + +- [Distributed Runtime Architecture](../design-docs/distributed-runtime.md) +- [Dynamo Architecture Overview](../design-docs/architecture.md) +- [Backend Guide](../development/backend-guide.md) +- [Log Aggregation in Kubernetes](../kubernetes/observability/logging.md) diff --git a/fern/pages/observability/metrics-developer-guide.md b/fern/pages/observability/metrics-developer-guide.md new file mode 100644 index 00000000000..c51af202da6 --- /dev/null +++ b/fern/pages/observability/metrics-developer-guide.md @@ -0,0 +1,268 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Metrics Developer Guide" +--- + +This guide explains how to create and use custom metrics in Dynamo components using the Dynamo metrics API. + +## Metrics Exposure + +All metrics created via the Dynamo metrics API are automatically exposed on the `/metrics` HTTP endpoint in Prometheus Exposition Format text when the following environment variable is set: + +- `DYN_SYSTEM_PORT=` - Port for the metrics endpoint (set to positive value to enable, default: `-1` disabled) + +Example: +```bash +DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model +``` + +Prometheus Exposition Format text metrics will be available at: `http://localhost:8081/metrics` + +## Metric Name Constants + +The [prometheus_names.rs](https://github.com/ai-dynamo/dynamo/tree/main/lib/runtime/src/metrics/prometheus_names.rs) module provides centralized metric name constants and sanitization functions to ensure consistency across all Dynamo components. + +--- + +## Metrics API in Rust + +The metrics API is accessible through the `.metrics()` method on runtime, namespace, component, and endpoint objects. See [Runtime Hierarchy](metrics.md#runtime-hierarchy) for details on the hierarchical structure. + +### Available Methods + +- `.metrics().create_counter()`: Create a counter metric +- `.metrics().create_gauge()`: Create a gauge metric +- `.metrics().create_histogram()`: Create a histogram metric +- `.metrics().create_countervec()`: Create a counter with labels +- `.metrics().create_gaugevec()`: Create a gauge with labels +- `.metrics().create_histogramvec()`: Create a histogram with labels + +### Creating Metrics + +```rust +use dynamo_runtime::DistributedRuntime; + +let runtime = DistributedRuntime::new()?; +let endpoint = runtime.namespace("my_namespace").component("my_component").endpoint("my_endpoint"); + +// Simple metrics +let requests_total = endpoint.metrics().create_counter( + "requests_total", + "Total requests", + &[] +)?; + +let active_connections = endpoint.metrics().create_gauge( + "active_connections", + "Active connections", + &[] +)?; + +let latency = endpoint.metrics().create_histogram( + "latency_seconds", + "Request latency", + &[], + Some(vec![0.001, 0.01, 0.1, 1.0, 10.0]) +)?; +``` + +### Using Metrics + +```rust +// Counters +requests_total.inc(); + +// Gauges +active_connections.set(42.0); +active_connections.inc(); +active_connections.dec(); + +// Histograms +latency.observe(0.023); // 23ms +``` + +### Vector Metrics with Labels + +```rust +// Create vector metrics with label names +let requests_by_model = endpoint.metrics().create_countervec( + "requests_by_model", + "Requests by model", + &["model_type", "model_size"], + &[] +)?; + +let memory_by_gpu = endpoint.metrics().create_gaugevec( + "gpu_memory_bytes", + "GPU memory by device", + &["gpu_id", "memory_type"], + &[] +)?; + +// Use with specific label values +requests_by_model.with_label_values(&["llama", "7b"]).inc(); +memory_by_gpu.with_label_values(&["0", "allocated"]).set(8192.0); +``` + +### Advanced Features + +**Custom histogram buckets:** +```rust +let latency = endpoint.metrics().create_histogram( + "latency_seconds", + "Request latency", + &[], + Some(vec![0.001, 0.01, 0.1, 1.0, 10.0]) +)?; +``` + +**Constant labels:** +```rust +let counter = endpoint.metrics().create_counter( + "requests_total", + "Total requests", + &[("region", "us-west"), ("env", "prod")] +)?; +``` + +--- + +## Metrics API in Python + +Python components can create and manage Prometheus metrics using the same metrics API through Python bindings. + +### Available Methods + +- `endpoint.metrics.create_counter()` / `create_intcounter()`: Create a counter metric +- `endpoint.metrics.create_gauge()` / `create_intgauge()`: Create a gauge metric +- `endpoint.metrics.create_histogram()`: Create a histogram metric +- `endpoint.metrics.create_countervec()` / `create_intcountervec()`: Create a counter with labels +- `endpoint.metrics.create_gaugevec()` / `create_intgaugevec()`: Create a gauge with labels +- `endpoint.metrics.create_histogramvec()`: Create a histogram with labels + +All metrics are imported from `dynamo.prometheus_metrics`. + +### Creating Metrics + +```python +from dynamo.runtime import DistributedRuntime + +drt = DistributedRuntime() +endpoint = drt.namespace("my_namespace").component("my_component").endpoint("my_endpoint") + +# Simple metrics +requests_total = endpoint.metrics.create_intcounter( + "requests_total", + "Total requests" +) + +active_connections = endpoint.metrics.create_intgauge( + "active_connections", + "Active connections" +) + +latency = endpoint.metrics.create_histogram( + "latency_seconds", + "Request latency", + buckets=[0.001, 0.01, 0.1, 1.0, 10.0] +) +``` + +### Using Metrics + +```python +# Counters +requests_total.inc() +requests_total.inc_by(5) + +# Gauges +active_connections.set(42) +active_connections.inc() +active_connections.dec() + +# Histograms +latency.observe(0.023) # 23ms +``` + +### Vector Metrics with Labels + +```python +# Create vector metrics with label names +requests_by_model = endpoint.metrics.create_intcountervec( + "requests_by_model", + "Requests by model", + ["model_type", "model_size"] +) + +memory_by_gpu = endpoint.metrics.create_intgaugevec( + "gpu_memory_bytes", + "GPU memory by device", + ["gpu_id", "memory_type"] +) + +# Use with specific label values +requests_by_model.inc({"model_type": "llama", "model_size": "7b"}) +memory_by_gpu.set(8192, {"gpu_id": "0", "memory_type": "allocated"}) +``` + +### Advanced Features + +**Constant labels:** +```python +counter = endpoint.metrics.create_intcounter( + "requests_total", + "Total requests", + [("region", "us-west"), ("env", "prod")] +) +``` + +**Metric introspection:** +```python +print(counter.name()) # "my_namespace_my_component_my_endpoint_requests_total" +print(counter.const_labels()) # {"dynamo_namespace": "my_namespace", ...} +print(gauge_vec.variable_labels()) # ["model_type", "model_size"] +``` + +**Update patterns:** + +Background thread updates: +```python +import threading +import time + +def update_loop(): + while True: + active_connections.set(compute_current_connections()) + time.sleep(2) + +threading.Thread(target=update_loop, daemon=True).start() +``` + +Callback-based updates (called before each `/metrics` scrape): +```python +def update_metrics(): + active_connections.set(compute_current_connections()) + +endpoint.metrics.register_callback(update_metrics) +``` + +### Examples + +Example scripts: [lib/bindings/python/examples/metrics/](https://github.com/ai-dynamo/dynamo/tree/main/lib/bindings/python/examples/metrics/) + +```bash +cd ~/dynamo/lib/bindings/python/examples/metrics +DYN_SYSTEM_PORT=8081 ./server_with_loop.py +DYN_SYSTEM_PORT=8081 ./server_with_callback.py +``` + +--- + +## Related Documentation + +- [Metrics Overview](metrics.md) +- [Prometheus and Grafana Setup](prometheus-grafana.md) +- [Distributed Runtime Architecture](../design-docs/distributed-runtime.md) +- [Python Metrics Examples](https://github.com/ai-dynamo/dynamo/tree/main/lib/bindings/python/examples/metrics/) + diff --git a/fern/pages/observability/metrics.md b/fern/pages/observability/metrics.md new file mode 100644 index 00000000000..bcc03ac17c2 --- /dev/null +++ b/fern/pages/observability/metrics.md @@ -0,0 +1,235 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Dynamo Metrics" +--- + +## Overview + +Dynamo provides built-in metrics capabilities through the Dynamo metrics API, which is automatically available whenever you use the `DistributedRuntime` framework. This document serves as a reference for all available metrics in Dynamo. + +**For visualization setup instructions**, see the [Prometheus and Grafana Setup Guide](prometheus-grafana.md). + +**For creating custom metrics**, see the [Metrics Developer Guide](metrics-developer-guide.md). + +## Environment Variables + +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `DYN_SYSTEM_PORT` | Backend component metrics/health port | `-1` (disabled) | `8081` | +| `DYN_HTTP_PORT` | Frontend HTTP port (also configurable via `--http-port` flag) | `8000` | `8000` | + +## Getting Started Quickly + +This is a single machine example. + +### Start Observability Stack + +For visualizing metrics with Prometheus and Grafana, start the observability stack. See [Observability Getting Started](README.md#getting-started-quickly) for instructions. + + +### Launch Dynamo Components + +Launch a frontend and vLLM backend to test metrics: + +```bash +# Start frontend (default port 8000, override with --http-port or DYN_HTTP_PORT env var) +$ python -m dynamo.frontend + +# Enable backend worker's system metrics on port 8081 +$ DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model Qwen/Qwen3-0.6B \ + --enforce-eager --no-enable-prefix-caching --max-num-seqs 3 +``` + +Wait for the vLLM worker to start, then send requests and check metrics: + +```bash +# Send a request +curl -H 'Content-Type: application/json' \ +-d '{ + "model": "Qwen/Qwen3-0.6B", + "max_completion_tokens": 100, + "messages": [{"role": "user", "content": "Hello"}] +}' \ +http://localhost:8000/v1/chat/completions + +# Check metrics from the backend worker +curl -s localhost:8081/metrics | grep dynamo_component +``` + +## Exposed Metrics + +Dynamo exposes metrics in Prometheus Exposition Format text at the `/metrics` HTTP endpoint. All Dynamo-generated metrics use the `dynamo_*` prefix and include labels (`dynamo_namespace`, `dynamo_component`, `dynamo_endpoint`) to identify the source component. + +**Example Prometheus Exposition Format text:** + +``` +# HELP dynamo_component_requests_total Total requests processed +# TYPE dynamo_component_requests_total counter +dynamo_component_requests_total{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate"} 42 + +# HELP dynamo_component_request_duration_seconds Request processing time +# TYPE dynamo_component_request_duration_seconds histogram +dynamo_component_request_duration_seconds_bucket{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate",le="0.005"} 10 +dynamo_component_request_duration_seconds_bucket{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate",le="0.01"} 15 +dynamo_component_request_duration_seconds_bucket{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate",le="+Inf"} 42 +dynamo_component_request_duration_seconds_sum{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate"} 2.5 +dynamo_component_request_duration_seconds_count{dynamo_namespace="default",dynamo_component="worker",dynamo_endpoint="generate"} 42 +``` + +### Metric Categories + +Dynamo exposes several categories of metrics: + +- **Frontend Metrics** (`dynamo_frontend_*`) - Request handling, token processing, and latency measurements +- **Component Metrics** (`dynamo_component_*`) - Request counts, processing times, byte transfers, and system uptime +- **Specialized Component Metrics** (e.g., `dynamo_preprocessor_*`) - Component-specific metrics +- **Engine Metrics** (Pass-through) - Backend engines expose their own metrics: [vLLM](../backends/vllm/prometheus.md) (`vllm:*`), [SGLang](../backends/sglang/prometheus.md) (`sglang:*`), [TensorRT-LLM](../backends/trtllm/prometheus.md) (`trtllm_*`) + +## Runtime Hierarchy + +The Dynamo metrics API is available on `DistributedRuntime`, `Namespace`, `Component`, and `Endpoint`, providing a hierarchical approach to metric collection that matches Dynamo's distributed architecture: + +- `DistributedRuntime`: Global metrics across the entire runtime +- `Namespace`: Metrics scoped to a specific dynamo_namespace +- `Component`: Metrics for a specific dynamo_component within a namespace +- `Endpoint`: Metrics for individual dynamo_endpoint within a component + +This hierarchical structure allows you to create metrics at the appropriate level of granularity for your monitoring needs. + +## Available Metrics + +### Backend Component Metrics + +**Backend workers** (`python -m dynamo.vllm`, `python -m dynamo.sglang`, etc.) expose `dynamo_component_*` metrics on port 8081 by default (configurable via `DYN_SYSTEM_PORT`). + +The core Dynamo backend system automatically exposes metrics on the system status port (default: 8081, configurable via `DYN_SYSTEM_PORT`) at the `/metrics` endpoint with the `dynamo_component_*` prefix for all components that use the `DistributedRuntime` framework: + +- `dynamo_component_inflight_requests`: Requests currently being processed (gauge) +- `dynamo_component_request_bytes_total`: Total bytes received in requests (counter) +- `dynamo_component_request_duration_seconds`: Request processing time (histogram) +- `dynamo_component_requests_total`: Total requests processed (counter) +- `dynamo_component_response_bytes_total`: Total bytes sent in responses (counter) +- `dynamo_component_uptime_seconds`: DistributedRuntime uptime (gauge) + +**Access backend component metrics:** +```bash +# Default port 8081 +curl http://localhost:8081/metrics + +# Or with custom port +DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model +curl http://localhost:8081/metrics +``` + +### KV Router Statistics (kvstats) + +KV router statistics are automatically exposed by LLM workers and KV router components on the backend system status port (port 8081) with the `dynamo_component_kvstats_*` prefix. These metrics provide insights into GPU memory usage and cache efficiency: + +- `dynamo_component_kvstats_active_blocks`: Number of active KV cache blocks currently in use (gauge) +- `dynamo_component_kvstats_total_blocks`: Total number of KV cache blocks available (gauge) +- `dynamo_component_kvstats_gpu_cache_usage_percent`: GPU cache usage as a percentage (0.0-1.0) (gauge) +- `dynamo_component_kvstats_gpu_prefix_cache_hit_rate`: GPU prefix cache hit rate as a percentage (0.0-1.0) (gauge) + +These metrics are published by: +- **LLM Workers**: vLLM and TRT-LLM backends publish these metrics through their respective publishers +- **KV Router**: The KV router component aggregates and exposes these metrics for load balancing decisions + +### Specialized Component Metrics + +Some components expose additional metrics specific to their functionality: + +- `dynamo_preprocessor_*`: Metrics specific to preprocessor components + +### Frontend Metrics + +**Important:** The frontend and backend workers are separate components that expose metrics on different ports. See [Backend Component Metrics](#backend-component-metrics) for backend metrics. + +The Dynamo HTTP Frontend (`python -m dynamo.frontend`) exposes `dynamo_frontend_*` metrics on port 8000 by default (configurable via `--http-port` or `DYN_HTTP_PORT`) at the `/metrics` endpoint. Most metrics include `model` labels containing the model name: + +- `dynamo_frontend_inflight_requests`: Inflight requests (gauge) +- `dynamo_frontend_queued_requests`: Number of requests in HTTP processing queue (gauge) +- `dynamo_frontend_disconnected_clients`: Number of disconnected clients (gauge) +- `dynamo_frontend_input_sequence_tokens`: Input sequence length (histogram) +- `dynamo_frontend_cached_tokens`: Number of cached tokens (prefix cache hits) per request (histogram) +- `dynamo_frontend_inter_token_latency_seconds`: Inter-token latency (histogram) +- `dynamo_frontend_output_sequence_tokens`: Output sequence length (histogram) +- `dynamo_frontend_output_tokens_total`: Total number of output tokens generated (counter) +- `dynamo_frontend_request_duration_seconds`: LLM request duration (histogram) +- `dynamo_frontend_requests_total`: Total LLM requests (counter) +- `dynamo_frontend_time_to_first_token_seconds`: Time to first token (histogram) +- `dynamo_frontend_model_migration_total`: Total number of request migrations due to worker unavailability (counter, labels: `model`, `migration_type`) + +**Access frontend metrics:** +```bash +curl http://localhost:8000/metrics +``` + +**Note**: The `dynamo_frontend_inflight_requests` metric tracks requests from HTTP handler start until the complete response is finished, while `dynamo_frontend_queued_requests` tracks requests from HTTP handler start until first token generation begins (including prefill time). HTTP queue time is a subset of inflight time. + +#### Model Configuration Metrics + +The frontend also exposes model configuration metrics (on port 8000 `/metrics` endpoint) with the `dynamo_frontend_model_*` prefix. These metrics are populated from the worker backend registration service when workers register with the system. All model configuration metrics include a `model` label. + +**Runtime Config Metrics (from ModelRuntimeConfig):** +These metrics come from the runtime configuration provided by worker backends during registration. + +- `dynamo_frontend_model_total_kv_blocks`: Total KV blocks available for a worker serving the model (gauge) +- `dynamo_frontend_model_max_num_seqs`: Maximum number of sequences for a worker serving the model (gauge) +- `dynamo_frontend_model_max_num_batched_tokens`: Maximum number of batched tokens for a worker serving the model (gauge) + +**MDC Metrics (from ModelDeploymentCard):** +These metrics come from the Model Deployment Card information provided by worker backends during registration. Note that when multiple worker instances register with the same model name, only the first instance's configuration metrics (runtime config and MDC metrics) will be populated. Subsequent instances with duplicate model names will be skipped for configuration metric updates. + +- `dynamo_frontend_model_context_length`: Maximum context length for a worker serving the model (gauge) +- `dynamo_frontend_model_kv_cache_block_size`: KV cache block size for a worker serving the model (gauge) +- `dynamo_frontend_model_migration_limit`: Request migration limit for a worker serving the model (gauge) + +### Request Processing Flow + +This section explains the distinction between two key metrics used to track request processing: + +1. **Inflight**: Tracks requests from HTTP handler start until the complete response is finished +2. **HTTP Queue**: Tracks requests from HTTP handler start until first token generation begins (including prefill time) + +**Example Request Flow:** +``` +curl -s localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ + "model": "Qwen/Qwen3-0.6B", + "prompt": "Hello let's talk about LLMs", + "stream": false, + "max_tokens": 1000 +}' +``` + +**Timeline:** +``` +Timeline: 0, 1, ... +Client ────> Frontend:8000 ────────────────────> Dynamo component/backend (vLLM, SGLang, TRT) + │request start │received │ + | | | + │ ├──> start prefill ──> first token ──> |last token + │ │ (not impl) | | + ├─────actual HTTP queue¹ ──────────┘ │ | + │ │ │ + ├─────implemented HTTP queue ─────────────────────────────┘ | + │ │ + └─────────────────────────────────── Inflight ────────────────────────────┘ +``` + +**Concurrency Example:** +Suppose the backend allows 3 concurrent requests and there are 10 clients continuously hitting the frontend: +- All 10 requests will be counted as inflight (from start until complete response) +- 7 requests will be in HTTP queue most of the time +- 3 requests will be actively processed (between first token and last token) + +**Key Differences:** +- **Inflight**: Measures total request lifetime including processing time +- **HTTP Queue**: Measures queuing time before processing begins (including prefill time) +- **HTTP Queue ≤ Inflight** (HTTP queue is a subset of inflight time) + +## Related Documentation + +- [Distributed Runtime Architecture](../design-docs/distributed-runtime.md) +- [Dynamo Architecture Overview](../design-docs/architecture.md) +- [Backend Guide](../development/backend-guide.md) diff --git a/fern/pages/observability/prometheus-grafana.md b/fern/pages/observability/prometheus-grafana.md new file mode 100644 index 00000000000..3fd4a40b47d --- /dev/null +++ b/fern/pages/observability/prometheus-grafana.md @@ -0,0 +1,111 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Metrics Visualization with Prometheus and Grafana" +--- + +## Overview + +This guide shows how to set up Prometheus and Grafana for visualizing Dynamo metrics on a single machine for demo purposes. + +![Grafana Dynamo Dashboard](../../assets/img/grafana-dynamo-composite.png) + +**Components:** +- **Prometheus Server** - Collects and stores metrics from Dynamo services +- **Grafana** - Provides dashboards by querying the Prometheus Server + +**For metrics reference**, see [Metrics Documentation](metrics.md). + +## Environment Variables + +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `DYN_SYSTEM_PORT` | System metrics/health port | `-1` (disabled) | `8081` | + +## Getting Started Quickly + +This is a single machine example. + +### Start the Observability Stack + +Start the observability stack (Prometheus, Grafana, Tempo, exporters). See [Observability Getting Started](README.md#getting-started-quickly) for instructions and prerequisites. + +### Start Dynamo Components + +Start frontend and worker (a simple single GPU example): + +```bash +# Start frontend (default port 8000, override with --http-port or DYN_HTTP_PORT env var) +python -m dynamo.frontend & + +# Start vLLM worker with metrics enabled on port 8081 +DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager +``` + +After the workers are running, send a few test requests to populate metrics in the system: + +```bash +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-0.6B", + "messages": [{"role": "user", "content": "Hello"}], + "max_completion_tokens": 100 + }' +``` + +After sending a few requests, the Prometheus Exposition Format text metrics are available at: +- Frontend: `http://localhost:8000/metrics` +- Backend worker: `http://localhost:8081/metrics` + +### Access Web Interfaces + +Once Dynamo components are running: + +1. Open **Grafana** at `http://localhost:3000` (username: `dynamo`, password: `dynamo`) +2. Click on **Dashboards** in the left sidebar +3. Select **Dynamo Dashboard** to view metrics and traces + +Other interfaces: +- **Prometheus**: `http://localhost:9090` +- **Tempo** (tracing): Accessible through Grafana's Explore view. See [Tracing Guide](tracing.md) for details. + +**Note:** If accessing from another machine, replace `localhost` with the machine's hostname or IP address, and ensure firewall rules allow access to these ports (3000, 9090). + +--- + +## Configuration + +### Prometheus + +The Prometheus configuration is specified in [prometheus.yml](https://github.com/ai-dynamo/dynamo/tree/main/deploy/observability/prometheus.yml). This file is set up to collect metrics from the metrics aggregation service endpoint. + +Please be aware that you might need to modify the target settings to align with your specific host configuration and network environment. + +After making changes to prometheus.yml, restart the Prometheus service. See [Observability Getting Started](README.md#getting-started-quickly) for Docker Compose commands. + +### Grafana + +Grafana is pre-configured with: +- Prometheus datasource +- Sample dashboard for visualizing service metrics + +### Troubleshooting + +1. Verify services are running using `docker compose ps` + +2. Check logs using `docker compose logs` + +3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection. + +4. If you encounter issues with stale data or configuration, stop services and wipe volumes using `docker compose down -v` then restart. + + **Note:** The `-v` flag removes named volumes (grafana-data, tempo-data), which will reset dashboards and stored metrics. + +For specific Docker Compose commands, see [Observability Getting Started](README.md#getting-started-quickly). + +## Developer Guide + +For detailed information on creating custom metrics in Dynamo components, see: + +- [Metrics Developer Guide](metrics-developer-guide.md) diff --git a/fern/pages/observability/tracing.md b/fern/pages/observability/tracing.md new file mode 100644 index 00000000000..94832efd3f9 --- /dev/null +++ b/fern/pages/observability/tracing.md @@ -0,0 +1,213 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Distributed Tracing with Tempo" +--- + +## Overview + +Dynamo supports OpenTelemetry-based distributed tracing for visualizing request flows across Frontend and Worker components. Traces are exported to Tempo via OTLP (OpenTelemetry Protocol) and visualized in Grafana. + +**Requirements:** Set `DYN_LOGGING_JSONL=true` and `OTEL_EXPORT_ENABLED=true` to export traces to Tempo. + +This guide covers single GPU demo setup using Docker Compose. For Kubernetes deployments, see [Kubernetes Deployment](#kubernetes-deployment). + +**Note:** This section has overlap with [Logging of OpenTelemetry Tracing](logging.md) since OpenTelemetry has aspects of both logging and tracing. The tracing approach documented here is for persistent trace visualization and analysis. For short debugging sessions examining trace context directly in logs, see the [Logging](logging.md) guide. + +## Environment Variables + +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `DYN_LOGGING_JSONL` | Enable JSONL logging format (required for tracing) | `false` | `true` | +| `OTEL_EXPORT_ENABLED` | Enable OTLP trace export | `false` | `true` | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP gRPC endpoint for Tempo | `http://localhost:4317` | `http://tempo:4317` | +| `OTEL_SERVICE_NAME` | Service name for identifying components | `dynamo` | `dynamo-frontend` | + +## Getting Started Quickly + +### 1. Start Observability Stack + +Start the observability stack (Prometheus, Grafana, Tempo, exporters). See [Observability Getting Started](README.md#getting-started-quickly) for instructions. + +### 2. Set Environment Variables + +Configure Dynamo components to export traces: + +```bash +# Enable JSONL logging and tracing +export DYN_LOGGING_JSONL=true +export OTEL_EXPORT_ENABLED=true +export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:4317 +``` + +### 3. Start Dynamo Components (Single GPU) + +For a simple single-GPU deployment, start the frontend and a single vLLM worker: + +```bash +# Start the frontend with tracing enabled (default port 8000, override with --http-port or DYN_HTTP_PORT env var) +export OTEL_SERVICE_NAME=dynamo-frontend +python -m dynamo.frontend --router-mode kv & + +# Start a single vLLM worker (aggregated prefill and decode) +export OTEL_SERVICE_NAME=dynamo-worker-vllm +python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager \ +--otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" & + +wait +``` + +This runs both prefill and decode on the same GPU, providing a simpler setup for testing tracing. + +### Alternative: Disaggregated Deployment (2 GPUs) + +Run the vLLM disaggregated script with tracing enabled: + +```bash +# Navigate to vLLM launch directory +cd examples/backends/vllm/launch + +# Export tracing env vars, then run the disaggregated deployment script. +./disagg.sh +``` + +**Note:** the example vLLM `disagg.sh` sets additional per-worker port environment variables (e.g., `DYN_VLLM_KV_EVENT_PORT`, +`VLLM_NIXL_SIDE_CHANNEL_PORT`) to avoid ZMQ "Address already in use" conflicts when multiple workers run on the same host. If you run the components manually, make sure you mirror those port settings. + +```bash +#!/bin/bash +set -e +trap 'echo Cleaning up...; kill 0' EXIT + +# Enable tracing +export DYN_LOGGING_JSONL=true +export OTEL_EXPORT_ENABLED=true +export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:4317 + +# Run frontend (default port 8000, override with --http-port or DYN_HTTP_PORT env var) +export OTEL_SERVICE_NAME=dynamo-frontend +python -m dynamo.frontend --router-mode kv & + +# Run decode worker, make sure to wait for start up +export OTEL_SERVICE_NAME=dynamo-worker-decode +DYN_SYSTEM_PORT=8081 CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ + --model Qwen/Qwen3-0.6B \ + --enforce-eager \ + --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" & + +# Run prefill worker, make sure to wait for start up +export OTEL_SERVICE_NAME=dynamo-worker-prefill +DYN_SYSTEM_PORT=8082 \ +DYN_VLLM_KV_EVENT_PORT=20081 \ +VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ +CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ + --model Qwen/Qwen3-0.6B \ + --enforce-eager \ + --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" \ + --is-prefill-worker & +``` + +For disaggregated deployments, this separates prefill and decode onto different GPUs for better resource utilization. + +### 4. Generate Traces + +Send requests to the frontend to generate traces (works for both aggregated and disaggregated deployments). **Note the `x-request-id` header**, which allows you to easily search for and correlate this specific trace in Grafana: + +```bash +curl -H 'Content-Type: application/json' \ +-H 'x-request-id: test-trace-001' \ +-d '{ + "model": "Qwen/Qwen3-0.6B", + "max_completion_tokens": 100, + "messages": [ + {"role": "user", "content": "What is the capital of France?"} + ] +}' \ +http://localhost:8000/v1/chat/completions +``` + +### 5. View Traces in Grafana Tempo + +1. Open Grafana at `http://localhost:3000` +2. Login with username `dynamo` and password `dynamo` +3. Navigate to **Explore** (compass icon in the left sidebar) +4. Select **Tempo** as the data source (should be selected by default) +5. In the query type, select **"Search"** (not TraceQL, not Service Graph) +6. Use the **Search** tab to find traces: + - Search by **Service Name** (e.g., `dynamo-frontend`) + - Search by **Span Name** (e.g., `http-request`, `handle_payload`) + - Search by **Tags** (e.g., `x_request_id=test-trace-001`) +7. Click on a trace to view the detailed flame graph + +#### Example Trace View + +Below is an example of what a trace looks like in Grafana Tempo: + +![Trace Example](../../assets/img/trace.png) + +### 6. Stop Services + +When done, stop the observability stack. See [Observability Getting Started](README.md#getting-started-quickly) for Docker Compose commands. + +--- + +## Kubernetes Deployment + +For Kubernetes deployments, ensure you have a Tempo instance deployed and accessible (e.g., `http://tempo.observability.svc.cluster.local:4317`). + +### Modify DynamoGraphDeployment for Tracing + +Add common tracing environment variables at the top level and service-specific names in each component in your `DynamoGraphDeployment` (e.g., `examples/backends/vllm/deploy/disagg.yaml`): + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: vllm-disagg +spec: + # Common environment variables for all services + env: + - name: DYN_LOGGING_JSONL + value: "true" + - name: OTEL_EXPORT_ENABLED + value: "true" + - name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT + value: "http://tempo.observability.svc.cluster.local:4317" + + services: + Frontend: + # ... existing configuration ... + extraPodSpec: + mainContainer: + # ... existing configuration ... + env: + - name: OTEL_SERVICE_NAME + value: "dynamo-frontend" + + VllmDecodeWorker: + # ... existing configuration ... + extraPodSpec: + mainContainer: + # ... existing configuration ... + env: + - name: OTEL_SERVICE_NAME + value: "dynamo-worker-decode" + + VllmPrefillWorker: + # ... existing configuration ... + extraPodSpec: + mainContainer: + # ... existing configuration ... + env: + - name: OTEL_SERVICE_NAME + value: "dynamo-worker-prefill" +``` + +Apply the updated DynamoGraphDeployment: + +```bash +kubectl apply -f examples/backends/vllm/deploy/disagg.yaml +``` + +Traces will now be exported to Tempo and can be viewed in Grafana. + diff --git a/fern/pages/performance/aiconfigurator.md b/fern/pages/performance/aiconfigurator.md new file mode 100644 index 00000000000..3efcb7907db --- /dev/null +++ b/fern/pages/performance/aiconfigurator.md @@ -0,0 +1,155 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Finding Best Initial Configs using AIConfigurator" +--- + +[AIConfigurator](https://github.com/ai-dynamo/aiconfigurator/tree/main) is a performance optimization tool that helps you find the optimal configuration for deploying LLMs with Dynamo. It automatically determines the best number of prefill and decode workers, parallelism settings, and deployment parameters to meet your SLA targets while maximizing throughput. + +## Why Use AIConfigurator? + +When deploying LLMs with Dynamo, you need to make several critical decisions: +- **Aggregated vs Disaggregated**: Which architecture gives better performance for your workload? +- **Worker Configuration**: How many prefill and decode workers to deploy? +- **Parallelism Settings**: What tensor/pipeline parallel configuration to use? +- **SLA Compliance**: How to meet your TTFT and TPOT targets? + +AIConfigurator answers these questions in seconds, providing: +- Optimal configurations that meet your SLA requirements +- Ready-to-deploy Dynamo configuration files +- Performance comparisons between different deployment strategies +- Up to 1.7x better throughput compared to manual configuration + +## Quick Start + +```bash +# Install +pip3 install aiconfigurator + +# Find optimal configuration +aiconfigurator cli default \ + --model QWEN3_32B \ # Model name (QWEN3_32B, LLAMA3.1_70B, etc.) + --total_gpus 32 \ # Number of available GPUs + --system h200_sxm \ # GPU type (h100_sxm, h200_sxm, a100_sxm) + --isl 4000 \ # Input sequence length (tokens) + --osl 500 \ # Output sequence length (tokens) + --ttft 300 \ # Target Time To First Token (ms) + --tpot 10 \ # Target Time Per Output Token (ms) + --save_dir ./dynamo-configs + +# Deploy +kubectl apply -f ./dynamo-configs/disagg/top1/disagg/k8s_deploy.yaml +``` + +## Example Output + +```text +******************************************************************************** +* Dynamo aiconfigurator Final Results * +******************************************************************************** + ---------------------------------------------------------------------------- + Input Configuration & SLA Target: + Model: QWEN3_32B (is_moe: False) + Total GPUs: 32 + Best Experiment Chosen: disagg at 812.92 tokens/s/gpu (1.70x better) + ---------------------------------------------------------------------------- + Overall Best Configuration: + - Best Throughput: 812.92 tokens/s/gpu + - User Throughput: 120.23 tokens/s/user + - TTFT: 276.76ms + - TPOT: 8.32ms + ---------------------------------------------------------------------------- + Pareto Frontier: + QWEN3_32B Pareto Frontier: tokens/s/gpu vs tokens/s/user + ┌────────────────────────────────────────────────────────────────────────┐ +1600.0┤ •• disagg │ + │ ff agg │ + │ xx disagg best │ + │ │ +1333.3┤ f │ + │ ff │ + │ ff • │ + │ f •••••••• │ +1066.7┤ f •• │ + │ fff •••••••• │ + │ f •• │ + │ f •••• │ + 800.0┤ fffff •••x │ + │ fff •• │ + │ fff • │ + │ fffff •• │ + 533.3┤ ffff •• │ + │ ffff •• │ + │ fffffff ••••• │ + │ ffffff •• │ + 266.7┤ fffff ••••••••• │ + │ ffffffffff │ + │ f │ + │ │ + 0.0┤ │ + └┬─────────────────┬─────────────────┬────────────────┬─────────────────┬┘ + 0 60 120 180 240 +tokens/s/gpu tokens/s/user + +1. **Performance Comparison**: Shows disaggregated vs aggregated serving performance +2. **Optimal Configuration**: The best configuration that meets your SLA targets +3. **Deployment Files**: Ready-to-use Dynamo configuration files + +## Key Features + +### Fast Profiling Integration +```bash +# Use with Dynamo's SLA planner (20-30 seconds vs hours) +python3 -m benchmarks.profiler.profile_sla \ + --config ./examples/backends/trtllm/deploy/disagg.yaml \ + --backend trtllm \ + --use-ai-configurator \ + --aic-system h200_sxm \ + --aic-model-name QWEN3_32B +``` +``` + +### Custom Configuration +```bash +# For advanced users: define custom search space +aiconfigurator cli exp --yaml_path custom_config.yaml +``` + +## Common Use Cases + +```bash +# Strict SLAs (low latency) +aiconfigurator cli default --model QWEN2.5_7B --total_gpus 8 --system h200_sxm --ttft 100 --tpot 5 + +# High throughput (relaxed latency) +aiconfigurator cli default --model QWEN3_32B --total_gpus 32 --system h200_sxm --ttft 1000 --tpot 50 +``` + +## Supported Configurations + +**Models**: GPT, LLAMA2/3, QWEN2.5/3, Mixtral, DEEPSEEK_V3 +**GPUs**: H100, H200, A100, B200 (preview), GB200 (preview) +**Backend**: TensorRT-LLM (vLLM and SGLang coming soon) + +## Additional Options + +```bash +# Web interface +aiconfigurator webapp # Visit http://127.0.0.1:7860 + +# Docker +docker run -it --rm nvcr.io/nvidia/aiconfigurator:latest \ + aiconfigurator cli default --model LLAMA3.1_70B --total_gpus 16 --system h100_sxm +``` + +## Troubleshooting + +**Model name mismatch**: Use exact model name that matches your deployment +**GPU allocation**: Verify available GPUs match `--total_gpus` +**Performance variance**: Results are estimates - benchmark actual deployment + +## Learn More + +- [Dynamo Installation Guide](../kubernetes/installation-guide.md) +- [SLA Planner Quick Start Guide](../planner/sla-planner-quickstart.md) +- [Benchmarking Guide](../benchmarks/benchmarking.md) \ No newline at end of file diff --git a/fern/pages/performance/tuning.md b/fern/pages/performance/tuning.md new file mode 100644 index 00000000000..ff04104cc39 --- /dev/null +++ b/fern/pages/performance/tuning.md @@ -0,0 +1,136 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Disaggregation and Performance Tuning" +--- + +Disaggregation gains performance by separating the prefill and decode into different engines to reduce interferences between the two. +However, performant disaggregation requires careful tuning of the inference parameters. +Specifically, there are three sets of parameters that needs to be tuned: + + 1. Engine configuration and options (e.g. parallelization mapping, maximum number of tokens, etc.). + 2. Disaggregated router configuration and options. + 3. Number of prefill and decode engines. + +This guide describes the process of tuning these parameters. + + +## Engine Configuration and Tuning + +The most important engine configuration to tune is the parallelization mapping. +For most dense models, the best setting is to use TP within node and PP across nodes. +For example, for Llama-405b w8a8 on H100, TP8 on a single node or TP8PP2 on two nodes is usually the best choice. +The next thing to decide is how many numbers of GPU to serve the model. +Typically, the number of GPUs vs the performance follows the following pattern: + +| Number of GPUs | Performance +| :-------------------------------------------------- | :---------------------------------------------------------------------------------------- | +| Cannot hold weights in VRAM | OOM | +| (Barely hold weights in VRAM) | (KV cache is too small to maintain large enough sequence length or reasonable batch size) | +| Minimum number with fair amount of KV cache | Best overall throughput/GPU, worst latency/user | +| Between minimum and maximum | Tradeoff between throughput/GPU and latency/user | +| Maximum number limited by communication scalability | Worst overall throughput/GPU, best latency/user | +| More than maximum | Communication overhead dominates, poor performance | + + +for decode-only engines, sometimes larger number of GPUs has to larger KV cache per GPU and more decoding requests running in parallel, which leads to both better throughput/GPU and better latency/user. +For example, for Llama-3.3-70b NVFP4 quantization on B200 in vLLM with 0.9 free GPU memory fraction: + + +| TP Size | KV Cache Size (GB) | KV Cache per GPU (GB) | Per GPU Improvement over TP1 | +| ------: | -----------------: | --------------------: | ---------------------------: | +| 1 | 113 | 113 | 1.00x | +| 2 | 269 | 135 | 1.19x | +| 4 | 578 | 144 | 1.28x | + +The best number of GPUs to use in the prefill and decode engines can be determined by running a few fixed ISL/OSL/concurrency test using [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main) and compare with the SLA. +AIPerf is pre-installed in the dynamo container. + + +If you are unfamiliar with AIPerf, please see this helpful [tutorial](https://github.com/ai-dynamo/aiperf/blob/main/docs/tutorial.md) to get you started. + + +Besides the parallelization mapping, other common knobs to tune are maximum batch size, maximum number of tokens, and block size. +For prefill engines, usually a small batch size and large `max_num_token` is preferred. +For decode engines, usually a large batch size and medium `max_num_token` is preferred. +For details on tuning the `max_num_token` and max_batch_size, see the next section. + +For block size, if the block size is too small, it leads to small memory chunks in the P->D KV cache transfer and poor performance. +Too small block size also leads to memory fragmentation in the attention calculation, but the impact is usually insignificant. +If the block size is too large, it leads to low prefix cache hit ratio. +For most dense models, we find block size 128 is a good choice. + + +## Disaggregated Router + +Disaggregated router decides whether to prefill a request in the remote prefill engine or locally in the decode engine using chunked prefill. +For most frameworks, when chunked prefill is enabled and one forward iteration gets a mixture of prefilling and decoding request, three kernels are launched: + + 1. The attention kernel for context tokens (context_fmha kernel in TRTLLM). + + 2. The attention kernel for decode tokens (xqa kernel in TRTLLM). + + 3. Dense kernel for the combined active tokens in prefills and decodes. + +### Prefill Engine + +In the prefill engine, the best strategy is to operate at the smallest batch size that saturates the GPUs so that the average time to first token (TTFT) is minimized. +For example, for Llama3.3-70b NVFP4 quantization on B200 TP1 in vLLM, the below figure shows the prefill time with different isl (prefix caching is turned off): + +![Combined bar and line chart showing "Prefill Time". Bar chart represents TTFT (Time To First Token) in milliseconds against ISL (Input Sequence Length). The line chart shows TTFT/ISL (milliseconds per token) against ISL.](../../assets/img/prefill-time.png) + +For isl less than 1000, the prefill efficiency is low because the GPU is not fully saturated. +For isl larger than 4000, the prefill time per token increases because the attention takes longer to compute with a longer history. + +Currently, prefill engines in Dynamo operate at a batch size of 1. +To make sure prefill engine is saturated, users can set `max-local-prefill-length` to the saturation point to make sure prefill engine is optimal. + +### Decode Engine + +In the decode engine, maximum batch size and maximum number of tokens affects the size of intermediate tensors. +With a larger batch size and number of tokens, the size of intermediate tensors increases and the size of KV cache decreases. +TensorRT-LLM (TRTLLM) has a good [summary](https://nvidia.github.io/TensorRT-LLM/reference/memory.html) on the memory footprint where similar ideas also applies to other LLM frameworks. + +With chunked prefill enabled, the maximum number of tokens controls the longest prefill that can be piggybacked to decode and control the inter-token latency (ITL). +For the same prefill requests, a large maximum number of tokens leads to fewer but longer stalls in the generation, while a small maximum number of tokens leads to more but shorter stalls in the generation. +However, chunked prefill is currently not supported in Dynamo (vLLM backend). +Hence, the current best strategy is to set the maximum batch size to the optimized KV cache size and set the maximum number of tokens to the maximum local prefill length + maximum batch size (since one decode request has one active token). + + +## Number of Prefill and Decode Engines + +The best dynamo knob choices depends on the operating condition of the model. +Based on the load, we define three operating conditions: + + 1. **Low load**: + The endpoint is hit by a single user (single-stream) most of the time. + + 2. **Medium load**: + The endpoint is hit by multiple users, but the KV cache of the decode engines is never fully utilized. + + 3. **High load**: + The endpoint is hit by multiple users and the requests are queued up due to no available KV cache in the decode engines. + +At low load, disaggregation would not benefit much as prefill and decode are usually computed separately. +It is usually better to use a single monolithic engine. + +At medium load, disaggregation allows better ITL compared with prefill-prioritized and chunked prefill engines and better TTFT compared with chunked prefill engine and decode-only engine for each user. +Dynamo users can adjust the number of prefill and decode engines based on TTFT and ITL SLA. + +At high load where KV cache capacity is the bottleneck, disaggregation has the following effect on the KV cache usage in the decode engines: + + * Increase the total amount of KVcache: + + * Being able to use greater TP values in decode engines leads to more KV cache per GPU and higher prefix cache hit rate. + + * When the requests is prefilled remotely, the decode engine does not need to maintain its KV cache (currently not implemented in Dynamo). + + * Lower ITL reduces the decode time and allow the same amount of KV cache to serve more requests. + + * Decrease the total amount of KV cache: + + * Some GPUs are configured as prefill engines whose KV cache is not used in the decode phase. + +Since Dynamo currently allocates the KV blocks immediately when the decode engine get the requests, +it is advisable to use as few prefill engines as possible (even no prefill engine) to maximize the available KV cache in decode engines. +To prevent queueing at prefill engines, users can set a large `max-local-prefill-length` and piggyback more prefill requests at decode engines. diff --git a/fern/pages/planner/load-planner.md b/fern/pages/planner/load-planner.md new file mode 100644 index 00000000000..c88fc15c179 --- /dev/null +++ b/fern/pages/planner/load-planner.md @@ -0,0 +1,63 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Load-based Planner" +--- + +This document covers load-based planner in `examples/llm/components/planner.py`. + + +Load-based planner is inoperable as vllm, sglang, and trtllm examples all do not use prefill queues. Please use SLA planner for now. + + + +Bare metal deployment with local connector is deprecated. The only option to deploy load-based planner is via k8s. We will update the examples in this document soon. + + +## Load-based Scaling Up/Down Prefill/Decode Workers + +To adjust the number of prefill/decode workers, planner monitors the following metrics: +* Prefill worker: planner monitors the number of requests pending in the prefill queue to estimate the prefill workload. +* Decode/aggregated worker: planner monitors the average KV cache utilization rate to estimate the decode/aggregated workload. + +Every `metric-pulling-interval`, planner gathers the aforementioned metrics. Every `adjustment-interval`, planner compares the aggregated metrics in this interval with pre-set thresholds and decide to scale up/down prefill/decode workers. To avoid over-compensation, planner only changes the number of workers by 1 in one adjustment interval. In addition, when the number of workers is being adjusted, the planner blocks the metric pulling and adjustment. + +To scale up a prefill/decode worker, planner just need to launch the worker in the correct namespace. The auto-discovery mechanism picks up the workers and add them to the routers. To scale down a prefill worker, planner send a SIGTERM signal to the prefill worker. The prefill worker store the signal and exit when it finishes the current request pulled from the prefill queue. This ensures that no remote prefill request is dropped. To scale down a decode worker, planner revokes the etcd lease of the decode worker. When the etcd lease is revoked, the corresponding decode worker is immediately removed from the router and won't get any new requests. The decode worker then finishes all the current requests in their original stream and exits gracefully. + +There are two additional rules set by planner to prevent over-compensation: +1. After a new decode worker is added, since it needs time to populate the kv cache, planner doesn't scale down the number of decode workers in the next `NEW_DECODE_WORKER_GRACE_PERIOD=3` adjustment intervals. +1. We do not scale up prefill worker if the prefill queue size is estimated to reduce below the `--prefill-queue-scale-up-threshold` within the next `NEW_PREFILL_WORKER_QUEUE_BUFFER_PERIOD=3` adjustment intervals following the trend observed in the current adjustment interval. + +## SLA-based Scaling Up/Down Prefill/Decode Workers + +See [SLA-Driven Profiling](../benchmarks/sla-driven-profiling.md) for more details. + +## Usage + +The planner integration with the new frontend + worker architecture is currently a work in progress. This documentation will be updated with the new deployment patterns and code examples once the planner component has been fully adapted to the new workflow. + +Configuration options: +* `namespace` (str, default: "dynamo"): Target namespace for planner operations +* `environment` (str, default: "local"): Target environment (local, kubernetes) +* `no-operation` (bool, default: false): Run in observation mode only +* `log-dir` (str, default: None): Tensorboard log directory +* `adjustment-interval` (int, default: 30): Seconds between adjustments +* `metric-pulling-interval` (int, default: 1): Seconds between metric pulls +* `max-gpu-budget` (int, default: 8): Maximum GPUs for all workers +* `min-gpu-budget` (int, default: 1): Minimum GPUs per worker type +* `decode-kv-scale-up-threshold` (float, default: 0.9): KV cache threshold for scale-up +* `decode-kv-scale-down-threshold` (float, default: 0.5): KV cache threshold for scale-down +* `prefill-queue-scale-up-threshold` (float, default: 0.5): Queue threshold for scale-up +* `prefill-queue-scale-down-threshold` (float, default: 0.2): Queue threshold for scale-down +* `decode-engine-num-gpu` (int, default: 1): GPUs per decode engine +* `prefill-engine-num-gpu` (int, default: 1): GPUs per prefill engine + +Run as standalone process: +```bash +PYTHONPATH=/workspace/examples/llm python components/planner.py --namespace=dynamo --served-model-name=vllm --no-operation --log-dir=log/planner +``` + +Monitor metrics with Tensorboard: +```bash +tensorboard --logdir= +``` diff --git a/fern/pages/planner/planner-intro.md b/fern/pages/planner/planner-intro.md new file mode 100644 index 00000000000..4a9b2fdeae9 --- /dev/null +++ b/fern/pages/planner/planner-intro.md @@ -0,0 +1,24 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Planner" +--- + +The planner monitors the state of the system and adjusts workers to +ensure that the system runs efficiently. + +Currently, the planner can scale the number of vllm workers up and down +based on the kv cache load and prefill queue size: + +Key features include: + +- **SLA-based scaling** that uses predictive modeling and performance + interpolation to proactively meet TTFT and ITL targets +- **Graceful scaling** that ensures no requests are dropped during + scale-down operations + + +**New to SLA Planner?** Start with the [SLA Planner Quick Start Guide](sla-planner-quickstart.md) for a complete, step-by-step workflow. + +**Prerequisites**: SLA-based planner requires pre-deployment profiling (2-4 hours on real silicon or a few minutes using simulator) before deployment. The Quick Start guide includes everything you need. + diff --git a/fern/pages/planner/sla-planner-quickstart.md b/fern/pages/planner/sla-planner-quickstart.md new file mode 100644 index 00000000000..cca2e216e51 --- /dev/null +++ b/fern/pages/planner/sla-planner-quickstart.md @@ -0,0 +1,554 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "SLA-Driven Profiling and Planner Deployment Quick Start Guide" +--- + +Complete workflow to deploy SLA-optimized Dynamo models using DynamoGraphDeploymentRequests (DGDR). This guide shows how to automatically profile models and deploy them with optimal configurations that meet your Service Level Agreements (SLAs). + + +**Prerequisites**: This guide assumes you have a Kubernetes cluster with GPU nodes and have completed the [Dynamo Platform installation](../kubernetes/installation-guide.md). + + +## Overview + +The DGDR workflow automates the entire process from SLA specification to deployment: + +1. **Define SLAs**: Specify performance requirements (TTFT, ITL) and model information in a DGDR Custom Resource +2. **Automatic Profiling**: The Dynamo Operator automatically profiles your model to find optimal configurations +3. **Auto-Deploy**: The system automatically deploys the optimal configuration that meets your SLAs + +```mermaid +flowchart TD + A[Create DGDR] --> B[DGDR Controller] + B --> C{Profiling Method} + C -->|Online| D[Run Profiling Job
2-4 hours] + C -->|Offline/AIC| E[AI Configurator
20-30 seconds] + D --> F[Generate DGD Config] + E --> F + F --> G[Auto-Deploy DGD] + G --> H[Monitor & Scale] + + style A fill:#e1f5fe + style D fill:#fff3e0 + style E fill:#e8f5e8 + style G fill:#f3e5f5 + style H fill:#fff8e1 +``` + +## What is a DynamoGraphDeploymentRequest (DGDR)? + +A **DynamoGraphDeploymentRequest (DGDR)** is a Kubernetes Custom Resource that serves as the primary interface for users to request model deployments with specific performance and resource constraints. Think of it as a "deployment order" where you specify: + +- **What** model you want to deploy (`model`) +- **How** it should perform (SLA targets: `ttft`, `itl`) +- **Where** it should run (optional GPU preferences) +- **Which** backend to use (`backend`: vllm, sglang, or trtllm) +- **Which** images to use (`profilingConfig.profilerImage`, `deploymentOverrides.workersImage`) + +The Dynamo Operator watches for DGDRs and automatically: +1. Discovers available GPU resources in your cluster +2. Runs profiling (online or offline) to find optimal configurations +3. Generates an optimized DynamoGraphDeployment (DGD) configuration +4. Deploys the DGD to your cluster + +**Key Benefits:** +- **Declarative**: Specify what you want, not how to achieve it +- **Automated**: No manual profiling job setup or result processing +- **SLA-Driven**: Ensures deployments meet your performance requirements +- **Integrated**: Works seamlessly with the Dynamo Operator + +## Prerequisites + +Before creating a DGDR, ensure: +- **Dynamo platform installed** with the operator running (see [Installation Guide](../kubernetes/installation-guide.md)) +- **[kube-prometheus-stack](../kubernetes/observability/metrics.md) installed and running** (required for SLA planner) +- **Image pull secrets configured** if using private registries (typically `nvcr-imagepullsecret` for NVIDIA images) +- **Sufficient GPU resources** available in your cluster for profiling +- **Runtime images available** that contain both profiler and runtime components + +### Container Images + +Each DGDR requires you to specify container images for the profiling and deployment process: + +**profilingConfig.profilerImage** (Required): +Specifies the container image used for the profiling job itself. This image must contain the profiler code and dependencies needed for SLA-based profiling. + +**deploymentOverrides.workersImage** (Optional): +Specifies the container image used for DynamoGraphDeployment worker components (frontend, workers, planner). This image is used for: +- Temporary DGDs created during online profiling (for performance measurements) +- The final DGD deployed after profiling completes + +If `workersImage` is omitted, the image from the base config file (e.g., `disagg.yaml`) is used. You may use our public images (0.6.1 and later) or build and push your own. + +```yaml +spec: + profilingConfig: + profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" + deploymentOverrides: + workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" # Optional +``` + +## Quick Start: Deploy with DGDR + +### Step 1: Create Your DGDR + +Dynamo provides sample DGDR configurations in `benchmarks/profiler/deploy/`. You can use these as starting points: + +**Available Sample DGDRs:** +- **`profile_sla_dgdr.yaml`**: Standard online profiling for dense models +- **`profile_sla_aic_dgdr.yaml`**: Fast offline profiling using AI Configurator (TensorRT-LLM) +- **`profile_sla_moe_dgdr.yaml`**: Online profiling for MoE models (SGLang) + +Or, you can create your own DGDR for your own needs: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeploymentRequest +metadata: + name: my-model-deployment # Change the name + namespace: default # Change the namespace +spec: + model: "Qwen/Qwen3-0.6B" # Update to your model + backend: vllm # Backend: vllm, sglang, or trtllm + + profilingConfig: + profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" # Required + config: + sla: + isl: 3000 # Adjust to your workload + osl: 150 # Adjust to your workload + ttft: 200 # Your target (ms) + itl: 20 # Your target (ms) + + sweep: + use_ai_configurator: false # Set to true for fast profiling (TensorRT-LLM only) + + deploymentOverrides: + workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" # Optional + + autoApply: true # Auto-deploy after profiling +``` + + +For detailed explanations of all configuration options (SLA, hardware, sweep, AIC, planner), see the [DGDR Configuration Reference](../benchmarks/sla-driven-profiling.md#dgdr-configuration-reference). + + +### Step 2: Apply the DGDR + +The rest of this quickstart will use the DGDR sample that uses AIC profiling. If you use a different DGDR file and/or name, be sure to adjust the commands accordingly. + +```bash +export NAMESPACE=your-namespace +kubectl apply -f benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml -n $NAMESPACE +``` + +The Dynamo Operator will immediately begin processing your request. + +### Step 3: Monitor Progress + +Watch the DGDR status: + +```bash +# View status +kubectl get dgdr -n $NAMESPACE + +# Detailed status +kubectl describe dgdr sla-aic -n $NAMESPACE + +# Watch profiling job logs +kubectl logs -f job/profile-sla-aic -n $NAMESPACE +``` + +**DGDR Status States:** +- `Pending`: Initial state, preparing to profile +- `Profiling`: Running profiling job (20-30 seconds for AIC, 2-4 hours for online) +- `Deploying`: Generating and applying DGD configuration +- `Ready`: DGD successfully deployed and running +- `Failed`: Error occurred (check events for details) + + +With AI Configurator, profiling completes in **20-30 seconds**! This is much faster than online profiling which takes 2-4 hours. + + +### Step 4: Access Your Deployment + +Once the DGDR reaches `Ready` state, your model is deployed and ready to serve: + +```bash +# Find the frontend service +kubectl get svc -n $NAMESPACE | grep trtllm-disagg + +# Port-forward to access locally +kubectl port-forward svc/trtllm-disagg-frontend 8000:8000 -n $NAMESPACE + +# Test the endpoint +curl http://localhost:8000/v1/models +``` + +### Step 5 (Optional): Access the Planner Grafana Dashboard + +If you want to monitor the SLA Planner's decision-making in real-time, you can deploy the Planner Grafana dashboard. + +```bash +kubectl apply -n monitoring -f deploy/observability/k8s/grafana-planner-dashboard-configmap.yaml +``` + +Follow the instructions in [Dynamo Metrics Collection on Kubernetes](../kubernetes/observability/metrics.md) to access the Grafana UI and select the **Dynamo Planner Dashboard**. + +The dashboard displays: +- **Worker Counts & GPU Usage**: Current prefill/decode worker counts and cumulative GPU hours +- **Observed Metrics**: Real-time TTFT, ITL, request rate, and sequence lengths from Prometheus +- **Predicted Metrics**: Planner's load predictions and recommended replica counts +- **Correction Factors**: How the planner adjusts predictions based on observed vs expected performance + + +Use the **Namespace** dropdown at the top of the dashboard to filter metrics for your specific deployment namespace. + + +## DGDR Configuration Details + +### Required Fields + +| Field | Type | Description | +|-------|------|-------------| +| `spec.model` | string | Model identifier (e.g., "meta-llama/Llama-3-70b") | +| `spec.backend` | enum | Inference backend: `vllm`, `sglang`, or `trtllm` | +| `spec.profilingConfig.profilerImage` | string | Container image for profiling job | +| `spec.profilingConfig.config.sla` | object | SLA targets (isl, osl, ttft, itl) | + +### Optional Fields + +| Field | Type | Description | +|-------|------|-------------| +| `spec.deploymentOverrides.workersImage` | string | Container image for DGD worker components. If omitted, uses image from base config file. | +| `spec.autoApply` | boolean | Automatically deploy DGD after profiling (default: false) | +| `spec.deploymentOverrides` | object | Customize metadata (name, namespace, labels, annotations) and image for auto-created DGD | + +### SLA Configuration + +The `sla` section defines performance requirements and workload characteristics: + +```yaml +sla: + isl: 3000 # Average input sequence length (tokens) + osl: 150 # Average output sequence length (tokens) + ttft: 200 # Target Time To First Token (milliseconds, float) + itl: 20 # Target Inter-Token Latency (milliseconds, float) +``` + +**Choosing SLA Values:** +- **ISL/OSL**: Based on your expected traffic patterns +- **TTFT**: First token latency target (lower = more GPUs needed) +- **ITL**: Token generation latency target (lower = more GPUs needed) +- **Trade-offs**: Tighter SLAs require more GPU resources + +### Profiling Methods + +Choose between **online profiling** (real measurements, 2-4 hours) or **offline profiling** with AI Configurator (estimated, 20-30 seconds): + +```yaml +# Online Profiling (Default) +sweep: + use_ai_configurator: false + +# Offline Profiling (AI Configurator - TensorRT-LLM only) +sweep: + use_ai_configurator: true + aic_system: h200_sxm + aic_hf_id: Qwen/Qwen3-32B + aic_backend_version: "0.20.0" +``` + + +For detailed comparison, supported configurations, and limitations, see [SLA-Driven Profiling Documentation](../benchmarks/sla-driven-profiling.md#profiling-method). + + +### Hardware Configuration + +For details on hardware configuration and GPU discovery options, see [Hardware Configuration in SLA-Driven Profiling](../benchmarks/sla-driven-profiling.md#hardware-configuration). + +### Advanced Configuration + +#### Using Existing DGD Configs (Recommended for Custom Setups) + +If you have an existing DynamoGraphDeployment config (e.g., from `examples/backends/*/deploy/disagg.yaml` or custom recipes), you can reference it via ConfigMap: + +**Step 1: Create ConfigMap from your DGD config file:** + +```bash +kubectl create configmap deepseek-r1-config \ + --from-file=disagg.yaml=/path/to/your/disagg.yaml \ + --namespace $NAMESPACE \ + --dry-run=client -o yaml | kubectl apply -f - +``` + +**Step 2: Reference the ConfigMap in your DGDR:** + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeploymentRequest +metadata: + name: deepseek-r1 +spec: + model: deepseek-ai/DeepSeek-R1 + backend: sglang + + profilingConfig: + profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1" + configMapRef: + name: deepseek-r1-config + key: disagg.yaml # Must match the key used in --from-file + config: + sla: + isl: 4000 + osl: 500 + ttft: 300 + itl: 10 + sweep: + use_ai_configurator: true + aic: + system: h200_sxm + model_name: DEEPSEEK_V3 + backend_version: "0.20.0" + + deploymentOverrides: + workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1" + + autoApply: true +``` + +> **What's happening**: The profiler uses the DGD config from the ConfigMap as a **base template**, then optimizes it based on your SLA targets. The controller automatically injects `spec.model` into `deployment.model` and `spec.backend` into `engine.backend` in the final configuration. + +#### Inline Configuration (Simple Use Cases) + +For simple use cases without a custom DGD config, provide profiler configuration directly. The profiler will auto-generate a basic DGD configuration from your `model` and `backend`: + +```yaml +profilingConfig: + config: + # SLA targets (required for profiling) + sla: + isl: 8000 # Input sequence length + osl: 200 # Output sequence length + ttft: 200.0 # Time To First Token (ms) + itl: 10.0 # Inter-Token Latency (ms) + + # Hardware constraints (optional) + hardware: + min_num_gpus_per_engine: 2 + max_num_gpus_per_engine: 8 + gpu_type: h200_sxm + + # Profiling sweep settings (optional) + sweep: + prefill_interpolation_granularity: 16 # Number of samples for prefill ISL sweep + decode_interpolation_granularity: 6 # Number of samples for decode sweep +``` + +> **Note**: `engine.config` is a **file path** to a DGD YAML file, not inline configuration. Use ConfigMapRef (recommended) or leave it unset to auto-generate. + +#### Planner Configuration Passthrough +Add planner-specific settings. Planner arguments use a `planner_` prefix: + +```yaml +profilingConfig: + config: + planner: + planner_min_endpoint: 2 +``` + +## Understanding Profiling Results + +For details about the profiling process, performance plots, and interpolation data, see [SLA-Driven Profiling Documentation](../benchmarks/sla-driven-profiling.md). + +## Advanced Topics + +### Mocker Deployment + +Instead of a real DGD that uses GPU resources, you can deploy a mocker deployment that uses simulated engines rather than GPUs. Mocker is available in all backend images and uses profiling data to simulate realistic GPU timing behavior. It is useful for: +- Large-scale experiments without GPU resources +- Testing Planner behavior and infrastructure +- Validating deployment configurations + +To deploy mocker instead of the real backend, set `useMocker: true`: + +```yaml +spec: + model: + backend: trtllm # Real backend for profiling (vllm, sglang, or trtllm) + useMocker: true # Deploy mocker instead of real backend + + profilingConfig: + profilerImage: "nvcr.io/nvidia/dynamo/trtllm-runtime:" + ... + autoApply: true +``` + +Profiling still runs against the real backend (via GPUs or AIC) to collect performance data. The mocker deployment then uses this data to simulate realistic timing behavior. + +### DGDR Immutability + +DGDRs are **immutable** - if you need to update SLAs or configuration: + +1. Delete the existing DGDR: `kubectl delete dgdr sla-aic` +2. Create a new DGDR with updated specifications + +### Manual Deployment Control + +There are two ways to manually control deployment after profiling: + +#### Option 1: Use DGDR-Generated Configuration (Recommended) + +Disable auto-deployment to review the generated DGD before applying: + +```yaml +spec: + autoApply: false +``` + +Then manually extract and apply the generated DGD: + +```bash +# Extract generated DGD from DGDR status +kubectl get dgdr sla-aic -n $NAMESPACE -o jsonpath='{.status.generatedDeployment}' | kubectl apply -f - + +# Or save to file first for review/modification +kubectl get dgdr sla-aic -n $NAMESPACE -o jsonpath='{.status.generatedDeployment}' > my-dgd.yaml + +vi my-dgd.yaml +kubectl apply -f my-dgd.yaml -n $NAMESPACE +``` + +The generated DGD includes optimized configurations and the SLA planner component. The required `planner-profile-data` ConfigMap is automatically created when profiling completes, so the DGD will deploy successfully. + +#### Option 2: Use Standalone Planner Templates (Advanced) + +For advanced use cases, you can manually deploy using the standalone planner templates in `examples/backends/*/deploy/disagg_planner.yaml`: + +```bash +# After profiling completes, profiling data is automatically stored in ConfigMaps + +# OPTIONAL: Inspect profiling results stored in ConfigMaps +# View the generated DGD configuration +kubectl get configmap dgdr-output- -n $NAMESPACE -o yaml + +# View the planner profiling data (JSON format) +kubectl get configmap planner-profile-data -n $NAMESPACE -o yaml + +# Update the PROMETHEUS_ENDPOINT environment variable in the planner template +# to match your cluster's Prometheus service location (see comments in the template) + +# Update backend planner manifest as needed, then deploy +kubectl apply -f examples/backends//deploy/disagg_planner.yaml -n $NAMESPACE +``` + +> **Note**: The standalone templates are provided as examples and may need customization for your model and requirements. The DGDR-generated configuration (Option 1) is recommended as it's automatically tuned to your profiling results and SLA targets. +> +> **Important - Prometheus Configuration**: The planner queries Prometheus to get frontend request metrics for scaling decisions. If you see errors like "Failed to resolve prometheus service", ensure the `PROMETHEUS_ENDPOINT` environment variable in your planner configuration correctly points to your Prometheus service. See the comments in the example templates for details. + +### Relationship to DynamoGraphDeployment (DGD) + +- **DGDR**: High-level "intent" - what you want deployed +- **DGD**: Low-level "implementation" - how it's deployed + +The DGDR controller generates a DGD that: +- Uses optimal TP configurations from profiling +- Includes SLA planner for autoscaling +- Has deployment and engine settings tuned for your SLAs + +The generated DGD is tracked via labels: +```yaml +metadata: + labels: + dgdr.nvidia.com/name: sla-aic + dgdr.nvidia.com/namespace: your-namespace +``` + +### Accessing Detailed Profiling Artifacts + +By default, profiling jobs save essential data to ConfigMaps for planner integration. For advanced users who need access to detailed artifacts (logs, performance plots, AIPerf results, etc), configure the DGDR to use `dynamo-pvc`. This is optional and will not affect the functionality of profiler or Planner. + +**What's available in ConfigMaps (always created):** +- Generated DGD configuration +- Profiling data for Planner (`.json` files) + +**What's available in PVC if attached to DGDR (optional):** +- Performance plots (PNGs) +- DGD configuration and logs of all services for each profiled deployment +- AIPerf profiling artifacts for each AIPerf run +- Raw profiling data (`.npz` files) +- Profiler log + +**Setup:** + +1. Set up the benchmarking PVC: +```bash +export NAMESPACE=your-namespace +deploy/utils/setup_benchmarking_resources.sh +``` + +2. Add `outputPVC` to your DGDR's `profilingConfig`: +```yaml +spec: + profilingConfig: + outputPVC: "dynamo-pvc" + config: + # ... rest of config +``` + +3. After profiling completes, access results: +```bash +kubectl apply -f deploy/utils/manifests/pvc-access-pod.yaml -n $NAMESPACE +kubectl wait --for=condition=Ready pod/pvc-access-pod -n $NAMESPACE --timeout=60s +kubectl cp $NAMESPACE/pvc-access-pod:/data ./profiling-results +kubectl delete pod pvc-access-pod -n $NAMESPACE +``` + +## Troubleshooting + +### Quick Diagnostics + +```bash +# Check DGDR status and events +kubectl describe dgdr sla-aic -n $NAMESPACE + +# Check operator logs +kubectl logs -n $NAMESPACE -l app.kubernetes.io/name=dynamo-operator --tail=100 + +# Check profiling job logs +kubectl logs -l job-name=profile-sla-aic -n $NAMESPACE +``` + +### Common Issues + +| Issue | Quick Fix | +|-------|-----------| +| **DGDR stuck in Pending** | Check GPU availability: `kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}'` | +| **Image pull errors** | Verify secret exists: `kubectl get secret nvcr-imagepullsecret -n $NAMESPACE` | +| **Profiling fails** | Check job logs: `kubectl logs -l job-name=profile-sla-aic -n $NAMESPACE` | +| **SLA cannot be met** | Relax TTFT/ITL targets or add more GPUs | +| **DGD not deployed** | Verify `autoApply: true` in DGDR spec | + + +For comprehensive troubleshooting including AI Configurator constraints, performance debugging, and backend-specific issues, see [SLA-Driven Profiling Troubleshooting](../benchmarks/sla-driven-profiling.md#troubleshooting). + + +## Configuration Reference + +For comprehensive documentation of all DGDR configuration options, see the [DGDR Configuration Reference](../benchmarks/sla-driven-profiling.md#dgdr-configuration-reference). + +This includes detailed explanations of: +- **SLA Configuration**: ISL, OSL, TTFT, ITL with use cases and trade-offs +- **Hardware Configuration**: GPU constraints and search space control +- **Sweep Configuration**: Profiling behavior and interpolation settings +- **AI Configurator Configuration**: System types, model mappings, backend versions +- **Planner Configuration**: Autoscaling and adjustment parameters +- **Complete Examples**: Full DGDRs for online, offline (AIC), and MoE profiling + +## Related Documentation + +- [DGDR API Reference](../kubernetes/api-reference.md) +- [Pre-Deployment Profiling Details](../benchmarks/sla-driven-profiling.md) +- [SLA Planner Architecture](sla-planner.md) +- [Dynamo Operator Guide](../kubernetes/dynamo-operator.md) diff --git a/fern/pages/planner/sla-planner.md b/fern/pages/planner/sla-planner.md new file mode 100644 index 00000000000..61b6336a7df --- /dev/null +++ b/fern/pages/planner/sla-planner.md @@ -0,0 +1,190 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "SLA-based Planner" +--- + + +**New to SLA Planner?** For a complete workflow including profiling and deployment, see the [SLA Profiling + Planner Quick Start Guide](sla-planner-quickstart.md). + + +This document covers information regarding the SLA-based planner in `examples/common/utils/planner_core.py`. + +The SLA (Service Level Agreement)-based planner is an intelligent autoscaling system that monitors system performance and adjusts the number of prefill and decode workers to meet specified TTFT and ITL targets. Unlike the load-based planner that scales based on resource utilization thresholds, the SLA planner uses predictive modeling and performance interpolation to proactively scale the workers. + + +Currently, SLA-based planner only supports disaggregated setup. + + + +Bare metal deployment with local connector is deprecated. Please deploy the SLA planner in k8s. + + +## Architecture Overview + +**Components:** +- **Frontend**: Serves requests and exposes `/metrics` +- **Prometheus**: Scrapes frontend metrics every 5s (by default, can be updated in the podmonitor manifest) +- **Planner**: Queries Prometheus and adjusts worker scaling every adjustment interval +- **Workers**: prefill and backend workers handle inference + +The adjustment interval can be defined in the planner manifest as an argument. The default interval value can be found in this [file](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/planner/defaults.py). + +```mermaid +flowchart LR + Frontend --"/metrics"--> Prometheus + Planner --"query API"--> Prometheus + Planner --"scaling decisions"--> Workers + Frontend -.->|"requests"| Workers +``` + +## Features + +* **SLA-driven scaling**: Automatically scales prefill/decode workers to meet TTFT and ITL targets +* **Predictive load forecasting**: Uses ARIMA, Prophet, or constant predictors to forecast future load +* **Performance interpolation**: Leverages profiling results data from pre-deployment profiling for accurate scaling decisions +* **Correction factors**: Adapts to real-world performance deviations from profiled data + +## Design + +The SLA planner consists of several key components: + +1. **Load Predictors**: Forecast future request patterns (number of requests, input/output sequence lengths) +2. **Performance Interpolators**: Estimate TTFT and ITL based on profiled performance data +3. **Correction Factors**: Adjust predictions based on observed vs. expected performance +4. **Scaling Logic**: Calculate optimal number of prefill/decode replicas to meet SLA targets + +## SLA-Driven Pre-Deployment Profiling + +**Prerequisite**: SLA-based planner requires pre-deployment profiling to be completed before deployment. The profiling process analyzes your model's performance characteristics to determine optimal tensor parallelism configurations and scaling parameters that the planner will use during operation. + +See [Pre-Deployment Profiling](../benchmarks/sla-driven-profiling.md) for detailed instructions on running the profiling process. + +## Load Prediction + +The SLA planner use load predictor to predict the number of requests, ISL, and OSL in the next adjustment interval. Currently, three load prediction model is supported: + +### Constant Predictor +- **Use case**: Stable and long prediction interval +- **Behavior**: Assumes next load equals current load +- **Configuration**: `load-predictor: "constant"` + +### ARIMA Predictor +- **Use case**: Time-series data with trends and seasonality +- **Behavior**: Uses auto-ARIMA to fit optimal model parameters +- **Configuration**: `load-predictor: "arima"` + +### Prophet Predictor +- **Use case**: Complex seasonal patterns and trend changes +- **Behavior**: Facebook's [Prophet](https://facebook.github.io/prophet/) model for time-series forecasting +- **Configuration**: `load-predictor: "prophet"` + +## Scaling Algorithm + +SLA planner uses a sophisticated scaling algorithm. At each adjustment interval, SLA planner performs the following operations: + +### 1. Metric Collection +Every adjustment interval, collect: +- Average Time to First Token (TTFT) +- Average Inter-Token Latency (ITL) +- Request count and duration +- Input/Output sequence lengths + +### 2. Correction Factor Calculation +Using the collected metrics, SLA planner applies the interpolator to find out the expected TTFT/ITL and calibrate the interpolation model. This step is important because the actual TTFT/ITL can often be different than the ideal world: +- **TTFT**: actual TTFT heavily depends on request queueing and prefix cache hit rate (if use kv reuse). For example, if all requests arrives at the beginning of the adjustment interval, they queue heavily and TTFT will be significantly higher. If prefix cache hit rate is very high, the actual number of tokens in the prefill will be very low and TTFT will be significantly lower. +- **ITL**: actual ITL maybe affected by chunked small prefill request in decode engine. +- **Metric variances**: large variances in request rate, ISL, and OSL may lead to inaccurate estimation of the TTFT/ITL since SLA only consider the average when interpolating. + +SLA planner calculate the correction factor with +- **Prefill correction**: `actual_ttft / expected_ttft` +- **Decode correction**: `actual_itl / expected_itl` + +### 3. Load Prediction +SLA planner forecasts these metric in the next interval using the load predictor +- Number of requests +- Input sequence length +- Output sequence length + +### 4. Calculating Number of Replicas + +**Prefill replicas**: SLA planner assumes the prefill correction factor has linear affect on the prefill throughput per GPU as prefill is single-batched. +``` +predicted_load = next_requests * next_isl / interval * min(1, prefill_correction) +prefill_replicas = ceil(predicted_load / interpolated_throughput / gpus_per_engine) +``` + +**Decode replicas**: +``` +# 1. apply d_correction_factor to the ITL SLA +corrected_itl = self.args.itl / self.d_correction_factor +# 2. reversely find out what is best throughput/gpu that can achieve corrected_itl under the predicted context length +pred_decode_thpt_per_gpu = self.decode_interpolator.find_best_throughput_per_gpu( + itl=corrected_itl, + context_length=next_isl + next_osl / 2 +) +# 3. compute number of decode replicas needed +next_num_d = math.ceil(next_num_req * next_osl / self.args.adjustment_interval / pred_decode_thpt_per_gpu / self.args.decode_engine_num_gpu) +``` + +### 5. Scaling + +Finally, SLA planner applies the change by scaling up/down the number of prefill and decode workers to the calculated number of replica in the next interval. + + +SLA-planner scales up/down the P/D engines non-blockingly. If `adjustment-interval` is too short, the previous scaling operations may not finish before the new scaling operations are issued. Make sure to set a large enough `adjustment-interval`. + + +## Deploying + +For complete deployment instructions, see the [SLA Planner Quick Start Guide](sla-planner-quickstart.md). + + +The SLA planner requires a frontend that reports metrics at the `/metrics` HTTP endpoint with the number of requests, ISL, OSL, TTFT, and ITL in the correct format. The dynamo frontend provides these metrics automatically. + + +### Virtual Deployment + +The SLA planner supports virtual deployment mode for customized environments (e.g., customized cluster) through the `VirtualConnector`. This connector enables the planner to communicate scaling decisions without directly managing the deployment infrastructure. + +The `VirtualConnector` acts as a bridge between the SLA planner and external deployment environments. Instead of directly scaling Kubernetes resources, it writes scaling decisions and waits for the deployment environment to acknowledge completion. + +#### Scaling Decision Flow + +1. **Decision Generation**: The planner calculates optimal worker counts +2. **Change Detection**: The planner skips scaling if the target counts match current counts, logging: `"No scaling needed (prefill=X, decode=Y)"` +3. **Readiness Check**: Before making new decisions, the planner verifies that previous scaling operations have completed by checking if `scaled_decision_id >= decision_id` +4. **Timeout Handling**: If a scaling decision isn't acknowledged within 30 minutes (1800 seconds), the planner proceeds with new decisions anyway +5. **Completion Tracking**: The planner can optionally wait for scaling completion confirmation (blocking mode) + +#### Configuration + +To use virtual deployment mode: + +```yaml +environment: "virtual" +backend: "vllm" # or "sglang" +``` + +#### Deployment Environment Requirements + +The external deployment environment must use `VirtualConnectorClient`: + +``` +from dynamo._core import DistributedRuntime, VirtualConnectorClient + +client = VirtualConnectorClient(distributed_runtime, namespace) +``` + +1. **Monitor Planner**: Continuously watch for scaling decisions: `await client.wait()`. This blocks until there is a change. +2. **Parse Decisions**: Read `num_prefill_workers` and `num_decode_workers` values: `decision = await client.get()` +3. **Execute Scaling**: Apply the scaling decisions to the actual deployment infrastructure +4. **Acknowledge Completion**: Mark the decision completed when scaling is finished: `await client.complete(decision)` + +A scaling decision (returned by `client.get()`) contains the following fields, which are -1 if not set yet: +- `num_prefill_workers`: Integer specifying the target number of prefill workers +- `num_decode_workers`: Integer specifying the target number of decode workers +- `decision_id`: Integer with incremental ID for each scaling decision + +See `components/planner/test/test_virtual_connector.py` for a full example. + diff --git a/fern/pages/reference/cli.md b/fern/pages/reference/cli.md new file mode 100644 index 00000000000..f23a0927323 --- /dev/null +++ b/fern/pages/reference/cli.md @@ -0,0 +1,412 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Dynamo Run" +--- + +`dynamo-run` is a Rust binary that lets you easily run a model, explore the Dynamo components, and demonstrates the Rust API. It supports the `mistral.rs` engines, as well as testing engines `echo` and `mocker`. + +It is primarily for development and rapid prototyping. For production use we recommend the Python wrapped components, see the main project README. + +## Basics + +Usage: See `dynamo-run --help` + +Example: `dynamo-run Qwen/Qwen3-0.6B` + +Set the environment variable `DYN_LOG` to adjust the logging level; for example, `export DYN_LOG=debug`. It has the same syntax as `RUST_LOG`. + +To adjust verbosity, use `-v` to enable debug logging or `-vv` to enable full trace logging. For example: + +```bash +dynamo-run in=http out=mistralrs -v # enables debug logging +``` + +### Use model from Hugging Face + +To automatically download Qwen3 4B from Hugging Face (16 GiB download) and to start it in interactive text mode: +``` +dynamo-run Qwen/Qwen3-4B +``` + +The general format for HF download follows this pattern: +``` +dynamo-run out= +``` + +For gated models (such as meta-llama/Llama-3.2-3B-Instruct), you must set an `HF_TOKEN` environment variable. + +The parameter can be the ID of a HuggingFace repository (which will be downloaded) or a folder containing safetensors, config.json, or similar (perhaps a locally checked out HuggingFace repository). + +### Run a model from local file + +To run a model from local file: +- Download the model from Hugging Face +- Run the model from local file + +See the following sections for details. + +#### Download model from Hugging Face +This model available from Hugging Face should be high quality and fast on almost any machine: https://huggingface.co/Qwen/Qwen3-0.6B + +To run the model: + +*Text interface* +``` +dynamo-run Qwen/Qwen3-0.6B +``` + +You can also pipe a prompt into `dynamo-run`: +``` +echo 'What is the capital of Tuvalu?' | dynamo-run Qwen/Qwen3-0.6B --context-length 4096 +``` + +*HTTP interface* +``` +dynamo-run in=http out=mistralrs Qwen/Qwen3-0.6B +``` +You can also list models or send a request: + +*List the models* +``` +curl localhost:8080/v1/models +``` + +*Send a request* +``` +curl -d '{"model": "Qwen/Qwen3-0.6B", "max_completion_tokens": 2049, "messages":[{"role":"user", "content": "What is the capital of South Africa?" }]}' -H 'Content-Type: application/json' http://localhost:8080/v1/chat/completions +``` + +## Distributed System + +You can run the ingress side (HTTP server and pre-processing) on one machine, for example a CPU node, and the worker on a different machine (a GPU node). + +You will need [etcd](https://etcd.io/) and [nats](https://nats.io) with jetstream installed and accessible from both nodes. For development I run NATS like this: `nats-server -js --trace --store_dir $(mktemp -d)`. + +**Node 1:** OpenAI compliant HTTP server, optional pre-processing, worker discovery: + +``` +dynamo-run in=http out=auto +``` + +**Node 2:** Engine. Receives and returns requests over the network: + +``` +dynamo-run in=dyn://llama3B.backend.generate out=mistralrs ~/llms/Llama-3.2-3B-Instruct +``` + +This uses etcd to auto-discover the model and NATS to talk to it. You can +run multiple instances on the same endpoint; it picks one based on the +`--router-mode` (round-robin by default if left unspecified). + +Run `dynamo-run --help` for more options. + +### Network names + +The `in=dyn://` URLs have the format `dyn://namespace.component.endpoint`. For quickstart just use any string `dyn://test`, `dynamo-run` will default any missing parts for you. The pieces matter for a larger system. + +* *Namespace*: A pipeline. Usually a model. e.g "llama_8b". Just a name. +* *Component*: A load balanced service needed to run that pipeline. "backend", "prefill", "decode", "preprocessor", "draft", etc. This typically has some configuration (which model to use, for example). +* *Endpoint*: Like a URL. "generate", "load_metrics". +* *Instance*: A process. Unique. Dynamo assigns each one a unique instance_id. The thing that is running is always an instance. Namespace/component/endpoint can refer to multiple instances. + +If you run two models, that is two pipelines. An exception would be if doing speculative decoding. The draft model is part of the pipeline of a bigger model. + +If you run two instances of the same model ("data parallel") they are the same namespace+component+endpoint but different instances. The router will spread traffic over all the instances of a namespace+component+endpoint. If you have four prefill workers in a pipeline, they all have the same namespace+component+endpoint and are automatically assigned unique instance_ids. + +Example 1: Data parallel load balanced, one model one pipeline two instances. +``` +Node 1: dynamo-run in=dyn://qwen3-32b.backend.generate /data/Qwen3-32B +Node 2: dynamo-run in=dyn://qwen3-32b.backend.generate /data/Qwen3-32B +``` + +Example 2: Two models, two pipelines. +``` +Node 1: dynamo-run in=dyn://qwen3-32b.backend.generate /data/Qwen3-32B +Node 2: dynamo-run in=dyn://llama3-1-8b.backend.generate /data/Llama-3.1-8B-Instruct/ +``` + +Example 3: Different endpoints. + +The KV metrics publisher in VLLM adds a `load_metrics` endpoint to the current component. If the `llama3-1-8b.backend` component above is using patched vllm it will also expose `llama3-1-8b.backend.load_metrics`. + +Example 4: Multiple component in a pipeline. + +In the P/D disaggregated setup you would have `deepseek-distill-llama8b.prefill.generate` (possibly multiple instances of this) and `deepseek-distill-llama8b.decode.generate`. + +For output it is always only `out=auto`. This tells Dynamo to auto-discover the instances, group them by model, and load balance appropriately (depending on `--router-mode` flag). + +### KV-aware routing + +``` +dynamo-run in=http out=auto --router-mode kv +``` + +The only difference from the distributed system above is `--router-mode kv`. vllm announces when a KV block is created or removed. The Dynamo router finds the worker with the best match for those KV blocks and directs the traffic to that node. + +For performance testing, compare a typical workload with `--router-mode random|round-robin` to see if it can benefit from KV-aware routing. + +The KV-aware routing arguments: + +- `--kv-overlap-score-weight`: Sets the amount of weighting on overlaps with prefix caches, which directly contributes to the prefill cost. A large weight is expected to yield a better TTFT (at the expense of worse ITL). When set to 0, prefix caches are not considered at all (falling back to pure load balancing behavior on the active blocks). + +- `--router-temperature`: Sets the temperature when randomly selecting workers to route to via softmax sampling on the router cost logits. Setting it to 0 recovers the deterministic behavior where the min logit is picked. + +- `--use-kv-events`: Sets whether to listen to KV events for maintaining the global view of cached blocks. If true, the router uses KV events to track block creation and deletion from workers. If false, the router predicts cache state based on routing decisions with TTL-based expiration (default 120s) and pruning. Set false if your backend engine does not emit KV events. + +### Request Migration + +In a [Distributed System](#distributed-system), you can enable [request migration](../fault-tolerance/request-migration.md) to handle worker failures gracefully. Use the `--migration-limit` flag to specify how many times a request can be migrated to another worker: + +```bash +dynamo-run in=dyn://... out= ... --migration-limit=3 +``` + +This allows a request to be migrated up to 3 times before failing. See the [Request Migration Architecture](../fault-tolerance/request-migration.md) documentation for details on how this works. + +### Request Cancellation + +When using the HTTP interface (`in=http`), if the HTTP request connection is dropped by the client, Dynamo automatically cancels the downstream request to the worker. This ensures that computational resources are not wasted on generating responses that are no longer needed. + +For detailed information about how request cancellation works across the system, see the [Request Cancellation Architecture](../fault-tolerance/request-cancellation.md) documentation. + +## Development + +`dynamo-run` is also an example of what can be built in Rust with the `dynamo-llm` and `dynamo-runtime` crates. The following guide shows how to build from source with all the features. + +### Step 1: Install libraries +**Ubuntu:** +``` +sudo apt install -y build-essential libhwloc-dev libudev-dev pkg-config libssl-dev libclang-dev protobuf-compiler python3-dev cmake +``` + +**macOS:** +- [Homebrew](https://brew.sh/) +``` +# if brew is not installed on your system, install it +/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" +``` +- [Xcode](https://developer.apple.com/xcode/) + +``` +brew install cmake protobuf + +## Check that Metal is accessible +xcrun -sdk macosx metal +``` +If Metal is accessible, you should see an error like `metal: error: no input files`, which confirms it is installed correctly. + +### Step 2: Install Rust +``` +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +source $HOME/.cargo/env +``` + +### Step 3: Build + +- Linux with GPU and CUDA (tested on Ubuntu): +``` +cargo build --features cuda +``` + +- macOS with Metal: +``` +cargo build --features metal +``` + +- CPU only: +``` +cargo build +``` + +Optionally you can run `cargo build` from any location with arguments: + +``` +--target-dir /path/to/target_directory # specify target_directory with write privileges +--manifest-path /path/to/project/Cargo.toml # if cargo build is run outside of `launch/` directory +``` + +The binary is called `dynamo-run` in `target/debug` +``` +cd target/debug +``` + +Build with `--release` for a smaller binary and better performance, but longer build times. The binary will be in `target/release`. + + +## Engines + +The input defaults to `in=text`. The output defaults to `out=mistralrs` engine, unless it is disabled with `--no-default-features` in which case an engine that echo's back your input is used. + +### mistralrs + +[mistral.rs](https://github.com/EricLBuehler/mistral.rs) is a pure Rust engine that is fast to run and fast to load, and runs well on CPU as well as GPU. For those reasons it is the default engine. + +``` +dynamo-run Qwen/Qwen3-4B +``` + +is equivalent to + +``` +dynamo-run in=text out=mistralrs Qwen/Qwen3-4B +``` + +If you have multiple GPUs, `mistral.rs` does automatic tensor parallelism. You do not need to pass any extra flags to dynamo-run to enable it. + +### Mocker engine + +The mocker engine is a mock vLLM implementation designed for testing and development purposes. It simulates realistic token generation timing without requiring actual model inference, making it useful for: + +- Testing distributed system components without GPU resources +- Benchmarking infrastructure and networking overhead +- Developing and debugging Dynamo components +- Load testing and performance analysis + +**Basic usage:** + +The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights (but the pre-processor needs the tokenizer). The arguments `block_size`, `num_gpu_blocks`, `max_num_seqs`, `max_num_batched_tokens`, `enable_prefix_caching`, and `enable_chunked_prefill` are common arguments shared with the real VLLM engine. + +And below are arguments that are mocker-specific: +- `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster. +- `dp_size`: Number of data parallel workers to simulate (default: 1) +- `watermark`: KV cache watermark threshold as a fraction (default: 0.01). This argument also exists for the real VLLM engine but cannot be passed as an engine arg. + +```bash +echo '{"speedup_ratio": 10.0}' > mocker_args.json +dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-engine-args mocker_args.json +dynamo-run in=http out=auto --router-mode kv +``` + +### echo + +The `echo` engine echoes the prompt back as the response. + +``` +dynamo-run in=http out=echo --model-name my_model +``` + +The echo engine uses a configurable delay between tokens to simulate generation speed. You can adjust this using the `DYN_TOKEN_ECHO_DELAY_MS` environment variable: + +``` +# Set token echo delay to 1ms (1000 tokens per second) +DYN_TOKEN_ECHO_DELAY_MS=1 dynamo-run in=http out=echo +``` + +The default delay is 10ms, which produces approximately 100 tokens per second. + +### Other engines, multi-node, production + +`vllm`, `sglang` and `trtllm` production grade engines are available in `examples/backends`. They run as Python components, using the Rust bindings. See the main README. + +`dynamo-run` is an exploration, development and prototyping tool, as well as an example of using the Rust API. Multi-node and production setups should be using the main engine components. + +## Batch mode + +`dynamo-run` can take a jsonl file full of prompts and evaluate them all: + +``` +dynamo-run in=batch:prompts.jsonl out=mistralrs +``` + +The input file should look like this: +``` +{"text": "What is the capital of France?"} +{"text": "What is the capital of Spain?"} +``` + +Each one is passed as a prompt to the model. The output is written back to the same folder in `output.jsonl`. At the end of the run some statistics are printed. +The output looks like this: +``` +{"text":"What is the capital of France?","response":"The capital of France is Paris.","tokens_in":7,"tokens_out":7,"elapsed_ms":1566} +{"text":"What is the capital of Spain?","response":".The capital of Spain is Madrid.","tokens_in":7,"tokens_out":7,"elapsed_ms":855} +``` + +## Writing your own engine in Python + +The [dynamo](https://pypi.org/project/ai-dynamo/) Python library allows you to build your own engine and attach it to Dynamo. All of the main backend components in `examples/backends/` work like this. + +The Python file must do three things: +1. Decorate a function to get the runtime +2. Register on the network +3. Attach a request handler + +``` +from dynamo.llm import ModelInput, ModelType, register_llm +from dynamo.runtime import DistributedRuntime, dynamo_worker + + # 1. Decorate a function to get the runtime + # + @dynamo_worker() + async def worker(runtime: DistributedRuntime): + + # 2. Register ourselves on the network + # + component = runtime.namespace("namespace").component("component") + model_path = "Qwen/Qwen3-0.6B" # or "/data/models/Qwen3-0.6B" + model_input = ModelInput.Tokens # or ModelInput.Text if engine handles pre-processing + model_type = ModelType.Chat # or ModelType.Chat | ModelType.Completions if model can be deployed on chat and completions endpoints + endpoint = component.endpoint("endpoint") + # Optional last param to register_llm is model_name. If not present derives it from model_path + await register_llm(model_input, model_type, endpoint, model_path) + + # Initialize your engine here + # engine = ... + + # 3. Attach request handler + # + await endpoint.serve_endpoint(RequestHandler(engine).generate) + +class RequestHandler: + + def __init__(self, engine): + ... + + async def generate(self, request): + # Call the engine + # yield result dict + ... + +if __name__ == "__main__": + uvloop.install() + asyncio.run(worker()) +``` + + +The `model_path` can be: +- A HuggingFace repo ID, optionally prefixed with `hf://`. It is downloaded and cached locally. +- The path to a checkout of a HuggingFace repo - any folder containing safetensor files as well as `config.json`, `tokenizer.json` and `tokenizer_config.json`. + +The `model_input` can be: +- ModelInput.Tokens. Your engine expects pre-processed input (token IDs). Dynamo handles tokenization and pre-processing. +- ModelInput.Text. Your engine expects raw text input and handles its own tokenization and pre-processing. + +The `model_type` can be: +- ModelType.Chat. Your `generate` method receives a `request` and must return a response dict of type [OpenAI Chat Completion](https://platform.openai.com/docs/api-reference/chat). +- ModelType.Completions. Your `generate` method receives a `request` and must return a response dict of the older [Completions](https://platform.openai.com/docs/api-reference/completions). + +`register_llm` can also take the following kwargs: +- `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name, or the folder name. +- `context_length`: Max model length in tokens. Defaults to the model's set max. Only set this if you need to reduce KV cache allocation to fit into VRAM. +- `kv_cache_block_size`: Size of a KV block for the engine, in tokens. Defaults to 16. +- `user_data`: Optional dictionary containing custom metadata for worker behavior (e.g., LoRA configuration). Defaults to None. + +Here are some example engines: + +- Backend: + * [vllm](https://github.com/ai-dynamo/dynamo/blob/main/lib/bindings/python/examples/hello_world/server_vllm.py) + * [sglang](https://github.com/ai-dynamo/dynamo/blob/main/lib/bindings/python/examples/hello_world/server_sglang.py) +- Chat: + * [sglang](https://github.com/ai-dynamo/dynamo/blob/main/lib/bindings/python/examples/hello_world/server_sglang_tok.py) + +More fully-featured Python engines are in `examples/backends`. + +## Debugging + +`dynamo-run` and `dynamo-runtime` support [tokio-console](https://github.com/tokio-rs/console). Build with the feature to enable: +``` +cargo build --features cuda,tokio-console -p dynamo-run +``` + +The listener uses the default tokio console port, and all interfaces (0.0.0.0). + diff --git a/fern/pages/reference/glossary.md b/fern/pages/reference/glossary.md new file mode 100644 index 00000000000..49df07f75a5 --- /dev/null +++ b/fern/pages/reference/glossary.md @@ -0,0 +1,91 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "NVIDIA Dynamo Glossary" +--- + +## B +**Block** - A fixed-size chunk of tokens (typically 16 or 64 tokens) used for efficient KV cache management and memory allocation, serving as the fundamental unit for techniques like PagedAttention. + +## C +**Component** - The fundamental deployable unit in Dynamo. A discoverable service entity that can host multiple endpoints and typically maps to a Docker container (such as VllmWorker, Router, Processor). + +**Conditional Disaggregation** - Dynamo's intelligent decision-making process within disaggregated serving that determines whether a request is processed locally or sent to a remote prefill engine based on prefill length and queue status. + +## D +**Decode Phase** - The second phase of LLM inference that generates output tokens one at a time. + +**Disaggregated Serving** - Dynamo's core architecture that separates prefill and decode phases into specialized engines to maximize GPU throughput and improve performance. + +**Distributed Runtime** - Dynamo's Rust-based core system that manages service discovery, communication, and component lifecycle across distributed clusters. + +**Dynamo** - NVIDIA's high-performance distributed inference framework for Large Language Models (LLMs) and generative AI models, designed for multinode environments with disaggregated serving and cache-aware routing. + +**Dynamo Kubernetes Platform** - A Kubernetes platform providing managed deployment experience for Dynamo inference graphs. + +## E +**Endpoint** - A specific network-accessible API within a Dynamo component, such as `generate` or `load_metrics`. + +## F +**Frontend** - Dynamo's API server component that receives user requests and provides OpenAI-compatible HTTP endpoints. + +## G +**Graph** - A collection of interconnected Dynamo components that form a complete inference pipeline with request paths (single-in) and response paths (many-out for streaming). A graph can be packaged into a Dynamo Artifact for deployment. + +## I +**Instance** - A running process with a unique `instance_id`. Multiple instances can serve the same namespace, component, and endpoint for load balancing + +## K +**KV Block Manager (KVBM)** - Dynamo's scalable runtime component that handles memory allocation, management, and remote sharing of Key-Value blocks across heterogeneous and distributed environments. + +**KV Cache** - Key-Value cache that stores computed attention states from previous tokens to avoid recomputation during inference. + +**KV Router** - Dynamo's intelligent routing system that directs requests to workers with the highest cache overlap to maximize KV cache reuse. Determines routing based on KV cache hit rates and worker metrics. + +**KVIndexer** - Dynamo component that maintains a global view of cached blocks across all workers using a prefix tree structure to calculate cache hit rates. + +**KVPublisher** - Dynamo component that emits KV cache events (stored/removed) from individual workers to the global KVIndexer. + + +## M +**Model Deployment Card (MDC)** - A configuration structure containing all information required for distributed model serving. When a worker loads a model, it creates an MDC containing references to components such as the tokenizer, templates, runtime config. Workers publish their MDC to make the model discoverable to frontends. Frontends use the MDC to configure request preprocessing (tokenization, prompt formatting). + +## N +**Namespace** - Dynamo's logical grouping mechanism for related components. Similar to directories in a file system, they prevent collisions between different deployments. + +**NIXL (NVIDIA Inference tranXfer Library)** - High-performance data transfer library optimized for inference workloads, supporting direct GPU-to-GPU transfers and multiple memory hierarchies. + +## P +**PagedAttention** - Memory management technique from vLLM that efficiently manages KV cache by chunking requests into blocks. + +**Planner** - Dynamo component that performs dynamic resource scaling based on real-time demand signals and system metrics. + +**Prefill Phase** - The first phase of LLM inference that processes the input prompt and generates KV cache. + +**Prefix Caching** - Optimization technique that reuses previously computed KV cache for common prompt prefixes. + +**Processor** - Dynamo component that handles request preprocessing, tokenization, and routing decisions. + +## R +**RadixAttention** - Technique from SGLang that uses a prefix tree structure for efficient KV cache matching, insertion, and eviction. + +**RDMA (Remote Direct Memory Access)** - Technology that allows direct memory access between distributed systems, used for efficient KV cache transfers. + +## S +**SGLang** - Fast LLM inference framework with native embedding support and RadixAttention. + +## T +**Tensor Parallelism (TP)** - Model parallelism technique where model weights are distributed across multiple GPUs. + +**TensorRT-LLM** - NVIDIA's optimized LLM inference engine with multinode MPI distributed support. + +**Time-To-First-Token (TTFT)** - The latency from receiving a request to generating the first output token. + +## V +**vLLM** - High-throughput LLM serving engine with distributed tensor/pipeline parallelism and PagedAttention. + +## W +**Wide Expert Parallelism (WideEP)** - Mixture-of-Experts deployment strategy that spreads experts across many GPUs (e.g., 64-way EP) so each GPU hosts only a few experts. + +## X +**xPyD (x Prefill y Decode)** - Dynamo notation describing disaggregated serving configurations where x prefill workers serve y decode workers. Dynamo supports runtime-reconfigurable xPyD. diff --git a/fern/pages/reference/support-matrix.md b/fern/pages/reference/support-matrix.md new file mode 100644 index 00000000000..86c4356f9d2 --- /dev/null +++ b/fern/pages/reference/support-matrix.md @@ -0,0 +1,123 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Dynamo Support Matrix" +--- + +This document provides the support matrix for Dynamo, including hardware, software and build instructions. + +## Hardware Compatibility + +| **CPU Architecture** | **Status** | +| :------------------- | :----------- | +| **x86_64** | Supported | +| **ARM64** | Supported | + + +### GPU Compatibility + +If you are using a **GPU**, the following GPU models and architectures are supported: + +| **GPU Architecture** | **Status** | +| :----------------------------------- | :--------- | +| **NVIDIA Blackwell Architecture** | Supported | +| **NVIDIA Hopper Architecture** | Supported | +| **NVIDIA Ada Lovelace Architecture** | Supported | +| **NVIDIA Ampere Architecture** | Supported | + +## Platform Architecture Compatibility + +**Dynamo** is compatible with the following platforms: + +| **Operating System** | **Version** | **Architecture** | **Status** | +| :------------------- | :---------- | :--------------- | :----------- | +| **Ubuntu** | 22.04 | x86_64 | Supported | +| **Ubuntu** | 24.04 | x86_64 | Supported | +| **Ubuntu** | 24.04 | ARM64 | Supported | +| **CentOS Stream** | 9 | x86_64 | Experimental | + + +Wheels are built using a manylinux_2_28-compatible environment and they have been validated on CentOS 9 and Ubuntu (22.04, 24.04). +Compatibility with other Linux distributions is expected but has not been officially verified yet. + + + +KV Block Manager is supported only with Python 3.12. Python 3.12 support is currently limited to Ubuntu 24.04. + + +## Software Compatibility + +### Runtime Dependency + +| **Python Package** | **Version** | glibc version | CUDA Version | +| :----------------- | :---------- | :------------------------------------ | :----------- | +| ai-dynamo | 0.8.0 | >=2.28 | | +| ai-dynamo-runtime | 0.8.0 | >=2.28 (Python 3.12 has known issues) | | +| NIXL | 0.8.0 | >=2.27 | >=11.8 | + +### Build Dependency + +The following table shows the dependency versions included with each Dynamo release: + +| **Dependency** | **main (ToT)** | **v0.8.0 (unreleased)** | **v0.7.1** | **v0.7.0.post1** | **v0.7.0** | +| :------------- | :------------- | :---------------------- | :--------- | :--------------- | :--------- | +| SGLang | 0.5.7 | 0.5.7 | 0.5.3.post4| 0.5.3.post4 | 0.5.3.post4| +| TensorRT-LLM | 1.2.0rc6 | 1.2.0rc6 | 1.2.0rc3 | 1.2.0rc3 | 1.2.0rc2 | +| vLLM | 0.13.0 | 0.12.0 | 0.11.0 | 0.11.0 | 0.11.0 | +| NIXL | 0.8.0 | 0.8.0 | 0.8.0 | 0.8.0 | 0.8.0 | + + +**main (ToT)** reflects the current development branch. **v0.8.0** is the upcoming release (planned for January 14, 2025) and not yet available. + + + + +Specific versions of TensorRT-LLM supported by Dynamo are subject to change. Currently TensorRT-LLM does not support Python 3.11 so installation of the ai-dynamo[trtllm] will fail. + + +### CUDA Support by Framework +| **Dynamo Version** | **SGLang** | **TensorRT-LLM** | **vLLM** | +| :------------------- | :-----------------------| :-----------------------| :-----------------------| +| **Dynamo 0.7.1** | CUDA 12.8 | CUDA 13.0 | CUDA 12.9 | + +## Cloud Service Provider Compatibility + +### AWS + +| **Host Operating System** | **Version** | **Architecture** | **Status** | +| :------------------------ | :---------- | :--------------- | :--------- | +| **Amazon Linux** | 2023 | x86_64 | Supported¹ | + + +There is a known issue with the TensorRT-LLM framework when running the AL2023 container locally with `docker run --network host ...` due to a [bug](https://github.com/mpi4py/mpi4py/discussions/491#discussioncomment-12660609) in mpi4py. To avoid this issue, replace the `--network host` flag with more precise networking configuration by mapping only the necessary ports (e.g., 4222 for nats, 2379/2380 for etcd, 8000 for frontend). + + +## Build Support + +**Dynamo** currently provides build support in the following ways: + +- **Wheels**: We distribute Python wheels of Dynamo and KV Block Manager: + - [ai-dynamo](https://pypi.org/project/ai-dynamo/) + - [ai-dynamo-runtime](https://pypi.org/project/ai-dynamo-runtime/) + - **New as of Dynamo v0.7.0:** [kvbm](https://pypi.org/project/kvbm/) as a standalone implementation. + +- **Dynamo Runtime Images**: We distribute multi-arch images (x86 & ARM64 compatible) of the Dynamo Runtime for each of the LLM inference frameworks on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo): + - [SGLang](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/containers/sglang-runtime) + - [TensorRT-LLM](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/containers/tensorrtllm-runtime) + - [vLLM](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/containers/vllm-runtime) + +- **Dynamo Kubernetes Operator Images**: We distribute multi-arch images (x86 & ARM64 compatible) of the Dynamo Operator on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo): + - [kubernetes-operator](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/containers/kubernetes-operator) to simplify deployments of Dynamo Graphs. + +- **Helm Charts**: [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo) hosts the helm charts supporting Kubernetes deployments of Dynamo: + - [Dynamo CRDs](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/helm-charts/dynamo-crds) + - [Dynamo Platform](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/helm-charts/dynamo-platform) + - [Dynamo Graph](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/helm-charts/dynamo-graph) + +- **Rust Crates**: + - [dynamo-runtime](https://crates.io/crates/dynamo-runtime/) + - [dynamo-async-openai](https://crates.io/crates/dynamo-async-openai/) + - [dynamo-parsers](https://crates.io/crates/dynamo-parsers/) + - [dynamo-llm](https://crates.io/crates/dynamo-llm/) + +Once you've confirmed that your platform and architecture are compatible, you can install **Dynamo** by following the instructions in the [Quick Start Guide](https://github.com/ai-dynamo/dynamo/blob/main/README.md#installation). diff --git a/fern/pages/router/README.md b/fern/pages/router/README.md new file mode 100644 index 00000000000..002268d326f --- /dev/null +++ b/fern/pages/router/README.md @@ -0,0 +1,251 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "KV Router" +--- + +## Overview + +The Dynamo KV Router intelligently routes requests by evaluating their computational costs across different workers. It considers both decoding costs (from active blocks) and prefill costs (from newly computed blocks). Optimizing the KV Router is critical for achieving maximum throughput and minimum latency in distributed inference setups. + +## Quick Start + +### Python / CLI Deployment + +To launch the Dynamo frontend with the KV Router: + +```bash +python -m dynamo.frontend --router-mode kv --http-port 8000 +``` + +This command: +- Launches the Dynamo frontend service with KV routing enabled +- Exposes the service on port 8000 (configurable) +- Automatically handles all backend workers registered to the Dynamo endpoint + +Backend workers register themselves using the `register_llm` API, after which the KV Router automatically: +- Tracks the state of all registered workers +- Makes routing decisions based on KV cache overlap +- Balances load across available workers + +### Kubernetes Deployment + +To enable the KV Router in a Kubernetes deployment, add the `DYN_ROUTER_MODE` environment variable to your frontend service: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: my-deployment +spec: + services: + Frontend: + dynamoNamespace: my-namespace + componentType: frontend + replicas: 1 + envs: + - name: DYN_ROUTER_MODE + value: kv # Enable KV Smart Router + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 + Worker: + # ... worker configuration ... +``` + +**Key Points:** +- Set `DYN_ROUTER_MODE=kv` on the **Frontend** service only +- Workers automatically report KV cache events to the router +- No worker-side configuration changes needed + +**Complete K8s Examples:** +- [TRT-LLM aggregated router example](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/trtllm/deploy/agg_router.yaml) +- [vLLM aggregated router example](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm/deploy/agg_router.yaml) +- [SGLang aggregated router example](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/sglang/deploy/agg_router.yaml) +- [Distributed inference tutorial](https://github.com/ai-dynamo/dynamo/tree/main/examples/basics/kubernetes/Distributed_Inference/agg_router.yaml) + +**For A/B Testing and Advanced K8s Setup:** +See the comprehensive [KV Router A/B Benchmarking Guide](../benchmarks/kv-router-ab-testing.md) for step-by-step instructions on deploying, configuring, and benchmarking the KV router in Kubernetes. + +## Configuration Options + +### CLI Arguments (Python Deployment) + +The KV Router supports several key configuration options: + +- **`--router-mode kv`**: Enable KV cache-aware routing (required) + +- **`--kv-cache-block-size `**: Sets the KV cache block size (default: backend-specific). Larger blocks reduce overlap detection granularity but improve memory efficiency. This should match your backend configuration. + +- **`--router-temperature `**: Controls routing randomness (default: 0.0) + - `0.0`: Deterministic selection of the best worker + - `> 0.0`: Probabilistic selection using softmax sampling + - Higher values increase randomness, helping prevent worker saturation + +- **`--kv-events` / `--no-kv-events`**: Controls how the router tracks cached blocks (default: `--kv-events`) + - `--kv-events`: Uses real-time events from workers for accurate cache tracking + - `--no-kv-events`: Uses approximation based on routing decisions (lower overhead, less accurate) + +- **`--kv-overlap-score-weight `**: Balance between prefill and decode optimization (default: 1.0) + - Higher values (> 1.0): Prioritize reducing prefill cost (better TTFT) + - Lower values (< 1.0): Prioritize decode performance (better ITL) + +For a complete list of available options: +```bash +python -m dynamo.frontend --help +``` + +### Kubernetes Environment Variables + +All CLI arguments can be configured via environment variables in Kubernetes deployments. Use the `DYN_` prefix with uppercase parameter names: + +| CLI Argument | K8s Environment Variable | Default | Description | +|--------------|-------------------------|---------|-------------| +| `--router-mode kv` | `DYN_ROUTER_MODE=kv` | `round_robin` | Enable KV router | +| `--router-temperature ` | `DYN_ROUTER_TEMPERATURE=` | `0.0` | Routing randomness | +| `--kv-cache-block-size ` | `DYN_KV_CACHE_BLOCK_SIZE=` | Backend-specific | KV cache block size | +| `--no-kv-events` | `DYN_KV_EVENTS=false` | `true` | Disable KV event tracking | +| `--kv-overlap-score-weight ` | `DYN_KV_OVERLAP_SCORE_WEIGHT=` | `1.0` | Prefill vs decode weight | +| `--http-port ` | `DYN_HTTP_PORT=` | `8000` | HTTP server port | + +### Example with Advanced Configuration + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: my-deployment +spec: + services: + Frontend: + dynamoNamespace: my-namespace + componentType: frontend + replicas: 1 + envs: + - name: DYN_ROUTER_MODE + value: kv + - name: DYN_ROUTER_TEMPERATURE + value: "0.5" # Add some randomness to prevent worker saturation + - name: DYN_KV_OVERLAP_SCORE_WEIGHT + value: "1.5" # Prioritize TTFT over ITL + - name: DYN_KV_CACHE_BLOCK_SIZE + value: "16" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 +``` + +### Alternative: Using Command Args in K8s + +You can also pass CLI arguments directly in the container command: + +```yaml +extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.0 + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.frontend --router-mode kv --router-temperature 0.5 --http-port 8000" +``` + +**Recommendation:** Use environment variables for easier configuration management and consistency with Dynamo's K8s patterns. + +## KV Router Architecture + +The KV Router tracks two key metrics for each worker: + +1. **Potential Active Blocks**: The number of blocks that would be used for decoding if a request is routed to a worker. This includes both existing active blocks and new blocks from the incoming request. + +2. **Potential New Prefill Blocks**: The number of tokens that need to be computed from scratch on a worker, calculated as: + - New prefill tokens = Total input tokens - (Overlap blocks × Block size) + - Potential prefill blocks = New prefill tokens / Block size + +### Block Tracking Mechanisms + +The router maintains block information through two complementary systems: + +- **Active Decoding Blocks**: Tracked locally by the router throughout the request lifecycle: + - Incremented when adding a new request + - Updated during token generation + - Decremented upon request completion + +- **Cached Blocks**: Maintained globally by the KvIndexer using a prefix tree built from worker-reported KV events. This provides accurate overlap information for routing decisions. + +## Cost Function + +The KV Router's routing decision is based on a simple cost function: + +``` +logit = kv_overlap_score_weight × potential_prefill_blocks + potential_active_blocks +``` + +Where: +- Lower logit values are better (less computational cost) +- The router uses softmax sampling with optional temperature to select workers + +### Key Parameter: kv-overlap-score-weight + +The `kv-overlap-score-weight` parameter (default: 1.0) controls the balance between prefill and decode optimization: + +- **Higher values (> 1.0)**: Emphasize reducing prefill cost + - Prioritizes routing to workers with better cache hits + - Optimizes for Time To First Token (TTFT) + - Best for workloads where initial response latency is critical + +- **Lower values (< 1.0)**: Emphasize decode performance + - Distributes active decoding blocks more evenly + - Optimizes for Inter-Token Latency (ITL) + - Best for workloads with long generation sequences + +## KV Events vs. Approximation Mode + +The router uses KV events from workers by default to maintain an accurate global view of cached blocks. You can disable this with the `--no-kv-events` flag: + +- **With KV Events (default)**: + - Calculates overlap accurately using actual cached blocks + - Provides higher accuracy with event processing overhead + - Recommended for production deployments + +- **Without KV Events (--no-kv-events)**: + - Router predicts cache state based on routing decisions with TTL-based expiration and pruning + - Tracks blocks from recent requests with configurable time-to-live + - Reduces overhead at the cost of routing accuracy + - Suitable for testing or when event processing becomes a bottleneck + +## Tuning Guidelines + +### 1. Understand Your Workload Characteristics + +- **Prefill-heavy workloads** (long prompts, short generations): Increase `kv-overlap-score-weight` +- **Decode-heavy workloads** (short prompts, long generations): Decrease `kv-overlap-score-weight` + +### 2. Monitor Key Metrics + +The router logs the cost calculation for each worker: +``` +Formula for worker_1: 125.3 = 1.0 * 100.5 + 25.0 (cached_blocks: 15) +``` + +This shows: +- Total cost (125.3) +- Overlap weight × prefill blocks (1.0 × 100.5) +- Active blocks (25.0) +- Cached blocks that contribute to overlap (15) + +### 3. Temperature-Based Routing + +The `router_temperature` parameter controls routing randomness: +- **0.0 (default)**: Deterministic selection of the best worker +- **> 0.0**: Probabilistic selection, higher values increase randomness +- Useful for preventing worker saturation and improving load distribution + +### 4. Iterative Optimization + +1. Begin with default settings +2. Monitor TTFT and ITL metrics +3. Adjust `kv-overlap-score-weight` to meet your performance goals: + - To reduce TTFT: Increase the weight + - To reduce ITL: Decrease the weight +4. If you observe severe load imbalance, increase the temperature setting diff --git a/fern/pages/router/kv-cache-routing.md b/fern/pages/router/kv-cache-routing.md new file mode 100644 index 00000000000..3fb8c910733 --- /dev/null +++ b/fern/pages/router/kv-cache-routing.md @@ -0,0 +1,730 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "KV Cache Routing" +--- + +This document explains how Dynamo's Key-Value (KV) cache routing optimizes large language model inference by intelligently directing requests to workers with the most relevant cached data, while maintaining load balance through worker utilization metrics. + +To enable KV cache aware routing start the frontend node like this: +``` +python -m dynamo.frontend --router-mode kv +``` + +When KV blocks are created or removed, the engine notifies the Dynamo router, which then identifies the worker with the best matching blocks and routes traffic accordingly. + +To evaluate the benefits of KV-aware routing, compare your workload's performance using `--router-mode random|round-robin` against KV-aware routing. + +The main KV-aware routing arguments: + +- `--kv-overlap-score-weight`: Controls the importance of prefix cache overlaps in prefill cost calculations. Higher values improve Time To First Token (TTFT) at the cost of Inter-Token Latency (ITL). When set to 0, the router ignores prefix caches and uses pure load balancing. Defaults to 1. + +- `--router-temperature`: Controls worker selection randomness through softmax sampling of router cost logits. A value of 0 (default) ensures deterministic selection of the lowest-cost worker, while higher values introduce more randomness. + +- `--no-kv-events`: Disables KV event tracking. By default (when this flag is not provided), the router uses KV events to monitor block creation and deletion from workers. When disabled with this flag, the router predicts cache state based on routing decisions with TTL-based expiration (default 120s) and pruning. Use this flag if your backend doesn't support KV events (or you are not confident in the accuracy or responsiveness of the events). + +- `--router-replica-sync`: Disabled by default. Enables NATS-based synchronization of local routing decisions between router replicas. When enabled, routers share their active sequence information and local predictions of block usage, improving routing consistency across instances. Note that this does not sync the radix tree or cached KV block states themselves - those are synchronized through JetStream events + +- `--router-reset-states`: When specified, resets the router state on startup by clearing both the JetStream event stream and NATS object store, starting with a fresh state. By default (when this flag is not provided), the router persists state across restarts, downloading any available snapshot from NATS object store and continuing to consume events from where it left off. This enables routers to maintain KV cache awareness across restarts. **Warning**: Using `--router-reset-states` can bring existing router replicas into an inconsistent state. Only use this flag when launching the first router replica in a component, or consider using a different namespace/component for a clean slate. + +- `--router-snapshot-threshold`: Sets the number of messages in the JetStream before triggering a snapshot. When the message count exceeds this threshold, a router will attempt to purge acknowledged messages from the stream and create a snapshot of the current radix tree state in NATs object store. Defaults to 1000000. This helps manage stream size and provides faster initialization for routers that restart. + +- `--no-track-active-blocks`: Disables tracking of active blocks (blocks being used for ongoing generation/decode phases). By default, the router tracks active blocks for load balancing. Disable this when routing to workers that only perform prefill (no decode phase), as tracking decode load is not relevant. This reduces router overhead and simplifies state management. + +- `--active-decode-blocks-threshold`: Initial threshold (0.0-1.0) for determining when a worker is considered busy based on KV cache block utilization. When a worker's KV cache active blocks exceed this percentage of total blocks, it will be marked as busy and excluded from routing. If not set, blocks-based busy detection is disabled. This feature works with all routing modes (`--router-mode kv|round-robin|random`) as long as backend engines emit `ForwardPassMetrics`. The threshold can be dynamically updated at runtime via the `/busy_threshold` HTTP endpoint (see [Dynamic Threshold Configuration](#dynamic-threshold-configuration)). + +- `--active-prefill-tokens-threshold`: Literal token count threshold for determining when a worker is considered busy based on prefill token utilization. When active prefill tokens exceed this threshold, the worker is marked as busy. If not set, tokens-based busy detection is disabled. + +- `--router-ttl`: Time-to-live in seconds for blocks in the router's local cache predictions. Blocks older than this duration will be automatically expired and removed from the router's radix tree. Defaults to 120.0 seconds when `--no-kv-events` is used. This helps manage memory usage by removing stale cache predictions that are unlikely to be accurate. + +- `--router-max-tree-size`: Maximum tree size (number of blocks) before pruning is triggered. When the total number of blocks in the radix tree exceeds this threshold, the router will prune the least recently used blocks. Defaults to 1048576 (2^20 blocks) when `--no-kv-events` is used. This prevents unbounded memory growth in long-running deployments. + +- `--router-prune-target-ratio`: Target size ratio to prune down to when `--router-max-tree-size` is exceeded. For example, with a value of 0.8 (default) and max tree size of 1048576, the router will prune down to approximately 838860 blocks when the threshold is exceeded. Defaults to 0.8 when `--no-kv-events` is used. This creates headroom before the next pruning cycle. + +>[!Note] +> **State persistence** depends on the event transport mode: +> - **JetStream mode** (default): State persists across router restarts via JetStream and NATS object store snapshots. +> - **NATS Core with Local Indexer mode** (`--enable-local-indexer` on workers): State persists on workers—router rebuilds state by querying workers on startup. +> - **No KV events** (`--no-kv-events`): State persistence is not supported. +> +> **Request plane is independent of KV event transport.** +> `DYN_REQUEST_PLANE` controls how **requests** are sent (TCP/HTTP/NATS), but KV-aware routing still uses **NATS** for KV events in both JetStream and NATS Core + Local Indexer modes. +> If you run with `DYN_REQUEST_PLANE=tcp` (or `http`) and KV events enabled (default), you must also configure NATS, e.g. `NATS_SERVER=nats://...`. +> Only `--no-kv-events` removes the NATS requirement. +> +> When `--kv-overlap-score-weight` is set to 0, no KvIndexer is created and prefix matching is disabled (pure load balancing). When `--no-kv-events` is set, a KvIndexer is still created but no event subscriber is launched to consume KV events from workers. Instead, the router predicts cache state based on its own routing decisions with TTL-based expiration and pruning. In both cases, it's recommended to disable your backend workers from publishing events through `KvEventPublisher` to avoid event accumulation in JetStream. WIP to enable disabling publishing of KV events completely in these cases. +> +> The cli args `--router-ttl`, `--router-max-tree-size`, and `--router-prune-target-ratio` control local cache management when the router operates without receiving events from workers. When KV events are enabled (default), the router relies on worker-side eviction events and these parameters are ignored. + +## Prerequisites and Limitations + +>[!Note] +> **KV Router Requirements**: The KV router currently works only with **dynamic endpoints** that are registered via [`register_llm()`](../development/backend-guide.md) with `model_input=ModelInput.Tokens`. Your backend handler receives pre-tokenized requests with `token_ids` instead of raw text. + +**Current Limitations (WIP):** +- **Static endpoints**: Not yet supported. The KV router requires dynamic model discovery via etcd to track worker instances and their KV cache states. +- **Multimodal models**: Not yet supported. The KV router currently tracks token-based blocks only. + +**What this means for your setup:** +1. Backend workers must call `register_llm()` with `model_input=ModelInput.Tokens` (see [Backend Guide](../development/backend-guide.md) or [example implementations](https://github.com/ai-dynamo/dynamo/tree/main/lib/bindings/python/examples/hello_world)) +2. Your handler receives requests with pre-tokenized `token_ids`, not raw text or multimodal inputs +3. You cannot use `--static-endpoint` mode with KV routing (use dynamic discovery instead) + +For basic model registration without KV routing, you can use `--router-mode round-robin` or `--router-mode random` with both static and dynamic endpoints. + +## Disaggregated Serving (Prefill and Decode) + +Dynamo supports disaggregated serving where prefill (prompt processing) and decode (token generation) are handled by separate worker pools. When you register workers with `ModelType.Prefill` (see [Backend Guide](../development/backend-guide.md)), the frontend automatically detects them and activates an internal prefill router. + +### Automatic Prefill Router Activation + +The prefill router is automatically created when: +1. A decode model is registered (e.g., via `register_llm()` with `ModelType.Chat | ModelType.Completions`) +2. A prefill worker is detected with the same model name and `ModelType.Prefill` + +**Key characteristics of the prefill router:** +- **Always disables active block tracking** (`track_active_blocks=false`) since prefill workers don't perform decode +- **Seamlessly integrated** into the request pipeline between preprocessing and decode routing +- **Falls back gracefully** to decode-only mode if prefill fails or no prefill workers are available + +### Setup Example + +When both workers are registered, requests are automatically routed. + +```python +# Decode worker registration (in your decode worker) +decode_endpoint = runtime.namespace("dynamo").component("decode").endpoint("generate") + +await register_llm( + model_input=ModelInput.Tokens, + model_type=ModelType.Chat | ModelType.Completions, + endpoint=decode_endpoint, + model_name="meta-llama/Llama-2-7b-hf", + # ... other parameters +) + +await decode_endpoint.serve_endpoint(decode_handler.generate) + +# Prefill worker registration (in your prefill worker) +prefill_endpoint = runtime.namespace("dynamo").component("prefill").endpoint("generate") + +await register_llm( + model_input=ModelInput.Tokens, + model_type=ModelType.Prefill, # <-- Mark as prefill worker + endpoint=prefill_endpoint, + model_name="meta-llama/Llama-2-7b-hf", # Must match decode model name + # ... other parameters +) + +await prefill_endpoint.serve_endpoint(prefill_handler.generate) +``` + + +The unified frontend with automatic prefill routing is currently enabled for vLLM and TensorRT-LLM backends. For SGLang (work in progress), you need to launch a separate standalone router as the prefill router targeting the prefill endpoints. See example script: [`examples/backends/sglang/launch/disagg_router.sh`](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/sglang/launch/disagg_router.sh). + + +### Request Flow + +The following diagram shows an overview of the major components in disaggregated serving: + +```mermaid +graph TD + HTTP[HTTP] + ROUTER[Router] + PREFILL[Prefill Worker] + DECODE[Decode Worker] + + classDef worker_style fill:#f3e5f5,stroke:#333,stroke-width:2px,color:#333; + classDef router_style fill:#2e8b57,stroke:#333,stroke-width:2px,color:#fff; + + class PREFILL,DECODE worker_style + class ROUTER router_style + + HTTP <--> |"request/response"| ROUTER + ROUTER --> |"1. send to prefill"| PREFILL + PREFILL --> |"2. return NIXL metadata"| ROUTER + ROUTER --> |"3. send with metadata"| DECODE + DECODE --> |"4. stream response"| ROUTER + + PREFILL -.-> |"publish kv events"| ROUTER + + linkStyle 0,1,2,3,4 stroke:#8b4513,stroke-width:2px + linkStyle 5 stroke:#2196f3,stroke-width:2px +``` + +## Overview + +The KV-aware router operates on two key principles to optimize request routing: + +### Global KV Cache State Synchronization + +KV events from engines are collected by the router to maintain a global view of cached blocks across all workers. The router supports two event transport modes: + +#### Mode 1: JetStream (Default) + +KV events are sent to a persistent NATS JetStream. Each KV router/indexer replica acts as a durable consumer, pulling messages from this shared stream. This architecture ensures consistency across router replicas and persistence across restarts. + +- **Best for**: Production deployments requiring durability and multi-replica router consistency +- **Tradeoffs**: Requires JetStream setup; slightly higher latency due to persistence guarantees + +```mermaid +graph TD + subgraph Engines + E1[Engine 1
KVPublisher] + E2[Engine 2
KVPublisher] + E3[Engine 3
KVPublisher] + end + + subgraph "NATS JetStream" + JS[(Persistent KV Events Stream
- Block created
- Block removed)] + end + + subgraph "NATS Object Store" + OS[(Radix Tree
State Snapshot)] + end + + subgraph "Router Replicas" + R1[Router 1
KVIndexer] + R2[Router 2
KVIndexer] + end + + E1 -->|Publish Events| JS + E2 -->|Publish Events| JS + E3 -->|Publish Events| JS + + JS -->|Consume as Durable Consumer| R1 + JS -->|Consume as Durable Consumer| R2 + JS -->|Periodic Snapshot| OS + + style JS fill:#e1f5fe,stroke:#333,color:#333 + style OS fill:#e1f5fe,stroke:#333,color:#333 + style E1 fill:#f3e5f5,stroke:#333,color:#333 + style E2 fill:#f3e5f5,stroke:#333,color:#333 + style E3 fill:#f3e5f5,stroke:#333,color:#333 + style R1 fill:#2e8b57,stroke:#333,color:#fff + style R2 fill:#2e8b57,stroke:#333,color:#fff + + linkStyle 0,1,2,3,4,5 stroke:#2196f3,stroke-width:2px +``` + +#### Mode 2: NATS Core with Local Indexer + +When workers are started with `--enable-local-indexer`, each worker maintains its own local radix tree (local indexer) and publishes events over NATS Core (fire-and-forget pub/sub) instead of JetStream. Each worker assigns monotonically increasing event IDs to its events. The router detects gaps in event sequences and recovers missed events by querying the worker's local indexer directly. + +- **Best for**: Lower-latency setups; simpler deployments without JetStream; single-router scenarios +- **Tradeoffs**: State persists on workers (not centralized); recovery depends on workers being available +- **Enable with**: `--enable-local-indexer` flag on workers (vLLM, mocker) + +```mermaid +graph TD + subgraph Engines + E1[Engine 1
LocalKvIndexer] + E2[Engine 2
LocalKvIndexer] + E3[Engine 3
LocalKvIndexer] + end + + subgraph "NATS Core" + NC[KV Events Pub/Sub
- Block created
- Block removed] + end + + subgraph "Router Replicas" + R1[Router 1
KVIndexer] + R2[Router 2
KVIndexer] + end + + E1 -->|Publish Events| NC + E2 -->|Publish Events| NC + E3 -->|Publish Events| NC + + NC -->|Subscribe| R1 + NC -->|Subscribe| R2 + + style NC fill:#e1f5fe,stroke:#333,color:#333 + style E1 fill:#f3e5f5,stroke:#333,color:#333 + style E2 fill:#f3e5f5,stroke:#333,color:#333 + style E3 fill:#f3e5f5,stroke:#333,color:#333 + style R1 fill:#2e8b57,stroke:#333,color:#fff + style R2 fill:#2e8b57,stroke:#333,color:#fff + + linkStyle 0,1,2,3,4 stroke:#2196f3,stroke-width:2px +``` + +**How gap detection works:** +1. Each worker assigns monotonically increasing event IDs starting from 0 +2. The router tracks the last received event ID per worker +3. If an event arrives with `event_id > last_id + 1`, the router detects a gap +4. The router queries the worker's local indexer for the missing event range `[last_id+1, event_id-1]` +5. On worker discovery (Added event), the router dumps the worker's entire local indexer state + +**Startup behavior:** +- When a worker is discovered, the router queries and ingests its full local indexer state +- When a worker is removed, the router removes all its blocks from the global radix tree + +>[!Note] +> The router automatically selects the transport mode based on worker configuration. If all connected workers have `enable_local_indexer=true`, the router uses NATS Core mode. Otherwise, it uses JetStream mode. + +### Local Active Block Management with Replica Sync + +Second, in addition to cached blocks, each router replica needs to track active blocks (blocks being used for ongoing generation) as load metrics. Since this information is highly time-sensitive, it should be predicted immediately when: +- The router receives and routes a request +- The first token is generated (prefill complete) +- The response ends (request freed) + +This is managed locally in each router via a "slot manager". To maintain consistency across the system, router replicas synchronize these local predictions with each other through NATS core messaging. + +```mermaid +sequenceDiagram + participant C1 as Client 1 + participant R1 as Router 1
(Slot Manager) + participant R2 as Router 2
(Slot Manager) + participant C2 as Client 2 + + Note over R1,R2: Router Replica Sync Enabled + + C1->>R1: Request A + activate R1 + R1->>R1: Predict blocks & route to worker + R1-->>R2: Sync: AddRequest(A) + + C2->>R2: Request B + activate R2 + R2->>R2: Predict blocks & route to worker + R2-->>R1: Sync: AddRequest(B) + + R1->>R1: First token received
(prefill complete) + R1-->>R2: Sync: MarkPrefillCompleted(A) + R1->>C1: Stream response + + R2->>R2: First token received
(prefill complete) + R2-->>R1: Sync: MarkPrefillCompleted(B) + R2->>C2: Stream response + + R1->>R1: Response complete
(free blocks) + R1-->>R2: Sync: Free(A) + deactivate R1 + + R2->>R2: Response complete
(free blocks) + R2-->>R1: Sync: Free(B) + deactivate R2 + + Note over R1,R2: Both routers have consistent
view of active blocks +``` + +This dual-layer approach—persistent global KV cache state via JetStream and ephemeral active block synchronization via router replicas—enables the system to make optimal routing decisions that balance cache reuse with load distribution. + +## Basic Routing +Dynamo supports several routing strategies when sending requests from one component to another component's endpoint. + +First, we must create a client tied to a components endpoint, we can do this using the labels defined above. Here we are getting a client tied to the `generate` endpoint of the `VllmWorker` component. + +```python +client = namespace('dynamo').component('VllmWorker').endpoint('generate').client() +``` + +We can then use the default routing methods exposed by the client class to send requests to the `VllmWorker` component. + +- **Random routing**: Default strategy, available via `client.generate()` or `client.random()` +- **Round-robin routing**: Cycles through available workers via `client.round_robin()` +- **Direct routing**: Explicitly targets a specific worker via `client.direct(input, component_id)` + +KV Cache routing uses direct routing with a special worker selection algorithm. + +## Serving Multiple Router Replicas + +For improved fault tolerance, you can launch multiple frontend + router replicas. Since the frontend and router are currently tied together, you'll need to use different HTTP ports for each instance. (The separation of the frontend and Router is WIP.) + +### Router State Management + +The KV Router tracks two types of state (see [KV Router Architecture](README.md) for details): + +1. **Prefix blocks (cached KV blocks)**: Maintained in a radix tree, tracking which blocks are cached on each worker. This state is **persistent** - backed by NATS JetStream events and object store snapshots. New router replicas automatically sync this state on startup, ensuring consistent cache awareness across restarts. + +2. **Active blocks (decoding blocks)**: Tracks blocks currently being used for active generation requests. This state is **ephemeral** - when a new router replica starts, it begins with zero active block knowledge but becomes eventually consistent as it handles requests. + +### Enabling Router Replica Synchronization + +```bash +# Router replica 1 +python -m dynamo.frontend --router-mode kv --port 8000 --router-replica-sync + +# Router replica 2 (can be started later) +python -m dynamo.frontend --router-mode kv --port 8001 --router-replica-sync +``` + +The `--router-replica-sync` flag enables active block synchronization between replicas: +- Active blocks are shared via NATS core messaging (fire-and-forget) +- Replicas exchange routing decisions to maintain consistent load estimates +- A new replica start with zero active blocks but quickly converge through request handling, by itself and active syncing with other replicas + +Without this flag, each replica maintains its own isolated view of active blocks, potentially leading to suboptimal routing. + +### Persistence and Recovery + +Persistence behavior depends on which event transport mode is active: + +**JetStream Mode (default):** +- Prefix blocks are stored in NATS JetStream with 1-hour retention +- Snapshots saved to NATS object store at configurable thresholds +- New replicas automatically restore this state on startup +- You can launch a third Router replica even if the first two are down, and it will recover the full prefix state + +```bash +python -m dynamo.frontend --router-mode kv --port 8002 --router-replica-sync +``` + +**NATS Core with Local Indexer Mode:** +- State persists on workers—events are fire-and-forget but workers retain their local indexer state +- On startup, the router queries each worker's local indexer to rebuild state +- Recovery depends on workers being available; if a worker is down, its blocks cannot be recovered +- Simpler infrastructure (no JetStream required) but less resilient + +>[!Note] +> If you need to start with a fresh state in JetStream mode, you have two options: +> 1. **Recommended**: Use a different namespace/component (see [Distributed Runtime](../design-docs/distributed-runtime.md)) which will start a new stream and NATS object store path +> 2. **Use with caution**: Launch a router with the `--router-reset-states` flag, which will purge the entire stream and radix snapshot. This should only be done when launching the first router replica in a component, as it can bring existing router replicas into an inconsistent state. + +## Understanding KV Cache +The leading Large Language Models (LLMs) today are auto-regressive and based off of the [transformer architecture](https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf). One key inference optimization technique is to cache the already computed keys and values and to reuse them for the future tokens. This is called the [KV Cache](https://developer.nvidia.com/blog/mastering-llm-techniques-inference-optimization/#key-value_caching). + +### KV Cache Optimizations +Every inference framework will have a KV Cache for each worker. A popular inference framework library is [vLLM](https://github.com/vllm-project/vllm) where a key contribution was [PagedAttention](https://arxiv.org/abs/2309.06180), which allowed them to manage KV Cache in an efficient way by chunking requests into blocks. + +Another popular inference framework, [SGLang](https://github.com/sgl-project/sglang), contributed [RadixAttention](https://arxiv.org/abs/2312.07104) which introduced a +prefix tree which allows for efficient matching, inserting and eviction of KV Cache blocks. The prefix tree structure popularized KV Cache reuse. + +In Dynamo, we introduce a KVPublisher which emits KV Cache events that occur at each worker and a KVIndexer which keeps track of these events globally. + +To get a feel for how KV Cache management works on a single worker with KV Cache reuse turned on and where the KVPublisher gets plugged in, we can walk through the KV Block management flow: +1. Request tokenization: The incoming prompt is converted into tokens +2. Block partitioning: The token sequence is divided into fixed-size blocks (e.g., 16 or 64 tokens per block) +3. Block hashing: Each block of tokens is hashed to create a unique identifier +4. Cache lookup: + - For each block, the system checks if a matching block already exists in the KV cache + - If a match is found, the existing KV cache block is reused + - If no match is found, the system proceeds to the next step +5. Resource allocation: + - For blocks without matches, the system attempts to allocate new memory space + - If sufficient memory is available, allocate memory space and proceed to step 7 + - If memory is constrained, proceed to step 6 +6. Cache eviction (when necessary): + - The system applies an eviction policy (e.g., LRU, LFU) to identify blocks for removal + - Selected blocks are evicted from the cache + - **KVPublisher emits a KV removed event notifying KVIndexer about the removed block.** + - Alternatively, some systems may offload less-frequently used blocks to CPU memory. +7. KV computation: + - For new blocks, the model computes key and value tensors + - These tensors are stored in the newly allocated cache blocks + - **KVPublisher emits a kv stored event notifying KVIndexer about newly stored blocks**. + +Further details can be found for: [TRT-LLM](https://developer.nvidia.com/blog/introducing-new-kv-cache-reuse-optimizations-in-nvidia-tensorrt-llm/), [vLLM](https://docs.vllm.ai/en/latest/design/automatic_prefix_caching.html#design-automatic-prefix-caching) and [SGLang](https://lmsys.org/blog/2024-01-17-sglang/). + +## KV Cache Routing and Load Balancing +```mermaid +graph TD + T[Tokens] --> R[KV Aware Router] + + R -.-> W1["Worker 1
Cached: 2 blocks
Prefill: 8 blks
Decode: 10 blks"] + R ==>|Selected| W2["Worker 2
Cached: 5 blocks
Prefill: 5 blks
Decode: 5 blks"] + R -.-> W3["Worker 3
Cached: 8 blocks
Prefill: 2 blks
Decode: 9 blks"] + + style T fill:#fff3e0,stroke:#333,color:#333 + style R fill:#2e8b57,stroke:#333,color:#fff + style W1 fill:#f3e5f5,stroke:#333,color:#333 + style W2 fill:#c8e6c9,stroke:#333,color:#333 + style W3 fill:#f3e5f5,stroke:#333,color:#333 + + linkStyle 0,1,2,3 stroke:#8b4513,stroke-width:2px +``` + +KV Cache reuse introduces complexity to LLM serving load balancing. While it can significantly reduce computation costs, routing strategies that ignore worker-specific KV states can lead to: +- Missed cache reuse opportunities due to suboptimal worker selection +- System throughput degradation from uneven request distribution across workers + +The router uses a cost function that considers both the prefill cost (influenced by cached blocks) and the decode load to make optimal routing decisions: + +### Cost Calculation + +1. **Prefill blocks**: Calculated by dividing the number of tokens requiring prefill processing by the block size. The system predicts this based on input tokens and available cached blocks per worker, updating the count when the first output token signals prefill completion. + +2. **Decode blocks**: Estimated from the request's input tokens and each worker's active sequences. The count updates when requests complete and their blocks are freed. + +3. **Cost formula**: `cost = overlap_score_weight * prefill_blocks + decode_blocks` + - Lower costs indicate better routing choices + - `overlap_score_weight` balances cache hit optimization against load distribution + - Higher weights favor cache reuse (improving TTFT), while lower weights prioritize even load distribution (improving ITL) + +### Worker Selection + +The router selects the worker with the lowest cost. When `router_temperature` is set to a non-zero value, the router uses softmax sampling on the normalized cost logits to introduce randomness in the selection, which can help with load distribution. + +Example calculation with `overlap_score_weight = 1.0`: +- Worker 1: cost = 1.0 * 8 + 10 = 18 +- **Worker 2: cost = 1.0 * 5 + 5 = 10** (selected - lowest cost) +- Worker 3: cost = 1.0 * 2 + 9 = 11 + +## Events + +### KVPublisher +The KVPublisher can be initialized and then called in the inference framework where blocks are allocated and removed. + +The two types of events are: +- KV stored event +- KV removed event + +The publisher can be initialized and used through C bindings or Python bindings. + +### Deterministic Event IDs + +Engines do not need to emit deterministic block identifiers in KV events, as the router uses local block hashes (computed from token content) for tracking and matching blocks across workers. However, it is strongly preferred that engines do emit deterministic block identifiers, as this keeps the KvIndexer's internal lookup table smaller and more efficient. To ensure deterministic behavior, all workers should use identical engine versions/configuration. If your engine relies on Python's builtin `hash()` for any event IDs, set `PYTHONHASHSEED=0`; otherwise this setting has no effect. + +### KVIndexer +The KVIndexer builds and maintains a global view of cached blocks in a prefix tree. We modify the original prefix tree by also storing the worker id on each node. This is so we can return the number of matched blocks for each worker. + +The KVIndexer has a method `find_matches_for_request`, which takes in tokens and returns a dictionary with keys of worker id and values of the number of matched KV Blocks. + +### Inter-Router Communication + +In distributed deployments with multiple routers, each router maintains visibility over only a portion of the total requests. To ensure consistent routing decisions, routers synchronize their states through three event types: + +1. **AddRequest**: Notifies other routers when a request is assigned to a worker. Includes request ID, worker ID, token sequence blocks, and overlap score to track block usage across the system. + +2. **MarkPrefillCompleted**: Signals when a request moves from prefill to decode phase, allowing routers to update their worker load calculations by excluding completed prefill tokens. + +3. **Free**: Indicates request completion and resource release, enabling accurate block reference counting across all routers. + +Each event carries a unique router ID to prevent self-event processing. This asynchronous communication system ensures optimal routing decisions by maintaining consistent KV cache state across all routers, even as they handle different request streams. + +## Using KvPushRouter Python API + +Instead of launching the KV Router via command line, you can create a `KvPushRouter` object directly in Python. This allows per-request routing configuration overrides. + +>[!Warning] +> **Multiple Routers in Same Process**: If you need to run multiple `KvPushRouter` instances for fault tolerance or load distribution, you must launch them in **separate processes** (e.g., using `python -m dynamo.frontend` with different ports). Creating multiple `KvPushRouter` objects in the same Python process is not supported - they share the same cancellation token from the component's primary lease, so dropping one router will cancel all routers in that process. For in-process routing, use a single `KvPushRouter` instance. + +### Methods + +The `KvPushRouter` provides the following methods: + +- **`generate(token_ids, model, ...)`**: Route and execute a request, returning an async stream of responses. Automatically handles worker selection, state tracking, and lifecycle management. + +- **`best_worker(token_ids, router_config_override=None, request_id=None)`**: Query which worker would be selected for given tokens. Returns `(worker_id, dp_rank, overlap_blocks)`. + - Without `request_id`: Query-only, doesn't update router state + - With `request_id`: Updates router state to track the request. **Note**: If used with `request_id`, you must call `mark_prefill_complete()` and `free()` at the appropriate lifecycle points to maintain accurate load tracking + +- **`best_worker_id(token_ids, router_config_override=None, request_id=None)`**: **[DEPRECATED - use `best_worker()` instead]** Query which worker would be selected for given tokens. Returns `(worker_id, overlap_blocks)`. + - Without `request_id`: Query-only, doesn't update router state + - With `request_id`: Updates router state to track the request. **Note**: If used with `request_id`, you must call `mark_prefill_complete()` and `free()` at the appropriate lifecycle points to maintain accurate load tracking + +- **`get_potential_loads(token_ids)`**: Get detailed load information for all workers, including potential prefill tokens and active decode blocks. Returns a list of load dictionaries. + +- **`mark_prefill_complete(request_id)`**: Signal that a request has completed its prefill phase. Only used for [manual lifecycle management](#2-manual-state-management-advanced) when using `best_worker_id()` for manual routing instead of `generate()`. + +- **`free(request_id)`**: Signal that a request has completed and its resources should be released. Only used for [manual lifecycle management](#2-manual-state-management-advanced) when using `best_worker_id()` for manual routing instead of `generate()`. + +- **`dump_events()`**: Dump all KV cache events from the router's indexer as a JSON string. Useful for debugging and analysis. + +### Setup + +First, launch your backend engines: +```bash +python -m dynamo.vllm --model meta-llama/Llama-2-7b-hf +``` + +### Example Script + +```python +import asyncio +from dynamo._core import DistributedRuntime, KvPushRouter, KvRouterConfig + +async def main(): + # Get runtime and create endpoint + runtime = DistributedRuntime.detached() + namespace = runtime.namespace("dynamo") + component = namespace.component("backend") + endpoint = component.endpoint("generate") + + # Create KV router + kv_router_config = KvRouterConfig() + router = KvPushRouter( + endpoint=endpoint, + block_size=16, + kv_router_config=kv_router_config + ) + + # Your input tokens + token_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + + # Generate with per-request routing override + stream = await router.generate( + token_ids=token_ids, + model="meta-llama/Llama-2-7b-hf", + stop_conditions={ + "max_tokens": 20, # Generate exactly 20 tokens + "ignore_eos": True, # Don't stop at EOS token + }, + sampling_options={ + "temperature": 0.7, + "top_p": 0.9, + }, + router_config_override={ + "overlap_score_weight": 2.0, # Prioritize cache hits for this request + "router_temperature": 0.5, # Add routing randomness + } + ) + + # Collect generated tokens + generated_tokens = [] + async for response in stream: + if isinstance(response, dict) and "token_ids" in response: + generated_tokens.extend(response["token_ids"]) + + print(f"Generated {len(generated_tokens)} tokens: {generated_tokens}") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### Routing Patterns + +The `KvPushRouter` supports multiple usage patterns depending on your control requirements: + +#### 1. Automatic Routing (Recommended) +Call `generate()` directly and let the router handle everything: +```python +stream = await router.generate(token_ids=tokens, model="model-name") +``` +- **Best for**: Most use cases +- **Router automatically**: Selects best worker, updates state, routes request, tracks lifecycle + +#### 2. Manual State Management (Advanced) +Use `best_worker_id(request_id=...)` to select and track, then manage the request yourself: +```python +worker_id, overlap = await router.best_worker_id(tokens, request_id="req-123") +response = await client.generate(tokens, request_id="req-123") +# await anext(response) # Get first token +await router.mark_prefill_complete("req-123") # After first token +# async for _ in response: # Continue generating +# ... +await router.free("req-123") # After completion +``` +- **Best for**: Custom request handling with router state tracking +- **Requires**: Calling `mark_prefill_complete()` and `free()` at correct lifecycle points +- **Caution**: Incorrect lifecycle management degrades load balancing accuracy + +#### 3. Hierarchical Router Probing +Query without state updates, then route through a chosen router: +```python +# Probe multiple routers without updating state +worker_id_1, overlap_1 = await router_1.best_worker_id(tokens) # No request_id +worker_id_2, overlap_2 = await router_2.best_worker_id(tokens) + +# Pick the best router based on results +chosen_router = router_1 if overlap_1 > overlap_2 else router_2 +stream = await chosen_router.generate(tokens, model="model-name", worker_id=worker_id) +``` +- **Best for**: Multi-tier deployments (e.g., Envoy Gateway routing to multiple router groups) +- **Advantage**: Query multiple routers before committing to one + +#### 4. Custom Load-Based Routing +Use `get_potential_loads()` to implement custom routing logic: +```python +loads = await router.get_potential_loads(tokens) +# Apply custom logic (e.g., weighted scoring, constraints) +best_worker = min(loads, key=lambda x: custom_cost_fn(x)) +stream = await router.generate(tokens, model="model-name", worker_id=best_worker['worker_id']) +``` +- **Best for**: Custom optimization strategies beyond the built-in cost function +- **Advantage**: Full control over worker selection logic +- **See also**: Detailed example below in "Custom Routing Example: Minimizing TTFT" + +All patterns support `router_config_override` to adjust routing behavior per-request without recreating the router. + +### Custom Routing Example: Minimizing TTFT + +Here's an example of using `get_potential_loads()` to implement custom routing that minimizes Time To First Token (TTFT) by selecting the worker with the least prefill work: + +```python +import asyncio +from dynamo._core import DistributedRuntime, KvPushRouter, KvRouterConfig + +async def minimize_ttft_routing(): + # Setup router + runtime = DistributedRuntime.detached() + namespace = runtime.namespace("dynamo") + component = namespace.component("backend") + endpoint = component.endpoint("generate") + + router = KvPushRouter( + endpoint=endpoint, + block_size=16, + kv_router_config=KvRouterConfig() + ) + + # Your input tokens + token_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + + # Get potential loads for all workers + potential_loads = await router.get_potential_loads(token_ids) + + # Find worker with minimum prefill tokens (best for TTFT) + best_worker = min(potential_loads, key=lambda x: x['potential_prefill_tokens']) + + print(f"Worker loads: {potential_loads}") + print(f"Selected worker {best_worker['worker_id']} with {best_worker['potential_prefill_tokens']} prefill tokens") + + # Route directly to the selected worker + stream = await router.generate( + token_ids=token_ids, + model="meta-llama/Llama-2-7b-hf", + worker_id=best_worker['worker_id'], # Force routing to optimal worker + stop_conditions={"max_tokens": 20} + ) + + # Process response + async for response in stream: + if isinstance(response, dict) and "token_ids" in response: + print(f"Generated tokens: {response['token_ids']}") + +if __name__ == "__main__": + asyncio.run(minimize_ttft_routing()) +``` + +This approach gives you complete control over routing decisions, allowing you to optimize for different metrics based on your specific requirements. As some examples: + +- **Minimize TTFT**: Select worker with lowest `potential_prefill_tokens` +- **Maximize cache reuse**: Use `best_worker_id()` which considers both prefill and decode loads +- **Balance load**: Consider both `potential_prefill_tokens` and `potential_decode_blocks` together + +See [KV Router Architecture](README.md) for performance tuning details. + +## Dynamic Threshold Configuration + +The busy thresholds can be updated at runtime without restarting the frontend. The frontend exposes HTTP endpoints at `/busy_threshold`: + +**Get or set a model's thresholds (POST):** +```bash +# Set both thresholds for a model +curl -X POST http://localhost:8000/busy_threshold \ + -H "Content-Type: application/json" \ + -d '{"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1000}' +# Response: {"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1000} + +# Set only active decode blocks threshold +curl -X POST http://localhost:8000/busy_threshold \ + -H "Content-Type: application/json" \ + -d '{"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85}' +# Response: {"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": } + +# Get current thresholds (omit threshold fields) +curl -X POST http://localhost:8000/busy_threshold \ + -H "Content-Type: application/json" \ + -d '{"model": "meta-llama/Llama-2-7b-hf"}' +# Response: {"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1000} +# Or if not configured: {"model": "...", "active_decode_blocks_threshold": null, "active_prefill_tokens_threshold": null} +``` + +**List all configured thresholds (GET):** +```bash +curl http://localhost:8000/busy_threshold +# Response: {"thresholds": [{"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1000}]} +``` diff --git a/fern/versions/next.yml b/fern/versions/next.yml new file mode 100644 index 00000000000..ecb533dfc5c --- /dev/null +++ b/fern/versions/next.yml @@ -0,0 +1,292 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Navigation structure for Latest version +# Matching https://docs.nvidia.com/dynamo/latest/ + +navigation: + # ==================== Getting Started ==================== + - section: Getting Started + contents: + - page: Quickstart + path: ../pages/getting-started/quickstart.md + - page: Installation + path: ../pages/getting-started/installation.md + - page: Support Matrix + path: ../pages/reference/support-matrix.md + - page: Examples + path: ../pages/getting-started/examples.md + + # ==================== Kubernetes Deployment ==================== + - section: Kubernetes Deployment + contents: + - section: Deployment Guide + contents: + - page: Kubernetes Quickstart + path: ../pages/kubernetes/README.md + - page: Detailed Installation Guide + path: ../pages/kubernetes/installation-guide.md + - page: Dynamo Operator + path: ../pages/kubernetes/dynamo-operator.md + - page: Minikube Setup + path: ../pages/kubernetes/deployment/minikube-setup.md + - page: Managing Models with DynamoModel + path: ../pages/kubernetes/deployment/dynamomodel-guide.md + - section: Observability (K8s) + contents: + - page: Metrics + path: ../pages/kubernetes/observability/metrics.md + - page: Logging + path: ../pages/kubernetes/observability/logging.md + - section: Multinode + contents: + - page: Multinode Deployments + path: ../pages/kubernetes/deployment/multinode-deployment.md + - page: Grove + path: ../pages/kubernetes/grove.md + + # ==================== User Guides ==================== + - section: User Guides + contents: + - page: Tool Calling + path: ../pages/agents/tool-calling.md + - page: Multimodality Support + path: ../pages/multimodal/index.md + - page: Finding Best Initial Configs + path: ../pages/performance/aiconfigurator.md + - page: Dynamo Benchmarking Guide + path: ../pages/benchmarks/benchmarking.md + - page: Tuning Disaggregated Performance + path: ../pages/performance/tuning.md + - page: Writing Python Workers in Dynamo + path: ../pages/development/runtime-guide.md + - section: Observability (Local) + contents: + - page: Overview + path: ../pages/observability/README.md + - page: Prometheus + Grafana Setup + path: ../pages/observability/prometheus-grafana.md + - page: Metrics + path: ../pages/observability/metrics.md + - page: Metrics Developer Guide + path: ../pages/observability/metrics-developer-guide.md + - page: Health Checks + path: ../pages/observability/health-checks.md + - page: Tracing + path: ../pages/observability/tracing.md + - page: Logging + path: ../pages/observability/logging.md + - section: Fault Tolerance + contents: + - page: Overview + path: ../pages/fault-tolerance/README.md + - page: Request Migration + path: ../pages/fault-tolerance/request-migration.md + - page: Request Cancellation + path: ../pages/fault-tolerance/request-cancellation.md + - page: Graceful Shutdown + path: ../pages/fault-tolerance/graceful-shutdown.md + - page: Request Rejection + path: ../pages/fault-tolerance/request-rejection.md + - page: Testing + path: ../pages/fault-tolerance/testing.md + - page: Glossary + path: ../pages/reference/glossary.md + + # ==================== Components ==================== + - section: Components + contents: + - section: Backends + contents: + - page: vLLM + path: ../pages/backends/vllm/README.md + - page: SGLang + path: ../pages/backends/sglang/README.md + - page: TensorRT-LLM + path: ../pages/backends/trtllm/README.md + - page: Router + path: ../pages/router/README.md + - section: Planner + contents: + - page: Overview + path: ../pages/planner/planner-intro.md + - page: SLA Planner Quick Start + path: ../pages/planner/sla-planner-quickstart.md + - page: SLA-Driven Profiling + path: ../pages/benchmarks/sla-driven-profiling.md + - page: SLA-based Planner + path: ../pages/planner/sla-planner.md + - section: KVBM + contents: + - page: Overview + path: ../pages/kvbm/kvbm-intro.md + - page: Motivation + path: ../pages/kvbm/kvbm-motivation.md + - page: Architecture + path: ../pages/kvbm/kvbm-architecture.md + - page: Components + path: ../pages/kvbm/kvbm-components.md + - page: Design Deep Dive + path: ../pages/kvbm/kvbm-design-deepdive.md + - page: Integrations + path: ../pages/kvbm/kvbm-integrations.md + - page: KVBM in vLLM + path: ../pages/kvbm/vllm-setup.md + - page: KVBM in TRTLLM + path: ../pages/kvbm/trtllm-setup.md + - page: LMCache Integration + path: ../pages/backends/vllm/LMCache-Integration.md + - page: Further Reading + path: ../pages/kvbm/kvbm-reading.md + + # ==================== Design Docs ==================== + - section: Design Docs + contents: + - page: Overall Architecture + path: ../pages/design-docs/architecture.md + - page: Architecture Flow + path: ../pages/design-docs/dynamo-flow.md + - page: Disaggregated Serving + path: ../pages/design-docs/disagg-serving.md + - page: Distributed Runtime + path: ../pages/design-docs/distributed-runtime.md + - page: Event Plane + path: ../pages/design-docs/event-plane.md + + # ==================== Additional Resources ==================== + # Hidden section - these pages are accessible via direct URL but not shown in navigation + - section: Additional Resources + hidden: true + contents: + - section: Advanced Kubernetes + contents: + - page: Create Deployment + path: ../pages/kubernetes/deployment/create-deployment.md + - page: Autoscaling + path: ../pages/kubernetes/autoscaling.md + - page: Service Discovery + path: ../pages/kubernetes/service-discovery.md + - page: Model Caching with Fluid + path: ../pages/kubernetes/model-caching-with-fluid.md + - page: FluxCD + path: ../pages/kubernetes/fluxcd.md + - page: Webhooks + path: ../pages/kubernetes/webhooks.md + - page: API Reference + path: ../pages/kubernetes/api-reference.md + - section: Multimodal Details + contents: + - page: vLLM + path: ../pages/multimodal/vllm.md + - page: SGLang + path: ../pages/multimodal/sglang.md + - page: TensorRT-LLM + path: ../pages/multimodal/trtllm.md + - section: Router Details + contents: + - page: KV Cache Routing + path: ../pages/router/kv-cache-routing.md + - section: Benchmarks + contents: + - page: KV Router A/B Testing + path: ../pages/benchmarks/kv-router-ab-testing.md + - section: Frontends + contents: + - page: KServe + path: ../pages/frontends/kserve.md + - section: Development + contents: + - page: Backend Guide + path: ../pages/development/backend-guide.md + - section: Guides + contents: + - page: Request Plane + path: ../pages/guides/request-plane.md + - page: Jail Stream + path: ../pages/guides/jail-stream-readme.md + - page: Load Planner + path: ../pages/planner/load-planner.md + - page: CLI Reference + path: ../pages/reference/cli.md + - section: API Reference + contents: + - section: NIXL Connect + contents: + - page: Overview + path: ../pages/api/nixl-connect/README.md + - page: Connector + path: ../pages/api/nixl-connect/connector.md + - page: Device + path: ../pages/api/nixl-connect/device.md + - page: Device Kind + path: ../pages/api/nixl-connect/device-kind.md + - page: Descriptor + path: ../pages/api/nixl-connect/descriptor.md + - page: Read Operation + path: ../pages/api/nixl-connect/read-operation.md + - page: Write Operation + path: ../pages/api/nixl-connect/write-operation.md + - page: Readable Operation + path: ../pages/api/nixl-connect/readable-operation.md + - page: Writable Operation + path: ../pages/api/nixl-connect/writable-operation.md + - page: Operation Status + path: ../pages/api/nixl-connect/operation-status.md + - page: RDMA Metadata + path: ../pages/api/nixl-connect/rdma-metadata.md + - section: Backend Details + contents: + - section: vLLM + contents: + - page: DeepSeek-R1 + path: ../pages/backends/vllm/deepseek-r1.md + - page: GPT-OSS + path: ../pages/backends/vllm/gpt-oss.md + - page: Multi-Node + path: ../pages/backends/vllm/multi-node.md + - page: Speculative Decoding + path: ../pages/backends/vllm/speculative-decoding.md + - page: Prompt Embeddings + path: ../pages/backends/vllm/prompt-embeddings.md + - page: Prometheus + path: ../pages/backends/vllm/prometheus.md + - section: SGLang + contents: + - page: GPT-OSS + path: ../pages/backends/sglang/gpt-oss.md + - page: Disaggregation + path: ../pages/backends/sglang/sglang-disaggregation.md + - page: Expert Distribution (EPLB) + path: ../pages/backends/sglang/expert-distribution-eplb.md + - page: HiCache Example + path: ../pages/backends/sglang/sgl-hicache-example.md + - page: Profiling + path: ../pages/backends/sglang/profiling.md + - page: Prometheus + path: ../pages/backends/sglang/prometheus.md + - section: TensorRT-LLM + contents: + - page: GPT-OSS + path: ../pages/backends/trtllm/gpt-oss.md + - page: KV Cache Transfer + path: ../pages/backends/trtllm/kv-cache-transfer.md + - page: Gemma3 Sliding Window + path: ../pages/backends/trtllm/gemma3-sliding-window-attention.md + - page: Llama4 + Eagle + path: ../pages/backends/trtllm/llama4-plus-eagle.md + - page: Multinode Examples + path: ../pages/backends/trtllm/multinode/multinode-examples.md + - page: Prometheus + path: ../pages/backends/trtllm/prometheus.md