diff --git a/.github/filters.yaml b/.github/filters.yaml
index 7ba093ed889..8e728173048 100644
--- a/.github/filters.yaml
+++ b/.github/filters.yaml
@@ -17,6 +17,7 @@ all:
docs:
- 'docs/**'
+ - 'fern/**'
- '**/*.md'
- '**/*.rst'
- '**/*.txt'
diff --git a/docs/kubernetes/dynamo_operator.md b/docs/kubernetes/dynamo_operator.md
index 4e926841333..43e1de8d8fe 100644
--- a/docs/kubernetes/dynamo_operator.md
+++ b/docs/kubernetes/dynamo_operator.md
@@ -86,6 +86,7 @@ helm install dynamo-test dynamo-platform-${RELEASE_VERSION}.tgz \
--create-namespace \
--set dynamo-operator.namespaceRestriction.enabled=true \
--set dynamo-operator.controllerManager.manager.image.tag=v2.0.0-beta
+```
**Observability:**
diff --git a/fern/.gitignore b/fern/.gitignore
new file mode 100644
index 00000000000..3dc347093d9
--- /dev/null
+++ b/fern/.gitignore
@@ -0,0 +1,5 @@
+**/.preview
+**/.definition
+
+# Include logos
+!*.svg
diff --git a/fern/assets/img/architecture.png b/fern/assets/img/architecture.png
new file mode 100644
index 00000000000..c285b70aa6f
Binary files /dev/null and b/fern/assets/img/architecture.png differ
diff --git a/fern/assets/img/disagg-perf-benefit.png b/fern/assets/img/disagg-perf-benefit.png
new file mode 100644
index 00000000000..099a0e8bc54
Binary files /dev/null and b/fern/assets/img/disagg-perf-benefit.png differ
diff --git a/fern/assets/img/dynamo-deploy.png b/fern/assets/img/dynamo-deploy.png
new file mode 100644
index 00000000000..aaea1906806
Binary files /dev/null and b/fern/assets/img/dynamo-deploy.png differ
diff --git a/fern/assets/img/dynamo-flow.png b/fern/assets/img/dynamo-flow.png
new file mode 100644
index 00000000000..672bc6309d9
Binary files /dev/null and b/fern/assets/img/dynamo-flow.png differ
diff --git a/fern/assets/img/favicon.png b/fern/assets/img/favicon.png
new file mode 100644
index 00000000000..308a5e6e209
Binary files /dev/null and b/fern/assets/img/favicon.png differ
diff --git a/fern/assets/img/frontpage-architecture.png b/fern/assets/img/frontpage-architecture.png
new file mode 100644
index 00000000000..1c8c8d42da5
Binary files /dev/null and b/fern/assets/img/frontpage-architecture.png differ
diff --git a/fern/assets/img/frontpage-banner.png b/fern/assets/img/frontpage-banner.png
new file mode 100644
index 00000000000..e06c52ba609
Binary files /dev/null and b/fern/assets/img/frontpage-banner.png differ
diff --git a/fern/assets/img/frontpage-gpu-evolution.png b/fern/assets/img/frontpage-gpu-evolution.png
new file mode 100644
index 00000000000..8c483620c1a
Binary files /dev/null and b/fern/assets/img/frontpage-gpu-evolution.png differ
diff --git a/fern/assets/img/frontpage-gpu-vertical.png b/fern/assets/img/frontpage-gpu-vertical.png
new file mode 100644
index 00000000000..169beee9af6
Binary files /dev/null and b/fern/assets/img/frontpage-gpu-vertical.png differ
diff --git a/fern/assets/img/grafana-disagg-trace.png b/fern/assets/img/grafana-disagg-trace.png
new file mode 100644
index 00000000000..1e41bc4d4ec
Binary files /dev/null and b/fern/assets/img/grafana-disagg-trace.png differ
diff --git a/fern/assets/img/grafana-dynamo-composite.png b/fern/assets/img/grafana-dynamo-composite.png
new file mode 100644
index 00000000000..eba18e0b06d
Binary files /dev/null and b/fern/assets/img/grafana-dynamo-composite.png differ
diff --git a/fern/assets/img/grafana-k8s.png b/fern/assets/img/grafana-k8s.png
new file mode 100644
index 00000000000..2c9ea3018b4
Binary files /dev/null and b/fern/assets/img/grafana-k8s.png differ
diff --git a/fern/assets/img/h100-decode-performance.png b/fern/assets/img/h100-decode-performance.png
new file mode 100644
index 00000000000..dfc2c7a2e7a
Binary files /dev/null and b/fern/assets/img/h100-decode-performance.png differ
diff --git a/fern/assets/img/h100-prefill-performance.png b/fern/assets/img/h100-prefill-performance.png
new file mode 100644
index 00000000000..0d5b499403f
Binary files /dev/null and b/fern/assets/img/h100-prefill-performance.png differ
diff --git a/fern/assets/img/itl-interpolation.png b/fern/assets/img/itl-interpolation.png
new file mode 100644
index 00000000000..356c986555b
Binary files /dev/null and b/fern/assets/img/itl-interpolation.png differ
diff --git a/fern/assets/img/kv-cache-mgr-design.png b/fern/assets/img/kv-cache-mgr-design.png
new file mode 100644
index 00000000000..18cae8e0c4e
Binary files /dev/null and b/fern/assets/img/kv-cache-mgr-design.png differ
diff --git a/fern/assets/img/kv-cache-mgr.png b/fern/assets/img/kv-cache-mgr.png
new file mode 100644
index 00000000000..a8bec363586
Binary files /dev/null and b/fern/assets/img/kv-cache-mgr.png differ
diff --git a/fern/assets/img/kv-routing.png b/fern/assets/img/kv-routing.png
new file mode 100644
index 00000000000..a24d38de810
Binary files /dev/null and b/fern/assets/img/kv-routing.png differ
diff --git a/fern/assets/img/kvbm-agg-performance.png b/fern/assets/img/kvbm-agg-performance.png
new file mode 100644
index 00000000000..3d2863cc2d1
Binary files /dev/null and b/fern/assets/img/kvbm-agg-performance.png differ
diff --git a/fern/assets/img/kvbm-architecture.png b/fern/assets/img/kvbm-architecture.png
new file mode 100644
index 00000000000..90ae3ac1ef1
Binary files /dev/null and b/fern/assets/img/kvbm-architecture.png differ
diff --git a/fern/assets/img/kvbm-components.png b/fern/assets/img/kvbm-components.png
new file mode 100644
index 00000000000..afa9f2dd25a
Binary files /dev/null and b/fern/assets/img/kvbm-components.png differ
diff --git a/fern/assets/img/kvbm-data-flows.png b/fern/assets/img/kvbm-data-flows.png
new file mode 100644
index 00000000000..2358da2a2d7
Binary files /dev/null and b/fern/assets/img/kvbm-data-flows.png differ
diff --git a/fern/assets/img/kvbm-integrations.png b/fern/assets/img/kvbm-integrations.png
new file mode 100644
index 00000000000..ffb64f3874e
Binary files /dev/null and b/fern/assets/img/kvbm-integrations.png differ
diff --git a/fern/assets/img/kvbm-internal-arch.png b/fern/assets/img/kvbm-internal-arch.png
new file mode 100644
index 00000000000..3e5c9c76e34
Binary files /dev/null and b/fern/assets/img/kvbm-internal-arch.png differ
diff --git a/fern/assets/img/kvbm-metrics-grafana.png b/fern/assets/img/kvbm-metrics-grafana.png
new file mode 100644
index 00000000000..b68b707ab06
Binary files /dev/null and b/fern/assets/img/kvbm-metrics-grafana.png differ
diff --git a/fern/assets/img/kvbm-offload.png b/fern/assets/img/kvbm-offload.png
new file mode 100644
index 00000000000..0da6af2a1d9
Binary files /dev/null and b/fern/assets/img/kvbm-offload.png differ
diff --git a/fern/assets/img/kvbm-onboard-disk2device.png b/fern/assets/img/kvbm-onboard-disk2device.png
new file mode 100644
index 00000000000..2354e7e2bac
Binary files /dev/null and b/fern/assets/img/kvbm-onboard-disk2device.png differ
diff --git a/fern/assets/img/kvbm-onboard-host2device.png b/fern/assets/img/kvbm-onboard-host2device.png
new file mode 100644
index 00000000000..fe8ad9f575d
Binary files /dev/null and b/fern/assets/img/kvbm-onboard-host2device.png differ
diff --git a/fern/assets/img/nvidia-logo-dark.svg b/fern/assets/img/nvidia-logo-dark.svg
new file mode 100644
index 00000000000..6798f1596ae
--- /dev/null
+++ b/fern/assets/img/nvidia-logo-dark.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/fern/assets/img/nvidia-logo.svg b/fern/assets/img/nvidia-logo.svg
new file mode 100644
index 00000000000..08844b77b9e
--- /dev/null
+++ b/fern/assets/img/nvidia-logo.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/fern/assets/img/pd-interpolation.png b/fern/assets/img/pd-interpolation.png
new file mode 100644
index 00000000000..68c3e49fee0
Binary files /dev/null and b/fern/assets/img/pd-interpolation.png differ
diff --git a/fern/assets/img/planner-perf.png b/fern/assets/img/planner-perf.png
new file mode 100644
index 00000000000..9a8e8cf6271
Binary files /dev/null and b/fern/assets/img/planner-perf.png differ
diff --git a/fern/assets/img/planner-tensorboard.png b/fern/assets/img/planner-tensorboard.png
new file mode 100644
index 00000000000..a42127a9e4f
Binary files /dev/null and b/fern/assets/img/planner-tensorboard.png differ
diff --git a/fern/assets/img/prefill-time.png b/fern/assets/img/prefill-time.png
new file mode 100644
index 00000000000..25630974de4
Binary files /dev/null and b/fern/assets/img/prefill-time.png differ
diff --git a/fern/assets/img/prometheus-k8s.png b/fern/assets/img/prometheus-k8s.png
new file mode 100644
index 00000000000..e754f3d5d1d
Binary files /dev/null and b/fern/assets/img/prometheus-k8s.png differ
diff --git a/fern/assets/img/trace.png b/fern/assets/img/trace.png
new file mode 100644
index 00000000000..7cc6eb09b19
Binary files /dev/null and b/fern/assets/img/trace.png differ
diff --git a/fern/docs.yml b/fern/docs.yml
new file mode 100644
index 00000000000..ba8c319e4e6
--- /dev/null
+++ b/fern/docs.yml
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+instances:
+ - url: ai-dynamo.docs.buildwithfern.com
+
+title: NVIDIA Dynamo Documentation
+
+# Version configuration
+versions:
+ - display-name: Next
+ path: ./versions/next.yml
+
+# GitHub repository link in navbar
+navbar-links:
+ - type: github
+ value: https://github.com/ai-dynamo/dynamo
+
+# NVIDIA branding colors
+colors:
+ accent-primary:
+ dark: "#76B900"
+ light: "#4A7300"
+ background:
+ dark: "#1A1A1A"
+ light: "#FFFFFF"
+
+# Logo and favicon
+logo:
+ href: /
+ light: ./assets/img/nvidia-logo.svg
+ dark: ./assets/img/nvidia-logo-dark.svg
+ height: 50
+
+favicon: ./assets/img/favicon.png
diff --git a/fern/fern.config.json b/fern/fern.config.json
new file mode 100644
index 00000000000..be7914486ab
--- /dev/null
+++ b/fern/fern.config.json
@@ -0,0 +1,4 @@
+{
+ "organization": "ai-dynamo",
+ "version": "3.29.1"
+}
diff --git a/fern/pages/agents/tool-calling.md b/fern/pages/agents/tool-calling.md
new file mode 100644
index 00000000000..142c88dec18
--- /dev/null
+++ b/fern/pages/agents/tool-calling.md
@@ -0,0 +1,189 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Tool Calling with Dynamo"
+---
+
+You can connect Dynamo to external tools and services using function calling (also known as tool calling). By providing a list of available functions, Dynamo can choose
+to output function arguments for the relevant function(s) which you can execute to augment the prompt with relevant external information.
+
+Tool calling (AKA function calling) is controlled using the `tool_choice` and `tools` request parameters.
+
+
+## Prerequisites
+
+To enable this feature, you should set the following flag while launching the backend worker
+
+- `--dyn-tool-call-parser` : select the parser from the available parsers list using the below command
+
+```bash
+# can be vllm, sglang, trtllm, etc. based on your installation
+python -m dynamo. --help"
+```
+
+
+If no tool call parser is provided by the user, Dynamo will try to use default tool call parsing based on `` and `<|python_tag|>` tool tags.
+
+
+
+If your model's default chat template doesn't support tool calling, but the model itself does, you can specify a custom chat template per worker
+with `python -m dynamo. --custom-jinja-template `.
+
+
+
+Parser to Model Mapping
+
+| Parser Name | Supported Models |
+|-------------|-----------------------------------------------------------------------|
+| hermes | Qwen/Qwen2.5-*, Qwen/QwQ-32B, NousResearch/Hermes-2-Pro-*, NousResearch/Hermes-2-Theta-*, NousResearch/Hermes-3-* |
+| mistral | mistralai/Mistral-7B-Instruct-v0.3, Additional mistral function-calling models are compatible as well.|
+| llama3_json | meta-llama/Llama-3.1-*, meta-llama/Llama-3.2-* |
+| harmony | openai/gpt-oss-* |
+| nemotron_deci | nvidia/nemotron-* |
+| phi4 | Phi-4-* |
+| deepseek_v3 | deepseek-ai/DeepSeek-V3, deepseek-ai/DeepSeek-R1, deepseek-ai/DeepSeek-R1-0528 |
+| deepseek_v3_1 | deepseek-ai/DeepSeek-V3.1 |
+| pythonic | meta-llama/Llama-4-* |
+| jamba | ai21labs/AI21-Jamba-*-1.5, ai21labs/AI21-Jamba-*-1.6, ai21labs/AI21-Jamba-*-1.7, |
+
+
+## Examples
+
+### Launch Dynamo Frontend and Backend
+
+```bash
+# launch backend worker
+python -m dynamo.vllm --model openai/gpt-oss-20b --dyn-tool-call-parser harmony
+
+# launch frontend worker
+python -m dynamo.frontend
+```
+
+### Tool Calling Request Examples
+
+- Example 1
+```python
+from openai import OpenAI
+import json
+
+client = OpenAI(base_url="http://localhost:8081/v1", api_key="dummy")
+
+def get_weather(location: str, unit: str):
+ return f"Getting the weather for {location} in {unit}..."
+tool_functions = {"get_weather": get_weather}
+
+tools = [{
+ "type": "function",
+ "function": {
+ "name": "get_weather",
+ "description": "Get the current weather in a given location",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+ "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+ },
+ "required": ["location", "unit"]
+ }
+ }
+}]
+
+response = client.chat.completions.create(
+ model="openai/gpt-oss-20b",
+ messages=[{"role": "user", "content": "What's the weather like in San Francisco in Celsius?"}],
+ tools=tools,
+ tool_choice="auto",
+ max_tokens=10000
+)
+print(f"{response}")
+tool_call = response.choices[0].message.tool_calls[0].function
+print(f"Function called: {tool_call.name}")
+print(f"Arguments: {tool_call.arguments}")
+print(f"Result: {tool_functions[tool_call.name](**json.loads(tool_call.arguments))}")
+```
+
+- Example 2
+```python
+
+# Use tools defined in example 1
+
+time_tool = {
+ "type": "function",
+ "function": {
+ "name": "get_current_time_nyc",
+ "description": "Get the current time in NYC.",
+ "parameters": {}
+ }
+}
+
+
+tools.append(time_tool)
+
+messages = [
+ {"role": "user", "content": "What's the current time in New York?"}
+]
+
+
+response = client.chat.completions.create(
+ model="openai/gpt-oss-20b", #client.models.list().data[1].id,
+ messages=messages,
+ tools=tools,
+ tool_choice="auto",
+ max_tokens=100,
+)
+print(f"{response}")
+tool_call = response.choices[0].message.tool_calls[0].function
+print(f"Function called: {tool_call.name}")
+print(f"Arguments: {tool_call.arguments}")
+```
+
+- Example 3
+
+
+```python
+
+tools = [
+ {
+ "type": "function",
+ "function": {
+ "name": "get_tourist_attractions",
+ "description": "Get a list of top tourist attractions for a given city.",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "city": {
+ "type": "string",
+ "description": "The name of the city to find attractions for.",
+ }
+ },
+ "required": ["city"],
+ },
+ },
+ },
+]
+
+def get_messages():
+ return [
+ {
+ "role": "user",
+ "content": (
+ "I'm planning a trip to Tokyo next week. what are some top tourist attractions in Tokyo? "
+ ),
+ },
+ ]
+
+
+messages = get_messages()
+
+response = client.chat.completions.create(
+ model="openai/gpt-oss-20b",
+ messages=messages,
+ tools=tools,
+ tool_choice="auto",
+ max_tokens=100,
+)
+print(f"{response}")
+tool_call = response.choices[0].message.tool_calls[0].function
+print(f"Function called: {tool_call.name}")
+print(f"Arguments: {tool_call.arguments}")
+```
diff --git a/fern/pages/api/nixl-connect/README.md b/fern/pages/api/nixl-connect/README.md
new file mode 100644
index 00000000000..15e77d8d4b3
--- /dev/null
+++ b/fern/pages/api/nixl-connect/README.md
@@ -0,0 +1,171 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Dynamo NIXL Connect"
+---
+
+Dynamo NIXL Connect specializes in moving data between models/workers in a Dynamo Graph, and for the use cases where registration and memory regions need to be dynamic.
+Dynamo connect provides utilities for such use cases, using the NIXL-based I/O subsystem via a set of Python classes.
+The relaxed registration comes with some performance overheads, but simplifies the integration process.
+Especially for larger data transfer operations, such as between models in a multi-model graph, the overhead would be marginal.
+The `dynamo.nixl_connect` library can be imported by any Dynamo container hosted application.
+
+
+Dynamo NIXL Connect will pick the best available method of data transfer available to it.
+The available methods depend on the hardware and software configuration of the machines and network running the graph.
+GPU Direct RDMA operations require that both ends of the operation have:
+- NIC and GPU capable of performing RDMA operations
+- Device drivers that support GPU-NIC direct interactions (aka "zero copy") and RDMA operations
+- Network that supports InfiniBand or RoCE
+With any of the above not satisfied, GPU Direct RDMA will not be available to the graph's workers, and less-optimal methods will be utilized to ensure basic functionality.
+For additional information, please read this [GPUDirect RDMA](https://docs.nvidia.com/cuda/pdf/GPUDirect_RDMA.pdf) document.
+
+
+```python
+import dynamo.nixl_connect
+```
+
+All operations using the NIXL Connect library begin with the [`Connector`](connector.md) class and the type of operation required.
+There are four types of supported operations:
+
+ 1. **Register local readable memory**:
+
+ Register local memory buffer(s) with the NIXL subsystem to enable a remote worker to read from.
+
+ 2. **Register local writable memory**:
+
+ Register local memory buffer(s) with the NIXL subsystem to enable a remote worker to write to.
+
+ 3. **Read from registered, remote memory**:
+
+ Read remote memory buffer(s), registered by a remote worker to be readable, into local memory buffer(s).
+
+ 4. **Write to registered, remote memory**:
+
+ Write local memory buffer(s) to remote memory buffer(s) registered by a remote worker to writable.
+
+When available, by connecting correctly paired operations, high-throughput GPU Direct RDMA data transfers can be completed.
+Given the list above, the correct pairing of operations would be 1 & 3 or 2 & 4.
+Where one side is a "(read|write)-able operation" and the other is its correctly paired "(read|write) operation".
+Specifically, a read operation must be paired with a readable operation, and a write operation must be paired with a writable operation.
+
+```mermaid
+sequenceDiagram
+ participant LocalWorker
+ participant RemoteWorker
+ participant NIXL
+
+ LocalWorker ->> NIXL: Register memory (Descriptor)
+ RemoteWorker ->> NIXL: Register memory (Descriptor)
+ LocalWorker ->> LocalWorker: Create Readable/WritableOperation
+ LocalWorker ->> RemoteWorker: Send NIXL metadata (via HTTP/TCP+NATS)
+ RemoteWorker ->> NIXL: Begin Read/WriteOperation with metadata
+ NIXL -->> RemoteWorker: Data transfer
+ RemoteWorker -->> LocalWorker: Notify completion (unblock awaiter)
+```
+
+## Examples
+
+### Generic Example
+
+In the diagram below, Local creates a [`WritableOperation`](writable-operation.md) intended to receive data from Remote.
+Local then sends metadata about the requested operation to Remote.
+Remote then uses the metadata to create a [`WriteOperation`](write-operation.md) which will perform the GPU Direct RDMA memory transfer, when available, from Remote's GPU memory to Local's GPU memory.
+
+```mermaid
+---
+title: Write Operation Between Two Workers (RDMA available)
+---
+flowchart LR
+ c1[Remote] --"3: .begin_write()"--- WriteOperation
+ WriteOperation e1@=="4: GPU Direct RDMA"==> WritableOperation
+ WritableOperation --"1: .create_writable()"--- c2[Local]
+ c2 e2@--"2: RDMA Metadata via HTTP"--> c1
+ e1@{ animate: true; }
+ e2@{ animate: true; }
+```
+
+
+When RDMA isn't available, the NIXL data transfer will still complete using non-accelerated methods.
+
+
+### Multimodal Example
+
+In the case of the [Dynamo Multimodal Disaggregated Example](../../multimodal/vllm.md):
+
+ 1. The HTTP frontend accepts a text prompt and a URL to an image.
+
+ 2. The prompt and URL are then enqueued with the Processor before being dispatched to the first available Decode Worker.
+
+ 3. Decode Worker then requests a Prefill Worker to provide key-value data for the LLM powering the Decode Worker.
+
+ 4. Prefill Worker then requests that the image be processed and provided as embeddings by the Encode Worker.
+
+ 5. Encode Worker acquires the image, processes it, performs inference on the image using a specialized vision model, and finally provides the embeddings to Prefill Worker.
+
+ 6. Prefill Worker receives the embeddings from Encode Worker and generates a key-value cache (KV$) update for Decode Worker's LLM and writes the update directly to the GPU memory reserved for the data.
+
+ 7. Finally, Decode Worker performs the requested inference.
+
+```mermaid
+---
+title: Multimodal Disaggregated Workflow
+---
+flowchart LR
+ p0[HTTP Frontend] i0@--"text prompt"-->p1[Processor]
+ p0 i1@--"url"-->p1
+ p1 i2@--"prompt"-->dw[Decode Worker]
+ p1 i3@--"url"-->dw
+ dw i4@--"prompt"-->pw[Prefill Worker]
+ dw i5@--"url"-->pw
+ pw i6@--"url"-->ew[Encode Worker]
+ ew o0@=="image embeddings"==>pw
+ pw o1@=="kv_cache updates"==>dw
+ dw o2@--"inference results"-->p0
+
+ i0@{ animate: true; }
+ i1@{ animate: true; }
+ i2@{ animate: true; }
+ i3@{ animate: true; }
+ i4@{ animate: true; }
+ i5@{ animate: true; }
+ i6@{ animate: true; }
+ o0@{ animate: true; }
+ o1@{ animate: true; }
+ o2@{ animate: true; }
+```
+
+
+In this example, it is the data transfer between the Prefill Worker and the Encode Worker that utilizes the Dynamo NIXL Connect library.
+The KV Cache transfer between Decode Worker and Prefill Worker utilizes a different connector that also uses the NIXL-based I/O subsystem underneath.
+
+
+#### Code Examples
+
+See [MultimodalPDWorkerHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py) or [MultimodalDecodeWorkerHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py) from our Multimodal example,
+for how they coordinate directly with the Encode Worker by creating a [`WritableOperation`](writable-operation.md),
+sending the operation's metadata via Dynamo's round-robin dispatcher, and awaiting the operation for completion before making use of the transferred data.
+
+See [MultimodalEncodeWorkerHandler](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/vllm/multimodal_handlers/encode_worker_handler.py) from our Multimodal example,
+for how the resulting embeddings are registered with the NIXL subsystem by creating a [`Descriptor`](descriptor.md),
+a [`WriteOperation`](write-operation.md) is created using the metadata provided by the requesting worker,
+and the worker awaits for the data transfer to complete for yielding a response.
+
+
+## Python Classes
+
+ - [Connector](connector.md)
+ - [Descriptor](descriptor.md)
+ - [Device](device.md)
+ - [ReadOperation](read-operation.md)
+ - [ReadableOperation](readable-operation.md)
+ - [WritableOperation](writable-operation.md)
+ - [WriteOperation](write-operation.md)
+
+
+## References
+
+ - [NVIDIA Dynamo](https://developer.nvidia.com/dynamo) @ [GitHub](https://github.com/ai-dynamo/dynamo)
+ - [NVIDIA Inference Transfer Library (NIXL)](https://developer.nvidia.com/blog/introducing-nvidia-dynamo-a-low-latency-distributed-inference-framework-for-scaling-reasoning-ai-models/#nvidia_inference_transfer_library_nixl_low-latency_hardware-agnostic_communication%C2%A0) @ [GitHub](https://github.com/ai-dynamo/nixl)
+ - [Dynamo Multimodal Example](https://github.com/ai-dynamo/dynamo/tree/main/examples/multimodal)
+ - [NVIDIA GPU Direct](https://developer.nvidia.com/gpudirect)
diff --git a/fern/pages/api/nixl-connect/connector.md b/fern/pages/api/nixl-connect/connector.md
new file mode 100644
index 00000000000..28714eb2a20
--- /dev/null
+++ b/fern/pages/api/nixl-connect/connector.md
@@ -0,0 +1,179 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "dynamo.nixl_connect.Connector"
+---
+
+Core class for managing the connection between workers in a distributed environment.
+Use this class to create readable and writable operations, or read and write data to remote workers.
+
+This class provides a "pythonic" interface using NIXL library to utilize GPU Direct RDMA accelerated, when available, data transfers between models hosted by different workers in a Dynamo graph.
+The connector provides two methods of moving data between workers:
+
+ - Preparing local memory to be written to by a remote worker.
+
+ - Preparing local memory to be read by a remote worker.
+
+In both cases, local memory is registered with the NIXL-based I/O subsystem via the [`Descriptor`](descriptor.md) class and provided to the connector.
+When RDMA is available, the connector then configures the RDMA subsystem to expose the memory for the requested operation and returns an operation control object;
+otherwise the connector will select the best available RDMA alternative.
+The operation control object, either a [`ReadableOperation`](readable-operation.md) or a [`WritableOperation`](writable-operation.md),
+provides NIXL metadata ([RdmaMetadata](rdma-metadata.md)) via its `.metadata()` method, functionality to query the operation's current state, as well as the ability to cancel the operation prior to its completion.
+
+The NIXL metadata must be provided to the remote worker expected to complete the operation.
+The metadata contains required information (identifiers, keys, etc.) which enables the remote worker to interact with the provided memory.
+
+
+NIXL metadata contains a worker's address as well as security keys to access specific registered memory descriptors.
+This data provides direct memory access between workers, and should be considered sensitive and therefore handled accordingly.
+
+
+
+## Example Usage
+
+```python
+ @async_on_start
+ async def async_init(self):
+ self.connector = dynamo.nixl_connect.Connector()
+```
+
+
+See [`ReadOperation`](read-operation.md#example-usage), [`ReadableOperation`](readable-operation.md#example-usage),
+[`WritableOperation`](writable-operation.md#example-usage), and [`WriteOperation`](write-operation.md#example-usage)
+for additional examples.
+
+
+
+## Methods
+
+### `begin_read`
+
+```python
+async def begin_read(
+ self,
+ remote_metadata: RdmaMetadata,
+ local_descriptors: Descriptor | list[Descriptor],
+) -> ReadOperation:
+```
+
+Creates a [`ReadOperation`](read-operation.md) for transferring data from a remote worker.
+
+To create the operation, the serialized request from a remote worker's [`ReadableOperation`](readable-operation.md)
+along with a matching set of local memory descriptors which reference memory intended to receive data from the remote worker
+must be provided.
+The serialized request must be transferred from the remote to the local worker via a secondary channel, most likely HTTP or TCP+NATS.
+
+Once created, data transfer will begin immediately.
+
+Disposal of the object will instruct the NIXL subsystem to cancel the operation,
+therefore the operation should be awaited until completed unless cancellation is intended.
+
+Use [`.wait_for_completion()`](read-operation.md#wait_for_completion) to block the caller until the operation has completed or encountered an error.
+
+### `begin_write`
+
+```python
+async def begin_write(
+ self,
+ local_descriptors: Descriptor | list[Descriptor],
+ remote_metadata: RdmaMetadata,
+) -> WriteOperation:
+```
+
+Creates a [`WriteOperation`](write-operation.md) for transferring data to a remote worker.
+
+To create the operation, the serialized request from a remote worker's [`WritableOperation`](writable-operation.md)
+along with a matching set of local memory descriptors which reference memory to be transferred to the remote worker
+must be provided.
+The serialized request must be transferred from the remote to the local worker via a secondary channel, most likely HTTP or TCP+NATS.
+
+Once created, data transfer will begin immediately.
+
+Disposal of the object will instruct the NIXL subsystem to cancel the operation,
+therefore the operation should be awaited until completed unless cancellation is intended.
+
+Use [`.wait_for_completion()`](write-operation.md#wait_for_completion) to block the caller until the operation has completed or encountered an error.
+
+### `create_readable`
+
+```python
+async def create_readable(
+ self,
+ local_descriptors: Descriptor | list[Descriptor],
+) -> ReadableOperation:
+```
+
+Creates a [`ReadableOperation`](readable-operation.md) for transferring data to a remote worker.
+
+To create the operation, a set of local memory descriptors must be provided that reference memory intended to be transferred to a remote worker.
+Once created, the memory referenced by the provided descriptors becomes immediately readable by a remote worker with the necessary metadata.
+The metadata required to access the memory referenced by the provided descriptors is accessible via the operation's `.metadata()` method.
+Once acquired, the metadata needs to be provided to a remote worker via a secondary channel, most likely HTTP or TCP+NATS.
+
+Disposal of the object will instruct the NIXL subsystem to cancel the operation,
+therefore the operation should be awaited until completed unless cancellation is intended.
+
+Use [`.wait_for_completion()`](readable-operation.md#wait_for_completion) to block the caller until the operation has completed or encountered an error.
+
+### `create_writable`
+
+```python
+async def create_writable(
+ self,
+ local_descriptors: Descriptor | list[Descriptor],
+) -> WritableOperation:
+```
+
+Creates a [`WritableOperation`](writable-operation.md) for transferring data from a remote worker.
+
+To create the operation, a set of local memory descriptors must be provided which reference memory intended to receive data from a remote worker.
+Once created, the memory referenced by the provided descriptors becomes immediately writable by a remote worker with the necessary metadata.
+The metadata required to access the memory referenced by the provided descriptors is accessible via the operation's `.metadata()` method.
+Once acquired, the metadata needs to be provided to a remote worker via a secondary channel, most likely HTTP or TCP+NATS.
+
+Disposal of the object will instruct the NIXL subsystem to cancel the operation,
+therefore the operation should be awaited until completed unless cancellation is intended.
+
+Use [`.wait_for_completion()`](writable-operation.md#wait_for_completion) to block the caller until the operation has completed or encountered an error.
+
+
+## Properties
+
+### `hostname`
+
+```python
+@property
+def hostname(self) -> str:
+```
+
+Gets the name of the current worker's host.
+
+### `is_cuda_available`
+
+```python
+@cached_property
+def is_cuda_available(self) -> bool:
+```
+
+Gets `True` when CUDA is available for the selected array module (most likely CuPy); otherwise `False`.
+
+### `name`
+
+```python
+@property
+def name(self) -> str | None:
+```
+
+Gets the Dynamo component name used by the connector.
+
+
+## Related Classes
+
+ - [Descriptor](descriptor.md)
+ - [Device](device.md)
+ - [OperationStatus](operation-status.md)
+ - [RdmaMetadata](rdma-metadata.md)
+ - [ReadOperation](read-operation.md)
+ - [ReadableOperation](readable-operation.md)
+ - [WritableOperation](writable-operation.md)
+ - [WriteOperation](write-operation.md)
diff --git a/fern/pages/api/nixl-connect/descriptor.md b/fern/pages/api/nixl-connect/descriptor.md
new file mode 100644
index 00000000000..adcd7a08a38
--- /dev/null
+++ b/fern/pages/api/nixl-connect/descriptor.md
@@ -0,0 +1,68 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "dynamo.nixl_connect.Descriptor"
+---
+
+Memory descriptor that ensures memory is registered with the NIXL-base I/O subsystem.
+Memory must be registered with the NIXL subsystem to enable interaction with the memory.
+
+Descriptor objects are administrative and do not copy, move, or otherwise modify the registered memory.
+
+There are four ways to create a descriptor:
+
+ 1. From a `torch.Tensor` object. Device information will be derived from the provided object.
+
+ 2. From a `tuple` containing either a NumPy or CuPy `ndarray` and information describing where the memory resides (Host/CPU vs GPU).
+
+ 3. From a Python `bytes` object. Memory is assumed to reside in CPU addressable host memory.
+
+ 4. From a `tuple` comprised of the address of the memory, its size in bytes, and device information.
+ An optional reference to a Python object can be provided to avoid garbage collection issues.
+
+
+## Methods
+
+### `register_memory`
+
+```python
+def register_memory(self, connector: Connector) -> None:
+```
+
+Instructs the descriptor to register its memory buffer with the NIXL-based I/O subsystem.
+
+Calling this method more than once on the same descriptor has no effect.
+
+When the descriptor is assigned to a NIXL operation, it will be automatically registered if was not explicitly registered.
+
+
+## Properties
+
+### `device`
+
+```python
+@property
+def device(self) -> Device:
+```
+
+Gets a reference to the [`Device`](device.md) that contains the buffer the descriptor represents.
+
+### `size`
+
+```python
+@property
+def size(self) -> int:
+```
+
+Gets the size of the memory allocation the descriptor represents.
+
+## Related Classes
+
+ - [Connector](connector.md)
+ - [Device](device.md)
+ - [OperationStatus](operation-status.md)
+ - [RdmaMetadata](rdma-metadata.md)
+ - [ReadOperation](read-operation.md)
+ - [ReadableOperation](readable-operation.md)
+ - [WritableOperation](writable-operation.md)
+ - [WriteOperation](write-operation.md)
diff --git a/fern/pages/api/nixl-connect/device-kind.md b/fern/pages/api/nixl-connect/device-kind.md
new file mode 100644
index 00000000000..c76eb4e77c7
--- /dev/null
+++ b/fern/pages/api/nixl-connect/device-kind.md
@@ -0,0 +1,30 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "dynamo.nixl_connect.DeviceKind(IntEnum)"
+---
+
+Represents the kind of device a [`Device`](device.md) object represents.
+
+
+## Values
+
+### `CUDA`
+
+CUDA addressable device (GPU) memory.
+
+### `HOST`
+
+System (CPU) memory.
+
+
+## Related Classes
+
+ - [Connector](connector.md)
+ - [Descriptor](descriptor.md)
+ - [Device](device.md)
+ - [OperationStatus](operation-status.md)
+ - [RdmaMetadata](rdma-metadata.md)
+ - [ReadOperation](read-operation.md)
+ - [WritableOperation](writable-operation.md)
+ - [WriteOperation](write-operation.md)
diff --git a/fern/pages/api/nixl-connect/device.md b/fern/pages/api/nixl-connect/device.md
new file mode 100644
index 00000000000..a1754ac6323
--- /dev/null
+++ b/fern/pages/api/nixl-connect/device.md
@@ -0,0 +1,50 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "dynamo.nixl_connect.Device"
+---
+
+`Device` class describes the device a given allocation resides in.
+Usually host (`"cpu"`) or GPU (`"cuda"`) memory.
+
+When a system contains multiple GPU devices, specific GPU devices can be identified by including their ordinal index number.
+For example, to reference the second GPU in a system `"cuda:1"` can be used.
+
+By default, when `"cuda"` is provided, it is assumed to be `"cuda:0"` or the first GPU enumerated by the system.
+
+
+## Properties
+
+### `id`
+
+```python
+@property
+def id(self) -> int:
+```
+
+Gets the identity, or ordinal, of the device.
+
+When the device is the [`HOST`](device-kind.md#host), this value is always `0`.
+
+When the device is a [`GPU`](device-kind.md#cuda), this value identifies a specific GPU.
+
+### `kind`
+
+```python
+@property
+def kind(self) -> DeviceKind:
+```
+
+Gets the [`DeviceKind`](device-kind.md) of device the instance references.
+
+
+## Related Classes
+
+ - [Connector](connector.md)
+ - [Descriptor](descriptor.md)
+ - [OperationStatus](operation-status.md)
+ - [ReadOperation](read-operation.md)
+ - [ReadableOperation](readable-operation.md)
+ - [RdmaMetadata](rdma-metadata.md)
+ - [WritableOperation](writable-operation.md)
+ - [WriteOperation](write-operation.md)
diff --git a/fern/pages/api/nixl-connect/operation-status.md b/fern/pages/api/nixl-connect/operation-status.md
new file mode 100644
index 00000000000..a966fb2323d
--- /dev/null
+++ b/fern/pages/api/nixl-connect/operation-status.md
@@ -0,0 +1,46 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "dynamo.nixl_connect.OperationStatus(IntEnum)"
+---
+
+Represents the current state or status of an operation.
+
+
+## Values
+
+### `CANCELLED`
+
+The operation has been cancelled by the user or system.
+
+### `COMPLETE`
+
+The operation has been completed successfully.
+
+### `ERRORED`
+
+The operation has encountered an error and cannot be completed.
+
+### `IN_PROGRESS`
+
+The operation has been initialized and is in-progress (not completed, errored, or cancelled).
+
+### `INITIALIZED`
+
+The operation has been initialized and is ready to be processed.
+
+### `UNINITIALIZED`
+
+The operation has not been initialized yet and is not in a valid state.
+
+
+## Related Classes
+
+ - [Connector](connector.md)
+ - [Descriptor](descriptor.md)
+ - [Device](device.md)
+ - [RdmaMetadata](rdma-metadata.md)
+ - [ReadOperation](read-operation.md)
+ - [ReadableOperation](readable-operation.md)
+ - [WritableOperation](writable-operation.md)
+ - [WriteOperation](write-operation.md)
diff --git a/fern/pages/api/nixl-connect/rdma-metadata.md b/fern/pages/api/nixl-connect/rdma-metadata.md
new file mode 100644
index 00000000000..e909dc2b070
--- /dev/null
+++ b/fern/pages/api/nixl-connect/rdma-metadata.md
@@ -0,0 +1,35 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "dynamo.nixl_connect.RdmaMetadata"
+---
+
+A Pydantic type intended to provide JSON serialized NIXL metadata about a [`ReadableOperation`](readable-operation.md) or [`WritableOperation`](writable-operation.md) object.
+NIXL metadata contains detailed information about a worker process and how to access memory regions registered with the corresponding agent.
+This data is required to perform data transfers using the NIXL-based I/O subsystem.
+
+
+NIXL metadata contains information to connect corresponding backends across agents, as well as identification keys to access specific registered memory regions.
+This data provides direct memory access between workers, and should be considered sensitive and therefore handled accordingly.
+
+
+Use the respective class's `.metadata()` method to generate an `RdmaMetadata` object for an operation.
+
+
+Classes using `RdmaMetadata` objects must be paired correctly.
+[`ReadableOperation`](readable-operation.md) with [`ReadOperation`](read-operation.md), and
+[`WritableOperation`](write-operation.md) with [`WriteOperation`](write-operation.md).
+Incorrect pairing will result in an error being raised.
+
+
+
+## Related Classes
+
+ - [Connector](connector.md)
+ - [Descriptor](descriptor.md)
+ - [Device](device.md)
+ - [OperationStatus](operation-status.md)
+ - [ReadOperation](read-operation.md)
+ - [ReadableOperation](readable-operation.md)
+ - [WritableOperation](writable-operation.md)
+ - [WriteOperation](write-operation.md)
diff --git a/fern/pages/api/nixl-connect/read-operation.md b/fern/pages/api/nixl-connect/read-operation.md
new file mode 100644
index 00000000000..8dcce65cede
--- /dev/null
+++ b/fern/pages/api/nixl-connect/read-operation.md
@@ -0,0 +1,75 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "dynamo.nixl_connect.ReadOperation"
+---
+
+An operation which transfers data from a remote worker to the local worker.
+
+To create the operation, NIXL metadata ([RdmaMetadata](rdma-metadata.md)) from a remote worker's [`ReadableOperation`](readable-operation.md)
+along with a matching set of local [`Descriptor`](descriptor.md) objects which reference memory intended to receive data from the remote worker must be provided.
+The NIXL metadata must be transferred from the remote to the local worker via a secondary channel, most likely HTTP or TCP+NATS.
+
+Once created, data transfer will begin immediately.
+Disposal of the object will instruct the NIXL subsystem to cancel the operation,
+therefore the operation should be awaited until completed unless cancellation is intended.
+
+
+## Example Usage
+
+```python
+ async def read_from_remote(
+ self,
+ remote_metadata: dynamo.nixl_connect.RdmaMetadata,
+ local_tensor: torch.Tensor
+ ) -> None:
+ descriptor = dynamo.nixl_connect.Descriptor(local_tensor)
+
+ with await self.connector.begin_read(remote_metadata, descriptor) as read_op:
+ # Wait for the operation to complete writing data from the remote worker to local_tensor.
+ await read_op.wait_for_completion()
+```
+
+
+## Methods
+
+### `cancel`
+
+```python
+def cancel(self) -> None:
+```
+
+Instructs the NIXL subsystem to cancel the operation.
+Completed operations cannot be cancelled.
+
+### `wait_for_completion`
+
+```python
+async def wait_for_completion(self) -> None:
+```
+
+Blocks the caller until the memory from the remote worker has been transferred to the provided buffers.
+
+
+## Properties
+
+### `status`
+
+```python
+@property
+def status(self) -> OperationStatus:
+```
+
+Returns [`OperationStatus`](operation-status.md) which provides the current state (aka. status) of the operation.
+
+
+## Related Classes
+
+ - [Connector](connector.md)
+ - [Descriptor](descriptor.md)
+ - [Device](device.md)
+ - [OperationStatus](operation-status.md)
+ - [RdmaMetadata](rdma-metadata.md)
+ - [ReadableOperation](readable-operation.md)
+ - [WritableOperation](writable-operation.md)
+ - [WriteOperation](write-operation.md)
diff --git a/fern/pages/api/nixl-connect/readable-operation.md b/fern/pages/api/nixl-connect/readable-operation.md
new file mode 100644
index 00000000000..30c2d691dd2
--- /dev/null
+++ b/fern/pages/api/nixl-connect/readable-operation.md
@@ -0,0 +1,79 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "dynamo.nixl_connect.ReadableOperation"
+---
+
+An operation which enables a remote worker to read data from the local worker.
+
+To create the operation, a set of local [`Descriptor`](descriptor.md) objects must be provided that reference memory intended to be transferred to a remote worker.
+Once created, the memory referenced by the provided descriptors becomes immediately readable by a remote worker with the necessary metadata.
+The NIXL metadata ([RdmaMetadata](rdma-metadata.md)) required to access the memory referenced by the provided descriptors is accessible via the operations `.metadata()` method.
+Once acquired, the metadata needs to be provided to a remote worker via a secondary channel, most likely HTTP or TCP+NATS.
+
+Disposal of the object will instruct the NIXL subsystem to cancel the operation,
+therefore the operation should be awaited until completed unless cancellation is intended.
+
+
+## Example Usage
+
+```python
+ async def send_data(
+ self,
+ local_tensor: torch.Tensor
+ ) -> None:
+ descriptor = dynamo.nixl_connect.Descriptor(local_tensor)
+
+ with await self.connector.create_readable(descriptor) as read_op:
+ op_metadata = read_op.metadata()
+
+ # Send the metadata to the remote worker via sideband communication.
+ await self.notify_remote_data(op_metadata)
+ # Wait for the remote worker to complete its read operation of local_tensor.
+ # AKA send data to remote worker.
+ await read_op.wait_for_completion()
+```
+
+
+## Methods
+
+### `metadata`
+
+```python
+def metadata(self) -> RdmaMetadata:
+```
+
+Generates and returns the NIXL metadata ([RdmaMetadata](rdma-metadata.md)) required for a remote worker to read from the operation.
+Once acquired, the metadata needs to be provided to a remote worker via a secondary channel, most likely HTTP or TCP+NATS.
+
+### `wait_for_completion`
+
+```python
+async def wait_for_completion(self) -> None:
+```
+
+Blocks the caller until the operation has received a completion signal from a remote worker.
+
+
+## Properties
+
+### `status`
+
+```python
+@property
+def status(self) -> OperationStatus:
+```
+
+Returns [`OperationStatus`](operation-status.md) which provides the current state (aka. status) of the operation.
+
+
+## Related Classes
+
+ - [Connector](connector.md)
+ - [Descriptor](descriptor.md)
+ - [Device](device.md)
+ - [OperationStatus](operation-status.md)
+ - [RdmaMetadata](rdma-metadata.md)
+ - [ReadOperation](read-operation.md)
+ - [WritableOperation](writable-operation.md)
+ - [WriteOperation](write-operation.md)
diff --git a/fern/pages/api/nixl-connect/writable-operation.md b/fern/pages/api/nixl-connect/writable-operation.md
new file mode 100644
index 00000000000..765869a775b
--- /dev/null
+++ b/fern/pages/api/nixl-connect/writable-operation.md
@@ -0,0 +1,80 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "dynamo.nixl_connect.WritableOperation"
+---
+
+An operation which enables a remote worker to write data to the local worker.
+
+To create the operation, a set of local [`Descriptor`](descriptor.md) objects must be provided which reference memory intended to receive data from a remote worker.
+Once created, the memory referenced by the provided descriptors becomes immediately writable by a remote worker with the necessary metadata.
+The NIXL metadata ([RdmaMetadata](rdma-metadata.md)) required to access the memory referenced by the provided descriptors is accessible via the operations `.metadata()` method.
+Once acquired, the metadata needs to be provided to a remote worker via a secondary channel, most likely HTTP or TCP+NATS.
+
+Disposal of the object will instruct the NIXL subsystem to cancel the operation,
+therefore the operation should be awaited until completed unless cancellation is intended.
+Cancellation is handled asynchronously.
+
+
+## Example Usage
+
+```python
+ async def recv_data(
+ self,
+ local_tensor: torch.Tensor
+ ) -> None:
+ descriptor = dynamo.nixl_connect.Descriptor(local_tensor)
+
+ with await self.connector.create_writable(descriptor) as write_op:
+ op_metadata = write_op.metadata()
+
+ # Send the metadata to the remote worker via sideband communication.
+ await self.request_remote_data(op_metadata)
+ # Wait the remote worker to complete its write operation to local_tensor.
+ # AKA receive data from remote worker.
+ await write_op.wait_for_completion()
+```
+
+
+## Methods
+
+### `metadata`
+
+```python
+def metadata(self) -> RdmaMetadata:
+```
+
+Generates and returns the NIXL metadata ([RdmaMetadata](rdma-metadata.md)) required for a remote worker to write to the operation.
+Once acquired, the metadata needs to be provided to a remote worker via a secondary channel, most likely HTTP or TCP+NATS.
+
+### `wait_for_completion`
+
+```python
+async def wait_for_completion(self) -> None:
+```
+
+Blocks the caller until the operation has received a completion signal from a remote worker.
+
+
+## Properties
+
+### `status`
+
+```python
+@property
+def status(self) -> OperationStatus:
+```
+
+Returns [`OperationStatus`](operation-status.md) which provides the current state (aka. status) of the operation.
+
+
+## Related Classes
+
+ - [Connector](connector.md)
+ - [Descriptor](descriptor.md)
+ - [Device](device.md)
+ - [OperationStatus](operation-status.md)
+ - [RdmaMetadata](rdma-metadata.md)
+ - [ReadOperation](read-operation.md)
+ - [ReadableOperation](readable-operation.md)
+ - [WriteOperation](write-operation.md)
diff --git a/fern/pages/api/nixl-connect/write-operation.md b/fern/pages/api/nixl-connect/write-operation.md
new file mode 100644
index 00000000000..ba6a4b4ae2c
--- /dev/null
+++ b/fern/pages/api/nixl-connect/write-operation.md
@@ -0,0 +1,76 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "dynamo.nixl_connect.WriteOperation"
+---
+
+An operation which transfers data from the local worker to a remote worker.
+
+To create the operation, NIXL metadata ([RdmaMetadata](rdma-metadata.md)) from a remote worker's [`WritableOperation`](writable-operation.md)
+along with a matching set of local [`Descriptor`](descriptor.md) objects which reference memory to be transferred to the remote worker must be provided.
+The NIXL metadata must be transferred from the remote to the local worker via a secondary channel, most likely HTTP or TCP+NATS.
+
+Once created, data transfer will begin immediately.
+Disposal of the object will instruct the NIXL subsystem to cancel the operation,
+therefore the operation should be awaited until completed unless cancellation is intended.
+Cancellation is handled asynchronously.
+
+
+## Example Usage
+
+```python
+ async def write_to_remote(
+ self,
+ remote_metadata: dynamo.nixl_connect.RdmaMetadata,
+ local_tensor: torch.Tensor
+ ) -> None:
+ descriptor = dynamo.nixl_connect.Descriptor(local_tensor)
+
+ with await self.connector.begin_write(descriptor, remote_metadata) as write_op:
+ # Wait for the operation to complete writing local_tensor to the remote worker.
+ await write_op.wait_for_completion()
+```
+
+
+## Methods
+
+### `cancel`
+
+```python
+def cancel(self) -> None:
+```
+
+Instructs the NIXL subsystem to cancel the operation.
+Completed operations cannot be cancelled.
+
+### `wait_for_completion`
+
+```python
+async def wait_for_completion(self) -> None:
+```
+
+Blocks the caller until all provided buffers have been transferred to the remote worker.
+
+
+## Properties
+
+### `status`
+
+```python
+@property
+def status(self) -> OperationStatus:
+```
+
+Returns [`OperationStatus`](operation-status.md) which provides the current state (aka. status) of the operation.
+
+
+## Related Classes
+
+ - [Connector](connector.md)
+ - [Descriptor](descriptor.md)
+ - [Device](device.md)
+ - [OperationStatus](operation-status.md)
+ - [RdmaMetadata](rdma-metadata.md)
+ - [ReadOperation](read-operation.md)
+ - [ReadableOperation](readable-operation.md)
+ - [WritableOperation](writable-operation.md)
diff --git a/fern/pages/backends/sglang/README.md b/fern/pages/backends/sglang/README.md
new file mode 100644
index 00000000000..273bccf3f46
--- /dev/null
+++ b/fern/pages/backends/sglang/README.md
@@ -0,0 +1,273 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Running SGLang with Dynamo"
+---
+
+## Use the Latest Release
+
+We recommend using the latest stable release of dynamo to avoid breaking changes:
+
+[](https://github.com/ai-dynamo/dynamo/releases/latest)
+
+You can find the latest release [here](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with:
+
+```bash
+git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
+```
+
+---
+
+## Table of Contents
+- [Feature Support Matrix](#feature-support-matrix)
+- [Dynamo SGLang Integration](#dynamo-sglang-integration)
+- [Installation](#installation)
+- [Quick Start](#quick-start)
+- [Aggregated Serving](#aggregated-serving)
+- [Disaggregated Serving](#disaggregated-serving)
+- [Deploy on SLURM or Kubernetes](#deployment)
+
+## Feature Support Matrix
+
+### Core Dynamo Features
+
+| Feature | SGLang | Notes |
+|---------|--------|-------|
+| [**Disaggregated Serving**](../../design-docs/disagg-serving.md) | ✅ | |
+| [**Conditional Disaggregation**](../../design-docs/disagg-serving.md#conditional-disaggregation) | 🚧 | WIP [PR](https://github.com/sgl-project/sglang/pull/7730) |
+| [**KV-Aware Routing**](../../router/kv-cache-routing.md) | ✅ | |
+| [**SLA-Based Planner**](../../planner/sla-planner.md) | ✅ | |
+| [**Multimodal Support**](../../multimodal/sglang.md) | ✅ | |
+| [**KVBM**](../../kvbm/kvbm-architecture.md) | ❌ | Planned |
+
+
+## Dynamo SGLang Integration
+
+Dynamo SGLang integrates SGLang engines into Dynamo's distributed runtime, enabling advanced features like disaggregated serving, KV-aware routing, and request migration while maintaining full compatibility with SGLang's engine arguments.
+
+### Argument Handling
+
+Dynamo SGLang uses SGLang's native argument parser, so **most SGLang engine arguments work identically**. You can pass any SGLang argument (like `--model-path`, `--tp`, `--trust-remote-code`) directly to `dynamo.sglang`.
+
+#### Dynamo-Specific Arguments
+
+| Argument | Description | Default | SGLang Equivalent |
+|----------|-------------|---------|-------------------|
+| `--endpoint` | Dynamo endpoint in `dyn://namespace.component.endpoint` format | Auto-generated based on mode | N/A |
+| `--migration-limit` | Max times a request can migrate between workers for fault tolerance. See [Request Migration Architecture](../../fault-tolerance/request-migration.md). | `0` (disabled) | N/A |
+| `--dyn-tool-call-parser` | Tool call parser for structured outputs (takes precedence over `--tool-call-parser`) | `None` | `--tool-call-parser` |
+| `--dyn-reasoning-parser` | Reasoning parser for CoT models (takes precedence over `--reasoning-parser`) | `None` | `--reasoning-parser` |
+| `--use-sglang-tokenizer` | Use SGLang's tokenizer instead of Dynamo's | `False` | N/A |
+| `--custom-jinja-template` | Use custom chat template for that model (takes precedence over default chat template in model repo) | `None` | `--chat-template` |
+
+#### Tokenizer Behavior
+
+- **Default (`--use-sglang-tokenizer` not set)**: Dynamo handles tokenization/detokenization via our blazing fast frontend and passes `input_ids` to SGLang
+- **With `--use-sglang-tokenizer`**: SGLang handles tokenization/detokenization, Dynamo passes raw prompts
+
+
+When using `--use-sglang-tokenizer`, only `v1/chat/completions` is available through Dynamo's frontend.
+
+
+### Request Cancellation
+
+When a user cancels a request (e.g., by disconnecting from the frontend), the request is automatically cancelled across all workers, freeing compute resources for other requests.
+
+#### Cancellation Support Matrix
+
+| | Prefill | Decode |
+|-|---------|--------|
+| **Aggregated** | ✅ | ✅ |
+| **Disaggregated** | ⚠️ | ✅ |
+
+
+⚠️ SGLang backend currently does not support cancellation during remote prefill phase in disaggregated mode.
+
+
+For more details, see the [Request Cancellation Architecture](../../fault-tolerance/request-cancellation.md) documentation.
+
+## Installation
+
+### Install latest release
+We suggest using uv to install the latest release of ai-dynamo[sglang]. You can install it with `curl -LsSf https://astral.sh/uv/install.sh | sh`
+
+
+Expand for instructions
+
+```bash
+# create a virtual env
+uv venv --python 3.12 --seed
+# install the latest release (which comes bundled with a stable sglang version)
+uv pip install "ai-dynamo[sglang]"
+```
+
+
+
+### Install editable version for development
+
+
+Expand for instructions
+
+This requires having rust installed. We also recommend having a proper installation of the cuda toolkit as sglang requires `nvcc` to be available.
+
+```bash
+# create a virtual env
+uv venv --python 3.12 --seed
+# build dynamo runtime bindings
+uv pip install maturin
+cd $DYNAMO_HOME/lib/bindings/python
+maturin develop --uv
+cd $DYNAMO_HOME
+# installs sglang supported version along with dynamo
+# include the prerelease flag to install flashinfer rc versions
+uv pip install -e .
+# install any sglang version >= 0.5.3.post2
+uv pip install "sglang[all]==0.5.3.post2"
+```
+
+
+
+### Using docker containers
+
+
+Expand for instructions
+
+We are in the process of shipping pre-built docker containers that contain installations of DeepEP, DeepGEMM, and NVSHMEM in order to support WideEP and P/D. For now, you can quickly build the container from source with the following command.
+
+```bash
+cd $DYNAMO_ROOT
+./container/build.sh \
+ --framework SGLANG \
+ --tag dynamo-sglang:latest \
+```
+
+And then run it using
+
+```bash
+docker run \
+ --gpus all \
+ -it \
+ --rm \
+ --network host \
+ --shm-size=10G \
+ --ulimit memlock=-1 \
+ --ulimit stack=67108864 \
+ --ulimit nofile=65536:65536 \
+ --cap-add CAP_SYS_PTRACE \
+ --ipc host \
+ dynamo-sglang:latest
+```
+
+
+
+## Quick Start
+
+Below we provide a guide that lets you run all of our common deployment patterns on a single node.
+
+### Start NATS and ETCD in the background
+
+Start using [Docker Compose](https://github.com/ai-dynamo/dynamo/tree/main/deploy/docker-compose.yml)
+
+```bash
+docker compose -f deploy/docker-compose.yml up -d
+```
+
+
+Each example corresponds to a simple bash script that runs the OpenAI compatible server, processor, and optional router (written in Rust) and LLM engine (written in Python) in a single terminal. You can easily take each command and run them in separate terminals.
+Additionally - because we use sglang's argument parser, you can pass in any argument that sglang supports to the worker!
+
+
+
+### Aggregated Serving
+
+```bash
+cd $DYNAMO_HOME/examples/backends/sglang
+./launch/agg.sh
+```
+
+### Aggregated Serving with KV Routing
+
+```bash
+cd $DYNAMO_HOME/examples/backends/sglang
+./launch/agg_router.sh
+```
+
+### Aggregated Serving for Embedding Models
+
+Here's an example that uses the [Qwen/Qwen3-Embedding-4B](https://huggingface.co/Qwen/Qwen3-Embedding-4B) model.
+
+```bash
+cd $DYNAMO_HOME/examples/backends/sglang
+./launch/agg_embed.sh
+```
+
+
+Send the following request to verify your deployment:
+
+```bash
+curl localhost:8000/v1/embeddings \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "Qwen/Qwen3-Embedding-4B",
+ "input": "Hello, world!"
+ }'
+```
+
+
+
+### Disaggregated serving
+
+See [SGLang Disaggregation](sglang-disaggregation.md) to learn more about how sglang and dynamo handle disaggregated serving.
+
+
+```bash
+cd $DYNAMO_HOME/examples/backends/sglang
+./launch/disagg.sh
+```
+
+### Disaggregated Serving with KV Aware Prefill Routing
+
+```bash
+cd $DYNAMO_HOME/examples/backends/sglang
+./launch/disagg_router.sh
+```
+
+### Disaggregated Serving with Mixture-of-Experts (MoE) models and DP attention
+
+You can use this configuration to test out disaggregated serving with dp attention and expert parallelism on a single node before scaling to the full DeepSeek-R1 model across multiple nodes.
+
+```bash
+# note this will require 4 GPUs
+cd $DYNAMO_HOME/examples/backends/sglang
+./launch/disagg_dp_attn.sh
+```
+
+### Testing the Deployment
+
+Send a test request to verify your deployment:
+
+```bash
+curl localhost:8000/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "Qwen/Qwen3-0.6B",
+ "messages": [
+ {
+ "role": "user",
+ "content": "Explain why Roger Federer is considered one of the greatest tennis players of all time"
+ }
+ ],
+ "stream": true,
+ "max_tokens": 30
+ }'
+```
+
+## Deployment
+
+We currently provide deployment examples for Kubernetes and SLURM.
+
+## Kubernetes
+- **[Deploying Dynamo with SGLang on Kubernetes](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/sglang/deploy/README.md)**
+
+## SLURM
+- **[Deploying Dynamo with SGLang on SLURM](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/sglang/slurm_jobs/README.md)**
diff --git a/fern/pages/backends/sglang/expert-distribution-eplb.md b/fern/pages/backends/sglang/expert-distribution-eplb.md
new file mode 100644
index 00000000000..507fd3edd2a
--- /dev/null
+++ b/fern/pages/backends/sglang/expert-distribution-eplb.md
@@ -0,0 +1,60 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Expert Parallelism Load Balancer (EPLB) in SGLang"
+---
+
+Mixture-of-Experts (MoE) models utilize a technique called Expert Parallelism (EP), where experts are distributed across multiple GPUs. While this allows for much larger and more powerful models, it can lead to an uneven workload distribution. Because the load on different experts may vary depending on the workload, some GPUs can become bottlenecks, forcing the entire system to wait. This imbalance leads to wasted compute cycles and increased memory usage.
+
+To address this, SGLang implements an Expert Parallelism Load Balancer (EPLB) inspired by the work in the DeepSeek-V3 paper. EPLB analyzes expert usage patterns and dynamically re-arranges the experts across the available GPUs to ensure a more balanced workload.
+
+## The EPLB Algorithm: Core Concepts
+
+The load balancing algorithm revolves around a few key ideas to achieve an optimal distribution of work.
+
+### Redundant Experts for Flexibility
+
+The core strategy is to create **redundant experts**. Instead of being limited to the model's original number of experts, EPLB can create duplicates of heavily-loaded experts. For example, if a model has 256 experts, you can configure EPLB to create an additional 32 "redundant" experts, bringing the total to 288. This pool of replicated experts is then strategically packed onto the available GPUs. A popular expert might be duplicated multiple times, while a moderately used expert might be grouped with several rarely used ones on a single GPU.
+
+### Group-Limited Routing for Efficiency
+
+Modern MoE models like DeepSeek-V3 use **group-limited expert routing**. In this design, experts are organized into groups, and routing decisions are constrained within these groups. EPLB can take advantage of this structure to reduce inter-node data traffic by attempting to place all experts from the same group onto the same node whenever possible.
+
+### Load Balancing Policies
+
+The algorithm comes with two policies for different scenarios:
+
+1. **Hierarchical Load Balancing**: This policy is used when the number of server nodes evenly divides the number of expert groups. It first harnesses the group-limited routing by packing expert groups onto nodes to balance the load between nodes. Then, within each node, it replicates and packs the experts onto individual GPUs to balance the load locally. This is often used during prefill where the expert-parallel size might be smaller.
+
+2. **Global Load Balancing**: In all other cases, a global policy is used. It replicates experts globally without regard to their group affiliation and packs them onto individual GPUs. This policy is more general and can be adopted during the decoding stage with a larger expert-parallel size.
+
+## How SGLang Implements EPLB
+
+SGLang provides a robust implementation of EPLB, allowing for dynamic, online rebalancing of expert locations based on real-world traffic.
+
+### Dynamic Rebalancing
+
+You can enable dynamic rebalancing by setting the `--enable-eplb` flag. When enabled, the `EPLBManager` runs in the background. It periodically triggers a rebalance after a certain number of requests, configured with `--eplb-rebalance-num-iterations`. At each rebalance, it computes a new expert placement plan based on the latest usage statistics and updates the model's expert locations on the fly.
+
+### Expert Usage Recording
+
+To make intelligent balancing decisions, SGLang needs to collect data on expert usage. The `ExpertDistributionRecorder` is responsible for this, and its behavior is controlled by the `--expert-distribution-recorder-mode` flag. This flag determines the granularity of the collected data. When `enable_eplb` is on, this mode defaults to `stat` to gather statistics for rebalancing. The available modes are:
+
+- **`per_token`**: This is the most detailed mode. It records the specific expert choices for every single token processed by the model. While it provides the richest data, it also has the highest performance overhead. The raw, unaggregated data for each forward pass is stored.
+
+- **`per_pass`**: In this mode, SGLang records the aggregated expert usage counts for each individual forward pass. The data is not aggregated across different passes, giving you a snapshot of expert popularity for each batch of requests.
+
+- **`stat`**: This mode also records the exact expert usage counts for each forward pass, but it then aggregates these counts across multiple passes (the number of passes is determined by `--expert-distribution-recorder-buffer-size`). This provides a moving average of expert usage statistics and is the default when EPLB is enabled.
+
+- **`stat_approx`**: This mode is similar to `stat` but gathers _approximate_ statistics, usually from the DeepEP dispatcher. This method has lower overhead than `stat` but is less precise, especially for small batch sizes. It is a good choice when performance is critical.
+
+The collected statistics are then fed into the rebalancing algorithm to generate a new expert placement plan.
+
+### Initializing with a Pre-computed Distribution
+
+While SGLang can start with a simple default layout and learn a better one over time, you can also provide it with a pre-computed expert distribution to start with. The `--init-expert-location` flag allows you to specify a file path (`.pt` or `.json`) or a JSON string containing an expert layout. This is useful if you have already analyzed a representative workload offline and want the server to start immediately with a balanced configuration. If this flag is not set, it defaults to a `trivial` sequential layout.
+
+### References and further reading
+
+- [SGLang Large Scale P/D + WideEP Deployment](https://lmsys.org/blog/2025-05-05-large-scale-ep/#expert-parallelism-load-balancer)
+- [Deepseek's EPLB repository](https://github.com/deepseek-ai/EPLB)
diff --git a/fern/pages/backends/sglang/gpt-oss.md b/fern/pages/backends/sglang/gpt-oss.md
new file mode 100644
index 00000000000..441ac804ae8
--- /dev/null
+++ b/fern/pages/backends/sglang/gpt-oss.md
@@ -0,0 +1,47 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Running gpt-oss-120b Disaggregated with SGLang"
+---
+
+The gpt-oss-120b guide for SGLang is largely identical to the [guide for vLLM](../vllm/gpt-oss.md),
+please ues the vLLM guide as a reference with the different deployment steps as highlighted below:
+
+# Launch the Deployment
+
+Note that GPT-OSS is a reasoning model with tool calling support. To
+ensure the response is being processed correctly, the worker should be
+launched with proper `--dyn-reasoning-parser` and `--dyn-tool-call-parser`.
+
+**Start frontend**
+```bash
+python3 -m dynamo.frontend --http-port 8000 &
+```
+
+**Run decode worker**
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.sglang \
+ --model-path openai/gpt-oss-120b \
+ --served-model-name openai/gpt-oss-120b \
+ --tp 4 \
+ --trust-remote-code \
+ --skip-tokenizer-init \
+ --disaggregation-mode decode \
+ --disaggregation-transfer-backend nixl \
+ --dyn-reasoning-parser gpt_oss \
+ --dyn-tool-call-parser harmony
+```
+
+**Run prefill workers**
+```bash
+CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.sglang \
+ --model-path openai/gpt-oss-120b \
+ --served-model-name openai/gpt-oss-120b \
+ --tp 4 \
+ --trust-remote-code \
+ --skip-tokenizer-init \
+ --disaggregation-mode prefill \
+ --disaggregation-transfer-backend nixl \
+ --dyn-reasoning-parser gpt_oss \
+ --dyn-tool-call-parser harmony
+```
diff --git a/fern/pages/backends/sglang/profiling.md b/fern/pages/backends/sglang/profiling.md
new file mode 100644
index 00000000000..515551136dd
--- /dev/null
+++ b/fern/pages/backends/sglang/profiling.md
@@ -0,0 +1,43 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Profiling SGLang Workers in Dynamo"
+---
+
+Dynamo exposes profiling endpoints for SGLang workers via the system server's `/engine/*` routes. This allows you to start and stop PyTorch profiling on running inference workers without restarting them.
+
+These endpoints wrap SGLang's internal `TokenizerManager.start_profile()` and `stop_profile()` methods. See SGLang's documentation for the full list of supported parameters.
+
+## Quick Start
+
+1. **Start profiling:**
+
+```bash
+curl -X POST http://localhost:9090/engine/start_profile \
+ -H "Content-Type: application/json" \
+ -d '{"output_dir": "/tmp/profiler_output"}'
+```
+
+2. **Run some inference requests to generate profiling data**
+
+3. **Stop profiling:**
+
+```bash
+curl -X POST http://localhost:9090/engine/stop_profile
+```
+
+4. **View the traces:**
+
+The profiler outputs Chrome trace files in the specified `output_dir`. You can view them using:
+- Chrome's `chrome://tracing`
+- [Perfetto UI](https://ui.perfetto.dev/)
+- TensorBoard with the PyTorch Profiler plugin
+
+## Test Script
+
+A test script is provided at [`examples/backends/sglang/test_sglang_profile.py`](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/sglang/test_sglang_profile.py) that demonstrates the full profiling workflow:
+
+```bash
+python examples/backends/sglang/test_sglang_profile.py
+```
+
diff --git a/fern/pages/backends/sglang/prometheus.md b/fern/pages/backends/sglang/prometheus.md
new file mode 100644
index 00000000000..c29858ea2e7
--- /dev/null
+++ b/fern/pages/backends/sglang/prometheus.md
@@ -0,0 +1,122 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "SGLang Prometheus Metrics"
+---
+
+## Overview
+
+When running SGLang through Dynamo, SGLang engine metrics are automatically passed through and exposed on Dynamo's `/metrics` endpoint (default port 8081). This allows you to access both SGLang engine metrics (prefixed with `sglang:`) and Dynamo runtime metrics (prefixed with `dynamo_*`) from a single worker backend endpoint.
+
+**For the complete and authoritative list of all SGLang metrics**, always refer to the [official SGLang Production Metrics documentation](https://docs.sglang.ai/references/production_metrics.html).
+
+**For Dynamo runtime metrics**, see the [Dynamo Metrics Guide](../../observability/metrics.md).
+
+**For visualization setup instructions**, see the [Prometheus and Grafana Setup Guide](../../observability/prometheus-grafana.md).
+
+## Environment Variables
+
+| Variable | Description | Default | Example |
+|----------|-------------|---------|---------|
+| `DYN_SYSTEM_PORT` | System metrics/health port | `-1` (disabled) | `8081` |
+
+## Getting Started Quickly
+
+This is a single machine example.
+
+### Start Observability Stack
+
+For visualizing metrics with Prometheus and Grafana, start the observability stack. See [Observability Getting Started](../../observability/README.md#getting-started-quickly) for instructions.
+
+### Launch Dynamo Components
+
+Launch a frontend and SGLang backend to test metrics:
+
+```bash
+# Start frontend (default port 8000, override with --http-port or DYN_HTTP_PORT env var)
+$ python -m dynamo.frontend
+
+# Enable system metrics server on port 8081
+$ DYN_SYSTEM_PORT=8081 python -m dynamo.sglang --model --enable-metrics
+```
+
+Wait for the SGLang worker to start, then send requests and check metrics:
+
+```bash
+# Send a request
+curl -H 'Content-Type: application/json' \
+-d '{
+ "model": "",
+ "max_completion_tokens": 100,
+ "messages": [{"role": "user", "content": "Hello"}]
+}' \
+http://localhost:8000/v1/chat/completions
+
+# Check metrics from the worker
+curl -s localhost:8081/metrics | grep "^sglang:"
+```
+
+## Exposed Metrics
+
+SGLang exposes metrics in Prometheus Exposition Format text at the `/metrics` HTTP endpoint. All SGLang engine metrics use the `sglang:` prefix and include labels (e.g., `model_name`, `engine_type`, `tp_rank`, `pp_rank`) to identify the source.
+
+**Example Prometheus Exposition Format text:**
+
+```
+# HELP sglang:prompt_tokens_total Number of prefill tokens processed.
+# TYPE sglang:prompt_tokens_total counter
+sglang:prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8128902.0
+
+# HELP sglang:generation_tokens_total Number of generation tokens processed.
+# TYPE sglang:generation_tokens_total counter
+sglang:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7557572.0
+
+# HELP sglang:cache_hit_rate The cache hit rate
+# TYPE sglang:cache_hit_rate gauge
+sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0075
+```
+
+**Note:** The specific metrics shown above are examples and may vary depending on your SGLang version. Always inspect your actual `/metrics` endpoint or refer to the [official documentation](https://docs.sglang.ai/references/production_metrics.html) for the current list.
+
+### Metric Categories
+
+SGLang provides metrics in the following categories (all prefixed with `sglang:`):
+
+- **Throughput metrics** - Token processing rates
+- **Resource usage** - System resource consumption
+- **Latency metrics** - Request and token latency measurements
+- **Disaggregation metrics** - Metrics specific to disaggregated deployments (when enabled)
+
+**Note:** Specific metrics are subject to change between SGLang versions. Always refer to the [official documentation](https://docs.sglang.ai/references/production_metrics.html) or inspect the `/metrics` endpoint for your SGLang version.
+
+## Available Metrics
+
+The official SGLang documentation includes complete metric definitions with:
+- HELP and TYPE descriptions
+- Counter, Gauge, and Histogram metric types
+- Metric labels (e.g., `model_name`, `engine_type`, `tp_rank`, `pp_rank`)
+- Setup guide for Prometheus + Grafana monitoring
+- Troubleshooting tips and configuration examples
+
+For the complete and authoritative list of all SGLang metrics, see the [official SGLang Production Metrics documentation](https://docs.sglang.ai/references/production_metrics.html).
+
+## Implementation Details
+
+- SGLang uses multiprocess metrics collection via `prometheus_client.multiprocess.MultiProcessCollector`
+- Metrics are filtered by the `sglang:` prefix before being exposed
+- The integration uses Dynamo's `register_engine_metrics_callback()` function
+- Metrics appear after SGLang engine initialization completes
+
+## Related Documentation
+
+### SGLang Metrics
+- [Official SGLang Production Metrics](https://docs.sglang.ai/references/production_metrics.html)
+- [SGLang GitHub - Metrics Collector](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/metrics/collector.py)
+
+### Dynamo Metrics
+- [Dynamo Metrics Guide](../../observability/metrics.md) - Complete documentation on Dynamo runtime metrics
+- [Prometheus and Grafana Setup](../../observability/prometheus-grafana.md) - Visualization setup instructions
+- Dynamo runtime metrics (prefixed with `dynamo_*`) are available at the same `/metrics` endpoint alongside SGLang metrics
+ - Implementation: `lib/runtime/src/metrics.rs` (Rust runtime metrics)
+ - Metric names: `lib/runtime/src/metrics/prometheus_names.rs` (metric name constants)
+ - Integration code: `components/src/dynamo/common/utils/prometheus.py` - Prometheus utilities and callback registration
diff --git a/fern/pages/backends/sglang/sgl-hicache-example.md b/fern/pages/backends/sglang/sgl-hicache-example.md
new file mode 100644
index 00000000000..d4ef20a2fe3
--- /dev/null
+++ b/fern/pages/backends/sglang/sgl-hicache-example.md
@@ -0,0 +1,64 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Enable SGLang Hierarchical Cache (HiCache)"
+---
+
+This guide shows how to enable SGLang's Hierarchical Cache (HiCache) inside Dynamo.
+
+## 1) Start the SGLang worker with HiCache enabled
+
+```bash
+python -m dynamo.sglang \
+ --model-path Qwen/Qwen3-0.6B \
+ --host 0.0.0.0 --port 8000 \
+ --page-size 64 \
+ --enable-hierarchical-cache \
+ --hicache-ratio 2 \
+ --hicache-write-policy write_through \
+ --hicache-storage-backend nixl \
+ --log-level debug \
+ --skip-tokenizer-init
+```
+
+- **--enable-hierarchical-cache**: Enables hierarchical KV cache/offload
+- **--hicache-ratio**: The ratio of the size of host KV cache memory pool to the size of device pool. Lower this number if your machine has less CPU memory.
+- **--hicache-write-policy**: Write policy (e.g., `write_through` for synchronous host writes)
+- **--hicache-storage-backend**: Host storage backend for HiCache (e.g., `nixl`). NIXL selects the concrete store automatically; see [PR #8488](https://github.com/sgl-project/sglang/pull/8488)
+
+
+Then, start the frontend:
+```bash
+python -m dynamo.frontend --http-port 8000
+```
+
+## 2) Send a single request
+
+```bash
+curl localhost:8000/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "Qwen/Qwen3-0.6B",
+ "messages": [
+ {
+ "role": "user",
+ "content": "Explain why Roger Federer is considered one of the greatest tennis players of all time"
+ }
+ ],
+ "stream": false,
+ "max_tokens": 30
+ }'
+```
+
+## 3) (Optional) Benchmarking
+
+Run the perf script:
+```bash
+bash -x $DYNAMO_ROOT/benchmarks/llm/perf.sh \
+ --model Qwen/Qwen3-0.6B \
+ --tensor-parallelism 1 \
+ --data-parallelism 1 \
+ --concurrency "2,4,8" \
+ --input-sequence-length 2048 \
+ --output-sequence-length 256
+```
diff --git a/fern/pages/backends/sglang/sglang-disaggregation.md b/fern/pages/backends/sglang/sglang-disaggregation.md
new file mode 100644
index 00000000000..e0be2afd126
--- /dev/null
+++ b/fern/pages/backends/sglang/sglang-disaggregation.md
@@ -0,0 +1,88 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "SGLang Disaggregated Serving"
+---
+
+This document explains how SGLang's disaggregated prefill-decode architecture works, both standalone and within Dynamo.
+
+## Overview
+
+Disaggregated serving separates the prefill and decode phases of LLM inference into different workers. This architecture allows for:
+- Independent scaling of prefill and decode resources
+- Better resource utilization (prefill is compute-bound, decode is memory-bound)
+- Efficient KV cache transfer between workers using RDMA
+
+## How Dynamo Integrates with SGLang Disaggregation
+
+**SGLang's standalone approach:**
+1. The load balancer receives a request from the client
+2. A random `(prefill, decode)` pair is selected from the pool of available workers
+3. Request is sent to both `prefill` and `decode` workers via asyncio tasks
+4. Internally disaggregation is done from prefill → decode
+
+**Dynamo's approach:**
+
+Because Dynamo has a discovery mechanism, we do not use a load balancer. Instead:
+1. Route to a decode worker first
+2. Choose a prefill worker via round-robin or KV-aware selection
+3. Send the request to both workers
+4. SGLang's bootstrap server (part of the `tokenizer_manager`) is used in conjunction with NIXL/Mooncake to handle the KV transfer
+
+## Disaggregation Flow
+
+The following diagram shows the complete request flow for disaggregated serving:
+
+```mermaid
+sequenceDiagram
+ participant Client
+ participant Decode
+ participant Prefill
+
+ Note over Decode,Prefill: 0. Setup Phase (One-Time)
+ Decode->>Prefill: Register RDMA connection info (base GPU memory pointers)
+ Note over Client,Prefill: Per-Request Phase
+ Client->>Decode: 1. Send request
+ Decode->>Prefill: 2. Forward request + get bootstrap_room
+ Prefill-->>Decode: Return bootstrap_room ID
+ Note over Decode: 3. Allocate GPU memory for KV cache
+ Decode->>Prefill: Send allocation info (page indices, metadata buffer)
+ Note over Prefill: 4. Prefill forward pass
+ par Decode polls
+ loop Poll transfer
+ Note over Decode: 5. Poll for KV arrival
+ end
+ and Prefill transfers
+ Note over Prefill: 6. RDMA write KV to decode
+ Prefill->>Decode: Transfer KV cache + metadata
+ end
+ Note over Prefill: 7. Poll RDMA handles
+ Note over Prefill: Transfer complete, deallocate metadata
+ Note over Decode: 8. KV received, start decode
+ loop Generate tokens
+ Note over Decode: Decode forward pass
+ Decode-->>Client: Stream output token
+ end
+```
+
+### Key Steps Explained
+
+**Setup Phase (One-Time)**
+- Decode workers register their RDMA connection information with prefill workers
+- This includes base GPU memory pointers for direct memory access
+
+**Per-Request Flow**
+1. **Request initiation**: Client sends request to decode worker
+2. **Bootstrap room allocation**: Decode forwards to prefill and receives a bootstrap_room ID for coordination
+3. **Memory allocation**: Decode allocates GPU memory pages for incoming KV cache
+4. **Prefill execution**: Prefill worker processes the prompt and generates KV cache
+5. **KV transfer**: Prefill uses RDMA to write KV cache directly to decode's GPU memory (while decode polls for completion)
+6. **Cleanup**: Prefill deallocates transfer metadata after confirming completion
+7. **Decode phase**: Decode worker generates tokens using the transferred KV cache
+8. **Streaming**: Tokens are streamed back to the client as they're generated
+
+### Performance Characteristics
+
+- **RDMA transfer**: Zero-copy GPU-to-GPU transfer with minimal CPU involvement
+- **Parallel operations**: Decode can poll while prefill transfers data
+- **One-time setup**: RDMA connections established once, reused for all requests
\ No newline at end of file
diff --git a/fern/pages/backends/trtllm/README.md b/fern/pages/backends/trtllm/README.md
new file mode 100644
index 00000000000..06b3fdb910e
--- /dev/null
+++ b/fern/pages/backends/trtllm/README.md
@@ -0,0 +1,284 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "LLM Deployment using TensorRT-LLM"
+---
+
+This directory contains examples and reference implementations for deploying Large Language Models (LLMs) in various configurations using TensorRT-LLM.
+
+## Use the Latest Release
+
+We recommend using the latest stable release of dynamo to avoid breaking changes:
+
+[](https://github.com/ai-dynamo/dynamo/releases/latest)
+
+You can find the latest release [here](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with:
+
+```bash
+git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
+```
+
+---
+
+## Table of Contents
+- [Feature Support Matrix](#feature-support-matrix)
+- [Quick Start](#tensorrt-llm-quick-start)
+- [Single Node Examples](#single-node-examples)
+- [Advanced Examples](#advanced-examples)
+- [KV Cache Transfer](#kv-cache-transfer-in-disaggregated-serving)
+- [Client](#client)
+- [Benchmarking](#benchmarking)
+- [Multimodal Support](#multimodal-support)
+- [Logits Processing](#logits-processing)
+- [Performance Sweep](#performance-sweep)
+
+## Feature Support Matrix
+
+### Core Dynamo Features
+
+| Feature | TensorRT-LLM | Notes |
+|---------|--------------|-------|
+| [**Disaggregated Serving**](../../design-docs/disagg-serving.md) | ✅ | |
+| [**Conditional Disaggregation**](../../design-docs/disagg-serving.md#conditional-disaggregation) | 🚧 | Not supported yet |
+| [**KV-Aware Routing**](../../router/kv-cache-routing.md) | ✅ | |
+| [**SLA-Based Planner**](../../planner/sla-planner.md) | ✅ | |
+| [**Load Based Planner**](../../planner/load-planner.md) | 🚧 | Planned |
+| [**KVBM**](../../kvbm/kvbm-architecture.md) | ✅ | |
+
+### Large Scale P/D and WideEP Features
+
+| Feature | TensorRT-LLM | Notes |
+|--------------------|--------------|-----------------------------------------------------------------|
+| **WideEP** | ✅ | |
+| **DP Rank Routing**| ✅ | |
+| **GB200 Support** | ✅ | |
+
+## TensorRT-LLM Quick Start
+
+Below we provide a guide that lets you run all of our the common deployment patterns on a single node.
+
+### Start NATS and ETCD in the background
+
+Start using [Docker Compose](https://github.com/ai-dynamo/dynamo/tree/main/deploy/docker-compose.yml)
+
+```bash
+docker compose -f deploy/docker-compose.yml up -d
+```
+
+### Build container
+
+```bash
+# TensorRT-LLM uses git-lfs, which needs to be installed in advance.
+apt-get update && apt-get -y install git git-lfs
+
+# On an x86 machine:
+./container/build.sh --framework trtllm
+
+# On an ARM machine:
+./container/build.sh --framework trtllm --platform linux/arm64
+
+# Build the container with the default experimental TensorRT-LLM commit
+# WARNING: This is for experimental feature testing only.
+# The container should not be used in a production environment.
+./container/build.sh --framework trtllm --tensorrtllm-git-url https://github.com/NVIDIA/TensorRT-LLM.git --tensorrtllm-commit main
+```
+
+### Run container
+
+```bash
+./container/run.sh --framework trtllm -it
+```
+
+## Single Node Examples
+
+
+Below we provide some simple shell scripts that run the components for each configuration. Each shell script is simply running the `python3 -m dynamo.frontend ` to start up the ingress and using `python3 -m dynamo.trtllm ` to start up the workers. You can easily take each command and run them in separate terminals.
+
+
+For detailed information about the architecture and how KV-aware routing works, see the [KV Cache Routing documentation](../../router/kv-cache-routing.md).
+
+### Aggregated
+```bash
+cd $DYNAMO_HOME/examples/backends/trtllm
+./launch/agg.sh
+```
+
+### Aggregated with KV Routing
+```bash
+cd $DYNAMO_HOME/examples/backends/trtllm
+./launch/agg_router.sh
+```
+
+### Disaggregated
+
+```bash
+cd $DYNAMO_HOME/examples/backends/trtllm
+./launch/disagg.sh
+```
+
+### Disaggregated with KV Routing
+
+
+In disaggregated workflow, requests are routed to the prefill worker to maximize KV cache reuse.
+
+
+```bash
+cd $DYNAMO_HOME/examples/backends/trtllm
+./launch/disagg_router.sh
+```
+
+### Aggregated with Multi-Token Prediction (MTP) and DeepSeek R1
+```bash
+cd $DYNAMO_HOME/examples/backends/trtllm
+
+export AGG_ENGINE_ARGS=./engine_configs/deepseek-r1/agg/mtp/mtp_agg.yaml
+export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4"
+# nvidia/DeepSeek-R1-FP4 is a large model
+export MODEL_PATH="nvidia/DeepSeek-R1-FP4"
+./launch/agg.sh
+```
+
+Notes:
+- There is a noticeable latency for the first two inference requests. Please send warm-up requests before starting the benchmark.
+- MTP performance may vary depending on the acceptance rate of predicted tokens, which is dependent on the dataset or queries used while benchmarking. Additionally, `ignore_eos` should generally be omitted or set to `false` when using MTP to avoid speculating garbage outputs and getting unrealistic acceptance rates.
+
+## Advanced Examples
+
+Below we provide a selected list of advanced examples. Please open up an issue if you'd like to see a specific example!
+
+### Multinode Deployment
+
+For comprehensive instructions on multinode serving, see the [multinode-examples.md](multinode/multinode-examples.md) guide. It provides step-by-step deployment examples and configuration tips for running Dynamo with TensorRT-LLM across multiple nodes. While the walkthrough uses DeepSeek-R1 as the model, you can easily adapt the process for any supported model by updating the relevant configuration files. You can see [Llama4+eagle](llama4-plus-eagle.md) guide to learn how to use these scripts when a single worker fits on the single node.
+
+### Speculative Decoding
+- **[Llama 4 Maverick Instruct + Eagle Speculative Decoding](llama4-plus-eagle.md)**
+
+### Kubernetes Deployment
+
+For complete Kubernetes deployment instructions, configurations, and troubleshooting, see [TensorRT-LLM Kubernetes Deployment Guide](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/trtllm/deploy/README.md).
+
+### Client
+
+See [client](../sglang/README.md#testing-the-deployment) section to learn how to send request to the deployment.
+
+NOTE: To send a request to a multi-node deployment, target the node which is running `python3 -m dynamo.frontend `.
+
+### Benchmarking
+
+To benchmark your deployment with AIPerf, see this utility script, configuring the
+`model` name and `host` based on your deployment: [perf.sh](https://github.com/ai-dynamo/dynamo/tree/main/benchmarks/llm/perf.sh)
+
+## KV Cache Transfer in Disaggregated Serving
+
+Dynamo with TensorRT-LLM supports two methods for transferring KV cache in disaggregated serving: UCX (default) and NIXL (experimental). For detailed information and configuration instructions for each method, see the [KV cache transfer guide](kv-cache-transfer.md).
+
+
+## Request Migration
+
+You can enable [request migration](../../fault-tolerance/request-migration.md) to handle worker failures gracefully. Use the `--migration-limit` flag to specify how many times a request can be migrated to another worker:
+
+```bash
+# For decode and aggregated workers
+python3 -m dynamo.trtllm ... --migration-limit=3
+```
+
+
+**Prefill workers do not support request migration** and must use `--migration-limit=0` (the default). Prefill workers only process prompts and return KV cache state - they don't maintain long-running generation requests that would benefit from migration.
+
+
+See the [Request Migration Architecture](../../fault-tolerance/request-migration.md) documentation for details on how this works.
+
+## Request Cancellation
+
+When a user cancels a request (e.g., by disconnecting from the frontend), the request is automatically cancelled across all workers, freeing compute resources for other requests.
+
+### Cancellation Support Matrix
+
+| | Prefill | Decode |
+|-|---------|--------|
+| **Aggregated** | ✅ | ✅ |
+| **Disaggregated** | ✅ | ✅ |
+
+For more details, see the [Request Cancellation Architecture](../../fault-tolerance/request-cancellation.md) documentation.
+
+## Client
+
+See [client](../sglang/README.md#testing-the-deployment) section to learn how to send request to the deployment.
+
+NOTE: To send a request to a multi-node deployment, target the node which is running `python3 -m dynamo.frontend `.
+
+## Benchmarking
+
+To benchmark your deployment with AIPerf, see this utility script, configuring the
+`model` name and `host` based on your deployment: [perf.sh](https://github.com/ai-dynamo/dynamo/tree/main/benchmarks/llm/perf.sh)
+
+## Multimodal support
+
+Dynamo with the TensorRT-LLM backend supports multimodal models, enabling you to process both text and images (or pre-computed embeddings) in a single request. For detailed setup instructions, example requests, and best practices, see the [TensorRT-LLM Multimodal Guide](../../multimodal/trtllm.md).
+
+## Logits Processing
+
+Logits processors let you modify the next-token logits at every decoding step (e.g., to apply custom constraints or sampling transforms). Dynamo provides a backend-agnostic interface and an adapter for TensorRT-LLM so you can plug in custom processors.
+
+### How it works
+- **Interface**: Implement `dynamo.logits_processing.BaseLogitsProcessor` which defines `__call__(input_ids, logits)` and modifies `logits` in-place.
+- **TRT-LLM adapter**: Use `dynamo.trtllm.logits_processing.adapter.create_trtllm_adapters(...)` to convert Dynamo processors into TRT-LLM-compatible processors and assign them to `SamplingParams.logits_processor`.
+- **Examples**: See example processors in `lib/bindings/python/src/dynamo/logits_processing/examples/` ([temperature](https://github.com/ai-dynamo/dynamo/tree/main/lib/bindings/python/src/dynamo/logits_processing/examples/temperature.py), [hello_world](https://github.com/ai-dynamo/dynamo/tree/main/lib/bindings/python/src/dynamo/logits_processing/examples/hello_world.py)).
+
+### Quick test: HelloWorld processor
+You can enable a test-only processor that forces the model to respond with "Hello world!". This is useful to verify the wiring without modifying your model or engine code.
+
+```bash
+cd $DYNAMO_HOME/examples/backends/trtllm
+export DYNAMO_ENABLE_TEST_LOGITS_PROCESSOR=1
+./launch/agg.sh
+```
+
+Notes:
+- When enabled, Dynamo initializes the tokenizer so the HelloWorld processor can map text to token IDs.
+- Expected chat response contains "Hello world".
+
+### Bring your own processor
+Implement a processor by conforming to `BaseLogitsProcessor` and modify logits in-place. For example, temperature scaling:
+
+```python
+from typing import Sequence
+import torch
+from dynamo.logits_processing import BaseLogitsProcessor
+
+class TemperatureProcessor(BaseLogitsProcessor):
+ def __init__(self, temperature: float = 1.0):
+ if temperature <= 0:
+ raise ValueError("Temperature must be positive")
+ self.temperature = temperature
+
+ def __call__(self, input_ids: Sequence[int], logits: torch.Tensor):
+ if self.temperature == 1.0:
+ return
+ logits.div_(self.temperature)
+```
+
+Wire it into TRT-LLM by adapting and attaching to `SamplingParams`:
+
+```python
+from dynamo.trtllm.logits_processing.adapter import create_trtllm_adapters
+from dynamo.logits_processing.examples import TemperatureProcessor
+
+processors = [TemperatureProcessor(temperature=0.7)]
+sampling_params.logits_processor = create_trtllm_adapters(processors)
+```
+
+### Current limitations
+- Per-request processing only (batch size must be 1); beam width > 1 is not supported.
+- Processors must modify logits in-place and not return a new tensor.
+- If your processor needs tokenization, ensure the tokenizer is initialized (do not skip tokenizer init).
+
+## Performance Sweep
+
+For detailed instructions on running comprehensive performance sweeps across both aggregated and disaggregated serving configurations, see the [TensorRT-LLM Benchmark Scripts for DeepSeek R1 model](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/trtllm/performance_sweeps/README.md). This guide covers recommended benchmarking setups, usage of provided scripts, and best practices for evaluating system performance.
+
+## Dynamo KV Block Manager Integration
+
+Dynamo with TensorRT-LLM currently supports integration with the Dynamo KV Block Manager. This integration can significantly reduce time-to-first-token (TTFT) latency, particularly in usage patterns such as multi-turn conversations and repeated long-context requests.
+
+Here is the instruction: [Running KVBM in TensorRT-LLM](../../kvbm/trtllm-setup.md) .
diff --git a/fern/pages/backends/trtllm/gemma3-sliding-window-attention.md b/fern/pages/backends/trtllm/gemma3-sliding-window-attention.md
new file mode 100644
index 00000000000..5adde1b9b46
--- /dev/null
+++ b/fern/pages/backends/trtllm/gemma3-sliding-window-attention.md
@@ -0,0 +1,52 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Gemma 3 with Variable Sliding Window Attention"
+---
+
+This guide demonstrates how to deploy google/gemma-3-1b-it with Variable Sliding Window Attention (VSWA) using Dynamo. Since google/gemma-3-1b-it is a small model, each aggregated, decode, or prefill worker only requires one H100 GPU or one GB200 GPU.
+VSWA is a mechanism in which a model’s layers alternate between multiple sliding window sizes. An example of this is Gemma 3, which incorporates both global attention layers and sliding window layers.
+
+
+- Ensure that required services such as `nats` and `etcd` are running before starting.
+- Request access to `google/gemma-3-1b-it` on Hugging Face and set your `HF_TOKEN` environment variable for authentication.
+- It's recommended to continue using the VSWA feature with the Dynamo 0.5.0 release and the TensorRT-LLM dynamo runtime image nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.5.0. The 0.5.1 release bundles TensorRT-LLM v1.1.0rc5, which has a regression that breaks VSWA.
+
+
+## Aggregated Serving
+```bash
+cd $DYNAMO_HOME/examples/backends/trtllm
+export MODEL_PATH=google/gemma-3-1b-it
+export SERVED_MODEL_NAME=$MODEL_PATH
+export AGG_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml
+./launch/agg.sh
+```
+
+## Aggregated Serving with KV Routing
+```bash
+cd $DYNAMO_HOME/examples/backends/trtllm
+export MODEL_PATH=google/gemma-3-1b-it
+export SERVED_MODEL_NAME=$MODEL_PATH
+export AGG_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml
+./launch/agg_router.sh
+```
+
+## Disaggregated Serving
+```bash
+cd $DYNAMO_HOME/examples/backends/trtllm
+export MODEL_PATH=google/gemma-3-1b-it
+export SERVED_MODEL_NAME=$MODEL_PATH
+export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml
+export DECODE_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml
+./launch/disagg.sh
+```
+
+## Disaggregated Serving with KV Routing
+```bash
+cd $DYNAMO_HOME/examples/backends/trtllm
+export MODEL_PATH=google/gemma-3-1b-it
+export SERVED_MODEL_NAME=$MODEL_PATH
+export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml
+export DECODE_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml
+./launch/disagg_router.sh
+```
diff --git a/fern/pages/backends/trtllm/gpt-oss.md b/fern/pages/backends/trtllm/gpt-oss.md
new file mode 100644
index 00000000000..4b5d4c7ebb2
--- /dev/null
+++ b/fern/pages/backends/trtllm/gpt-oss.md
@@ -0,0 +1,515 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Running gpt-oss-120b Disaggregated with TensorRT-LLM"
+---
+
+Dynamo supports disaggregated serving of gpt-oss-120b with TensorRT-LLM. This guide demonstrates how to deploy gpt-oss-120b using disaggregated prefill/decode serving on a single B200 node with 8 GPUs, running 1 prefill worker on 4 GPUs and 1 decode worker on 4 GPUs.
+
+## Overview
+
+This deployment uses disaggregated serving in TensorRT-LLM where:
+- **Prefill Worker**: Processes input prompts efficiently using 4 GPUs with tensor parallelism
+- **Decode Worker**: Generates output tokens using 4 GPUs, optimized for token generation throughput
+- **Frontend**: Provides OpenAI-compatible API endpoint with round-robin routing
+
+The disaggregated approach optimizes for both low-latency (maximizing tokens per second per user) and high-throughput (maximizing total tokens per GPU per second) use cases by separating the compute-intensive prefill phase from the memory-bound decode phase.
+
+## Prerequisites
+
+- 1x NVIDIA B200 node with 8 GPUs (this guide focuses on single-node B200 deployment)
+- CUDA Toolkit 12.8 or later
+- Docker with [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed
+- Fast SSD storage for model weights (~240GB required)
+- HuggingFace account and [access token](https://huggingface.co/settings/tokens)
+- [HuggingFace CLI](https://huggingface.co/docs/huggingface_hub/en/guides/cli)
+
+
+Ensure that the `etcd` and `nats` services are running with the following command:
+
+```bash
+docker compose -f deploy/docker-compose.yml up
+```
+
+## Instructions
+
+### 1. Download the Model
+
+```bash
+export MODEL_PATH=
+export HF_TOKEN=
+
+pip install -U "huggingface_hub[cli]"
+
+huggingface-cli download openai/gpt-oss-120b --exclude "original/*" --exclude "metal/*" --local-dir $MODEL_PATH
+```
+
+### 2. Run the Container
+
+Set the container image:
+```bash
+export DYNAMO_CONTAINER_IMAGE=nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag
+```
+
+Launch the Dynamo TensorRT-LLM container with the necessary configurations:
+
+```bash
+docker run \
+ --gpus all \
+ -it \
+ --rm \
+ --network host \
+ --volume $MODEL_PATH:/model \
+ --volume $PWD:/workspace \
+ --shm-size=10G \
+ --ulimit memlock=-1 \
+ --ulimit stack=67108864 \
+ --ulimit nofile=65536:65536 \
+ --cap-add CAP_SYS_PTRACE \
+ --ipc host \
+ -e HF_TOKEN=$HF_TOKEN \
+ -e TRTLLM_ENABLE_PDL=1 \
+ -e TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True \
+ $DYNAMO_CONTAINER_IMAGE
+```
+
+This command:
+- Automatically removes the container when stopped (`--rm`)
+- Allows container to interact with host's IPC resources for optimal performance (`--ipc=host`)
+- Runs the container in interactive mode (`-it`)
+- Sets up shared memory and stack limits for optimal performance
+- Mounts your model directory into the container at `/model`
+- Mounts the current Dynamo workspace into the container at `/workspace/dynamo`
+- Enables [PDL](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization) and disables parallel weight loading
+- Sets HuggingFace token as environment variable in the container
+
+### 3. Understanding the Configuration
+
+The deployment uses configuration files and command-line arguments to control behavior:
+
+#### Configuration Files
+
+**Prefill Configuration (`examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml`)**:
+- `enable_attention_dp: false` - Attention data parallelism disabled for prefill
+- `enable_chunked_prefill: true` - Enables efficient chunked prefill processing
+- `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers
+- `cache_transceiver_config.backend: ucx` - Uses UCX for efficient KV cache transfer
+- `cuda_graph_config.max_batch_size: 32` - Maximum batch size for CUDA graphs
+
+**Decode Configuration (`examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml`)**:
+- `enable_attention_dp: true` - Attention data parallelism enabled for decode
+- `disable_overlap_scheduler: false` - Enables overlapping for decode efficiency
+- `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers
+- `cache_transceiver_config.backend: ucx` - Uses UCX for efficient KV cache transfer
+- `cuda_graph_config.max_batch_size: 128` - Maximum batch size for CUDA graphs
+
+#### Command-Line Arguments
+
+Both workers receive these key arguments:
+- `--tensor-parallel-size 4` - Uses 4 GPUs for tensor parallelism
+- `--expert-parallel-size 4` - Expert parallelism across 4 GPUs
+- `--free-gpu-memory-fraction 0.9` - Allocates 90% of GPU memory
+
+Prefill-specific arguments:
+- `--max-num-tokens 20000` - Maximum tokens for prefill processing
+- `--max-batch-size 32` - Maximum batch size for prefill
+
+Decode-specific arguments:
+- `--max-num-tokens 16384` - Maximum tokens for decode processing
+- `--max-batch-size 128` - Maximum batch size for decode
+
+### 4. Launch the Deployment
+
+Note that GPT-OSS is a reasoning model with tool calling support. To ensure the response is being processed correctly, the worker should be launched with proper ```--dyn-reasoning-parser``` and ```--dyn-tool-call-parser```.
+
+You can use the provided launch script or run the components manually:
+
+#### Option A: Using the Launch Script
+
+```bash
+cd /workspace/examples/backends/trtllm
+./launch/gpt_oss_disagg.sh
+```
+
+#### Option B: Manual Launch
+
+1. **Start frontend**:
+```bash
+# Start frontend with round-robin routing
+python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 &
+```
+
+2. **Launch prefill worker**:
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \
+ --model-path /model \
+ --served-model-name openai/gpt-oss-120b \
+ --extra-engine-args examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml \
+ --dyn-reasoning-parser gpt_oss \
+ --dyn-tool-call-parser harmony \
+ --disaggregation-mode prefill \
+ --max-num-tokens 20000 \
+ --max-batch-size 32 \
+ --free-gpu-memory-fraction 0.9 \
+ --tensor-parallel-size 4 \
+ --expert-parallel-size 4 &
+```
+
+3. **Launch decode worker**:
+```bash
+CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \
+ --model-path /model \
+ --served-model-name openai/gpt-oss-120b \
+ --extra-engine-args examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml \
+ --dyn-reasoning-parser gpt_oss \
+ --dyn-tool-call-parser harmony \
+ --disaggregation-mode decode \
+ --max-num-tokens 16384 \
+ --free-gpu-memory-fraction 0.9 \
+ --tensor-parallel-size 4 \
+ --expert-parallel-size 4
+```
+
+### 6. Verify the Deployment is Ready
+
+Poll the `/health` endpoint to verify that both the prefill and decode worker endpoints have started:
+```
+curl http://localhost:8000/health
+```
+
+Make sure that both of the endpoints are available before sending an inference request:
+```
+{
+ "endpoints": [
+ "dyn://dynamo.tensorrt_llm.generate",
+ "dyn://dynamo.prefill.generate"
+ ],
+ "status": "healthy"
+}
+```
+
+If only one worker endpoint is listed, the other may still be starting up. Monitor the worker logs to track startup progress.
+
+### 7. Test the Deployment
+
+Send a test request to verify the deployment:
+
+```bash
+curl -X POST http://localhost:8000/v1/responses \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "openai/gpt-oss-120b",
+ "input": "Explain the concept of disaggregated serving in LLM inference in 3 sentences.",
+ "max_output_tokens": 200,
+ "stream": false
+ }'
+```
+
+The server exposes a standard OpenAI-compatible API endpoint that accepts JSON requests. You can adjust parameters like `max_tokens`, `temperature`, and others according to your needs.
+
+### 8. Reasoning and Tool Calling
+
+Dynamo has supported reasoning and tool calling in OpenAI Chat Completion endpoint. A typical workflow for application built on top of Dynamo
+is that the application has a set of tools to aid the assistant provide accurate answer, and it is ususally
+multi-turn as it involves tool selection and generation based on the tool result.
+
+In addition, the reasoning effort can be configured through ```chat_template_args```. Increasing the reasoning effort makes the model more accurate but also slower. It supports three levels: ```low```, ```medium```, and ```high```.
+
+Below is an example of sending multi-round requests to complete a user query with reasoning and tool calling:
+**Application setup (pseudocode)**
+```Python
+# The tool defined by the application
+def get_system_health():
+ for component in system.components:
+ if not component.health():
+ return False
+ return True
+
+# The JSON representation of the declaration in ChatCompletion tool style
+tool_choice = '{
+ "type": "function",
+ "function": {
+ "name": "get_system_health",
+ "description": "Returns the current health status of the LLM runtime—use before critical operations to verify the service is live.",
+ "parameters": {
+ "type": "object",
+ "properties": {}
+ }
+ }
+}'
+
+# On user query, perform below workflow.
+def user_query(app_request):
+ # first round
+ # create chat completion with prompt and tool choice
+ request = ...
+ response = send(request)
+
+ if response["finish_reason"] == "tool_calls":
+ # second round
+ function, params = parse_tool_call(response)
+ function_result = function(params)
+ # create request with prompt, assistant response, and function result
+ request = ...
+ response = send(request)
+ return app_response(response)
+```
+
+
+**First request with tools**
+
+
+```bash
+curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '
+{
+ "model": "openai/gpt-oss-120b",
+ "messages": [
+ {
+ "role": "user",
+ "content": "Hey, quick check: is everything up and running?"
+ }
+ ],
+ "chat_template_args": {
+ "reasoning_effort": "low"
+ },
+ "tools": [
+ {
+ "type": "function",
+ "function": {
+ "name": "get_system_health",
+ "description": "Returns the current health status of the LLM runtime—use before critical operations to verify the service is live.",
+ "parameters": {
+ "type": "object",
+ "properties": {}
+ }
+ }
+ }
+ ],
+ "response_format": {
+ "type": "text"
+ },
+ "stream": false,
+ "max_tokens": 300
+}'
+```
+**First response with tool choice**
+```JSON
+{
+ "id": "chatcmpl-d1c12219-6298-4c83-a6e3-4e7cef16e1a9",
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "tool_calls": [
+ {
+ "id": "call-1",
+ "type": "function",
+ "function": {
+ "name": "get_system_health",
+ "arguments": "{}"
+ }
+ }
+ ],
+ "role": "assistant",
+ "reasoning_content": "We need to check system health. Use function."
+ },
+ "finish_reason": "tool_calls"
+ }
+ ],
+ "created": 1758758741,
+ "model": "openai/gpt-oss-120b",
+ "object": "chat.completion",
+ "usage": null
+}
+```
+**Second request with tool calling result**
+```bash
+curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '
+{
+ "model": "openai/gpt-oss-120b",
+ "messages": [
+ {
+ "role": "user",
+ "content": "Hey, quick check: is everything up and running?"
+ },
+ {
+ "role": "assistant",
+ "tool_calls": [
+ {
+ "id": "call-1",
+ "type": "function",
+ "function": {
+ "name": "get_system_health",
+ "arguments": "{}"
+ }
+ }
+ ]
+ },
+ {
+ "role": "tool",
+ "tool_call_id": "call-1",
+ "content": "{\"status\":\"ok\",\"uptime_seconds\":372045}"
+ }
+ ],
+ "chat_template_args": {
+ "reasoning_effort": "low"
+ },
+ "tools": [
+ {
+ "type": "function",
+ "function": {
+ "name": "get_system_health",
+ "description": "Returns the current health status of the LLM runtime—use before critical operations to verify the service is live.",
+ "parameters": {
+ "type": "object",
+ "properties": {}
+ }
+ }
+ }
+ ],
+ "response_format": {
+ "type": "text"
+ },
+ "stream": false,
+ "max_tokens": 300
+}'
+```
+**Second response with final message**
+```JSON
+{
+ "id": "chatcmpl-9ebfe64a-68b9-4c1d-9742-644cf770ad0e",
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "content": "All systems are green—everything’s up and running smoothly! 🚀 Let me know if you need anything else.",
+ "role": "assistant",
+ "reasoning_content": "The user asks: \"Hey, quick check: is everything up and running?\" We have just checked system health, it's ok. Provide friendly response confirming everything's up."
+ },
+ "finish_reason": "stop"
+ }
+ ],
+ "created": 1758758853,
+ "model": "openai/gpt-oss-120b",
+ "object": "chat.completion",
+ "usage": null
+}
+```
+## Benchmarking
+
+### Performance Testing with AIPerf
+
+The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main?tab=readme-ov-file#aiperf), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment.
+
+**Run the following benchmark from inside the container** (after completing the deployment steps above):
+
+```bash
+# Create a directory for benchmark results
+mkdir -p /tmp/benchmark-results
+
+# Run the benchmark - this command tests the deployment with high-concurrency synthetic workload
+aiperf profile \
+ --model openai/gpt-oss-120b \
+ --tokenizer /model \
+ --endpoint-type chat \
+ --endpoint /v1/chat/completions \
+ --streaming \
+ --url localhost:8000 \
+ --synthetic-input-tokens-mean 32000 \
+ --synthetic-input-tokens-stddev 0 \
+ --output-tokens-mean 256 \
+ --output-tokens-stddev 0 \
+ --extra-inputs max_tokens:256 \
+ --extra-inputs min_tokens:256 \
+ --extra-inputs ignore_eos:true \
+ --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
+ --concurrency 256 \
+ --request-count 6144 \
+ --warmup-request-count 1000 \
+ --num-dataset-entries 8000 \
+ --random-seed 100 \
+ --artifact-dir /tmp/benchmark-results \
+ -H 'Authorization: Bearer NOT USED' \
+ -H 'Accept: text/event-stream'
+```
+
+### What This Benchmark Does
+
+This command:
+- **Tests chat completions** with streaming responses against the disaggregated deployment
+- **Simulates high load** with 256 concurrent requests and 6144 total requests
+- **Uses long context inputs** (32K tokens) to test prefill performance
+- **Generates consistent outputs** (256 tokens) to measure decode throughput
+- **Includes warmup period** (1000 requests) to stabilize performance metrics
+- **Saves detailed results** to `/tmp/benchmark-results` for analysis
+
+Key parameters you can adjust:
+- `--concurrency`: Number of simultaneous requests (impacts GPU utilization)
+- `--synthetic-input-tokens-mean`: Average input length (tests prefill capacity)
+- `--output-tokens-mean`: Average output length (tests decode throughput)
+- `--request-count`: Total number of requests for the benchmark
+
+### Installing AIPerf Outside the Container
+
+If you prefer to run benchmarks from outside the container:
+
+```bash
+# Install AIPerf
+pip install aiperf
+
+# Then run the same benchmark command, adjusting the tokenizer path if needed
+```
+
+## Architecture Overview
+
+The disaggregated architecture separates prefill and decode phases:
+
+```mermaid
+flowchart TD
+ Client["Users/Clients (HTTP)"] --> Frontend["Frontend Round-Robin Router"]
+ Frontend --> Prefill["Prefill Worker (GPUs 0-3)"]
+ Frontend --> Decode["Decode Worker (GPUs 4-7)"]
+
+ Prefill -.->|KV Cache Transfer via UCX| Decode
+```
+
+## Key Features
+
+1. **Disaggregated Serving**: Separates compute-intensive prefill from memory-bound decode operations
+2. **Optimized Resource Usage**: Different parallelism strategies for prefill vs decode
+3. **Scalable Architecture**: Easy to adjust worker counts based on workload
+4. **TensorRT-LLM Optimizations**: Leverages TensorRT-LLM's efficient kernels and memory management
+
+## Troubleshooting
+
+### Common Issues
+
+1. **CUDA Out-of-Memory Errors**
+ - Reduce `--max-num-tokens` in the launch commands (currently 20000 for prefill, 16384 for decode)
+ - Lower `--free-gpu-memory-fraction` from 0.9 to 0.8 or 0.7
+ - Ensure model checkpoints are compatible with the expected format
+
+2. **Workers Not Connecting**
+ - Ensure etcd and NATS services are running: `docker ps | grep -E "(etcd|nats)"`
+ - Check network connectivity between containers
+ - Verify CUDA_VISIBLE_DEVICES settings match your GPU configuration
+ - Check that no other processes are using the assigned GPUs
+
+3. **Performance Issues**
+ - Monitor GPU utilization with `nvidia-smi` while the deployment is running
+ - Check worker logs for bottlenecks or errors
+ - Ensure that batch sizes in manual commands match those in configuration files
+ - Adjust chunked prefill settings based on your workload
+ - For connection issues, ensure port 8000 is not being used by another application
+
+4. **Container Startup Issues**
+ - Verify that the NVIDIA Container Toolkit is properly installed
+ - Check Docker daemon is running with GPU support
+ - Ensure sufficient disk space for model weights and container images
+
+## Next Steps
+
+- **Production Deployment**: For multi-node deployments, see the [Multi-node Guide](https://github.com/ai-dynamo/dynamo/tree/main/examples/basics/multinode/README.md)
+- **Advanced Configuration**: Explore TensorRT-LLM engine building options for further optimization
+- **Monitoring**: Set up Prometheus and Grafana for production monitoring
+- **Performance Benchmarking**: Use AIPerf to measure and optimize your deployment performance
diff --git a/fern/pages/backends/trtllm/kv-cache-transfer.md b/fern/pages/backends/trtllm/kv-cache-transfer.md
new file mode 100644
index 00000000000..40fddc6f478
--- /dev/null
+++ b/fern/pages/backends/trtllm/kv-cache-transfer.md
@@ -0,0 +1,23 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "KV Cache Transfer in Disaggregated Serving"
+---
+
+In disaggregated serving architectures, KV cache must be transferred between prefill and decode workers. TensorRT-LLM supports two methods for this transfer:
+
+## Default Method: NIXL
+By default, TensorRT-LLM uses **NIXL** (NVIDIA Inference Xfer Library) with UCX (Unified Communication X) as backend for KV cache transfer between prefill and decode workers. [NIXL](https://github.com/ai-dynamo/nixl) is NVIDIA's high-performance communication library designed for efficient data transfer in distributed GPU environments.
+
+### Specify Backends for NIXL
+
+TODO: Add instructions for how to specify different backends for NIXL.
+
+## Alternative Method: UCX
+
+TensorRT-LLM can also leverage **UCX** (Unified Communication X) directly for KV cache transfer between prefill and decode workers. There are two ways to enable UCX as the KV cache transfer backend:
+
+1. **Recommended:** Set `cache_transceiver_config.backend: UCX` in your engine configuration YAML file.
+2. Alternatively, set the environment variable `TRTLLM_USE_UCX_KV_CACHE=1` and configure `cache_transceiver_config.backend: DEFAULT` in the engine configuration YAML.
+
+This flexibility allows users to choose the most suitable method for their deployment and compatibility requirements.
diff --git a/fern/pages/backends/trtllm/llama4-plus-eagle.md b/fern/pages/backends/trtllm/llama4-plus-eagle.md
new file mode 100644
index 00000000000..e2ae9ec7efa
--- /dev/null
+++ b/fern/pages/backends/trtllm/llama4-plus-eagle.md
@@ -0,0 +1,72 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Llama 4 Maverick Instruct with Eagle Speculative Decoding on SLURM"
+---
+
+This guide demonstrates how to deploy Llama 4 Maverick Instruct with Eagle Speculative Decoding on GB200x4 nodes. We will be following the [multi-node deployment instructions](multinode/multinode-examples.md) to set up the environment for the following scenarios:
+
+- **Aggregated Serving:**
+ Deploy the entire Llama 4 model on a single GB200x4 node for end-to-end serving.
+
+- **Disaggregated Serving:**
+ Distribute the workload across two GB200x4 nodes:
+ - One node runs the decode worker.
+ - The other node runs the prefill worker.
+
+## Notes
+* Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `examples/backends/trtllm/engine_configs/llama4/eagle` folder.
+
+## Setup
+
+Assuming you have already allocated your nodes via `salloc`, and are
+inside an interactive shell on one of the allocated nodes, set the
+following environment variables based:
+
+```bash
+cd $DYNAMO_HOME/examples/backends/trtllm
+
+export IMAGE=""
+# export MOUNTS="${PWD}/:/mnt,/lustre:/lustre"
+export MOUNTS="${PWD}/:/mnt"
+export MODEL_PATH="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8"
+export SERVED_MODEL_NAME="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8"
+```
+
+See [this](multinode/multinode-examples.md#setup) section from multinode guide to learn more about the above options.
+
+
+## Aggregated Serving
+```bash
+export NUM_NODES=1
+export ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yml"
+./multinode/srun_aggregated.sh
+```
+
+## Disaggregated Serving
+
+```bash
+export NUM_PREFILL_NODES=1
+export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yml"
+export NUM_DECODE_NODES=1
+export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yml"
+./multinode/srun_disaggregated.sh
+```
+
+## Example Request
+
+See [here](multinode/multinode-examples.md#example-request) to learn how to send a request to the deployment.
+
+```
+curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
+ "model": "nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
+ "messages": [{"role": "user", "content": "Why is NVIDIA a great company?"}],
+ "max_tokens": 1024
+ }' -w "\n"
+
+
+# output:
+{"id":"cmpl-3e87ea5c-010e-4dd2-bcc4-3298ebd845a8","choices":[{"text":"NVIDIA is considered a great company for several reasons:\n\n1. **Technological Innovation**: NVIDIA is a leader in the field of graphics processing units (GPUs) and has been at the forefront of technological innovation.
+...
+and the broader tech industry.\n\nThese factors combined have contributed to NVIDIA's status as a great company in the technology sector.","index":0,"logprobs":null,"finish_reason":"stop"}],"created":1753329671,"model":"nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8","system_fingerprint":null,"object":"text_completion","usage":{"prompt_tokens":16,"completion_tokens":562,"total_tokens":578,"prompt_tokens_details":null,"completion_tokens_details":null}}
+```
diff --git a/fern/pages/backends/trtllm/multinode/multinode-examples.md b/fern/pages/backends/trtllm/multinode/multinode-examples.md
new file mode 100644
index 00000000000..99e0919b379
--- /dev/null
+++ b/fern/pages/backends/trtllm/multinode/multinode-examples.md
@@ -0,0 +1,280 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Example: Multi-node TRTLLM Workers with Dynamo on Slurm"
+---
+
+> **Note:** The scripts referenced in this example (such as `srun_aggregated.sh` and `srun_disaggregated.sh`) can be found in [`examples/basics/multinode/trtllm/`](https://github.com/ai-dynamo/dynamo/tree/main/examples/basics/multinode/trtllm/).
+
+To run a single Dynamo+TRTLLM Worker that spans multiple nodes (ex: TP16),
+the set of nodes need to be launched together in the same MPI world, such as
+via `mpirun` or `srun`. This is true regardless of whether the worker is
+aggregated, prefill-only, or decode-only.
+
+In this document we will demonstrate two examples launching multinode workers
+on a slurm cluster with `srun`:
+1. Deploying an aggregated nvidia/DeepSeek-R1 model as a multi-node TP16/EP16
+ worker across 4 GB200 nodes
+2. Deploying a disaggregated nvidia/DeepSeek-R1 model with a multi-node
+ TP16/EP16 prefill worker (4 nodes) and a multi-node TP16/EP16 decode
+ worker (4 nodes) across a total of 8 GB200 nodes.
+
+NOTE: Some of the scripts used in this example like `start_frontend_services.sh` and
+`start_trtllm_worker.sh` should be translatable to other environments like Kubernetes, or
+using `mpirun` directly, with relative ease.
+
+## Setup
+
+For simplicity of the example, we will make some assumptions about your slurm cluster:
+1. First, we assume you have access to a slurm cluster with multiple GPU nodes
+ available. For functional testing, most setups should be fine. For performance
+ testing, you should aim to allocate groups of nodes that are performantly
+ inter-connected, such as those in an NVL72 setup.
+2. Second, we assume this slurm cluster has the [Pyxis](https://github.com/NVIDIA/pyxis)
+ SPANK plugin setup. In particular, the `srun_aggregated.sh` script in this
+ example will use `srun` arguments like `--container-image`,
+ `--container-mounts`, and `--container-env` that are added to `srun` by Pyxis.
+ If your cluster supports similar container based plugins, you may be able to
+ modify the script to use that instead.
+3. Third, we assume you have already built a recent Dynamo+TRTLLM container image as
+ described [here](https://github.com/ai-dynamo/dynamo/tree/main/docs/backends/trtllm/README.md#build-container).
+ This is the image that can be set to the `IMAGE` environment variable in later steps.
+4. Fourth, we assume you pre-allocate a group of nodes using `salloc`. We
+ will allocate 8 nodes below as a reference command to have enough capacity
+ to run both examples. If you plan to only run the aggregated example, you
+ will only need 4 nodes. If you customize the configurations to require a
+ different number of nodes, you can adjust the number of allocated nodes
+ accordingly. Pre-allocating nodes is technically not a requirement,
+ but it makes iterations of testing/experimenting easier.
+
+ Make sure to set your `PARTITION` and `ACCOUNT` according to your slurm cluster setup:
+ ```bash
+ # Set partition manually based on your slurm cluster's partition names
+ PARTITION=""
+ # Set account manually if this command doesn't work on your cluster
+ ACCOUNT="$(sacctmgr -nP show assoc where user=$(whoami) format=account)"
+ salloc \
+ --partition="${PARTITION}" \
+ --account="${ACCOUNT}" \
+ --job-name="${ACCOUNT}-dynamo.trtllm" \
+ -t 05:00:00 \
+ --nodes 8
+ ```
+5. Lastly, we will assume you are inside an interactive shell on one of your allocated
+ nodes, which may be the default behavior after executing the `salloc` command above
+ depending on the cluster setup. If not, then you should SSH into one of the allocated nodes.
+
+### Environment Variable Setup
+
+This example aims to automate as much of the environment setup as possible,
+but all slurm clusters and environments are different, and you may need to
+dive into the scripts to make modifications based on your specific environment.
+
+Assuming you have already allocated your nodes via `salloc`, and are
+inside an interactive shell on one of the allocated nodes, set the
+following environment variables based:
+```bash
+# NOTE: IMAGE must be set manually for now
+# To build an iamge, see the steps here:
+# https://github.com/ai-dynamo/dynamo/tree/main/docs/backends/trtllm/README.md#build-container
+export IMAGE=""
+
+# MOUNTS are the host:container path pairs that are mounted into the containers
+# launched by each `srun` command.
+#
+# If you want to reference files, such as $MODEL_PATH below, in a
+# different location, you can customize MOUNTS or specify additional
+# comma-separated mount pairs here.
+#
+# NOTE: Currently, this example assumes that the local bash scripts and configs
+# referenced are mounted into into /mnt inside the container. If you want to
+# customize the location of the scripts, make sure to modify `srun_aggregated.sh`
+# accordingly for the new locations of `start_frontend_services.sh` and
+# `start_trtllm_worker.sh`.
+#
+# For example, assuming your cluster had a `/lustre` directory on the host, you
+# could add that as a mount like so:
+#
+# export MOUNTS="${PWD}/../../../../:/mnt,/lustre:/lustre"
+export MOUNTS="${PWD}/../../../../:/mnt"
+
+# NOTE: In general, Deepseek R1 is very large, so it is recommended to
+# pre-download the model weights and save them in some shared location,
+# NFS storage, HF_HOME, etc. and modify the `--model-path` below
+# to reuse the pre-downloaded weights instead.
+#
+# On Blackwell systems (ex: GB200), it is recommended to use the FP4 weights:
+# https://huggingface.co/nvidia/DeepSeek-R1-FP4
+#
+# On Hopper systems, FP4 isn't supported so you'll need to use the default weights:
+# https://huggingface.co/deepseek-ai/DeepSeek-R1
+export MODEL_PATH="nvidia/DeepSeek-R1-FP4"
+
+# The name the model will be served/queried under, matching what's
+# returned by the /v1/models endpoint.
+#
+# By default this is inferred from MODEL_PATH, but when using locally downloaded
+# model weights, it can be nice to have explicit control over the name.
+export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4"
+```
+
+## Aggregated WideEP
+
+Assuming you have at least 4 nodes allocated following the setup steps above,
+follow these steps below to launch an **aggregated** deployment across 4 nodes:
+
+```bash
+# Default set in srun_aggregated.sh, but can customize here.
+# export ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/wide_ep_agg.yaml"
+
+# Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG
+# The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of
+# total GPUs necessary to satisfy the requested parallelism. For example,
+# 4 nodes x 4 gpus/node = 16 gpus total for TP16/EP16.
+# export NUM_NODES=4
+
+# GB200 nodes have 4 gpus per node, but for other types of nodes you can configure this.
+# export NUM_GPUS_PER_NODE=4
+
+# Launches:
+# - frontend + etcd/nats on current (head) node
+# - one large aggregated trtllm worker across multiple nodes via MPI tasks
+./srun_aggregated.sh
+```
+
+## Disaggregated WideEP
+
+Assuming you have at least 8 nodes allocated (4 for prefill, 4 for decode)
+following the setup above, follow these steps below to launch a **disaggregated**
+deployment across 8 nodes:
+
+
+Make sure you have a fresh environment and don't still have the aggregated
+example above still deployed on the same set of nodes.
+
+
+```bash
+# Defaults set in srun_disaggregated.sh, but can customize here.
+# export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_prefill.yaml"
+# export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_decode.yaml"
+
+# Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG
+# Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG
+# The products of NUM_PREFILL_NODES*NUM_GPUS_PER_NODE and
+# NUM_DECODE_NODES*NUM_GPUS_PER_NODE should match the respective number of
+# GPUs necessary to satisfy the requested parallelism in each config.
+# export NUM_PREFILL_NODES=4
+# export NUM_DECODE_NODES=4
+
+# GB200 nodes have 4 gpus per node, but for other types of nodes you can configure this.
+# export NUM_GPUS_PER_NODE=4
+
+# Launches:
+# - frontend + etcd/nats on current (head) node.
+# - one large prefill trtllm worker across multiple nodes via MPI tasks
+# - one large decode trtllm worker across multiple nodes via MPI tasks
+./srun_disaggregated.sh
+```
+
+
+To launch multiple replicas of the configured prefill/decode workers, you can set
+NUM_PREFILL_WORKERS and NUM_DECODE_WORKERS respectively (default: 1).
+
+
+## Understanding the Output
+
+1. The `srun_aggregated.sh` launches two `srun` jobs. The first launches
+ etcd, NATS, and the OpenAI frontend on the head node only
+ called "node1" in the example output below. The second launches
+ a single TP16 Dynamo+TRTLLM worker spread across 4 nodes, each node
+ using 4 GPUs each.
+ ```
+ # Frontend/etcd/nats services
+ srun: launching StepId=453374.17 on host node1, 1 tasks: 0
+ ...
+ # TP16 TRTLLM worker split across 4 nodes with 4 gpus each
+ srun: launching StepId=453374.18 on host node1, 4 tasks: [0-3]
+ srun: launching StepId=453374.18 on host node2, 4 tasks: [4-7]
+ srun: launching StepId=453374.18 on host node3, 4 tasks: [8-11]
+ srun: launching StepId=453374.18 on host node4, 4 tasks: [12-15]
+ ```
+2. The OpenAI frontend will listen for and dynamically discover workers as
+ they register themselves with Dynamo's distributed runtime:
+ ```
+ 0: 2025-06-13T02:36:48.160Z INFO dynamo_run::input::http: Watching for remote model at models
+ 0: 2025-06-13T02:36:48.161Z INFO dynamo_llm::http::service::service_v2: Starting HTTP service on: 0.0.0.0:8000 address="0.0.0.0:8000"
+ ```
+3. The TRTLLM worker will consist of N (N=16 for TP16) MPI ranks, 1 rank on each
+ GPU on each node, which will each output their progress while loading the model.
+ You can see each rank's output prefixed with the rank at the start of each log line
+ until the model succesfully finishes loading:
+ ```
+ 8: rank8 run mgmn worker node with mpi_world_size: 16 ...
+ 10: rank10 run mgmn worker node with mpi_world_size: 16 ...
+ 9: rank9 run mgmn worker node with mpi_world_size: 16 ...
+ 11: rank11 run mgmn worker node with mpi_world_size: 16 ...
+ ...
+ 15: Model init total -- 55.42s
+ 11: Model init total -- 55.91s
+ 12: Model init total -- 55.24s
+ ```
+4. After the model fully finishes loading on all ranks, the worker will register itself,
+ and the OpenAI frontend will detect it, signaled by this output:
+ ```
+ 0: 2025-06-13T02:46:35.040Z INFO dynamo_llm::discovery::watcher: added model model_name="nvidia/DeepSeek-R1-FP4"
+ ```
+5. At this point, with the worker fully initialized and detected by the frontend,
+ it is now ready for inference.
+6. For `srun_disaggregated.sh`, it follows a very similar flow, but instead launches
+ three srun jobs instead of two. One for frontend, one for prefill worker,
+ and one for decode worker.
+
+## Example Request
+
+To verify the deployed model is working, send a `curl` request:
+```bash
+# NOTE: $HOST assumes running on head node, but can be changed to $HEAD_NODE_IP instead.
+HOST=localhost
+PORT=8000
+# "model" here should match the model name returned by the /v1/models endpoint
+curl -w "%{http_code}" ${HOST}:${PORT}/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "'${SERVED_MODEL_NAME}'",
+ "messages": [
+ {
+ "role": "user",
+ "content": "Tell me a story as if we were playing dungeons and dragons."
+ }
+ ],
+ "stream": true,
+ "max_tokens": 30
+}'
+```
+
+## Cleanup
+
+To cleanup background `srun` processes launched by `srun_aggregated.sh` or
+`srun_disaggregated.sh`, you can run:
+```bash
+pkill srun
+```
+
+## Known Issues
+
+- This example has only been tested on a 4xGB200 node setup with 16 GPUs using
+ FP4 weights. In theory, the example should work on alternative setups such as
+ H100 nodes with FP8 weights, but this hasn't been tested yet.
+- WideEP configs in this directory are still being tested. A WideEP specific
+ example with documentation will be added once ready.
+- There are known issues where WideEP workers may not cleanly shut down:
+ - This may lead to leftover shared memory files in `/dev/shm/moe_*`. For
+ now, you must manually clean these up before deploying again on the
+ same set of nodes.
+ - Similarly, there may be GPU memory left in-use after killing the `srun`
+ jobs. After cleaning up any leftover shared memory files as described
+ above, the GPU memory may slowly come back. You can run `watch nvidia-smi`
+ to check on this behavior. If you don't free the GPU memory before the
+ next deployment, you may get a CUDA OOM error while loading the model.
+ - There is mention of this issue in the relevant TRT-LLM blog
+ [here](https://github.com/NVIDIA/TensorRT-LLM/blob/6021a439ab9c29f4c46f721eeb59f6b992c425ea/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md#miscellaneous).
diff --git a/fern/pages/backends/trtllm/prometheus.md b/fern/pages/backends/trtllm/prometheus.md
new file mode 100644
index 00000000000..a55f920cc6a
--- /dev/null
+++ b/fern/pages/backends/trtllm/prometheus.md
@@ -0,0 +1,192 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "TensorRT-LLM Prometheus Metrics"
+---
+
+## Overview
+
+When running TensorRT-LLM through Dynamo, TensorRT-LLM's Prometheus metrics are automatically passed through and exposed on Dynamo's `/metrics` endpoint (default port 8081). This allows you to access both TensorRT-LLM engine metrics (prefixed with `trtllm_`) and Dynamo runtime metrics (prefixed with `dynamo_*`) from a single worker backend endpoint.
+
+Additional performance metrics are available via non-Prometheus APIs (see [Non-Prometheus Performance Metrics](#non-prometheus-performance-metrics) below).
+
+As of the date of this documentation, the included TensorRT-LLM version 1.1.0rc5 exposes **5 basic Prometheus metrics**. Note that the `trtllm_` prefix is added by Dynamo.
+
+**For Dynamo runtime metrics**, see the [Dynamo Metrics Guide](../../observability/metrics.md).
+
+**For visualization setup instructions**, see the [Prometheus and Grafana Setup Guide](../../observability/prometheus-grafana.md).
+
+## Environment Variables
+
+| Variable | Description | Default | Example |
+|----------|-------------|---------|---------|
+| `DYN_SYSTEM_PORT` | System metrics/health port | `-1` (disabled) | `8081` |
+
+## Getting Started Quickly
+
+This is a single machine example.
+
+### Start Observability Stack
+
+For visualizing metrics with Prometheus and Grafana, start the observability stack. See [Observability Getting Started](../../observability/README.md#getting-started-quickly) for instructions.
+
+### Launch Dynamo Components
+
+Launch a frontend and TensorRT-LLM backend to test metrics:
+
+```bash
+# Start frontend (default port 8000, override with --http-port or DYN_HTTP_PORT env var)
+$ python -m dynamo.frontend
+
+# Enable system metrics server on port 8081 and enable metrics collection
+$ DYN_SYSTEM_PORT=8081 python -m dynamo.trtllm --model --publish-events-and-metrics
+```
+
+**Note:** The `backend` must be set to `"pytorch"` for metrics collection (enforced in `components/src/dynamo/trtllm/main.py`). TensorRT-LLM's `MetricsCollector` integration has only been tested/validated with the PyTorch backend.
+
+Wait for the TensorRT-LLM worker to start, then send requests and check metrics:
+
+```bash
+# Send a request
+curl -H 'Content-Type: application/json' \
+-d '{
+ "model": "",
+ "max_completion_tokens": 100,
+ "messages": [{"role": "user", "content": "Hello"}]
+}' \
+http://localhost:8000/v1/chat/completions
+
+# Check metrics from the worker
+curl -s localhost:8081/metrics | grep "^trtllm_"
+```
+
+## Exposed Metrics
+
+TensorRT-LLM exposes metrics in Prometheus Exposition Format text at the `/metrics` HTTP endpoint. All TensorRT-LLM engine metrics use the `trtllm_` prefix and include labels (e.g., `model_name`, `engine_type`, `finished_reason`) to identify the source.
+
+**Note:** TensorRT-LLM uses `model_name` instead of Dynamo's standard `model` label convention.
+
+**Example Prometheus Exposition Format text:**
+
+```
+# HELP trtllm_request_success_total Count of successfully processed requests.
+# TYPE trtllm_request_success_total counter
+trtllm_request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="stop"} 150.0
+trtllm_request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="length"} 5.0
+
+# HELP trtllm_time_to_first_token_seconds Histogram of time to first token in seconds.
+# TYPE trtllm_time_to_first_token_seconds histogram
+trtllm_time_to_first_token_seconds_bucket{le="0.01",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 0.0
+trtllm_time_to_first_token_seconds_bucket{le="0.05",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 12.0
+trtllm_time_to_first_token_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
+trtllm_time_to_first_token_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 8.75
+
+# HELP trtllm_e2e_request_latency_seconds Histogram of end to end request latency in seconds.
+# TYPE trtllm_e2e_request_latency_seconds histogram
+trtllm_e2e_request_latency_seconds_bucket{le="0.5",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 25.0
+trtllm_e2e_request_latency_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
+trtllm_e2e_request_latency_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 45.2
+
+# HELP trtllm_time_per_output_token_seconds Histogram of time per output token in seconds.
+# TYPE trtllm_time_per_output_token_seconds histogram
+trtllm_time_per_output_token_seconds_bucket{le="0.1",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 120.0
+trtllm_time_per_output_token_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
+trtllm_time_per_output_token_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 12.5
+
+# HELP trtllm_request_queue_time_seconds Histogram of time spent in WAITING phase for request.
+# TYPE trtllm_request_queue_time_seconds histogram
+trtllm_request_queue_time_seconds_bucket{le="1.0",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 140.0
+trtllm_request_queue_time_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
+trtllm_request_queue_time_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 32.1
+```
+
+**Note:** The specific metrics shown above are examples and may vary depending on your TensorRT-LLM version. Always inspect your actual `/metrics` endpoint for the current list.
+
+### Metric Categories
+
+TensorRT-LLM provides metrics in the following categories (all prefixed with `trtllm_`):
+
+- **Request metrics** - Request success tracking and latency measurements
+- **Performance metrics** - Time to first token (TTFT), time per output token (TPOT), and queue time
+
+**Note:** Metrics may change between TensorRT-LLM versions. Always inspect the `/metrics` endpoint for your version.
+
+## Available Metrics
+
+The following metrics are exposed via Dynamo's `/metrics` endpoint (with the `trtllm_` prefix added by Dynamo) for TensorRT-LLM version 1.1.0rc5:
+
+- `trtllm_request_success_total` (Counter) — Count of successfully processed requests by finish reason
+ - Labels: `model_name`, `engine_type`, `finished_reason`
+- `trtllm_e2e_request_latency_seconds` (Histogram) — End-to-end request latency (seconds)
+ - Labels: `model_name`, `engine_type`
+- `trtllm_time_to_first_token_seconds` (Histogram) — Time to first token, TTFT (seconds)
+ - Labels: `model_name`, `engine_type`
+- `trtllm_time_per_output_token_seconds` (Histogram) — Time per output token, TPOT (seconds)
+ - Labels: `model_name`, `engine_type`
+- `trtllm_request_queue_time_seconds` (Histogram) — Time a request spends waiting in the queue (seconds)
+ - Labels: `model_name`, `engine_type`
+
+These metric names and availability are subject to change with TensorRT-LLM version updates.
+
+TensorRT-LLM provides Prometheus metrics through the `MetricsCollector` class (see [tensorrt_llm/metrics/collector.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/metrics/collector.py)).
+
+## Non-Prometheus Performance Metrics
+
+TensorRT-LLM provides extensive performance data beyond the basic Prometheus metrics. These are not currently exposed to Prometheus.
+
+### Available via Code References
+
+- **RequestPerfMetrics Structure**: [tensorrt_llm/executor/result.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/executor/result.py) - KV cache, timing, speculative decoding metrics
+- **Engine Statistics**: `engine.llm.get_stats_async()` - System-wide aggregate statistics
+- **KV Cache Events**: `engine.llm.get_kv_cache_events_async()` - Real-time cache operations
+
+### Example RequestPerfMetrics JSON Structure
+
+```json
+{
+ "timing_metrics": {
+ "arrival_time": 1234567890.123,
+ "first_scheduled_time": 1234567890.135,
+ "first_token_time": 1234567890.150,
+ "last_token_time": 1234567890.300,
+ "kv_cache_size": 2048576,
+ "kv_cache_transfer_start": 1234567890.140,
+ "kv_cache_transfer_end": 1234567890.145
+ },
+ "kv_cache_metrics": {
+ "num_total_allocated_blocks": 100,
+ "num_new_allocated_blocks": 10,
+ "num_reused_blocks": 90,
+ "num_missed_blocks": 5
+ },
+ "speculative_decoding": {
+ "acceptance_rate": 0.85,
+ "total_accepted_draft_tokens": 42,
+ "total_draft_tokens": 50
+ }
+}
+```
+
+**Note:** These structures are valid as of the date of this documentation but are subject to change with TensorRT-LLM version updates.
+
+## Implementation Details
+
+- **Prometheus Integration**: Uses the `MetricsCollector` class from `tensorrt_llm.metrics` (see [collector.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/metrics/collector.py))
+- **Dynamo Integration**: Uses `register_engine_metrics_callback()` function with `add_prefix="trtllm_"`
+- **Engine Configuration**: `return_perf_metrics` set to `True` when `--publish-events-and-metrics` is enabled
+- **Initialization**: Metrics appear after TensorRT-LLM engine initialization completes
+- **Metadata**: `MetricsCollector` initialized with model metadata (model name, engine type)
+
+## Related Documentation
+
+### TensorRT-LLM Metrics
+- See the [Non-Prometheus Performance Metrics](#non-prometheus-performance-metrics) section above for detailed performance data and source code references
+- [TensorRT-LLM Metrics Collector](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/metrics/collector.py) - Source code reference
+
+### Dynamo Metrics
+- [Dynamo Metrics Guide](../../observability/metrics.md) - Complete documentation on Dynamo runtime metrics
+- [Prometheus and Grafana Setup](../../observability/prometheus-grafana.md) - Visualization setup instructions
+- Dynamo runtime metrics (prefixed with `dynamo_*`) are available at the same `/metrics` endpoint alongside TensorRT-LLM metrics
+ - Implementation: `lib/runtime/src/metrics.rs` (Rust runtime metrics)
+ - Metric names: `lib/runtime/src/metrics/prometheus_names.rs` (metric name constants)
+ - Integration code: `components/src/dynamo/common/utils/prometheus.py` - Prometheus utilities and callback registration
diff --git a/fern/pages/backends/vllm/LMCache-Integration.md b/fern/pages/backends/vllm/LMCache-Integration.md
new file mode 100644
index 00000000000..fa202e201a1
--- /dev/null
+++ b/fern/pages/backends/vllm/LMCache-Integration.md
@@ -0,0 +1,211 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "LMCache Integration in Dynamo"
+---
+
+## Introduction
+
+LMCache is a high-performance KV cache layer that supercharges LLM serving by enabling **prefill-once, reuse-everywhere** semantics. As described in the [official documentation](https://docs.lmcache.ai/index.html), LMCache lets LLMs prefill each text only once by storing the KV caches of all reusable texts, allowing reuse of KV caches for any reused text (not necessarily prefix) across any serving engine instance.
+
+This document describes how LMCache is integrated into Dynamo's vLLM backend to provide enhanced performance and memory efficiency.
+
+### Key Benefits
+- **Reduced Time to First Token (TTFT)**: Eliminates redundant prefill computations
+- **Memory Offloading**: Intelligent KV cache placement across CPU/GPU/storage tiers
+- **Improved Throughput**: Reduced GPU memory pressure enables higher batch sizes
+
+## Platform Support
+
+**Important Note**: LMCache integration currently only supports x86 architecture. ARM64 is not supported at this time.
+
+## Aggregated Serving
+
+
+### Configuration
+
+LMCache is enabled using the `--connector lmcache` flag:
+
+```bash
+python -m dynamo.vllm --model --connector lmcache
+```
+
+### Customization
+
+LMCache configuration can be customized via environment variables listed [here](https://docs.lmcache.ai/api_reference/configurations.html).
+
+For advanced configurations, LMCache supports multiple [storage backends](https://docs.lmcache.ai/index.html):
+- **CPU RAM**: Fast local memory offloading
+- **Local Storage**: Disk-based persistence
+- **Redis**: Distributed cache sharing
+- **GDS Backend**: GPU Direct Storage for high throughput
+- **InfiniStore/Mooncake**: Cloud-native storage solutions
+
+### Deployment
+
+Use the provided launch script for quick setup:
+
+```bash
+./examples/backends/vllm/launch/agg_lmcache.sh
+```
+
+This will:
+1. Start the dynamo frontend
+2. Launch a single vLLM worker with LMCache enabled
+
+### Architecture for Aggregated Mode
+
+In aggregated mode, the system uses:
+- **KV Connector**: `LMCacheConnectorV1`
+- **KV Role**: `kv_both` (handles both reading and writing)
+
+## Disaggregated Serving
+
+Disaggregated serving separates prefill and decode operations into dedicated workers. This provides better resource utilization and scalability for production deployments.
+
+### Deployment
+
+Use the provided disaggregated launch script(the script requires at least 2 GPUs):
+
+```bash
+./examples/backends/vllm/launch/disagg_lmcache.sh
+```
+
+This will:
+1. Start the dynamo frontend
+2. Launch a decode worker on GPU 0
+3. Wait for initialization
+4. Launch a prefill worker on GPU 1 with LMCache enabled
+
+### Worker Roles
+
+#### Decode Worker
+- **Purpose**: Handles token generation (decode phase)
+- **GPU Assignment**: CUDA_VISIBLE_DEVICES=0
+- **LMCache Config**: Uses `NixlConnector` only for kv transfer between prefill and decode workers
+
+#### Prefill Worker
+- **Purpose**: Handles prompt processing (prefill phase)
+- **GPU Assignment**: CUDA_VISIBLE_DEVICES=1
+- **LMCache Config**: Uses `MultiConnector` with both LMCache and NIXL connectors. This enables prefill worker to use LMCache for kv offloading and use NIXL for kv transfer between prefill and decode workers.
+- **Flag**: `--is-prefill-worker`
+
+## Architecture
+
+### KV Transfer Configuration
+
+The system automatically configures KV transfer based on the deployment mode and worker type:
+
+#### Prefill Worker (Disaggregated Mode)
+```python
+kv_transfer_config = KVTransferConfig(
+ kv_connector="PdConnector",
+ kv_role="kv_both",
+ kv_connector_extra_config={
+ "connectors": [
+ {"kv_connector": "LMCacheConnectorV1", "kv_role": "kv_both"},
+ {"kv_connector": "NixlConnector", "kv_role": "kv_both"}
+ ]
+ }
+)
+```
+
+#### Decode Worker or Aggregated Mode
+```python
+kv_transfer_config = KVTransferConfig(
+ kv_connector="LMCacheConnectorV1",
+ kv_role="kv_both"
+)
+```
+
+#### Fallback (No LMCache)
+```python
+kv_transfer_config = KVTransferConfig(
+ kv_connector="NixlConnector",
+ kv_role="kv_both"
+)
+```
+
+### Integration Points
+
+1. **Argument Parsing** (`args.py`):
+ - Configures appropriate KV transfer settings
+ - Sets up connector configurations based on worker type
+
+2. **Engine Setup** (`main.py`):
+ - Initializes LMCache environment variables
+ - Creates vLLM engine with proper KV transfer config
+ - Handles both aggregated and disaggregated modes
+
+
+### Best Practices
+
+1. **Chunk Size Tuning**: Adjust `LMCACHE_CHUNK_SIZE` based on your use case:
+ - Smaller chunks (128-256): Better reuse granularity for varied content
+ - Larger chunks (512-1024): More efficient for repetitive content patterns
+
+2. **Memory Allocation**: Set `LMCACHE_MAX_LOCAL_CPU_SIZE` conservatively:
+ - Leave sufficient RAM for other system processes
+ - Monitor memory usage during peak loads
+
+3. **Workload Optimization**: LMCache performs best with:
+ - Repeated prompt patterns (RAG, multi-turn conversations)
+ - Shared context across sessions
+ - Long-running services with warm caches
+
+## Metrics and Monitoring
+
+When LMCache is enabled with `--connector lmcache` and `DYN_SYSTEM_PORT` is set, LMCache metrics are automatically exposed via Dynamo's `/metrics` endpoint alongside vLLM and Dynamo metrics.
+
+**Requirements to access LMCache metrics:**
+- `--connector lmcache` - Enables LMCache
+- `DYN_SYSTEM_PORT=8081` - Enables metrics HTTP endpoint
+- `PROMETHEUS_MULTIPROC_DIR` (optional) - If not set, Dynamo manages it internally. Only set explicitly if you need control over the metrics directory.
+
+For detailed information on LMCache metrics, including the complete list of available metrics and how to access them, see the **[LMCache Metrics section](prometheus.md#lmcache-metrics)** in the vLLM Prometheus Metrics Guide.
+
+### Troubleshooting
+
+#### LMCache log: `PrometheusLogger instance already created with different metadata`
+
+You may see an error like:
+
+```text
+LMCache ERROR: PrometheusLogger instance already created with different metadata. This should not happen except in test
+```
+
+**Version note**: We reproduced this behavior with **vLLM v0.12.0**. We have not reproduced it with **vLLM v0.11.0**, so it may be specific to (or introduced in) v0.12.0.
+
+This is emitted by LMCache when the LMCache connector is initialized more than once in the same process (for example, once for a `WORKER` role and later for a `SCHEDULER` role). LMCache uses a process-global singleton for its Prometheus logger, so the second initialization can log this warning if its metadata differs.
+
+- **Impact**: This is a log-only error; in our testing it does not prevent vLLM/Dynamo from serving requests. If you care about LMCache metric labels, be aware the logger singleton uses the first-seen metadata.
+- **Repro without Dynamo** (vLLM v0.12.0):
+
+```bash
+vllm serve Qwen/Qwen3-0.6B \
+ --host 127.0.0.1 --port 18000 \
+ --gpu-memory-utilization 0.24 \
+ --enforce-eager \
+ --no-enable-prefix-caching \
+ --max-num-seqs 2 \
+ --kv-offloading-backend lmcache \
+ --kv-offloading-size 1 \
+ --disable-hybrid-kv-cache-manager
+```
+
+- **Mitigation (silence)**: set `LMCACHE_LOG_LEVEL=CRITICAL`.
+- **Upstream issue**: [vLLM issue #30996](https://github.com/vllm-project/vllm/issues/30996).
+
+#### vLLM log: `Found PROMETHEUS_MULTIPROC_DIR was set by user`
+
+vLLM v1 uses `prometheus_client.multiprocess` and stores intermediate metric values in `PROMETHEUS_MULTIPROC_DIR`.
+
+- If you **set `PROMETHEUS_MULTIPROC_DIR` yourself**, vLLM warns that the directory must be wiped between runs to avoid stale/incorrect metrics.
+- When running via Dynamo, the vLLM wrapper may set `PROMETHEUS_MULTIPROC_DIR` internally to a temporary directory to avoid vLLM cleanup issues. If you still see the warning, confirm you are not exporting `PROMETHEUS_MULTIPROC_DIR` in your shell or container environment.
+
+## References and Additional Resources
+
+- [LMCache Documentation](https://docs.lmcache.ai/index.html) - Comprehensive guide and API reference
+- [Configuration Reference](https://docs.lmcache.ai/api_reference/configurations.html) - Detailed configuration options
+- [LMCache Observability Guide](https://docs.lmcache.ai/production/observability/vllm_endpoint.html) - Metrics and monitoring details
+
diff --git a/fern/pages/backends/vllm/README.md b/fern/pages/backends/vllm/README.md
new file mode 100644
index 00000000000..d713b424583
--- /dev/null
+++ b/fern/pages/backends/vllm/README.md
@@ -0,0 +1,199 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "LLM Deployment using vLLM"
+---
+
+This directory contains reference implementations for deploying Large Language Models (LLMs) in various configurations using vLLM. For Dynamo integration, we leverage vLLM's native KV cache events, NIXL based transfer mechanisms, and metric reporting to enable KV-aware routing and P/D disaggregation.
+
+## Use the Latest Release
+
+We recommend using the latest stable release of Dynamo to avoid breaking changes:
+
+[](https://github.com/ai-dynamo/dynamo/releases/latest)
+
+You can find the latest release [here](https://github.com/ai-dynamo/dynamo/releases/latest) and check out the corresponding branch with:
+
+```bash
+git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
+```
+
+---
+
+## Table of Contents
+- [Feature Support Matrix](#feature-support-matrix)
+- [Quick Start](#vllm-quick-start)
+- [Single Node Examples](#run-single-node-examples)
+- [Advanced Examples](#advanced-examples)
+- [Deploy on Kubernetes](#kubernetes-deployment)
+- [Configuration](#configuration)
+
+## Feature Support Matrix
+
+### Core Dynamo Features
+
+| Feature | vLLM | Notes |
+|---------|------|-------|
+| [**Disaggregated Serving**](../../design-docs/disagg-serving.md) | ✅ | |
+| [**Conditional Disaggregation**](../../design-docs/disagg-serving.md#conditional-disaggregation) | 🚧 | WIP |
+| [**KV-Aware Routing**](../../router/kv-cache-routing.md) | ✅ | |
+| [**SLA-Based Planner**](../../planner/sla-planner.md) | ✅ | |
+| [**Load Based Planner**](../../planner/load-planner.md) | 🚧 | WIP |
+| [**KVBM**](../../kvbm/kvbm-architecture.md) | ✅ | |
+| [**LMCache**](LMCache-Integration.md) | ✅ | |
+| [**Prompt Embeddings**](prompt-embeddings.md) | ✅ | Requires `--enable-prompt-embeds` flag |
+
+### Large Scale P/D and WideEP Features
+
+| Feature | vLLM | Notes |
+|--------------------|------|-----------------------------------------------------------------------|
+| **WideEP** | ✅ | Support for PPLX / DeepEP not verified |
+| **DP Rank Routing**| ✅ | Supported via external control of DP ranks |
+| **GB200 Support** | 🚧 | Container functional on main |
+
+## vLLM Quick Start
+
+Below we provide a guide that lets you run all of our the common deployment patterns on a single node.
+
+### Start NATS and ETCD in the background
+
+Start using [Docker Compose](https://github.com/ai-dynamo/dynamo/tree/main/deploy/docker-compose.yml)
+
+```bash
+docker compose -f deploy/docker-compose.yml up -d
+```
+
+### Pull or build container
+
+We have public images available on [NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo/artifacts). If you'd like to build your own container from source:
+
+```bash
+./container/build.sh --framework VLLM
+```
+
+### Run container
+
+```bash
+./container/run.sh -it --framework VLLM [--mount-workspace]
+```
+
+This includes the specific commit [vllm-project/vllm#19790](https://github.com/vllm-project/vllm/pull/19790) which enables support for external control of the DP ranks.
+
+## Run Single Node Examples
+
+
+Below we provide simple shell scripts that run the components for each configuration. Each shell script runs `python3 -m dynamo.frontend` to start the ingress and uses `python3 -m dynamo.vllm` to start the vLLM workers. You can also run each command in separate terminals for better log visibility.
+
+
+### Aggregated Serving
+
+```bash
+# requires one gpu
+cd examples/backends/vllm
+bash launch/agg.sh
+```
+
+### Aggregated Serving with KV Routing
+
+```bash
+# requires two gpus
+cd examples/backends/vllm
+bash launch/agg_router.sh
+```
+
+### Disaggregated Serving
+
+```bash
+# requires two gpus
+cd examples/backends/vllm
+bash launch/disagg.sh
+```
+
+### Disaggregated Serving with KV Routing
+
+```bash
+# requires three gpus
+cd examples/backends/vllm
+bash launch/disagg_router.sh
+```
+
+### Single Node Data Parallel Attention / Expert Parallelism
+
+This example is not meant to be performant but showcases Dynamo routing to data parallel workers
+
+```bash
+# requires four gpus
+cd examples/backends/vllm
+bash launch/dep.sh
+```
+
+
+Run a disaggregated example and try adding another prefill worker once the setup is running! The system will automatically discover and utilize the new worker.
+
+
+## Advanced Examples
+
+Below we provide a selected list of advanced deployments. Please open up an issue if you'd like to see a specific example!
+
+### Speculative Decoding with Aggregated Serving (Meta-Llama-3.1-8B-Instruct + Eagle3)
+
+Run **Meta-Llama-3.1-8B-Instruct** with **Eagle3** as a draft model using **aggregated speculative decoding** on a single node.
+This setup demonstrates how to use Dynamo to create an instance using Eagle-based speculative decoding under the **VLLM aggregated serving framework** for faster inference while maintaining accuracy.
+
+**Guide:** [Speculative Decoding Quickstart](speculative-decoding.md)
+
+### Kubernetes Deployment
+
+For complete Kubernetes deployment instructions, configurations, and troubleshooting, see [vLLM Kubernetes Deployment Guide](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm/deploy/README.md)
+
+## Configuration
+
+vLLM workers are configured through command-line arguments. Key parameters include:
+
+- `--model`: Model to serve (e.g., `Qwen/Qwen3-0.6B`)
+- `--is-prefill-worker`: Enable prefill-only mode for disaggregated serving
+- `--metrics-endpoint-port`: Port for publishing KV metrics to Dynamo
+- `--connector`: Specify which kv_transfer_config you want vllm to use `[nixl, lmcache, kvbm, none]`. This is a helper flag which overwrites the engines KVTransferConfig.
+- `--enable-prompt-embeds`: **Enable prompt embeddings feature** (opt-in, default: disabled)
+ - **Required for:** Accepting pre-computed prompt embeddings via API
+ - **Default behavior:** Prompt embeddings DISABLED - requests with `prompt_embeds` will fail
+ - **Error without flag:** `ValueError: You must set --enable-prompt-embeds to input prompt_embeds`
+
+See `args.py` for the full list of configuration options and their defaults.
+
+The [documentation](https://docs.vllm.ai/en/v0.9.2/configuration/serve_args.html?h=serve+arg) for the vLLM CLI args points to running 'vllm serve --help' to see what CLI args can be added. We use the same argument parser as vLLM.
+
+### Hashing Consistency for KV Events
+
+When using KV-aware routing, ensure deterministic hashing across processes to avoid radix tree mismatches. Choose one of the following:
+
+- Set `PYTHONHASHSEED=0` for all vLLM processes when relying on Python's builtin hashing for prefix caching.
+- If your vLLM version supports it, configure a deterministic prefix caching algorithm, for example:
+
+```bash
+vllm serve ... --enable-prefix-caching --prefix-caching-algo sha256
+```
+See the high-level notes in [KV Cache Routing](../../router/kv-cache-routing.md) on deterministic event IDs.
+
+## Request Migration
+
+You can enable [request migration](../../fault-tolerance/request-migration.md) to handle worker failures gracefully. Use the `--migration-limit` flag to specify how many times a request can be migrated to another worker:
+
+```bash
+python3 -m dynamo.vllm ... --migration-limit=3
+```
+
+This allows a request to be migrated up to 3 times before failing. See the [Request Migration Architecture](../../fault-tolerance/request-migration.md) documentation for details on how this works.
+
+## Request Cancellation
+
+When a user cancels a request (e.g., by disconnecting from the frontend), the request is automatically cancelled across all workers, freeing compute resources for other requests.
+
+### Cancellation Support Matrix
+
+| | Prefill | Decode |
+|-|---------|--------|
+| **Aggregated** | ✅ | ✅ |
+| **Disaggregated** | ✅ | ✅ |
+
+For more details, see the [Request Cancellation Architecture](../../fault-tolerance/request-cancellation.md) documentation.
diff --git a/fern/pages/backends/vllm/deepseek-r1.md b/fern/pages/backends/vllm/deepseek-r1.md
new file mode 100644
index 00000000000..d502b2b161b
--- /dev/null
+++ b/fern/pages/backends/vllm/deepseek-r1.md
@@ -0,0 +1,41 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Running Deepseek R1 with Wide EP"
+---
+
+Dynamo supports running Deepseek R1 with data parallel attention and wide expert parallelism. Each data parallel attention rank is a seperate dynamo component that will emit its own KV Events and Metrics. vLLM controls the expert parallelism using the flag `--enable-expert-parallel`
+
+## Instructions
+
+The following script can be adapted to run Deepseek R1 with a variety of different configuration. The current configuration uses 2 nodes, 16 GPUs, and a dp of 16. Follow the [vLLM Backend](README.md) Getting Started section on each node, and then run these two commands.
+
+node 0
+```bash
+./launch/dsr1_dep.sh --num-nodes 2 --node-rank 0 --gpus-per-node 8 --master-addr
+```
+
+node 1
+```bash
+./launch/dsr1_dep.sh --num-nodes 2 --node-rank 1 --gpus-per-node 8 --master-addr
+```
+
+### Testing the Deployment
+
+On node 0 (where the frontend was started) send a test request to verify your deployment:
+
+```bash
+curl localhost:8000/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "deepseek-ai/DeepSeek-R1",
+ "messages": [
+ {
+ "role": "user",
+ "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
+ }
+ ],
+ "stream": false,
+ "max_tokens": 30
+ }'
+```
diff --git a/fern/pages/backends/vllm/gpt-oss.md b/fern/pages/backends/vllm/gpt-oss.md
new file mode 100644
index 00000000000..8cc89f98993
--- /dev/null
+++ b/fern/pages/backends/vllm/gpt-oss.md
@@ -0,0 +1,286 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Running gpt-oss-120b Disaggregated with vLLM"
+---
+
+Dynamo supports disaggregated serving of gpt-oss-120b with vLLM. This guide demonstrates how to deploy gpt-oss-120b using disaggregated prefill/decode serving on a single H100 node with 8 GPUs, running 1 prefill worker on 4 GPUs and 1 decode worker on 4 GPUs.
+
+## Overview
+
+This deployment uses disaggregated serving in vLLM where:
+- **Prefill Worker**: Processes input prompts efficiently using 4 GPUs with tensor parallelism
+- **Decode Worker**: Generates output tokens using 4 GPUs, optimized for token generation throughput
+- **Frontend**: Provides OpenAI-compatible API endpoint with round-robin routing
+
+## Prerequisites
+
+This guide assumes readers already knows how to deploy Dynamo disaggregated serving with vLLM as illustrated in [README.md](README.md)
+
+## Instructions
+
+### 1. Launch the Deployment
+
+Note that GPT-OSS is a reasoning model with tool calling support. To
+ensure the response is being processed correctly, the worker should be
+launched with proper `--dyn-reasoning-parser` and `--dyn-tool-call-parser`.
+
+**Start frontend**
+```bash
+python3 -m dynamo.frontend --http-port 8000 &
+```
+
+**Run decode worker**
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m dynamo.vllm \
+ --model openai/gpt-oss-120b \
+ --tensor-parallel-size 4 \
+ --dyn-reasoning-parser gpt_oss \
+ --dyn-tool-call-parser harmony
+```
+
+**Run prefill workers**
+```bash
+CUDA_VISIBLE_DEVICES=4,5,6,7 python -m dynamo.vllm \
+ --model openai/gpt-oss-120b \
+ --tensor-parallel-size 4 \
+ --is-prefill-worker \
+ --dyn-reasoning-parser gpt_oss \
+ --dyn-tool-call-parser harmony
+```
+
+### 2. Verify the Deployment is Ready
+
+Poll the `/health` endpoint to verify that both the prefill and decode worker endpoints have started:
+```
+curl http://localhost:8000/health
+```
+
+Make sure that both of the `generate` endpoints are available before sending an inference request:
+```
+{
+ "status": "healthy",
+ "endpoints": [
+ "dyn://dynamo.backend.generate"
+ ],
+ "instances": [
+ {
+ "component": "backend",
+ "endpoint": "generate",
+ "namespace": "dynamo",
+ "instance_id": 7587889712474989333,
+ "transport": {
+ "nats_tcp": "dynamo_backend.generate-694d997dbae9a315"
+ }
+ },
+ {
+ "component": "prefill",
+ "endpoint": "generate",
+ "namespace": "dynamo",
+ "instance_id": 7587889712474989350,
+ "transport": {
+ "nats_tcp": "dynamo_prefill.generate-694d997dbae9a326"
+ }
+ },
+ ...
+ ]
+}
+```
+
+If only one worker endpoint is listed, the other may still be starting up. Monitor the worker logs to track startup progress.
+
+### 3. Test the Deployment
+
+Send a test request to verify the deployment:
+
+```bash
+curl -X POST http://localhost:8000/v1/responses \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "openai/gpt-oss-120b",
+ "input": "Explain the concept of disaggregated serving in LLM inference in 3 sentences.",
+ "max_output_tokens": 200,
+ "stream": false
+ }'
+```
+
+The server exposes a standard OpenAI-compatible API endpoint that accepts JSON requests. You can adjust parameters like `max_tokens`, `temperature`, and others according to your needs.
+
+### 4. Reasoning and Tool Calling
+
+Dynamo has supported reasoning and tool calling in OpenAI Chat Completion endpoint. A typical workflow for application built on top of Dynamo
+is that the application has a set of tools to aid the assistant provide accurate answer, and it is ususally
+multi-turn as it involves tool selection and generation based on the tool result. Below is an example
+of sending multi-round requests to complete a user query with reasoning and tool calling:
+
+**Application setup (pseudocode)**
+```Python
+# The tool defined by the application
+def get_system_health():
+ for component in system.components:
+ if not component.health():
+ return False
+ return True
+
+# The JSON representation of the declaration in ChatCompletion tool style
+tool_choice = '{
+ "type": "function",
+ "function": {
+ "name": "get_system_health",
+ "description": "Returns the current health status of the LLM runtime—use before critical operations to verify the service is live.",
+ "parameters": {
+ "type": "object",
+ "properties": {}
+ }
+ }
+}'
+
+# On user query, perform below workflow.
+def user_query(app_request):
+ # first round
+ # create chat completion with prompt and tool choice
+ request = ...
+ response = send(request)
+
+ if response["finish_reason"] == "tool_calls":
+ # second round
+ function, params = parse_tool_call(response)
+ function_result = function(params)
+ # create request with prompt, assistant response, and function result
+ request = ...
+ response = send(request)
+ return app_response(response)
+```
+
+
+**First request with tools**
+```bash
+curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '
+{
+ "model": "openai/gpt-oss-120b",
+ "messages": [
+ {
+ "role": "user",
+ "content": "Hey, quick check: is everything up and running?"
+ }
+ ],
+ "tools": [
+ {
+ "type": "function",
+ "function": {
+ "name": "get_system_health",
+ "description": "Returns the current health status of the LLM runtime—use before critical operations to verify the service is live.",
+ "parameters": {
+ "type": "object",
+ "properties": {}
+ }
+ }
+ }
+ ],
+ "response_format": {
+ "type": "text"
+ },
+ "stream": false,
+ "max_tokens": 300
+}'
+```
+**First response with tool choice**
+```JSON
+{
+ "id": "chatcmpl-d1c12219-6298-4c83-a6e3-4e7cef16e1a9",
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "tool_calls": [
+ {
+ "id": "call-1",
+ "type": "function",
+ "function": {
+ "name": "get_system_health",
+ "arguments": "{}"
+ }
+ }
+ ],
+ "role": "assistant",
+ "reasoning_content": "We need to check system health. Use function."
+ },
+ "finish_reason": "tool_calls"
+ }
+ ],
+ "created": 1758758741,
+ "model": "openai/gpt-oss-120b",
+ "object": "chat.completion",
+ "usage": null
+}
+```
+**Second request with tool calling result**
+```bash
+curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '
+{
+ "model": "openai/gpt-oss-120b",
+ "messages": [
+ {
+ "role": "user",
+ "content": "Hey, quick check: is everything up and running?"
+ },
+ {
+ "role": "assistant",
+ "tool_calls": [
+ {
+ "id": "call-1",
+ "type": "function",
+ "function": {
+ "name": "get_system_health",
+ "arguments": "{}"
+ }
+ }
+ ]
+ },
+ {
+ "role": "tool",
+ "tool_call_id": "call-1",
+ "content": "{\"status\":\"ok\",\"uptime_seconds\":372045}"
+ }
+ ],
+ "tools": [
+ {
+ "type": "function",
+ "function": {
+ "name": "get_system_health",
+ "description": "Returns the current health status of the LLM runtime—use before critical operations to verify the service is live.",
+ "parameters": {
+ "type": "object",
+ "properties": {}
+ }
+ }
+ }
+ ],
+ "response_format": {
+ "type": "text"
+ },
+ "stream": false,
+ "max_tokens": 300
+}'
+```
+**Second response with final message**
+```JSON
+{
+ "id": "chatcmpl-9ebfe64a-68b9-4c1d-9742-644cf770ad0e",
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "content": "All systems are green—everything’s up and running smoothly! 🚀 Let me know if you need anything else.",
+ "role": "assistant",
+ "reasoning_content": "The user asks: \"Hey, quick check: is everything up and running?\" We have just checked system health, it's ok. Provide friendly response confirming everything's up."
+ },
+ "finish_reason": "stop"
+ }
+ ],
+ "created": 1758758853,
+ "model": "openai/gpt-oss-120b",
+ "object": "chat.completion",
+ "usage": null
+}
+```
\ No newline at end of file
diff --git a/fern/pages/backends/vllm/multi-node.md b/fern/pages/backends/vllm/multi-node.md
new file mode 100644
index 00000000000..22d1981ed75
--- /dev/null
+++ b/fern/pages/backends/vllm/multi-node.md
@@ -0,0 +1,95 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Multi-node Examples"
+---
+
+This guide covers deploying vLLM across multiple nodes using Dynamo's distributed capabilities.
+
+## Prerequisites
+
+Multi-node deployments require:
+- Multiple nodes with GPU resources
+- Network connectivity between nodes (faster the better)
+- Firewall rules allowing NATS/ETCD communication
+
+## Infrastructure Setup
+
+### Step 1: Start NATS/ETCD on Head Node
+
+Start the required services on your head node. These endpoints must be accessible from all worker nodes:
+
+```bash
+# On head node (node-1)
+docker compose -f deploy/docker-compose.yml up -d
+```
+
+Default ports:
+- NATS: 4222
+- ETCD: 2379
+
+### Step 2: Configure Environment Variables
+
+Set the head node IP address and service endpoints. **Set this on all nodes** for easy copy-paste:
+
+```bash
+# Set this on ALL nodes - replace with your actual head node IP
+export HEAD_NODE_IP=""
+
+# Service endpoints (set on all nodes)
+export NATS_SERVER="nats://${HEAD_NODE_IP}:4222"
+export ETCD_ENDPOINTS="${HEAD_NODE_IP}:2379"
+```
+
+## Deployment Patterns
+
+### Multi-node Aggregated Serving
+
+Deploy vLLM workers across multiple nodes for horizontal scaling:
+
+**Node 1 (Head Node)**: Run ingress and first worker
+```bash
+# Start ingress
+python -m dynamo.frontend --router-mode kv
+
+# Start vLLM worker
+python -m dynamo.vllm \
+ --model meta-llama/Llama-3.3-70B-Instruct \
+ --tensor-parallel-size 8 \
+ --enforce-eager
+```
+
+**Node 2**: Run additional worker
+```bash
+# Start vLLM worker
+python -m dynamo.vllm \
+ --model meta-llama/Llama-3.3-70B-Instruct \
+ --tensor-parallel-size 8 \
+ --enforce-eager
+```
+
+### Multi-node Disaggregated Serving
+
+Deploy prefill and decode workers on separate nodes for optimized resource utilization:
+
+**Node 1**: Run ingress and decode worker
+```bash
+# Start ingress
+python -m dynamo.frontend --router-mode kv &
+
+# Start prefill worker
+python -m dynamo.vllm \
+ --model meta-llama/Llama-3.3-70B-Instruct \
+ --tensor-parallel-size 8 \
+ --enforce-eager
+```
+
+**Node 2**: Run prefill worker
+```bash
+# Start decode worker
+python -m dynamo.vllm \
+ --model meta-llama/Llama-3.3-70B-Instruct \
+ --tensor-parallel-size 8 \
+ --enforce-eager \
+ --is-prefill-worker
+```
diff --git a/fern/pages/backends/vllm/prometheus.md b/fern/pages/backends/vllm/prometheus.md
new file mode 100644
index 00000000000..f15c920ad69
--- /dev/null
+++ b/fern/pages/backends/vllm/prometheus.md
@@ -0,0 +1,164 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "vLLM Prometheus Metrics"
+---
+
+## Overview
+
+When running vLLM through Dynamo, vLLM engine metrics are automatically passed through and exposed on Dynamo's `/metrics` endpoint (default port 8081). This allows you to access both vLLM engine metrics (prefixed with `vllm:`) and Dynamo runtime metrics (prefixed with `dynamo_*`) from a single worker backend endpoint.
+
+**For the complete and authoritative list of all vLLM metrics**, always refer to the [official vLLM Metrics Design documentation](https://docs.vllm.ai/en/latest/design/metrics.html).
+
+**For LMCache metrics and integration**, see the [LMCache Integration Guide](LMCache-Integration.md).
+
+**For Dynamo runtime metrics**, see the [Dynamo Metrics Guide](../../observability/metrics.md).
+
+**For visualization setup instructions**, see the [Prometheus and Grafana Setup Guide](../../observability/prometheus-grafana.md).
+
+## Environment Variables and Flags
+
+| Variable/Flag | Description | Default | Example |
+|---------------|-------------|---------|---------|
+| `DYN_SYSTEM_PORT` | System metrics/health port. Required to expose `/metrics` endpoint. | `-1` (disabled) | `8081` |
+| `--connector` | KV connector to use. Use `lmcache` to enable LMCache metrics. | `nixl` | `--connector lmcache` |
+
+## Getting Started Quickly
+
+This is a single machine example.
+
+### Start Observability Stack
+
+For visualizing metrics with Prometheus and Grafana, start the observability stack. See [Observability Getting Started](../../observability/README.md#getting-started-quickly) for instructions.
+
+### Launch Dynamo Components
+
+Launch a frontend and vLLM backend to test metrics:
+
+```bash
+# Start frontend (default port 8000, override with --http-port or DYN_HTTP_PORT env var)
+$ python -m dynamo.frontend
+
+# Enable system metrics server on port 8081
+$ DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model \
+ --enforce-eager --no-enable-prefix-caching --max-num-seqs 3
+```
+
+Wait for the vLLM worker to start, then send requests and check metrics:
+
+```bash
+# Send a request
+curl -H 'Content-Type: application/json' \
+-d '{
+ "model": "",
+ "max_completion_tokens": 100,
+ "messages": [{"role": "user", "content": "Hello"}]
+}' \
+http://localhost:8000/v1/chat/completions
+
+# Check metrics from the worker
+curl -s localhost:8081/metrics | grep "^vllm:"
+```
+
+## Exposed Metrics
+
+vLLM exposes metrics in Prometheus Exposition Format text at the `/metrics` HTTP endpoint. All vLLM engine metrics use the `vllm:` prefix and include labels (e.g., `model_name`, `finished_reason`, `scheduling_event`) to identify the source.
+
+**Example Prometheus Exposition Format text:**
+
+```
+# HELP vllm:request_success_total Number of successfully finished requests.
+# TYPE vllm:request_success_total counter
+vllm:request_success_total{finished_reason="length",model_name="meta-llama/Llama-3.1-8B"} 15.0
+vllm:request_success_total{finished_reason="stop",model_name="meta-llama/Llama-3.1-8B"} 150.0
+
+# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds.
+# TYPE vllm:time_to_first_token_seconds histogram
+vllm:time_to_first_token_seconds_bucket{le="0.001",model_name="meta-llama/Llama-3.1-8B"} 0.0
+vllm:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B"} 5.0
+vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B"} 165.0
+vllm:time_to_first_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B"} 89.38
+```
+
+**Note:** The specific metrics shown above are examples and may vary depending on your vLLM version. Always inspect your actual `/metrics` endpoint or refer to the [official documentation](https://docs.vllm.ai/en/latest/design/metrics.html) for the current list.
+
+### Metric Categories
+
+vLLM provides metrics in the following categories (all prefixed with `vllm:`):
+
+- **Request metrics** - Request success, failure, and completion tracking
+- **Performance metrics** - Latency, throughput, and timing measurements
+- **Resource usage** - System resource consumption
+- **Scheduler metrics** - Scheduling and queue management
+- **Disaggregation metrics** - Metrics specific to disaggregated deployments (when enabled)
+
+**Note:** Specific metrics are subject to change between vLLM versions. Always refer to the [official documentation](https://docs.vllm.ai/en/latest/design/metrics.html) or inspect the `/metrics` endpoint for your vLLM version.
+
+## Available Metrics
+
+The official vLLM documentation includes complete metric definitions with:
+- Detailed explanations and design rationale
+- Counter, Gauge, and Histogram metric types
+- Metric labels (e.g., `model_name`, `finished_reason`, `scheduling_event`)
+- Information about v1 metrics migration
+- Future work and deprecated metrics
+
+For the complete and authoritative list of all vLLM metrics, see the [official vLLM Metrics Design documentation](https://docs.vllm.ai/en/latest/design/metrics.html).
+
+## LMCache Metrics
+
+When LMCache is enabled with `--connector lmcache` and `DYN_SYSTEM_PORT` is set, LMCache metrics (prefixed with `lmcache:`) are automatically exposed via Dynamo's `/metrics` endpoint alongside vLLM and Dynamo metrics.
+
+### Minimum Requirements
+
+To access LMCache metrics, both of these are required:
+1. `--connector lmcache` - Enables LMCache in vLLM
+2. `DYN_SYSTEM_PORT=8081` - Enables Dynamo's metrics HTTP endpoint
+
+**Example:**
+```bash
+DYN_SYSTEM_PORT=8081 \
+python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
+```
+
+### Viewing LMCache Metrics
+
+```bash
+# View all LMCache metrics
+curl -s localhost:8081/metrics | grep "^lmcache:"
+```
+
+### Troubleshooting
+
+Troubleshooting LMCache-related metrics and logs (including `PrometheusLogger instance already created with different metadata` and `PROMETHEUS_MULTIPROC_DIR` warnings) is documented in:
+
+- [LMCache Integration Guide](LMCache-Integration.md#troubleshooting)
+
+**For complete LMCache configuration and metric details**, see:
+- [LMCache Integration Guide](LMCache-Integration.md) - Setup and configuration
+- [LMCache Observability Documentation](https://docs.lmcache.ai/production/observability/vllm_endpoint.html) - Complete metrics reference
+
+## Implementation Details
+
+- vLLM v1 uses multiprocess metrics collection via `prometheus_client.multiprocess`
+- `PROMETHEUS_MULTIPROC_DIR`: (optional). By default, Dynamo automatically manages this environment variable, setting it to a temporary directory where multiprocess metrics are stored as memory-mapped files. Each worker process writes its metrics to separate files in this directory, which are aggregated when `/metrics` is scraped. Users only need to set this explicitly where complete control over the metrics directory is required.
+- Dynamo uses `MultiProcessCollector` to aggregate metrics from all worker processes
+- Metrics are filtered by the `vllm:` and `lmcache:` prefixes before being exposed (when LMCache is enabled)
+- The integration uses Dynamo's `register_engine_metrics_callback()` function with the global `REGISTRY`
+- Metrics appear after vLLM engine initialization completes
+- vLLM v1 metrics are different from v0 - see the [official documentation](https://docs.vllm.ai/en/latest/design/metrics.html) for migration details
+
+## Related Documentation
+
+### vLLM Metrics
+- [Official vLLM Metrics Design Documentation](https://docs.vllm.ai/en/latest/design/metrics.html)
+- [vLLM Production Metrics User Guide](https://docs.vllm.ai/en/latest/usage/metrics.html)
+- [vLLM GitHub - Metrics Implementation](https://github.com/vllm-project/vllm/tree/main/vllm/v1/metrics)
+
+### Dynamo Metrics
+- [Dynamo Metrics Guide](../../observability/metrics.md) - Complete documentation on Dynamo runtime metrics
+- [Prometheus and Grafana Setup](../../observability/prometheus-grafana.md) - Visualization setup instructions
+- Dynamo runtime metrics (prefixed with `dynamo_*`) are available at the same `/metrics` endpoint alongside vLLM metrics
+ - Implementation: `lib/runtime/src/metrics.rs` (Rust runtime metrics)
+ - Metric names: `lib/runtime/src/metrics/prometheus_names.rs` (metric name constants)
+ - Integration code: `components/src/dynamo/common/utils/prometheus.py` - Prometheus utilities and callback registration
diff --git a/fern/pages/backends/vllm/prompt-embeddings.md b/fern/pages/backends/vllm/prompt-embeddings.md
new file mode 100644
index 00000000000..09221f5498e
--- /dev/null
+++ b/fern/pages/backends/vllm/prompt-embeddings.md
@@ -0,0 +1,253 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Prompt Embeddings"
+---
+
+Dynamo supports prompt embeddings (also known as prompt embeds) as a secure alternative input method to traditional text prompts. By allowing applications to use pre-computed embeddings for inference, this feature not only offers greater flexibility in prompt engineering but also significantly enhances privacy and data security. With prompt embeddings, sensitive user data can be transformed into embeddings before ever reaching the inference server, reducing the risk of exposing confidential information during the AI workflow.
+
+
+## How It Works
+
+| Path | What Happens |
+|------|--------------|
+| **Text prompt** | Tokenize → Embedding Layer → Transformer |
+| **Prompt embeds** | Validate → Bypass Embedding → Transformer |
+
+
+## Architecture
+
+```mermaid
+flowchart LR
+ subgraph FE["Frontend (Rust)"]
+ A[Request] --> B{prompt_embeds?}
+ B -->|No| C[🔴 Tokenize text]
+ B -->|Yes| D[🟢 Validate base64+size]
+ C --> E[token_ids, ISL=N]
+ D --> F[token_ids=empty, skip ISL]
+ end
+
+ subgraph RT["Router (NATS)"]
+ G[Route PreprocessedRequest]
+ end
+
+ subgraph WK["Worker (Python)"]
+ H[TokensPrompt#40;token_ids#41;]
+ I[Decode → EmbedsPrompt#40;tensor#41;]
+ end
+
+ subgraph VLLM["vLLM Engine"]
+ J[🔴 Embedding Layer]
+ K[🟢 Bypass Embedding]
+ L[Transformer Layers]
+ M[LM Head → Response]
+ end
+
+ E --> G
+ F --> G
+ G -->|Normal| H
+ G -->|Embeds| I
+ H --> J --> L
+ I --> K --> L
+ L --> M
+```
+
+| Layer | **Normal Flow** | **Prompt Embeds** |
+|---|---|---|
+| **Frontend (Rust)** | 🔴 Tokenize text → token_ids, compute ISL | 🟢 Validate base64+size, skip tokenization |
+| **Router (NATS)** | Forward token_ids in PreprocessedRequest | Forward prompt_embeds string |
+| **Worker (Python)** | `TokensPrompt(token_ids)` | Decode base64 → `EmbedsPrompt(tensor)` |
+| **vLLM Engine** | 🔴 Embedding Layer → Transformer | 🟢 Bypass Embedding → Transformer |
+
+
+## Quick Start
+
+Send pre-computed prompt embeddings directly to vLLM, bypassing tokenization.
+
+### 1. Enable Feature
+
+```bash
+python -m dynamo.vllm --model --enable-prompt-embeds
+```
+
+> **Required:** The `--enable-prompt-embeds` flag must be set or requests will fail.
+
+### 2. Send Request
+
+```python
+import torch
+import base64
+import io
+from openai import OpenAI
+
+# Prepare embeddings (sequence_length, hidden_dim)
+embeddings = torch.randn(10, 4096, dtype=torch.float32)
+
+# Encode
+buffer = io.BytesIO()
+torch.save(embeddings, buffer)
+buffer.seek(0)
+embeddings_base64 = base64.b64encode(buffer.read()).decode()
+
+# Send
+client = OpenAI(base_url="http://localhost:8000/v1", api_key="EMPTY")
+response = client.completions.create(
+ model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+ prompt="", # Can be empty or present; prompt_embeds takes precedence
+ max_tokens=100,
+ extra_body={"prompt_embeds": embeddings_base64}
+)
+```
+
+## Configuration
+
+### Docker Compose
+
+```yaml
+vllm-worker:
+ command:
+ - python
+ - -m
+ - dynamo.vllm
+ - --model
+ - meta-llama/Meta-Llama-3.1-8B-Instruct
+ - --enable-prompt-embeds # Add this
+```
+
+### Kubernetes
+
+```yaml
+extraPodSpec:
+ mainContainer:
+ args:
+ - "--model"
+ - "meta-llama/Meta-Llama-3.1-8B-Instruct"
+ - "--enable-prompt-embeds" # Add this
+```
+
+### NATS Configuration
+
+NATS needs 15MB payload limit (already configured in default deployments):
+
+```yaml
+# Docker Compose - deploy/docker-compose.yml
+nats-server:
+ command: ["-js", "--trace", "-m", "8222", "--max_payload", "15728640"]
+
+# Kubernetes - deploy/cloud/helm/platform/values.yaml
+nats:
+ config:
+ merge:
+ max_payload: 15728640
+```
+
+## API Reference
+
+### Request
+
+```json
+{
+ "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "prompt": "",
+ "prompt_embeds": "",
+ "max_tokens": 100
+}
+```
+
+**Requirements:**
+- **Format:** PyTorch tensor serialized with `torch.save()` and base64-encoded
+- **Size:** 100 bytes - 10MB (decoded)
+- **Shape:** `(seq_len, hidden_dim)` or `(batch, seq_len, hidden_dim)`
+- **Dtype:** `torch.float32` (recommended)
+
+**Field Precedence:**
+- Both `prompt` and `prompt_embeds` can be provided in the same request
+- When both are present, **`prompt_embeds` takes precedence** and `prompt` is ignored
+- The `prompt` field can be empty (`""`) when using `prompt_embeds`
+
+### Response
+
+Standard OpenAI format with accurate usage:
+
+```json
+{
+ "usage": {
+ "prompt_tokens": 10, // Extracted from embedding shape
+ "completion_tokens": 15,
+ "total_tokens": 25
+ }
+}
+```
+
+## Errors
+
+| Error | Fix |
+|-------|-----|
+| `ValueError: You must set --enable-prompt-embeds` | Add `--enable-prompt-embeds` to worker |
+| `prompt_embeds must be valid base64` | Use `.decode('utf-8')` after `base64.b64encode()` |
+| `decoded data must be at least 100 bytes` | Increase sequence length |
+| `exceeds maximum size of 10MB` | Reduce sequence length |
+| `must be a torch.Tensor` | Use `torch.save()` not NumPy |
+| `size of tensor must match` | Use correct hidden dimension for model |
+
+## Examples
+
+### Streaming
+
+```python
+stream = client.completions.create(
+ model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+ prompt="",
+ max_tokens=100,
+ stream=True,
+ extra_body={"prompt_embeds": embeddings_base64}
+)
+
+for chunk in stream:
+ if chunk.choices:
+ print(chunk.choices[0].text, end="", flush=True)
+```
+
+### Load from File
+
+```python
+embeddings = torch.load("embeddings.pt")
+
+buffer = io.BytesIO()
+torch.save(embeddings, buffer)
+buffer.seek(0)
+embeddings_base64 = base64.b64encode(buffer.read()).decode()
+
+# Use in request...
+```
+
+
+## Limitations
+
+- ❌ Requires `--enable-prompt-embeds` flag (disabled by default)
+- ❌ PyTorch format only (NumPy not supported)
+- ❌ 10MB decoded size limit
+- ❌ Cannot mix with multimodal data (images/video)
+
+## Testing
+
+Comprehensive test coverage ensures reliability:
+
+- **Unit Tests:** 31 tests (11 Rust + 20 Python)
+ - Validation, decoding, format handling, error cases, usage statistics
+- **Integration Tests:** 21 end-to-end tests
+ - Core functionality, performance, formats, concurrency, usage statistics
+
+Run integration tests:
+```bash
+# Start worker with flag
+python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enable-prompt-embeds
+
+# Run tests
+pytest tests/integration/test_prompt_embeds_integration.py -v
+```
+
+## See Also
+
+- [vLLM Backend](README.md)
+- [vLLM Configuration](README.md#configuration)
diff --git a/fern/pages/backends/vllm/speculative-decoding.md b/fern/pages/backends/vllm/speculative-decoding.md
new file mode 100644
index 00000000000..06ae5f5a90a
--- /dev/null
+++ b/fern/pages/backends/vllm/speculative-decoding.md
@@ -0,0 +1,109 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Running **Meta-Llama-3.1-8B-Instruct** with Speculative Decoding (Eagle3)"
+---
+
+This guide walks through how to deploy **Meta-Llama-3.1-8B-Instruct** using **aggregated speculative decoding** with **Eagle3** on a single node.
+Since the model is only **8B parameters**, you can run it on **any GPU with at least 16GB VRAM**.
+
+
+
+## Step 1: Set Up Your Docker Environment
+
+First, we’ll initialize a Docker container using the VLLM backend.
+You can refer to the [VLLM Quickstart Guide](README.md#vllm-quick-start) — or follow the full steps below.
+
+### 1. Launch Docker Compose
+
+```bash
+docker compose -f deploy/docker-compose.yml up -d
+```
+
+### 2. Build the Container
+
+```bash
+./container/build.sh --framework VLLM
+```
+
+### 3. Run the Container
+
+```bash
+./container/run.sh -it --framework VLLM --mount-workspace
+```
+
+
+
+## Step 2: Get Access to the Llama-3 Model
+
+The **Meta-Llama-3.1-8B-Instruct** model is gated, so you’ll need to request access on Hugging Face.
+Go to the official [Meta-Llama-3.1-8B-Instruct repository](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) and fill out the access form.
+Approval usually takes around **5 minutes**.
+
+Once you have access, generate a **Hugging Face access token** with permission for gated repositories, then set it inside your container:
+
+```bash
+export HUGGING_FACE_HUB_TOKEN="insert_your_token_here"
+export HF_TOKEN=$HUGGING_FACE_HUB_TOKEN
+```
+
+
+
+## Step 3: Run Aggregated Speculative Decoding
+
+Now that your environment is ready, start the aggregated server with **speculative decoding**.
+
+```bash
+# Requires only one GPU
+cd examples/backends/vllm
+bash launch/agg_spec_decoding.sh
+```
+
+Once the weights finish downloading and serving begins, you’ll be ready to send inference requests to your model.
+
+
+
+
+## Step 4: Example Request
+
+To verify your setup, try sending a simple prompt to your model:
+
+```bash
+curl http://localhost:8000/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "messages": [
+ {"role": "user", "content": "Write a poem about why Sakura trees are beautiful."}
+ ],
+ "max_tokens": 250
+ }'
+```
+
+### Example Output
+
+```json
+{
+ "id": "cmpl-3e87ea5c-010e-4dd2-bcc4-3298ebd845a8",
+ "choices": [
+ {
+ "text": "In cherry blossom’s gentle breeze ... A delicate balance of life and death, as petals fade, and new life breathes.",
+ "index": 0,
+ "finish_reason": "stop"
+ }
+ ],
+ "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "usage": {
+ "prompt_tokens": 16,
+ "completion_tokens": 250,
+ "total_tokens": 266
+ }
+}
+```
+
+
+
+## Additional Resources
+
+* [VLLM Quickstart](README.md#vllm-quick-start)
+* [Meta-Llama-3.1-8B-Instruct on Hugging Face](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
\ No newline at end of file
diff --git a/fern/pages/benchmarks/benchmarking.md b/fern/pages/benchmarks/benchmarking.md
new file mode 100644
index 00000000000..ef9d2478696
--- /dev/null
+++ b/fern/pages/benchmarks/benchmarking.md
@@ -0,0 +1,530 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Dynamo Benchmarking Guide"
+---
+
+This benchmarking framework lets you compare performance across any combination of:
+- **DynamoGraphDeployments**
+- **External HTTP endpoints** (existing services deployed following standard documentation from vLLM, llm-d, AIBrix, etc.)
+
+## Choosing Your Benchmarking Approach
+
+Dynamo provides two benchmarking approaches to suit different use cases: **client-side** and **server-side**. Client-side refers to running benchmarks on your local machine and connecting to Kubernetes deployments via port-forwarding, while server-side refers to running benchmarks directly within the Kubernetes cluster using internal service URLs. Which method to use depends on your use case.
+
+**TLDR:**
+Need high performance/load testing? Server-side.
+Just quick testing/comparison? Client-side.
+
+### Use Client-Side Benchmarking When:
+- You want to quickly test deployments
+- You want immediate access to results on your local machine
+- You're comparing external services or deployments (not necessarily just Dynamo deployments)
+- You need to run benchmarks from your laptop/workstation
+
+→ **[Go to Client-Side Benchmarking (Local)](#client-side-benchmarking-local)**
+
+### Use Server-Side Benchmarking When:
+- You have a development environment with kubectl access
+- You're doing performance validation with high load/speed requirements
+- You're experiencing timeouts or performance issues with client-side benchmarking
+- You want optimal network performance (no port-forwarding overhead)
+- You're running automated CI/CD pipelines
+- You need isolated execution environments
+- You're doing resource-intensive benchmarking
+- You want persistent result storage in the cluster
+
+→ **[Go to Server-Side Benchmarking (In-Cluster)](#server-side-benchmarking-in-cluster)**
+
+### Quick Comparison
+
+| Feature | Client-Side | Server-Side |
+|---------|-------------|-------------|
+| **Location** | Your local machine | Kubernetes cluster |
+| **Network** | Port-forwarding required | Direct service DNS |
+| **Setup** | Quick and simple | Requires cluster resources |
+| **Performance** | Limited by local resources, may timeout under high load | Optimal cluster performance, handles high load |
+| **Isolation** | Shared environment | Isolated job execution |
+| **Results** | Local filesystem | Persistent volumes |
+| **Best for** | Light load | High load |
+
+## What This Tool Does
+
+The framework is a Python-based wrapper around `aiperf` that:
+- Benchmarks any HTTP endpoints
+- Runs concurrency sweeps across configurable load levels
+- Generates comparison plots with your custom labels
+- Works with any HuggingFace-compatible model on NVIDIA GPUs (H200, H100, A100, etc.)
+- Provides direct Python script execution for maximum flexibility
+
+**Default sequence lengths**: Input: 2000 tokens, Output: 256 tokens (configurable with `--isl` and `--osl`)
+
+**Important**: The `--model` parameter configures AIPerf for benchmarking and provides logging context. The default `--model` value in the benchmarking script is `Qwen/Qwen3-0.6B`, but it must match the model deployed at the endpoint(s).
+
+---
+
+## Client-Side Benchmarking (Local)
+
+Client-side benchmarking runs on your local machine and connects to Kubernetes deployments via port-forwarding.
+
+## Prerequisites
+
+1. **Dynamo container environment** - You must be running inside a Dynamo container with the benchmarking tools pre-installed.
+
+2. **HTTP endpoints** - Ensure you have HTTP endpoints available for benchmarking. These can be:
+ - DynamoGraphDeployments exposed via HTTP endpoints
+ - External services (vLLM, llm-d, AIBrix, etc.)
+ - Any HTTP endpoint serving HuggingFace-compatible models
+
+3. **Benchmark dependencies** - Since benchmarks run locally, you need to install the required Python dependencies. Install them using:
+ ```bash
+ pip install -r deploy/utils/requirements.txt
+ ```
+
+## User Workflow
+
+Follow these steps to benchmark Dynamo deployments using client-side benchmarking:
+
+### Step 1: Establish Kubernetes Cluster and Install Dynamo
+Set up your Kubernetes cluster with NVIDIA GPUs and install the Dynamo Kubernetes Platform. First follow the [installation guide](../kubernetes/installation-guide.md) to install Dynamo Kubernetes Platform, then use [deploy/utils/README](https://github.com/ai-dynamo/dynamo/tree/main/deploy/utils/README.md) to set up benchmarking resources.
+
+### Step 2: Deploy DynamoGraphDeployments
+Deploy your DynamoGraphDeployments separately using the [deployment documentation](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/). Each deployment should have a frontend service exposed.
+
+### Step 3: Port-Forward and Benchmark Deployment A
+```bash
+# Port-forward the frontend service for deployment A
+kubectl port-forward -n svc/ 8000:8000 > /dev/null 2>&1 &
+# Note: remember to stop the port-forward process after benchmarking.
+
+# Benchmark deployment A using Python scripts
+python3 -m benchmarks.utils.benchmark \
+ --benchmark-name deployment-a \
+ --endpoint-url http://localhost:8000 \
+ --model "your-model-name" \
+ --output-dir ./benchmarks/results
+```
+
+### Step 4: [If Comparative] Teardown Deployment A and Establish Deployment B
+If comparing multiple deployments, teardown deployment A and deploy deployment B with a different configuration.
+
+### Step 5: [If Comparative] Port-Forward and Benchmark Deployment B
+```bash
+# Port-forward the frontend service for deployment B
+kubectl port-forward -n svc/ 8001:8000 > /dev/null 2>&1 &
+
+# Benchmark deployment B using Python scripts
+python3 -m benchmarks.utils.benchmark \
+ --benchmark-name deployment-b \
+ --endpoint-url http://localhost:8001 \
+ --model "your-model-name" \
+ --output-dir ./benchmarks/results
+```
+
+### Step 6: Generate Summary and Visualization
+```bash
+# Generate plots and summary using Python plotting script
+python3 -m benchmarks.utils.plot --data-dir ./benchmarks/results
+
+# Or plot only specific benchmark experiments
+python3 -m benchmarks.utils.plot --data-dir ./benchmarks/results --benchmark-name experiment-a --benchmark-name experiment-b
+```
+
+## Use Cases
+
+The benchmarking framework supports various comparative analysis scenarios:
+
+- **Compare multiple DynamoGraphDeployments of a single backend** (e.g., aggregated vs disaggregated configurations)
+- **Compare different backends** (e.g., vLLM vs TensorRT-LLM vs SGLang)
+- **Compare Dynamo vs other platforms** (e.g., Dynamo vs llm-d vs AIBrix)
+- **Compare different models** (e.g., Llama-3-8B vs Llama-3-70B vs Qwen-3-0.6B)
+- **Compare different hardware configurations** (e.g., H100 vs A100 vs H200)
+- **Compare different parallelization strategies** (e.g., different GPU counts or memory configurations)
+
+## Configuration and Usage
+
+### Command Line Options
+
+```bash
+python3 -m benchmarks.utils.benchmark --benchmark-name --endpoint-url [OPTIONS]
+
+REQUIRED:
+ --benchmark-name NAME Name/label for this benchmark (used in plots and results)
+ --endpoint-url URL HTTP endpoint URL to benchmark (e.g., http://localhost:8000)
+
+OPTIONS:
+ -h, --help Show help message and examples
+ -m, --model MODEL Model name for AIPerf configuration and logging (default: Qwen/Qwen3-0.6B)
+ NOTE: This must match the model deployed at the endpoint
+ -i, --isl LENGTH Input sequence length (default: 2000)
+ -s, --std STDDEV Input sequence standard deviation (default: 10)
+ -o, --osl LENGTH Output sequence length (default: 256)
+ -d, --output-dir DIR Output directory (default: ./benchmarks/results)
+ --verbose Enable verbose output
+```
+
+### Important Notes
+
+- **Benchmark Name**: The benchmark name becomes the label in plots and results
+- **Name Restrictions**: Names can only contain letters, numbers, hyphens, and underscores. The name `plots` is reserved.
+- **Port-Forwarding**: You must have an exposed endpoint before benchmarking
+- **Model Parameter**: The `--model` parameter configures AIPerf for testing and logging, and must match the model deployed at the endpoint
+- **Sequential Benchmarking**: For comparative benchmarks, deploy and benchmark each configuration separately
+
+### What Happens During Benchmarking
+
+The Python benchmarking module:
+1. **Connects** to your port-forwarded endpoint
+2. **Benchmarks** using AIPerf at various concurrency levels (default: 1, 2, 5, 10, 50, 100, 250)
+3. **Measures** key metrics: latency, throughput, time-to-first-token
+4. **Saves** results to an output directory organized by benchmark name
+
+The Python plotting module:
+1. **Generates** comparison plots using your benchmark name in `/plots/`
+2. **Creates** summary statistics and visualizations
+
+### Plotting Options
+
+The plotting script supports several options for customizing which experiments to visualize:
+
+```bash
+# Plot all benchmark experiments in the data directory
+python3 -m benchmarks.utils.plot --data-dir ./benchmarks/results
+
+# Plot only specific benchmark experiments
+python3 -m benchmarks.utils.plot --data-dir ./benchmarks/results --benchmark-name experiment-a --benchmark-name experiment-b
+
+# Specify custom output directory for plots
+python3 -m benchmarks.utils.plot --data-dir ./benchmarks/results --output-dir ./custom-plots
+```
+
+**Available Options:**
+- `--data-dir`: Directory containing benchmark results (required)
+- `--benchmark-name`: Specific benchmark experiment name to plot (can be specified multiple times). Names must match subdirectory names under the data dir.
+- `--output-dir`: Custom output directory for plots (defaults to data-dir/plots)
+
+**Note**: If `--benchmark-name` is not specified, the script will plot all subdirectories found in the data directory.
+
+### Using Your Own Models and Configuration
+
+The benchmarking framework supports any HuggingFace-compatible LLM model. Specify your model in the benchmark script's `--model` parameter. It must match the model name of the deployment. You can override the default sequence lengths (2000/256 tokens) with `--isl` and `--osl` flags if needed for your specific workload.
+
+The benchmarking framework is built around Python modules that provide direct control over the benchmark workflow. The Python benchmarking module connects to your existing endpoints, runs the benchmarks, and can generate plots. Deployment is user-managed and out of scope for this tool.
+
+### Comparison Limitations
+
+The plotting system supports up to 12 different benchmarks in a single comparison.
+
+### Concurrency Configuration
+
+You can customize the concurrency levels using the CONCURRENCIES environment variable:
+
+```bash
+# Custom concurrency levels
+CONCURRENCIES="1,5,20,50" python3 -m benchmarks.utils.benchmark \
+ --benchmark-name my-test \
+ --endpoint-url http://localhost:8000
+
+# Or set permanently
+export CONCURRENCIES="1,2,5,10,25,50,100"
+python3 -m benchmarks.utils.benchmark \
+ --benchmark-name test \
+ --endpoint-url http://localhost:8000
+```
+
+## Understanding Your Results
+
+After benchmarking completes, check `./benchmarks/results/` (or your custom output directory):
+
+### Plot Labels and Organization
+
+The plotting script uses the `--benchmark-name` as the experiment name in all generated plots. For example:
+- `--benchmark-name aggregated` → plots will show "aggregated" as the label
+- `--benchmark-name vllm-disagg` → plots will show "vllm-disagg" as the label
+
+This allows you to easily identify and compare different configurations in the visualization plots.
+
+### Summary and Plots
+
+```text
+benchmarks/results/plots
+├── SUMMARY.txt # Quick overview of all results
+├── p50_inter_token_latency_vs_concurrency.png # Token generation speed
+├── avg_time_to_first_token_vs_concurrency.png # Response time
+├── request_throughput_vs_concurrency.png # Requests per second
+├── efficiency_tok_s_gpu_vs_user.png # GPU efficiency
+└── avg_inter_token_latency_vs_concurrency.png # Average latency
+```
+
+### Data Files
+
+Raw data is organized by deployment/benchmark type and concurrency level:
+
+**For Any Benchmarking (uses your custom benchmark name):**
+```text
+results/ # Client-side: ./benchmarks/results/ or custom dir
+├── plots/ # Server-side: /data/results/
+│ ├── SUMMARY.txt # Performance visualization plots
+│ ├── p50_inter_token_latency_vs_concurrency.png
+│ ├── avg_inter_token_latency_vs_concurrency.png
+│ ├── request_throughput_vs_concurrency.png
+│ ├── efficiency_tok_s_gpu_vs_user.png
+│ └── avg_time_to_first_token_vs_concurrency.png
+├── / # Results for your benchmark (uses your custom name)
+│ ├── c1/ # Concurrency level 1
+│ │ └── profile_export_aiperf.json
+│ ├── c2/ # Concurrency level 2
+│ ├── c5/ # Concurrency level 5
+│ └── ... # Other concurrency levels (10, 50, 100, 250)
+└── / # Results for additional benchmarking runs
+ └── c*/ # Same structure as above
+```
+
+**Example with actual benchmark names:**
+```text
+results/
+├── plots/
+├── experiment-a/ # --benchmark-name experiment-a
+├── experiment-b/ # --benchmark-name experiment-b
+└── experiment-c/ # --benchmark-name experiment-c
+```
+
+Each concurrency directory contains:
+- **`profile_export_aiperf.json`** - Structured metrics from AIPerf
+- **`profile_export_aiperf.csv`** - CSV format metrics from AIPerf
+- **`profile_export.json`** - Raw AIPerf results
+- **`inputs.json`** - Generated test inputs
+
+---
+
+## Server-Side Benchmarking (In-Cluster)
+
+Server-side benchmarking runs directly within the Kubernetes cluster, eliminating the need for port forwarding and providing better resource utilization.
+
+## What Server-Side Benchmarking Does
+
+The server-side benchmarking solution:
+- Runs benchmarks directly within the Kubernetes cluster using internal service URLs
+- Uses Kubernetes service DNS for direct communication (no port forwarding required)
+- Leverages the existing benchmarking infrastructure (`benchmarks.utils.benchmark`)
+- Stores results persistently using `dynamo-pvc`
+- Provides isolated execution environment with configurable resources
+- Handles high load/speed requirements without timeout issues
+- **Note**: Each benchmark job runs within a single Kubernetes namespace, but can benchmark services across multiple namespaces using the full DNS format `svc_name.namespace.svc.cluster.local`
+
+## Prerequisites
+
+1. **Kubernetes cluster** with NVIDIA GPUs and Dynamo namespace setup (see [Dynamo Kubernetes Platform docs](../kubernetes/README.md))
+2. **Storage** PersistentVolumeClaim configured with appropriate permissions (see [deploy/utils README](https://github.com/ai-dynamo/dynamo/tree/main/deploy/utils/README.md))
+3. **Docker image** containing the Dynamo benchmarking tools
+
+## Quick Start
+
+### Step 1: Deploy Your DynamoGraphDeployment
+Deploy your DynamoGraphDeployment using the [deployment documentation](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/). Ensure it has a frontend service exposed.
+
+### Step 2: Deploy and Run Benchmark Job
+
+**Note**: The server-side benchmarking job requires a Docker image containing the Dynamo benchmarking tools. Before the 0.5.1 release, you must build your own Docker image using the [container build instructions](https://github.com/ai-dynamo/dynamo/tree/main/container/README.md), push it to your container registry, then update the `image` field in `benchmarks/incluster/benchmark_job.yaml` to use your built image tag.
+
+```bash
+export NAMESPACE=benchmarking
+
+# Deploy the benchmark job with default settings
+kubectl apply -f benchmarks/incluster/benchmark_job.yaml -n $NAMESPACE
+
+# Monitor the job, wait for it to complete
+kubectl logs -f job/dynamo-benchmark -n $NAMESPACE
+```
+
+#### Customize the job configuration
+
+To customize the benchmark parameters, edit the `benchmarks/incluster/benchmark_job.yaml` file and modify:
+
+- **Model name**: Change `"Qwen/Qwen3-0.6B"` in the args section
+- **Benchmark name**: Change `"qwen3-0p6b-vllm-agg"` to your desired benchmark name
+- **Service URL**: Change `"vllm-agg-frontend:8000"` so the service URL matches your deployed service
+- **Docker image**: Change the image field if needed
+
+Then deploy:
+```bash
+kubectl apply -f benchmarks/incluster/benchmark_job.yaml -n $NAMESPACE
+```
+
+### Step 3: Retrieve Results
+```bash
+# Create access pod (skip this step if access pod is already running)
+kubectl apply -f deploy/utils/manifests/pvc-access-pod.yaml -n $NAMESPACE
+kubectl wait --for=condition=Ready pod/pvc-access-pod -n $NAMESPACE --timeout=60s
+
+# Download the results
+kubectl cp $NAMESPACE/pvc-access-pod:/data/results/ ./benchmarks/results/
+
+# Cleanup
+kubectl delete pod pvc-access-pod -n $NAMESPACE
+```
+
+### Step 4: Generate Plots
+```bash
+# Generate performance plots from the downloaded results
+python3 -m benchmarks.utils.plot \
+ --data-dir ./benchmarks/results
+```
+
+This will create visualization plots. For more details on interpreting these plots, see the [Summary and Plots](#summary-and-plots) section above.
+
+## Cross-Namespace Service Access
+
+Server-side benchmarking can benchmark services across multiple namespaces from a single job using Kubernetes DNS. When referencing services in other namespaces, use the full DNS format:
+
+```bash
+# Access service in same namespace
+SERVICE_URL=vllm-agg-frontend:8000
+
+# Access service in different namespace
+SERVICE_URL=vllm-agg-frontend.production.svc.cluster.local:8000
+```
+
+**DNS Format**: `..svc.cluster.local:port`
+
+This allows you to:
+- Benchmark multiple services across different namespaces in a single job
+- Compare services running in different environments (dev, staging, production)
+- Test cross-namespace integrations without port-forwarding
+- Run comprehensive cross-namespace performance comparisons
+
+## Configuration
+
+The benchmark job is configured directly in the YAML file.
+
+### Default Configuration
+
+- **Model**: `Qwen/Qwen3-0.6B`
+- **Benchmark Name**: `qwen3-0p6b-vllm-agg`
+- **Service**: `vllm-agg-frontend:8000`
+- **Docker Image**: `nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag`
+
+### Customizing the Job
+
+To customize the benchmark, edit `benchmarks/incluster/benchmark_job.yaml`:
+
+1. **Change the model**: Update the `--model` argument
+2. **Change the benchmark name**: Update the `--benchmark-name` argument
+3. **Change the service URL**: Update the `--endpoint-url` argument (use `..svc.cluster.local:port` for cross-namespace access)
+4. **Change Docker image**: Update the image field if needed
+
+### Example: Multi-Namespace Benchmarking
+
+To benchmark services across multiple namespaces, you would need to run separate benchmark jobs for each service since the format supports one benchmark per job. However, the results are stored in the same PVC and may be accessed together.
+
+```yaml
+# Job 1: Production service
+args:
+ - --model
+ - "Qwen/Qwen3-0.6B"
+ - --benchmark-name
+ - "prod-vllm"
+ - --endpoint-url
+ - "vllm-agg-frontend.production.svc.cluster.local:8000"
+ - --output-dir
+ - /data/results
+
+# Job 2: Staging service
+args:
+ - --model
+ - "Qwen/Qwen3-0.6B"
+ - --benchmark-name
+ - "staging-vllm"
+ - --endpoint-url
+ - "vllm-agg-frontend.staging.svc.cluster.local:8000"
+ - --output-dir
+ - /data/results
+```
+
+## Understanding Your Results
+
+Results are stored in `/data/results` and follow the same structure as client-side benchmarking:
+
+```text
+/data/results/
+└── / # Results for your benchmark name
+ ├── c1/ # Concurrency level 1
+ │ └── profile_export_aiperf.json
+ ├── c2/ # Concurrency level 2
+ └── ... # Other concurrency levels
+```
+
+## Monitoring and Debugging
+
+### Check Job Status
+```bash
+kubectl describe job dynamo-benchmark -n $NAMESPACE
+```
+
+### View Logs
+```bash
+# Follow logs in real-time
+kubectl logs -f job/dynamo-benchmark -n $NAMESPACE
+```
+
+### Debug Failed Jobs
+```bash
+# Check pod status
+kubectl get pods -n $NAMESPACE -l job-name=dynamo-benchmark
+
+# Describe failed pod
+kubectl describe pod -n $NAMESPACE
+```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Service not found**: Ensure your DynamoGraphDeployment frontend service is running
+3. **PVC access**: Check that `dynamo-pvc` is properly configured and accessible
+4. **Image pull issues**: Ensure the Docker image is accessible from the cluster
+5. **Resource constraints**: Adjust resource limits if the job is being evicted
+
+### Debug Commands
+
+```bash
+# Check PVC status
+kubectl get pvc dynamo-pvc -n $NAMESPACE
+
+# Check service endpoints
+kubectl get svc -n $NAMESPACE
+
+# Verify your service exists and has endpoints
+SVC_NAME="${SERVICE_URL%%:*}"
+kubectl get svc "$SVC_NAME" -n "$NAMESPACE"
+kubectl get endpoints "$SVC_NAME" -n "$NAMESPACE"
+```
+
+---
+
+## Customize Benchmarking Behavior
+
+The built-in Python workflow connects to endpoints, benchmarks with aiperf, and generates plots. If you want to modify the behavior:
+
+1. **Extend the workflow**: Modify `benchmarks/utils/workflow.py` to add custom deployment types or metrics collection
+
+2. **Generate different plots**: Modify `benchmarks/utils/plot.py` to generate a different set of plots for whatever you wish to visualize.
+
+3. **Direct module usage**: Use individual Python modules (`benchmarks.utils.benchmark`, `benchmarks.utils.plot`) for granular control over each step of the benchmarking process.
+
+The Python benchmarking module provides a complete end-to-end benchmarking experience with full control over the workflow.
+
+---
+
+## Testing with Mocker Backend
+
+For development and testing purposes, Dynamo provides a [mocker backend](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/mocker/) that simulates LLM inference without requiring actual GPU resources. This is useful for:
+
+- **Testing deployments** without expensive GPU infrastructure
+- **Developing and debugging** router, planner, or frontend logic
+- **CI/CD pipelines** that need to validate infrastructure without model execution
+- **Benchmarking framework validation** to ensure your setup works before using real backends
+
+The mocker backend mimics the API and behavior of real backends (vLLM, SGLang, TensorRT-LLM) but generates mock responses instead of running actual inference.
+
+See the [mocker directory](https://github.com/ai-dynamo/dynamo/tree/main/components/src/dynamo/mocker/) for usage examples and configuration options.
diff --git a/fern/pages/benchmarks/kv-router-ab-testing.md b/fern/pages/benchmarks/kv-router-ab-testing.md
new file mode 100644
index 00000000000..28cd4db92b7
--- /dev/null
+++ b/fern/pages/benchmarks/kv-router-ab-testing.md
@@ -0,0 +1,801 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Dynamo KV Smart Router A/B Benchmarking Guide"
+---
+
+This guide walks you through setting up and running A/B benchmarks to compare Dynamo's KV Smart Router against standard round-robin routing on a Kubernetes cluster.
+
+## Overview
+Dynamo's KV Smart Router intelligently routes requests based on KV cache affinity, improving performance for workloads with shared prompt prefixes. This guide helps you:
+
+1. Deploy two identical Dynamo configurations:
+ a. A vllm server for Qwen3-32B with 8 workers (aggregated) **WITHOUT** KV Smart Router enabled
+ b. A vllm server for Qwen3-32B with 8 workers (aggregated) **WITH** KV Smart Router enabled
+2. Run controlled benchmarks using AIPerf
+3. Compare performance metrics to evaluate KV router effectiveness
+
+**Prerequisites:** Kubernetes cluster with GPUs, kubectl, helm
+
+---
+
+## Prerequisites
+
+### Required Tools
+
+- `kubectl` (configured with cluster access)
+- `helm` (v3+)
+- HuggingFace account and token (if model downloads are gated)
+- Kubernetes cluster with:
+ - GPU nodes (H100, H200, or similar)
+ - Sufficient GPU capacity (16+ GPUs recommended for this example)
+ - Dynamo platform installed globally OR ability to install per-namespace
+
+### Knowledge Requirements
+
+- Basic Kubernetes concepts (namespaces, pods, services)
+- Familiarity with LLM inference concepts
+- Command-line proficiency
+
+---
+
+## Architecture
+
+This guide sets up two parallel deployments, as well as a benchmarking pod that can test each deployment:
+
+```text
+┌─────────────────────────────────────┐
+│ Deployment A: Router OFF │
+│ Namespace: router-off-test │
+│ ├─ Frontend (Standard Routing) │
+│ └─ 8x Decode Workers (1 GPU each) │
+└─────────────────────────────────────┘
+
+┌─────────────────────────────────────┐
+│ Deployment B: Router ON │
+│ Namespace: router-on-test │
+│ ├─ Frontend (KV Smart Router) │
+│ └─ 8x Decode Workers (1 GPU each) │
+└─────────────────────────────────────┘
+
+┌─────────────────────────────────────┐
+│ Benchmark Pod │
+│ Namespace: benchmark │
+│ └─ AIPerf + Dataset │
+└─────────────────────────────────────┘
+```
+
+**Key Difference:** Deployment B sets `DYN_ROUTER_MODE=kv` on the frontend to enable KV cache-aware routing.
+
+---
+
+## Phase 1: Namespace and Infrastructure Setup
+
+### Step 1.1: Create Namespaces
+
+```bash
+# Create namespaces for both deployments
+kubectl create namespace router-off-test
+kubectl create namespace router-on-test
+kubectl create namespace benchmark
+```
+
+### Step 1.2: Create HuggingFace Token Secret (optional)
+
+If the model you're seeking to deploy requires HF token to download (Llama family models require this), replace `YOUR_HF_TOKEN` with your actual HuggingFace token:
+
+```bash
+# Router-OFF namespace
+kubectl create secret generic hf-token-secret \
+ --from-literal=HF_TOKEN="YOUR_HF_TOKEN" \
+ -n router-off-test
+
+# Router-ON namespace
+kubectl create secret generic hf-token-secret \
+ --from-literal=HF_TOKEN="YOUR_HF_TOKEN" \
+ -n router-on-test
+```
+
+### Step 1.3: Install Dynamo Platform (Per-Namespace)
+
+If your cluster uses namespace-restricted Dynamo operators, you'll need to install the Dynamo platform in each namespace. Follow the [Dynamo Kubernetes Installation Guide](https://github.com/ai-dynamo/dynamo/blob/main/docs/kubernetes/installation_guide.md) to install the platform in both namespaces:
+
+- `router-off-test`
+- `router-on-test`
+
+**Key Configuration Notes:**
+- If your cluster uses namespace restrictions, ensure `dynamo-operator.namespaceRestriction.enabled=true` is set during installation
+- Adjust version tags to match your cluster's available Dynamo versions
+- If you encounter operator compatibility issues (e.g., unsupported MPI arguments), consult your cluster administrator or the Dynamo troubleshooting documentation
+
+### Step 1.4: Verify Infrastructure
+
+Wait for operators and infrastructure to be ready:
+
+```bash
+# Check router-off-test
+kubectl get pods -n router-off-test
+
+# Check router-on-test
+kubectl get pods -n router-on-test
+```
+
+You should see:
+- `dynamo-platform-dynamo-operator-controller-manager` (2/2 Running)
+- `dynamo-platform-etcd-0` (1/1 Running)
+- `dynamo-platform-nats-0` (2/2 Running)
+
+---
+
+## Phase 2: Deploy Model Serving
+
+### Step 2.1: Create Deployment YAMLs
+
+Create `router-off-deployment.yaml`:
+
+```yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+ name: vllm-agg-no-router
+spec:
+ services:
+ Frontend:
+ dynamoNamespace: vllm-agg-no-router
+ componentType: frontend
+ replicas: 1
+ extraPodSpec:
+ mainContainer:
+ image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0
+ VllmDecodeWorker:
+ envFromSecret: hf-token-secret
+ dynamoNamespace: vllm-agg-no-router
+ componentType: worker
+ replicas: 8
+ resources:
+ limits:
+ gpu: "1"
+ extraPodSpec:
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: node.kubernetes.io/instance-type
+ operator: In
+ values:
+ - gpu-h200-sxm # Adjust to your GPU node type
+ mainContainer:
+ image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0
+ workingDir: /workspace/examples/backends/vllm
+ command:
+ - /bin/sh
+ - -c
+ args:
+ - python3 -m dynamo.vllm --model Qwen/Qwen3-32B --quantization fp8
+ startupProbe:
+ httpGet:
+ path: /health
+ port: 9090
+ initialDelaySeconds: 120
+ periodSeconds: 30
+ timeoutSeconds: 10
+ failureThreshold: 60 # 32 minutes total (120s + 60*30s)
+ livenessProbe:
+ httpGet:
+ path: /live
+ port: 9090
+ initialDelaySeconds: 300
+ periodSeconds: 30
+ timeoutSeconds: 10
+ failureThreshold: 10
+ readinessProbe:
+ httpGet:
+ path: /live
+ port: 9090
+ initialDelaySeconds: 300
+ periodSeconds: 30
+ timeoutSeconds: 10
+ failureThreshold: 10
+```
+
+Create `router-on-deployment.yaml`:
+
+```yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+ name: vllm-agg-router
+spec:
+ services:
+ Frontend:
+ dynamoNamespace: vllm-agg-router
+ componentType: frontend
+ replicas: 1
+ extraPodSpec:
+ mainContainer:
+ image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0
+ envs:
+ - name: DYN_ROUTER_MODE
+ value: kv # KEY DIFFERENCE: Enable KV Smart Router
+ VllmDecodeWorker:
+ envFromSecret: hf-token-secret
+ dynamoNamespace: vllm-agg-router
+ componentType: worker
+ replicas: 8
+ resources:
+ limits:
+ gpu: "1"
+ extraPodSpec:
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: node.kubernetes.io/instance-type
+ operator: In
+ values:
+ - gpu-h200-sxm # Adjust to your GPU node type
+ mainContainer:
+ image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0
+ workingDir: /workspace/examples/backends/vllm
+ command:
+ - /bin/sh
+ - -c
+ args:
+ - python3 -m dynamo.vllm --model Qwen/Qwen3-32B --quantization fp8
+ startupProbe:
+ httpGet:
+ path: /health
+ port: 9090
+ initialDelaySeconds: 120
+ periodSeconds: 30
+ timeoutSeconds: 10
+ failureThreshold: 60 # 32 minutes total (120s + 60*30s)
+ livenessProbe:
+ httpGet:
+ path: /live
+ port: 9090
+ initialDelaySeconds: 300
+ periodSeconds: 30
+ timeoutSeconds: 10
+ failureThreshold: 10
+ readinessProbe:
+ httpGet:
+ path: /live
+ port: 9090
+ initialDelaySeconds: 300
+ periodSeconds: 30
+ timeoutSeconds: 10
+ failureThreshold: 10
+```
+
+### Step 2.2: Deploy Both Configurations
+
+```bash
+# Deploy router-OFF
+kubectl apply -f router-off-deployment.yaml -n router-off-test
+
+# Deploy router-ON
+kubectl apply -f router-on-deployment.yaml -n router-on-test
+```
+
+**💡 Optimization Tip:** Each worker will download the model independently (~20 minutes per pod). For faster initialization, add a shared PVC with `ReadWriteMany` access mode to cache the model.
+
+First, create the PVC separately:
+
+```yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+ name: model-cache
+spec:
+ accessModes:
+ - ReadWriteMany
+ storageClassName: "your-shared-storage-class" # e.g., nfs, efs, nebius-shared-fs
+ resources:
+ requests:
+ storage: 100Gi
+```
+
+Then reference it in your DynamoGraphDeployment:
+
+```yaml
+spec:
+ pvcs:
+ - create: false
+ name: model-cache
+ size: "0"
+ services:
+ VllmDecodeWorker:
+ volumeMounts:
+ - mountPoint: /root/.cache/huggingface
+ name: model-cache
+ useAsCompilationCache: false
+```
+
+With this configuration, only the first worker downloads the model; others use the cached version, reducing startup time from 20+ minutes to ~2 minutes per pod.
+
+### Step 2.3: Monitor Deployment Progress
+
+```bash
+# Watch router-OFF pods
+kubectl get pods -n router-off-test -w
+
+# Watch router-ON pods
+kubectl get pods -n router-on-test -w
+```
+
+Wait for all pods to reach `Running` status and pass readiness probes.
+
+**Expected Timeline:**
+- **With shared PVC** (ReadWriteMany): ~5-10 minutes total (first worker downloads, others reuse cache)
+- **Without shared PVC**: 20-30 minutes per worker (workers download independently)
+ - For 8 workers: Budget **1-2 hours** for full deployment (workers start in parallel but are limited by node scheduling)
+
+The startup probe allows 32 minutes per pod (failureThreshold: 60), which accommodates model download and initialization.
+
+### Step 2.4: Verify All Workers Are Healthy
+
+> ⚠️ **CRITICAL CHECKPOINT**: Before running benchmarks, you **MUST** verify equal worker health in both deployments. Unequal worker counts will invalidate your comparison results.
+
+```bash
+# Quick health check - both should show "8/8"
+echo "Router OFF: $(kubectl get pods -n router-off-test -l nvidia.com/dynamo-component-type=worker --field-selector=status.phase=Running -o json | jq '[.items[] | select(.status.conditions[] | select(.type=="Ready" and .status=="True"))] | length')/8 ready"
+echo "Router ON: $(kubectl get pods -n router-on-test -l nvidia.com/dynamo-component-type=worker --field-selector=status.phase=Running -o json | jq '[.items[] | select(.status.conditions[] | select(.type=="Ready" and .status=="True"))] | length')/8 ready"
+
+# Detailed view
+kubectl get pods -n router-off-test -l nvidia.com/dynamo-component-type=worker
+kubectl get pods -n router-on-test -l nvidia.com/dynamo-component-type=worker
+```
+
+**Both must show 8/8 workers in Ready state (1/1 Running).** If workers are not ready:
+- Check logs: `kubectl logs -n `
+- Common issues: model download in progress, startup probe timeout, insufficient GPU resources
+
+**Do not proceed with benchmarks until all 16 workers (8 per deployment) are healthy.**
+
+---
+
+## Phase 3: Prepare Benchmark Dataset
+
+### Understanding the Mooncake Trace Dataset
+
+For this A/B comparison, we use the **Mooncake Trace Dataset**, published by [Mooncake AI](https://github.com/kvcache-ai/Mooncake). This is a privacy-preserving dataset of real-world LLM inference traffic from production arxiv workloads.
+
+**What's in the dataset?** Each trace entry contains:
+- **Timestamp:** When the request arrived (for realistic request timing)
+- **Input/output lengths:** Number of tokens in prompts and responses
+- **Block hash IDs:** Cryptographic hashes representing KV cache blocks (explained below)
+
+**Sample trace entry:**
+```json
+{
+ "timestamp": 27482,
+ "input_length": 6955,
+ "output_length": 52,
+ "hash_ids": [46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 2353, 2354]
+}
+```
+
+### Why Mooncake Traces Matter for KV Cache Benchmarking
+
+**The Challenge:** Traditional LLM benchmarks use synthetic or random data, which are often insufficient to capture real-world optimizations like KV Smart Router. To properly evaluate this feature, we need realistic traffic patterns with **prefix repetition** - but this creates a privacy problem: how do we measure realistic KV cache hit patterns without exposing actual user conversations?
+
+**Mooncake's Solution: Privacy-Preserving Block Hashes**
+
+Instead of storing actual prompt text, the Mooncake dataset uses cryptographic hashes to represent KV cache blocks. Each hash ID represents a **512-token block**, and the hash includes both the current block and all preceding blocks. This preserves the **pattern of prefix reuse** while completely protecting user privacy.
+
+### How it works - Multi-turn conversation example
+
+```text
+Turn 1 (initial request - long document analysis):
+ Input: ~8,000 tokens (e.g., research paper + question)
+ Hash IDs: [46][47][48][49][50][51][52][53][54][55][56][57][58][59][60][61]
+ └─ 16 blocks × 512 tokens/block = ~8,192 tokens
+
+Turn 2 (follow-up question on same document):
+ Input: Same document + new question (~8,500 tokens)
+ Hash IDs: [46][47][48][49][50][51][52][53][54][55][56][57][58][59][60][61][62]
+ └──────────── Reuses first 16 blocks (~8,192 tokens) ───────────────┘
+
+ ✅ Cache hit: First 8,192 tokens don't need recomputation!
+
+Turn 3 (another follow-up):
+ Input: Same document + different question (~9,000 tokens)
+ Hash IDs: [46][47][48][49][50][51][52][53][54][55][56][57][58][59][60][61][62][63]
+ └──────────── Reuses first 16 blocks (~8,192 tokens) ───────────────┘
+```
+
+When requests share the same hash IDs (e.g., blocks 46-61), it means they share those 512-token blocks - indicating **significant prefix overlap** (in this case, 8,192 tokens). The **KV Smart Router** routes requests with matching hash IDs to the same worker, maximizing cache hits and avoiding redundant computation for those shared prefix tokens.
+
+**Key Dataset Properties:**
+- ✅ **Realistic timing:** Request arrival patterns from production workloads
+- ✅ **Real prefix patterns:** Up to 50% cache hit ratio ([Mooncake technical report](https://github.com/kvcache-ai/Mooncake))
+- ✅ **Privacy-preserving:** No actual text - only hash-based cache block identifiers
+- ✅ **Reproducible:** Public dataset enables fair comparisons across different systems
+
+**Why this matters:** With random synthetic data, the KV Smart Router would show no benefit because there's no prefix reuse to exploit. Mooncake traces provide realistic workload patterns that demonstrate the router's real-world performance gains while respecting user privacy.
+
+---
+
+### Download and Prepare the Dataset
+
+```bash
+# Download the Mooncake arxiv trace dataset
+curl -sL https://raw.githubusercontent.com/kvcache-ai/Mooncake/refs/heads/main/FAST25-release/arxiv-trace/mooncake_trace.jsonl -o mooncake_trace.jsonl
+
+# Trim to 1000 requests for faster benchmarking
+head -n 1000 mooncake_trace.jsonl > mooncake_trace_small.jsonl
+
+# Speed up timestamps 4x (reduces benchmark time from ~12 min to ~3 min)
+python3 - <<'PY'
+import json
+
+with open("mooncake_trace_small.jsonl") as src, open("mooncake_trace_4x.jsonl", "w") as dst:
+ for line in src:
+ rec = json.loads(line)
+ rec["timestamp"] = int(rec["timestamp"] / 4)
+ dst.write(json.dumps(rec) + "\n")
+PY
+
+echo "Dataset ready: mooncake_trace_4x.jsonl (1000 requests, 4x speed)"
+```
+
+---
+
+## Phase 4: Set Up Benchmark Environment
+
+### Step 4.1: Deploy Benchmark Pod
+
+Create `benchmark-job.yaml`:
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: aiperf-benchmark
+ namespace: benchmark
+spec:
+ backoffLimit: 1
+ template:
+ spec:
+ restartPolicy: Never
+ containers:
+ - name: benchmark
+ image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0
+ command: ["/bin/sh", "-c", "sleep infinity"]
+ imagePullPolicy: IfNotPresent
+ resources:
+ limits:
+ nvidia.com/gpu: 0
+```
+
+Deploy:
+
+```bash
+kubectl apply -f benchmark-job.yaml
+```
+
+Wait for pod to be ready:
+
+```bash
+kubectl get pods -n benchmark
+```
+
+### Step 4.2: Copy Dataset to Benchmark Pod
+
+```bash
+POD_NAME=$(kubectl get pods -n benchmark -l job-name=aiperf-benchmark -o jsonpath='{.items[0].metadata.name}')
+
+kubectl -n benchmark cp mooncake_trace_4x.jsonl ${POD_NAME}:/tmp/mooncake_trace_4x.jsonl
+```
+
+### Step 4.3: Install AIPerf
+
+```bash
+kubectl -n benchmark exec ${POD_NAME} -- bash -lc '. /opt/dynamo/venv/bin/activate && pip install -q aiperf'
+```
+
+---
+
+## Phase 5: Run Benchmarks
+
+### Step 5.1: Benchmark Router-OFF (Baseline)
+
+```bash
+kubectl -n benchmark exec ${POD_NAME} -- bash -lc '
+ . /opt/dynamo/venv/bin/activate
+ aiperf profile \
+ --model "Qwen/Qwen3-32B" \
+ --url "http://vllm-agg-no-router-frontend.router-off-test.svc.cluster.local:8000" \
+ --endpoint-type chat \
+ --input-file /tmp/mooncake_trace_4x.jsonl \
+ --custom-dataset-type mooncake_trace \
+ --tokenizer "Qwen/Qwen3-32B" \
+ --streaming \
+ --request-count 1000 \
+ --fixed-schedule \
+ --output-artifact-dir /tmp/router_off_results
+'
+```
+
+**Note:** This will take 3-5 minutes. The terminal output includes a summary table.
+
+### Step 5.2: Benchmark Router-ON (KV Smart Router)
+
+```bash
+kubectl -n benchmark exec ${POD_NAME} -- bash -lc '
+ . /opt/dynamo/venv/bin/activate
+ aiperf profile \
+ --model "Qwen/Qwen3-32B" \
+ --url "http://vllm-agg-router-frontend.router-on-test.svc.cluster.local:8000" \
+ --endpoint-type chat \
+ --input-file /tmp/mooncake_trace_4x.jsonl \
+ --custom-dataset-type mooncake_trace \
+ --tokenizer "Qwen/Qwen3-32B" \
+ --streaming \
+ --request-count 1000 \
+ --fixed-schedule \
+ --output-artifact-dir /tmp/router_on_results
+'
+```
+
+### Step 5.3: Collect Results
+
+```bash
+# Copy results to local machine
+kubectl -n benchmark cp ${POD_NAME}:/tmp/router_off_results/profile_export_aiperf.csv ./router_off_results.csv
+kubectl -n benchmark cp ${POD_NAME}:/tmp/router_on_results/profile_export_aiperf.csv ./router_on_results.csv
+```
+
+---
+
+## Phase 6: Analyze Results
+
+### Key Metrics to Compare
+
+| Metric | Description | What to Look For |
+|--------|-------------|------------------|
+| **Time to First Token (TTFT)** | Latency until first token arrives | Lower is better; KV router may reduce with prefix reuse |
+| **Inter Token Latency (ITL)** | Average time between tokens | Lower is better; indicates generation speed |
+| **Request Latency** | Total end-to-end latency | Lower is better; overall user experience |
+| **Output Token Throughput** | Tokens generated per second (system-wide) | Higher is better; system efficiency |
+| **Request Throughput** | Requests completed per second | Higher is better; capacity |
+
+### Interpreting Results
+
+**Your Results May Vary**: The improvement from KV Smart Router depends heavily on your workload characteristics:
+
+**Factors that increase KV router benefit:**
+- **High prefix overlap** (shared system prompts, templates, document contexts)
+- **Long prompts** (>2000 tokens) where caching saves significant compute
+- **Multi-turn conversations** with context carryover
+- **Batch workloads** with similar queries
+
+**Factors that reduce KV router benefit:**
+- **Unique prompts** with no prefix reuse
+- **Short prompts** (\<1000 tokens) where routing overhead exceeds benefit
+- **Evenly distributed load** where round-robin is already optimal
+- **Low request rate** where cache eviction negates benefits
+
+**Expected Performance:**
+- **High prefix overlap workloads**: 20-50% TTFT improvement
+- **Moderate prefix overlap**: 10-20% improvement
+- **Low prefix overlap**: \<5% improvement (may not be worth enabling)
+
+**KV Smart Router is beneficial when:**
+- TTFT improvements > 20%
+- No significant degradation in other metrics
+- Workload demonstrates measurable prefix reuse patterns
+
+**Standard routing is better when:**
+- KV router shows \<10% improvement
+- Increased latency variance is observed
+- Load distribution across workers is more important than cache affinity
+
+### Example Comparison
+
+From the terminal output, compare the summary tables:
+
+```
+Router-OFF (Baseline):
+ TTFT avg: 12,764 ms p99: 45,898 ms
+ Request Latency avg: 32,978 ms
+ Output Token Throughput: 1,614 tokens/sec
+ Request Throughput: 8.61 req/sec
+
+Router-ON (KV Router):
+ TTFT avg: 8,012 ms p99: 28,644 ms (37% faster ✅)
+ Request Latency avg: 28,972 ms (12% faster ✅)
+ Output Token Throughput: 1,746 tokens/sec (8% higher ✅)
+ Request Throughput: 9.33 req/sec (8% higher ✅)
+```
+
+In this example with all 8 workers healthy, the **KV router significantly outperformed** the baseline:
+- **37% faster TTFT** - Users see first token much sooner
+- **8% higher throughput** - System processes more requests per second
+- **12% lower latency** - Faster end-to-end completion
+
+The Mooncake arxiv dataset has sufficient prefix overlap (long input sequences with similar patterns) to benefit from KV cache-aware routing. Workloads with explicit shared prefixes (system prompts, templates) may see even greater improvements.
+
+---
+
+## Phase 7: Cleanup
+
+```bash
+# Delete deployments
+kubectl delete dynamographdeployment vllm-agg-no-router -n router-off-test
+kubectl delete dynamographdeployment vllm-agg-router -n router-on-test
+
+# Delete namespaces (removes all resources)
+kubectl delete namespace router-off-test
+kubectl delete namespace router-on-test
+kubectl delete namespace benchmark
+```
+
+---
+
+## Troubleshooting
+
+### Issue: Pods Stuck in Pending
+
+**Cause:** Insufficient GPU resources
+
+**Solution:**
+```bash
+# Check GPU availability
+kubectl describe nodes | grep -A 10 "Allocated resources"
+
+# Reduce worker replicas if needed
+kubectl edit dynamographdeployment -n
+```
+
+### Issue: ImagePullBackOff Errors
+
+**Cause:** Version mismatch or missing credentials
+
+**Solution:**
+```bash
+# Check available versions
+kubectl get pods -n dynamo-system -o yaml | grep image:
+
+# Update deployment YAML to match cluster version
+```
+
+### Issue: Operator Not Processing Deployment
+
+**Cause:** Namespace restrictions
+
+**Solution:**
+- Ensure Dynamo platform is Helm-installed in the namespace
+- Verify operator has `--restrictedNamespace=` argument
+- Check operator logs: `kubectl logs -n deployment/dynamo-platform-dynamo-operator-controller-manager`
+
+### Issue: Workers Not Becoming Ready
+
+**Cause:** Model download failures or probe configuration
+
+**Solution:**
+```bash
+# Check worker logs
+kubectl logs -n
+
+# Common issues:
+# - Invalid HuggingFace token
+# - Network connectivity
+# - Insufficient disk space for model
+```
+
+### Issue: Workers Restarting in CrashLoopBackOff
+
+**Cause:** Startup probe timeout - workers killed before finishing initialization
+
+**Symptoms:**
+- Pods show "Container main failed startup probe, will be restarted"
+- Logs show model still downloading or loading when pod is killed
+- Large models (>30GB) take longer than default 22-minute timeout
+
+**Solution:**
+Increase the startup probe `failureThreshold`:
+
+```bash
+# Patch the deployment to allow 32 minutes instead of 22
+kubectl patch dynamographdeployment -n --type='json' \
+ -p='[{"op": "replace", "path": "/spec/services/VllmDecodeWorker/extraPodSpec/mainContainer/startupProbe/failureThreshold", "value": 60}]'
+```
+
+Or update your YAML before deploying:
+```yaml
+startupProbe:
+ httpGet:
+ path: /health
+ port: 9090
+ initialDelaySeconds: 120
+ periodSeconds: 30
+ timeoutSeconds: 10
+ failureThreshold: 60 # 32 minutes total (120s + 60*30s)
+```
+
+**Model Loading Times (approximate):**
+- Qwen3-32B: ~20-25 minutes (first download)
+- Llama-70B: ~25-30 minutes (first download)
+- With cached model on node: ~2-5 minutes
+
+### Issue: Unequal Worker Health
+
+**Cause:** Resource constraints, image pull issues, or configuration errors
+
+**Solution:**
+```bash
+# Check all worker status
+kubectl get pods -n -l nvidia.com/dynamo-component-type=worker
+
+# Describe problematic pods
+kubectl describe pod -n
+
+# Fix issues before benchmarking or results will be skewed
+```
+
+---
+
+## Advanced Configuration
+
+### Testing Different Models
+
+Replace `Qwen/Qwen3-32B` with your model in:
+- Deployment YAML `args` section
+- AIPerf `--model` and `--tokenizer` parameters
+
+### Adjusting Worker Count
+
+Change `replicas: 8` in the deployment YAMLs. Ensure both deployments use the same count for fair comparison.
+
+### Using Custom Datasets
+
+Replace mooncake dataset with your own JSONL file:
+- Format: One request per line with `timestamp` field
+- AIPerf supports various formats via `--custom-dataset-type`
+
+### Disaggregated Prefill/Decode
+
+For advanced testing, add separate prefill workers:
+
+```yaml
+VllmPrefillWorker:
+ componentType: worker
+ replicas: 2
+ # ... configuration
+```
+
+---
+
+## Best Practices
+
+1. **Equal Conditions:** Ensure both deployments have identical worker counts and health before benchmarking
+2. **Warm-Up:** Run a small test (100 requests) before the full benchmark to warm up caches
+3. **Multiple Runs:** Run benchmarks 3+ times and average results for statistical significance
+4. **Monitor Workers:** Watch for any pod restarts or issues during benchmark runs
+5. **Document Conditions:** Record cluster state, worker health, and any anomalies
+6. **Test Relevant Workloads:** Use datasets that match your actual use case for meaningful results
+
+---
+
+## Conclusion
+
+This guide provides a complete methodology for A/B testing Dynamo's KV Smart Router. The KV router's effectiveness depends heavily on workload characteristics—datasets with high prefix overlap will show the most benefit.
+
+For questions or issues, consult the [Dynamo documentation](https://github.com/ai-dynamo/dynamo) or open an issue on GitHub.
+
+---
+
+## Appendix: Files Reference
+
+- `router-off-deployment.yaml`: Standard routing deployment
+- `router-on-deployment.yaml`: KV router enabled deployment
+- `benchmark-job.yaml`: AIPerf benchmark pod
+- `prepare-dataset.sh`: Dataset preparation script
+- Results CSVs: Detailed metrics from AIPerf
+
+**Repository:** [https://github.com/ai-dynamo/dynamo](https://github.com/ai-dynamo/dynamo)
+
diff --git a/fern/pages/benchmarks/sla-driven-profiling.md b/fern/pages/benchmarks/sla-driven-profiling.md
new file mode 100644
index 00000000000..9e23fa84940
--- /dev/null
+++ b/fern/pages/benchmarks/sla-driven-profiling.md
@@ -0,0 +1,628 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "SLA-Driven Profiling with DynamoGraphDeploymentRequest"
+---
+
+
+**New to DGDR and SLA-Driven Profiling?** Start with the [SLA-Driven Profiling and Planner Deployment Quick Start Guide](../planner/sla-planner-quickstart.md) for step-by-step instructions. This document provides deeper technical details about the profiling process.
+
+
+## Overview
+
+Dynamo provides automated SLA-driven profiling through **DynamoGraphDeploymentRequests (DGDR)**. Instead of manually running profiling scripts, you declare your performance requirements and let the Dynamo Operator handle profiling and deployment automatically.
+
+**Key Benefits:**
+- **Declarative**: Specify SLAs, not implementation details
+- **Automated**: No manual job setup or result processing
+- **Integrated**: Seamlessly works with Dynamo Operator
+- **Production-Ready**: Generates optimized configurations with SLA planner
+
+This document covers:
+- Technical details of online vs offline profiling
+- Profiling process internals (GPU usage, measurements, interpolation)
+- Direct script usage for advanced scenarios
+- Comprehensive troubleshooting
+
+## Support Matrix
+
+| Backend | Dense Models | MoE Models |
+|---------|-------------|------------|
+| vLLM | ✅ | 🚧 |
+| SGLang | ✅ | ✅ |
+| TensorRT-LLM | ✅ | 🚧 |
+
+Specifically, the profiler sweeps over the following parallelization mapping for prefill and decode:
+| Model Architecture | Prefill Parallelization Mapping | Decode Parallelization Mapping |
+|---------|-------------|------------|
+| MLA+MoE (DeepseekV3ForCausalLM, DeepseekV32ForCausalLM) | TEP, DEP | TEP, DEP |
+| GQA+MoE (Qwen3MoeForCausalLM) | TP, TEP, DEP | TP, TEP, DEP |
+| Other Models | TP | TP |
+
+
+- Exact model x parallelization mapping support is dependent on the backend. The profiler does not guarantee that the recommended P/D engine configuration is supported and bug-free by the backend.
+
+
+## Using DGDR for Profiling (Recommended)
+
+The recommended way to profile models is through DGDRs. Sample configurations are provided in `deploy/`:
+
+**Available Samples:**
+- **`profile_sla_dgdr.yaml`**: Standard profiling with AIPerf on real engines
+- **`profile_sla_aic_dgdr.yaml`**: Fast profiling with AI Configurator simulation
+- **`profile_sla_moe_dgdr.yaml`**: MoE model profiling
+
+The Dynamo Operator automatically:
+1. Discovers GPU resources (cluster-scoped operators only)
+2. Runs profiling (AIPerf on real engines or AI Configurator simulation)
+3. Generates optimal DGD configuration with SLA planner
+4. Deploys the DGD to your cluster
+
+See the [Quick Start Guide](../planner/sla-planner-quickstart.md) for prerequisites and detailed instructions.
+
+## Hardware Configuration
+
+Hardware parameters have sensible defaults and are **optional** - you can override them if needed:
+
+```yaml
+profilingConfig:
+ config:
+ # Override hardware defaults if needed
+ hardware:
+ min_num_gpus_per_engine: 1
+ max_num_gpus_per_engine: 8
+ num_gpus_per_node: 8
+
+ # Only needed when using AI Configurator (sweep.use_ai_configurator: true)
+ sweep:
+ aic_system: h200_sxm # GPU type for AI Configurator (h100_sxm, h200_sxm, etc.)
+```
+
+### Automatic GPU Discovery (Optional Feature)
+
+Cluster-scoped operators can optionally enable automatic GPU discovery to detect hardware from cluster nodes. When enabled, hardware config is auto-detected and overrides any manually specified values.
+
+```yaml
+spec:
+ enableGpuDiscovery: true
+```
+
+This feature is only available with cluster-scoped operators (`namespaceRestriction.enabled=false`) as it requires cluster-wide node access permissions. It is not available for namespace-restricted operators.
+
+## Profiling Method
+
+1. **Hardware Setup**: Uses defaults or user-specified hardware configuration. Optionally, cluster-scoped operators can enable automatic GPU discovery to detect specifications from cluster nodes.
+2. **Identify Sweep Ranges**: Automatically determine minimum and maximum number of GPUs per engine. Minimum is determined by the model size and GPU VRAM. Maximum is set to one node for dense model and 4 nodes for MoE models.
+3. **Parallelization Mapping Sweep**: Use the input ISL and OSL, test the performance of the engines with different parallelization mappings.
+ - For dense models, we test different TP sizes for both prefill and decode.
+ - For MoE models (SGLang), we evaluate both TEP and DEP as candidates for prefill and decode.
+ - **Prefill**:
+ - TP/TEP: We measure TTFT with batch size = 1 (assuming ISL is long enough to saturate compute) without KV reuse.
+ - DEP: Attention uses data parallelism. We send a single burst with total concurrency `attention_dp_size × attn_dp_num_req_ratio` (defaults to 4) and compute the reported TTFT as `time_to_first_token.max / attn_dp_num_req_ratio` from the AIPerf summary of that burst. This stabilizes measurements when the first batch may launch before all requests arrive.
+ 
+ - **Decode**: Since the ITL (or iteration time) is relevant with how many requests are in-flight, we measure the ITL under different number of in-flight requests. The range of the number of in-flight requests is from 1 to the maximum number of requests that the kv cache of the engine can hold. To measure the ITL without being affected by piggy-backed prefill requests, the script will enable kv-reuse and warm up the engine by issuing the same prompts before measuring the ITL. Since the kv cache is sufficient for all the requests, it can hold the kv cache of the pre-computed prompts and skip the prefill phase when measuring the ITL. However, for MoE models, this is not guaranteed because the kv cache in different attention DP ranks is different. We are working on framework-side change to fix this issue. For example, the below plot shows the decode parallelization mapping sweep results for H100 for deepseek-ai/DeepSeek-R1-Distill-Llama-8B.
+ 
+4. **Recommendation**: Selects optimal parallelization mapping for prefill and decode that achieves the highest per GPU throughput while adhering the SLA on TTFT and ITL. Specifically, the profiler will choose the point (or a point on the curve for decode) that is left to the vertical red dashed line that represents the SLAs while has the highest y coordinate (throughput per GPU).
+5. **In-Depth Profiling on the Recommended P/D Engine**: After finding the best TP size for prefill and decode, the script will then interpolate the TTFT with ISL and ITL with active KV cache and decode context length. This is to provide a more accurate estimation of the performance when ISL and OSL changes and will be used in the sla-planner.
+
+ - **Prefill**: Measures TTFT and throughput per GPU across different input lengths with batch size=1.
+ - **Decode**: Measures ITL and throughput per GPU under various KV cache loads and decode context lengths. The active kv usage determines the complexity of the memory-bounded attention kernel while the active kv usage divided the average context length determines the complexity of the computation bound MLP kernel. For example, the below figure shows the ITL of DS-Distilled Llama 8b model on H100 TP4. The ITL grows near-linearly with active kv usage under a fixed context length. And the slope increases as the context length decreases.
+
+
+To run the parallelization mapping sweep and the in-depth profiling on the recommended P/D engine, the profiler need to know the engine's forward pass time with different loads. There are two ways to achieve this: run AIPerf on real engines or use AI Configurator to run simulations.
+
+### AIPerf on Real Engines
+
+Profiles your model by creating real test deployments in Kubernetes and measuring their performance.
+
+**Characteristics:**
+- **Duration**: 2-4 hours
+- **Accuracy**: Highest (real measurements)
+- **GPU Requirements**: Full access to test different parallelization mappings
+- **Backends**: vLLM, SGLang, TensorRT-LLM
+
+**DGDR Configuration:**
+```yaml
+profilingConfig:
+ config:
+ sweep:
+ use_ai_configurator: false # Default
+```
+
+### AI Configurator Simulation
+
+Uses performance simulation to rapidly estimate optimal configurations without running real deployments.
+
+**Characteristics:**
+- **Duration**: 20-30 seconds
+- **Accuracy**: Estimated (may have errors for unusual configurations)
+- **GPU Requirements**: None
+- **Backends**: TensorRT-LLM only (vLLM/SGLang coming soon)
+
+**DGDR Configuration:**
+```yaml
+profilingConfig:
+ config:
+ sweep:
+ use_ai_configurator: true
+ aic:
+ system: h200_sxm # GPU system type
+ model_name: QWEN3_32B # AIC model identifier
+ backend_version: "0.20.0"
+```
+
+**Supported Configurations:**
+
+For the current list of supported models, systems, and backend versions, see the [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#supported-features).
+
+To check from the command line: `aiconfigurator cli --help`
+
+**Currently supports:**
+- **Backends**: TensorRT-LLM (versions 0.20.0, 1.0.0rc3, 1.0.0rc6)
+- **Systems**: H100 SXM, H200 SXM, B200 SXM, GB200 SXM, A100 SXM
+- **Models**: Wide range including GPT, Llama, Mixtral, DeepSeek, Qwen, and more
+
+### Output Format
+
+After profiling, the DGDR status contains:
+
+1. **Recommended Configuration**: Optimal TP for prefill and decode
+2. **Performance Data**: Interpolation models for SLA planner
+3. **Generated DGD**: Complete deployment manifest
+
+**Example Recommendations:**
+```
+Suggested prefill TP:4 (TTFT 48.37 ms, throughput 15505.23 tokens/s/GPU)
+Suggested decode TP:4 (ITL 4.83 ms, throughput 51.22 tokens/s/GPU)
+```
+
+#### Interactive Configuration Selection WebUI
+
+When running the profiler with `--pick-with-webui`, an interactive web interface is launched that allows you to visually explore profiling results and manually select configurations.
+
+**Features:**
+- **Interactive Charts**: Visualize prefill TTFT, decode ITL, and GPU hours analysis with hover-to-highlight synchronization between charts and tables
+- **Pareto-Optimal Analysis**: The GPU Hours table shows pareto-optimal configurations balancing latency and throughput
+- **DGD Config Preview**: Click "Show Config" on any row to view the corresponding DynamoGraphDeployment YAML
+- **GPU Cost Estimation**: Toggle GPU cost display to convert GPU hours to cost ($/1000 requests)
+- **SLA Visualization**: Red dashed lines indicate your TTFT and ITL targets
+
+**Selection Methods:**
+1. **GPU Hours Table** (recommended): Click any row to select both prefill and decode configurations at once based on the pareto-optimal combination
+2. **Individual Selection**: Click one row in the Prefill table AND one row in the Decode table to manually choose each
+
+**Example DGD Config Output:**
+
+When you click "Show Config", you'll see a DynamoGraphDeployment configuration like:
+
+```yaml
+# DynamoGraphDeployment Configuration
+# Prefill: 1 GPU(s), TP=1
+# Decode: 4 GPU(s), TP=4
+# Model: Qwen/Qwen3-32B-FP8
+# Backend: trtllm
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+spec:
+ services:
+ PrefillWorker:
+ subComponentType: prefill
+ replicas: 1
+ extraPodSpec:
+ mainContainer:
+ args:
+ - --tensor-parallel-size=1
+ DecodeWorker:
+ subComponentType: decode
+ replicas: 1
+ extraPodSpec:
+ mainContainer:
+ args:
+ - --tensor-parallel-size=4
+```
+
+**Usage:**
+```bash
+python -m benchmarks.profiler.profile_sla \
+ --backend trtllm \
+ --config path/to/disagg.yaml \
+ --pick-with-webui \
+ --use-ai-configurator \
+ --model Qwen/Qwen3-32B-FP8 \
+ --aic-system h200_sxm \
+ --ttft 200 --itl 15
+```
+
+Once you have selected a configuration, the full DynamoGraphDeployment CRD will be saved in your output folder as `config_with_planner.yaml`.
+
+The WebUI launches on port 8000 by default (configurable with `--webui-port`).
+
+#### Output Performance Plots
+
+The profiler will generate the following plots to better visualize the performance data:
+
+**Parallelization Mapping Sweep Plots:**
+- `prefill_performance.png`: TTFT vs Parallelization Mapping size
+- `decode_performance.png`: ITL vs Parallelization Mapping size and in-flight requests
+
+Note these two plots are based on the input ISL and OSL.
+
+**In-Depth Profiling for the Recommended P/D Engine Plots:**
+- `selected_prefill_interpolation/prefill_ttft_interpolation.png`: TTFT vs ISL for the recommended prefill engine
+- `selected_prefill_interpolation/prefill_throughput_interpolation.png`: Throughput vs ISL for the recommended prefill engine
+- `selected_decode_interpolation/decode_itl_interplation.png`: ITL vs KV usage and context length for the recommended decode engine
+- `selected_decode_interpolation/decode_throughput_interpolation.png`: Throughput vs KV usage and context length for the recommended decode engine
+
+
+### Output Interpolation Data
+
+The profiler generates `.npz` files to store the performance data for the recommended P/D engine:
+
+**Prefill Interpolation** (`selected_prefill_interpolation/raw_data.npz`):
+- `prefill_isl`: 1D array of input sequence lengths tested
+- `prefill_ttft`: 1D array of TTFTs (ms) at each ISL
+- `prefill_thpt_per_gpu`: 1D array of throughput (tokens/s/GPU) at each ISL
+
+**Decode Interpolation** (`selected_decode_interpolation/raw_data.npz`):
+- `max_kv_tokens`: Total KV tokens capacity in decode engine
+- `x_kv_usage`: 1D array of active KV usage percentages [0, 1]
+- `y_context_length`: 1D array of average context lengths tested
+- `z_itl`: 1D array of ITLs (ms) at each (KV usage, context length) point
+- `z_thpt_per_gpu`: 1D array of throughput (tokens/s/GPU) at each point
+
+## DGDR Configuration Reference
+
+This section provides detailed explanations of all DGDR `profilingConfig` options. The DGDR controller passes this configuration to the profiler script, which is defined in `benchmarks/profiler/utils/profiler_argparse.py`.
+
+### Configuration Structure
+
+All profiler configuration goes under `spec.profilingConfig.config`:
+
+```yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeploymentRequest
+metadata:
+ name: my-deployment
+spec:
+ model: "Qwen/Qwen3-0.6B" # High-level: model to deploy
+ backend: vllm # High-level: inference backend
+
+ profilingConfig:
+ profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" # Required
+ configMapRef: # Optional: base DGD config
+ name: my-config
+ key: disagg.yaml
+
+ config: # Profiler configuration
+ sla: { ... }
+ hardware: { ... }
+ sweep: { ... }
+ aic: { ... }
+ planner: { ... }
+
+ deploymentOverrides: # Optional
+ workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
+```
+
+### SLA Configuration (Required)
+
+Define your performance requirements and workload characteristics:
+
+```yaml
+profilingConfig:
+ config:
+ sla:
+ isl: 3000 # Average input sequence length (tokens)
+ osl: 150 # Average output sequence length (tokens)
+ ttft: 200.0 # Target Time To First Token (milliseconds)
+ itl: 20.0 # Target Inter-Token Latency (milliseconds)
+```
+
+**What these control:**
+- **ISL/OSL**: Based on your expected traffic patterns
+- **TTFT**: First token latency target (lower = more GPUs needed, affects prefill engine)
+- **ITL**: Token generation latency target (lower = more GPUs needed, affects decode engine)
+- **Trade-offs**: Tighter SLAs require more GPU resources
+
+### Hardware Configuration (Optional)
+
+Control GPU search space and constraints:
+
+```yaml
+profilingConfig:
+ config:
+ hardware:
+ min_num_gpus_per_engine: 2 # if not provided, will automatically determine based on model and VRAM size
+ max_num_gpus_per_engine: 8 # Maximum GPUs to test
+ num_gpus_per_node: 8 # GPUs per node (for multi-node MoE)
+ gpu_type: h200_sxm # GPU type hint
+```
+
+**When to use:**
+- **min_num_gpus_per_engine**: Skip small TP sizes if your model is large
+- **max_num_gpus_per_engine**: Limit search space or work around constraints (e.g., [AIC attention heads](#ai-configurator-attention-head-constraint-error))
+- **num_gpus_per_node**: Determine the upper bound of number of GPUs per node for dense models and configure Grove for multi-node MoE engines.
+- **gpu_type**: Informational, auto-detected by controller
+
+
+If you don't specify hardware constraints, the controller auto-detects based on your model size and available cluster resources.
+
+
+### Sweep Configuration (Optional)
+
+Control profiling behavior:
+
+```yaml
+profilingConfig:
+ config:
+ sweep:
+ use_ai_configurator: false # Use offline profiling (default: false)
+ prefill_interpolation_granularity: 16 # Samples for prefill TTFT curve
+ decode_interpolation_granularity: 6 # Samples for decode ITL curve
+```
+
+**Use cases:**
+- **use_ai_configurator**: Set to `true` for 20-30 second profiling (TensorRT-LLM only)
+- **prefill_interpolation_granularity**: How many samples to benchmark for prefill TTFT curve (lower = faster but may be less accurate)
+- **decode_interpolation_granularity**: How many samples to benchmark for decode ITL curve (lower = faster but may be less accurate). Since ITL interpolation is a 3d plot and takes longer to run, we default to a smaller number of samples. Increasing this value might quadratically increase the profiling time.
+
+### AI Configurator Configuration (Required if `use_ai_configurator: true`)
+
+Configure AI Configurator profiling mode:
+
+```yaml
+profilingConfig:
+ config:
+ sweep:
+ use_ai_configurator: true
+ aic_system: h200_sxm # GPU system: h100_sxm, h200_sxm, b200_sxm, gb200_sxm, a100_sxm
+ aic_hf_id: Qwen/Qwen3-32B # Huggingface model id
+ aic_backend_version: "0.20.0" # TensorRT-LLM version: 0.20.0, 1.0.0rc3
+```
+
+**Supported configurations:** See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#supported-features)
+
+### Planner Configuration (Optional)
+
+Pass arguments to the SLA planner:
+
+```yaml
+profilingConfig:
+ config:
+ planner:
+ planner_min_endpoint: 2 # Minimum endpoints to maintain
+ planner_adjustment_interval: 60 # Adjustment interval (seconds)
+ planner_load_predictor: linear # Load prediction method
+```
+
+
+Planner arguments use `planner_` prefix. See planner documentation for full list.
+
+
+### Engine Configuration (Auto-configured)
+
+The controller automatically sets these from high-level fields:
+
+```yaml
+# You specify:
+spec:
+ model: "Qwen/Qwen3-0.6B"
+ backend: vllm
+
+# Controller auto-injects into config:
+profilingConfig:
+ config:
+ deployment:
+ model: "Qwen/Qwen3-0.6B" # From spec.model
+ engine:
+ backend: vllm # From spec.backend
+ config: /path/to/configmap # From spec.profilingConfig.configMapRef (if provided)
+```
+
+**You should not manually set** `deployment.model` or `engine.backend` in `profilingConfig.config` - they are automatically injected from the high-level fields.
+
+### Complete Example: AIPerf on Real Engines
+
+```yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeploymentRequest
+metadata:
+ name: vllm-dense-online
+spec:
+ model: "Qwen/Qwen3-0.6B"
+ backend: vllm
+
+ profilingConfig:
+ profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
+ config:
+ sla:
+ isl: 3000
+ osl: 150
+ ttft: 200.0
+ itl: 20.0
+
+ hardware:
+ min_num_gpus_per_engine: 1
+ max_num_gpus_per_engine: 8
+
+ sweep:
+ use_ai_configurator: false
+
+ deploymentOverrides:
+ workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
+
+ autoApply: true
+```
+
+### Complete Example: AI Configurator Simulation
+
+```yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeploymentRequest
+metadata:
+ name: trtllm-aic-offline
+spec:
+ model: "Qwen/Qwen3-32B"
+ backend: trtllm
+
+ profilingConfig:
+ profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.6.1"
+ config:
+ sla:
+ isl: 4000
+ osl: 500
+ ttft: 300.0
+ itl: 10.0
+
+ sweep:
+ use_ai_configurator: true
+
+ aic:
+ system: h200_sxm
+ model_name: QWEN3_32B
+ backend_version: "0.20.0"
+
+ deploymentOverrides:
+ workersImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.6.1"
+
+ autoApply: true
+```
+
+### Complete Example: MoE Model
+
+```yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeploymentRequest
+metadata:
+ name: sglang-moe
+spec:
+ model: "deepseek-ai/DeepSeek-R1"
+ backend: sglang
+
+ profilingConfig:
+ profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
+ config:
+ sla:
+ isl: 2048
+ osl: 512
+ ttft: 300.0
+ itl: 25.0
+
+ hardware:
+ num_gpus_per_node: 8
+ max_num_gpus_per_engine: 32
+
+ engine:
+ is_moe_model: true # Enable MoE profiling mode
+
+ deploymentOverrides:
+ workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
+
+ autoApply: true
+```
+
+## Troubleshooting
+
+### Profiling Takes Too Long
+
+**Solution 1**: Use AI Configurator for rapid profiling (TensorRT-LLM only):
+```yaml
+sweep:
+ use_ai_configurator: true
+```
+
+**Solution 2**: Reduce search space:
+```yaml
+config:
+ sweep:
+ min_num_gpus: 4 # Skip TP1, TP2
+ max_num_gpus: 8 # Don't test beyond TP8
+```
+
+### SLA Cannot Be Met
+
+**Symptoms**: Profiler reports no configuration meets targets
+
+**Solutions:**
+1. Relax SLA targets (increase TTFT/ITL)
+2. Add more GPU resources
+3. Try a different backend
+4. Use a smaller model
+
+### AI Configurator: Attention Head Constraint Error
+
+**Symptoms**: Profiling fails with error:
+```
+AssertionError: num_heads should be divisible by tp_size and the division result should be >= 4
+```
+
+**Cause**: AI Configurator requires **≥4 attention heads per GPU**. Small models with few heads cannot use high TP sizes.
+
+**Affected Models:**
+- **Qwen3-0.6B** (16 heads): Max TP = 4 ❌ Fails at TP=8
+- **GPT-2** (12 heads): Max TP = 3
+- Most models **\<1B parameters**: May hit this constraint
+
+**Solution**: Limit `max_num_gpus_per_engine` in your DGDR:
+
+```yaml
+profilingConfig:
+ profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.6.1"
+ config:
+ hardware:
+ max_num_gpus_per_engine: 4 # For Qwen3-0.6B (16 heads / 4 = max TP of 4)
+ sweep:
+ use_ai_configurator: true
+ aic:
+ system: h200_sxm
+ model_name: QWEN3_0_6B
+```
+
+**Calculate Max TP**: `max_tp = num_attention_heads / 4`
+
+> **Note**: This is an AI Configurator limitation. Online profiling doesn't have this constraint.
+
+### Image Pull Errors
+
+**Symptoms**: `ErrImagePull` or `ImagePullBackOff`
+
+**Solution**: Ensure image pull secrets are configured:
+```bash
+kubectl create secret docker-registry nvcr-imagepullsecret \
+ --docker-server=nvcr.io \
+ --docker-username='$oauthtoken' \
+ --docker-password= \
+ --namespace
+```
+
+### Out of Memory During Profiling
+
+**Symptoms**: OOM errors in profiling jobs
+
+**Solutions:**
+1. Reduce `gpu_memory_utilization` in engine config
+2. Reduce `--max-context-length`
+3. Skip larger TP configurations
+4. Use fewer GPUs per test
+
+### Unsupported Parallelization Mapping in Backend
+
+**Symptoms**: Starttime/runtime error in the backend. For example, prime number of attention heads restrain TP size to be 1 (i.e., falcon-7b with 71 attention heads). Or some backend does not support different TP sizes for prefill and decode.
+
+**Solutions:**
+1. Contact the backend to add support for the use cases and bump backend version in dynamo.
+2. Restrain the max and min number of GPUs per engine to the supported range.
+
+## Next Steps
+
+- **Deploy with DGDR**: See [Quick Start Guide](../planner/sla-planner-quickstart.md)
+- **Understand SLA Planner**: Read [SLA Planner Deep Dive](../planner/sla-planner.md)
+- **Monitor Deployments**: Set up [Observability](../kubernetes/observability/metrics.md)
+- **Optimize Performance**: See [Performance Tuning](../performance/tuning.md)
+
+## Related Documentation
+
+- [DGDR API Reference](../kubernetes/api-reference.md)
+- [SLA Planner Quick Start](../planner/sla-planner-quickstart.md)
+- [SLA Planner Architecture](../planner/sla-planner.md)
+- [Profiler Arguments Reference](https://github.com/ai-dynamo/dynamo/tree/main/benchmarks/profiler/utils/profiler_argparse.py)
diff --git a/fern/pages/design-docs/architecture.md b/fern/pages/design-docs/architecture.md
new file mode 100644
index 00000000000..e43e41ead89
--- /dev/null
+++ b/fern/pages/design-docs/architecture.md
@@ -0,0 +1,99 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "High Level Architecture"
+---
+
+Dynamo is NVIDIA's high-throughput, low-latency inference framework that's designed to serve generative AI and reasoning models in multi-node distributed environments. It's inference engine agnostic, supporting TRT-LLM, vLLM, SGLang and others, while capturing essential LLM capabilities:
+
+- **Disaggregated prefill & decode inference**: Maximizes GPU throughput and helps you balance throughput and latency
+- **Dynamic GPU scheduling**: Optimizes performance based on real-time demand
+- **LLM-aware request routing**: Eliminates unnecessary KV cache recomputation
+- **Accelerated data transfer**: Reduces inference response time using NIXL
+- **KV cache offloading**: Uses multiple memory hierarchies for higher system throughput and lower latency
+
+Built in Rust for performance and in Python for extensibility, Dynamo is fully open-source and driven by a transparent, Open Source Software (OSS)-first development approach
+
+## Motivation behind Dynamo
+
+Scaling inference for generative AI and reasoning models presents complex challenges in three key areas: performance, correctness, and efficiency. Here's what we're solving:
+
+There are multi-faceted challenges:
+
+- *Difficult UX*: User experience is critical for distributed inference runtimes because managing large-scale inference systems is already complex, and poor usability further complicates matters. Developers need a clear, intuitive way to define, optimize, and update inference execution without wrestling with low-level infrastructure details. Without simple UX, inference runtimes remain inaccessible, prone to errors, and inefficient, hindering model deployment and innovation. A modern distributed inference stack must consider usability at its core—empowering developers to scale AI effortlessly for agentic workflows while ensuring correctness and performance.
+
+- *GPU underutilization*: Traditional monolithic inference pipelines often leave GPUs idle due to the imbalance between prefill and decode stages. Prefill (which generates large prompt embeddings) is highly compute-intensive, while decode (which generates tokens) is latency-sensitive. A disaggregated approach that separate prefill and decode ensures optimal GPU utilization and increases overall throughput ([DistServe](https://arxiv.org/abs/2401.09670)).
+
+- *Expensive KV cache re-computation*: When requests aren't efficiently routed, KV caches (intermediate states of transformer model) often get flushed and recomputed, leading to wasted computation cycles and increased latency. KV-aware request routing eliminates redundant KV cache regeneration, significantly boosting efficiency.([DeepSeek](https://arxiv.org/abs/2501.12948))
+
+- *Memory bottlenecks*: Large-scale inference workloads demand extensive KV cache storage, which can quickly overwhelm GPU memory capacity. KV cache offloading across memory hierarchies (HBM, DDR, NVMe or remote storage) enables models to scale beyond GPU memory limits and speeds up latency. ([Mooncake](https://kvcache-ai.github.io/Mooncake/design/mooncake-store.html), [AIBrix](https://blog.vllm.ai/2025/02/21/aibrix-release.html), [LMCache](https://lmcache.ai/))
+
+- *Fluctuating demand and inefficient GPU allocation*: Inference workloads are use-case specific and dynamic—demand surges inherently cause unpredictably, yet traditional serving stacks allocate GPUs statically. Dynamic GPU scheduling ensures that resources are allocated based on real-time demand, preventing over-provisioning and improving utilization ([AzureTrace](https://github.com/Azure/AzurePublicDataset))
+
+- *Inefficient data transfer*: Distributed inference workloads introduce unique and highly dynamic communication patterns that differ fundamentally from training. Unlike training, where worker roles remain largely static, inference requires real-time worker scaling, dynamic load balancing, and adaptive memory management—necessitating a communication layer that can efficiently handle these evolving requirements. Contemporary libraries are built for static, synchronous operations and lack the dynamicity needed for inference serving. While UCX provides high-performance networking, it requires deep networking expertise to configure correctly, making it impractical for broad inference use cases. Developers need a library optimized for inference workloads that can abstract heterogeneous memory (remote memory or storage) and dynamically select the best transport mechanism via a unified API.
+
+To address the growing demands of distributed inference serving, NVIDIA introduces Dynamo. This innovative product tackles key challenges in scheduling, memory management, and data transfer. Dynamo employs KV-aware routing for optimized decoding, leveraging existing KV caches. For efficient global memory management at scale, it strategically stores and evicts KV caches across multiple memory tiers—GPU, CPU, SSD, and object storage—enhancing both time-to-first-token and overall throughput. Dynamo features NIXL (NVIDIA Inference tranXfer Library), a new data transfer engine designed for dynamic scaling and low-latency storage access.
+
+## Key benefits
+
+The following diagram outlines Dynamo's high-level architecture. To enable large-scale distributed and disaggregated inference serving, Dynamo includes five key features:
+
+- [Dynamo Disaggregated Serving](disagg-serving.md)
+- [Dynamo Smart Router](../router/kv-cache-routing.md)
+- [Dynamo KV Cache Block Manager](../kvbm/kvbm-intro.md)
+- [Planner](../planner/planner-intro.md)
+- [NVIDIA Inference Transfer Library (NIXL)](https://github.com/ai-dynamo/nixl/blob/main/docs/nixl.md)
+
+Every component in the Dynamo architecture is independently scalable and portable. The API server can adapt to task-specific deployment. A smart router processes user requests to route them to the optimal worker for performance. Specifically, for Large Language Models (LLMs), Dynamo employs KV cache-aware routing, which directs requests to the worker with the highest cache hit rate while maintaining load balance, expediting decoding. This routing strategy leverages a KV cache manager that maintains a global radix tree registry for hit rate calculation. The KV cache manager also oversees a multi-tiered memory system, enabling rapid KV cache storage and eviction. This design results in substantial TTFT reductions, increased throughput, and the ability to process extensive context lengths.
+
+
+
+Dynamo enables dynamic worker scaling, responding to real-time deployment signals. These signals, captured and communicated through an event plane, empower the Planner to make intelligent, zero-downtime adjustments. For instance, if Dynamo detects an increase in requests with long input sequences, the Planner automatically scales up prefill workers to meet the heightened demand.
+
+Beyond efficient event communication, data transfer across multi-node deployments is crucial at scale. To address this, Dynamo utilizes NIXL, a technology designed to expedite transfers through reduced synchronization and intelligent batching. This acceleration is particularly vital for disaggregated serving, ensuring minimal latency when prefill workers pass KV cache data to decode workers.
+
+Dynamo prioritizes seamless integration. Its modular design enables it to work harmoniously with your existing infrastructure and preferred open-source components. To achieve optimal performance and extensibility, Dynamo leverages the strengths of both Rust and Python. We built critical performance-sensitive modules with Rust for speed, memory safety, and robust concurrency. Meanwhile, we used Python for its flexibility, enabling rapid prototyping and effortless customization.
+
+## Performance benefits of key features
+
+### Disaggregated serving
+
+Disaggregating prefill and decode boosts performance, gaining efficiency when more GPUs are involved in inference. For example, for Llama 70B, single-node tests show a 30% throughput/GPU improvement, while two-node setups achieve over 2X gains due to better parallelization.
+
+
+
+* Tested on H100s with R1 Distilled Llama 70B model FP8 using vLLM. 3K ISL/ 150 OSL
+
+
+The disaggregation of prefill and decode phases offers valuable flexibility. Since these phases directly correlate with time-to-first-token (TTFT) and inter-token latency (ITL) respectively, adjusting worker allocation can provide tailored performance. This enables optimization for specific service level agreements (SLAs), whether prioritizing faster TTFT, lower ITL, or higher throughput.
+
+### KV aware routing
+
+
+
+* Tested with 100K requests to R1 using R1 Distilled Llama 70B FP8 on 2 nodes of H100s. Avg 4K ISL / 800 OSL
+
+
+Existing routing methods, including load-based routing, overlook the specific properties of LLMs that could improve performance. Addressing this, routing user queries to workers with the highest KV cache hit rate (rather than simply the least busy node) allows for immediate processing, even under heavy load. The preceeding figures illustrate the effectiveness of KV aware routing on 100,000 real R1 user queries, achieving a 3x improvement in TTFT and a 2x reduction in average request latency. Depending on traffic, this approach can also enhance throughput.
+
+### KV cache manager
+
+The Dynamo KV Block Manager (KVBM) enables KV cache offloading to system CPU memory, local SSDs, and network-attached storage, allowing more KV blocks to be reused instead of recomputed. In many cases, KV transfer is faster than recomputation, so KVBM helps reduce time-to-first-token (TTFT). The following plot highlights the performance gains achieved through CPU memory offloading. In a scenario involving 20 multi-turn conversations with 15 users, KVBM with CPU memory offloading achieved a 2.2×–12× improvement in TTFT (depending on QPS), demonstrating benefits that extend beyond basic prefix caching.
+
+
+* Tested with different QPS using Qwen3-8B on H100. Avg 20K ISL / 100 OSL.
+
+### NVIDIA Inference Transfer Library (NIXL)
+
+NIXL streamlines data transfer through simplified synchronization and batching and simplified source and destination abstractions. NIXL can abstract data movement across different types of memory and fast storage, whereas other data transfer libraries typically support a single tier of memory. These enhancements yield significant performance gains, accelerating both time-to-first-token (TTFT) and throughput.
+
+## Acknowledgements
+
+We'd like to acknowledge several open source software stacks that motivated our creation Dynamo.
+
+- vLLM and vLLM-project
+- SGLang
+- DistServe
+- Mooncake
+- AIBrix
+- BentoML
diff --git a/fern/pages/design-docs/disagg-serving.md b/fern/pages/design-docs/disagg-serving.md
new file mode 100644
index 00000000000..5e16c1cec78
--- /dev/null
+++ b/fern/pages/design-docs/disagg-serving.md
@@ -0,0 +1,105 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Dynamo Disaggregation: Separating Prefill and Decode for Enhanced Performance"
+---
+
+The prefill and decode phases of LLM requests have different computation characteristics and memory footprints. Disaggregating these phases into specialized llm engines allows for better hardware allocation, improved scalability, and overall enhanced performance. For example, using a larger TP for the memory-bound decoding phase while a smaller TP for the computation-bound prefill phase allows both phases to be computed efficiently. In addition, for requests with long context, separating their prefill phase into dedicated prefill engines allows the ongoing decoding requests to be efficiently processed without being blocked by these long prefills.
+
+Disaggregated execution of a request has three main steps:
+1. Prefill engine computes prefill phase and generates KV cache
+2. Prefill engine transfers the KV cache to decode engine, and
+3. Decode engine computes decode phase.
+
+However, not all requests’ prefill phases need to be computed in the remote prefill engine. If the prefill is short or the decode engine has a high prefix cache hit, often it is more efficient to prefill locally in the decode engine. The disaggregation design in Dynamo accounts for all these scenarios and features a flexible framework that delivers strong performance across various conditions.
+
+
+## Design
+
+```mermaid
+sequenceDiagram
+ participant D as Worker
+ participant Q as PrefillQueue
+ participant P as PrefillWorker
+
+ Note over D: Request is routed to decode
+ D->>D: Decide if prefill should be done locally or remotely
+
+ D->>D: Allocate KV blocks
+ D->>Q: Put RemotePrefillRequest on the queue
+
+ P->>Q: Pull request from the queue
+ P-->>D: Read cached KVs from Decode
+
+ D->>D: Decode other requests
+ P->>P: Run prefill
+ P-->>D: Write prefilled KVs into allocated blocks
+ P->>D: Send completion notification
+ Note over D: Notification received when prefill is done
+ D->>D: Schedule decoding
+```
+
+There are four main components in Dynamo disaggregation:
+- Worker: execute prefill and decode requests
+- Prefill worker: execute prefill requests only
+- Disaggregated router: decide whether to prefill locally or remotely
+- Prefill queue: cache and load balance the remote prefill requests
+
+When worker receives a request, it first decides if the prefill should be done locally or remotely using the disaggregated router and allocates the KV blocks. If prefilling remotely, it then pushes a remote prefill request to the prefill queue. After that, the prefill worker pulls from prefill queue, reads KV blocks with prefix cache hit from the worker, computes the prefill, and writes the computed KV blocks back to the worker. Finally, the worker completes the remaining decoding.
+
+## Conditional Disaggregation
+
+Not all requests’ prefill phases need to be computed in the remote prefill engine. Disaggregated router decides whether the prefill phase of a request should be computed locally and globally at runtime based on the prefill length and prefill queue status. Specifically, a request is sent to remote prefill engine if the following two conditions are met:
+1. The absolute prefill length without prefix cache hit is greater than a preset threshold. On the one hand, if the prefill length of a request is short, it can be efficiently computed in the decode engine by piggybacking chunked prefill requests with ongoing decode requests. On the other hand, if the prefix cache hit is long, the prefill becomes memory bound and hence can be more efficiently computed in the decode engine.
+2. The number of remote prefill requests in the prefill queue is less than a preset threshold. When the prefill queue has a large number of prefill requests, it indicates that the prefill workers are lagging behind, and it is better to prefill locally until more prefill workers join.
+
+Conditional disaggregation allows Dynamo to achieve high performance for dynamic workloads
+
+## Prefill Queue
+
+Prefill requests are computation bound (except for very short prefills) and should be executed in their dedicated iterations without any other requests to ensure fast TTFT. To balance the load across multiple prefill engines, Dynamo adopts a global prefill queue where workers push remote prefill requests and prefill workers pull and complete the requests one by one. The global prefill queue is implemented based on NATS stream to ensure high performance and availability.
+
+## Efficient KV Transfer
+
+```mermaid
+sequenceDiagram
+ participant D as Worker
+ participant SD as WorkerScheduler
+ participant SP as PrefillWorkerScheduler
+ participant P as PrefillWorker
+
+ Note over SD: KV blocks allocated
+ SD->>SP: Issue remote prefill request with KV block descriptors via prefill queue
+ SP->>P: Add to in-flight batch
+
+ P-->>D: Remote NIXL read for prefix hit KV blocks (non-block)
+ P->>P: Execute prefill
+ P-->>D: Remote NIXL write for comptued KV blocks (non-block)
+
+ P->>SP: Notify finish
+ SP->>SD: Notify finish
+ SD->>D: Add to in-flight batch
+
+ D->>D: Execute decode
+```
+
+The key to high-performance disaggregation is efficient KV transfer. Dynamo leverage NIXL to transfer KV cache directly from the VRAM of prefill engine to the VRAM of decode engine. In addition, the KV transfer is non-blocking, allowing GPU forward pass to serve other requests in addition to the KV transfer.
+
+After the KV blocks are allocated, the worker scheduler sends the remote prefill requests, which contain the memory descriptors for the allocated KV blocks, to the prefill worker scheduler via prefill queue. This allows the prefill worker to read and write from the remote KV blocks without explicit handling in the remote worker engine, thanks to the RDMA read and write NIXL operations. Once the remote prefill is done, worker scheduler simply adds the decode request to the worker in-flight. This allows workers to execute forward passes of ongoing decode/prefill requests while waiting for the remote prefill to finish.
+
+To reduce the size of memory descriptors, Dynamo applies two optimizations:
+1. After each worker finishes its initialization and allocates all the KV cache pool, it stores the memory descriptor of all blocks (which is also referred to as the NIXL metadata) in ETCD, a distributed key-value store. Prefill workers load and cache the memory descriptors in one worker at the first time that it serves a remote prefill request issued by this worker. Thus, only the KV block ID instead of the full memory descriptor is needed when issuing the remote prefill request.
+
+2. Dynamo promotes the memory allocator in the prefill engine to allocate continuous blocks and merge continuous blocks into larger blocks to reduce the total number of KV blocks.
+
+For decode and prefill with different KV layouts (i.e., due to different TP), Dynamo applies a high-performance kernel that transposes the KV blocks into their matching layout in the KV receiver after the NIXL reads and before the NIXL writes.
+
+## Runtime-Reconfigurable xPyD
+
+The prefill queue and NIXL-based KV transfer design in Dynamo naturally allows runtime-reconfigurable xPyD. Workers and prefill workers can be added and removed at runtime without any system-level synchronization or overheads. New and existing prefill workers both just simply pull remote prefill requests from NATS prefill queue. The NIXL metadata of the new or existing workers (for new prefill workers) are lazily loaded and cached when necessary. Specifically, adding and removing workers and prefill workers is as easy as:
+
+- Add worker: add NIXL metadata in ETCD.
+- Remove worker: flush engine and delete NIXL metadata in ETCD.
+- Add prefill worker: no explicit action needed.
+- Delete prefill worker: flush engine.
+
diff --git a/fern/pages/design-docs/distributed-runtime.md b/fern/pages/design-docs/distributed-runtime.md
new file mode 100644
index 00000000000..e3c45bb70c6
--- /dev/null
+++ b/fern/pages/design-docs/distributed-runtime.md
@@ -0,0 +1,67 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Dynamo Distributed Runtime"
+---
+
+## Overview
+
+Dynamo's `DistributedRuntime` is the core infrastructure in the framework that enables distributed communication and coordination between different Dynamo components. It is implemented in rust (`/lib/runtime`) and exposed to other programming languages via bindings (i.e., python bindings can be found in `/lib/bindings/python`). `DistributedRuntime` follows a hierarchical structure:
+
+- `DistributedRuntime`: This is the highest level object that exposes the distributed runtime interface. It maintains connection to external services (e.g., etcd for service discovery and NATS for messaging) and manages lifecycle with cancellation tokens.
+- `Namespace`: A `Namespace` is a logical grouping of components that isolate between different model deployments.
+- `Component`: A `Component` is a discoverable object within a `Namespace` that represents a logical unit of workers.
+- `Endpoint`: An `Endpoint` is a network-accessible service that provides a specific service or function.
+
+While theoretically each `DistributedRuntime` can have multiple `Namespace`s as long as their names are unique (similar logic also applies to `Component/Namespace` and `Endpoint/Component`), in practice, each dynamo components typically are deployed with its own process and thus has its own `DistributedRuntime` object. However, they share the same namespace to discover each other.
+
+For example, a typical deployment configuration (like `examples/backends/vllm/deploy/agg.yaml` or `examples/backends/sglang/deploy/agg.yaml`) has multiple workers:
+
+- `Frontend`: Starts an HTTP server and handles incoming requests. The HTTP server routes all requests to the `Processor`.
+- `Processor`: When a new request arrives, `Processor` applies the chat template and performs the tokenization.
+Then, it routes the request to the `Worker`.
+- `Worker` components (e.g., `VllmDecodeWorker`, `SGLangDecodeWorker`, `TrtllmWorker`): Perform the actual computation using their respective engines (vLLM, SGLang, TensorRT-LLM).
+
+Since the workers are deployed in different processes, each of them has its own `DistributedRuntime`. Within their own `DistributedRuntime`, they all share the same `Namespace` (e.g., `vllm-agg`, `sglang-agg`). Then, under their namespace, they have their own `Component`s: `Frontend` uses the `make_engine` function which handles HTTP serving and routing automatically, while worker components create components with names like `worker`, `decode`, or `prefill` and register endpoints like `generate`, `flush_cache`, or `clear_kv_blocks`. The `Frontend` component doesn't explicitly create endpoints - instead, the `make_engine` function handles the HTTP server and worker discovery. Worker components create their endpoints programmatically using the `component.endpoint()` method. Their `DistributedRuntime`s are initialized in their respective main functions, their `Namespace`s are configured in the deployment YAML, their `Component`s are created programmatically (e.g., `runtime.namespace("dynamo").component("worker")`), and their `Endpoint`s are created using the `component.endpoint()` method.
+
+## Initialization
+
+In this section, we explain what happens under the hood when `DistributedRuntime/Namespace/Component/Endpoint` objects are created. There are two modes for `DistributedRuntime` initialization: dynamic and static. In static mode, components and endpoints are defined using known addresses and do not change during runtime. In dynamic modes, components and endpoints are discovered through the network and can change during runtime. We focus on the dynamic mode in the rest of this document. Static mode is basically dynamic mode without registration and discovery and hence does not rely on etcd.
+
+:::caution
+The hierarchy and naming in etcd and NATS may change over time, and this document might not reflect the latest changes. Regardless of such changes, the main concepts would remain the same.
+:::
+
+- `DistributedRuntime`: When a `DistributedRuntime` object is created, it establishes connections to the following two services:
+ - etcd (dynamic mode only): for service discovery. In static mode, `DistributedRuntime` can operate without etcd.
+ - NATS (both static and dynamic mode): for messaging.
+
+ where etcd and NATS are two global services (there could be multiple etcd and NATS services for high availability).
+
+ For etcd, it also creates a primary lease and spin up a background task to keep the lease alive. All objects registered under this `DistributedRuntime` use this lease_id to maintain their life cycle. There is also a cancellation token that is tied to the primary lease. When the cancellation token is triggered or the background task failed, the primary lease is revoked or expired and the kv pairs stored with this lease_id is removed.
+- `Namespace`: `Namespace`s are primarily a logical grouping mechanism and is not registered in etcd. It provides the root path for all components under this `Namespace`.
+- `Component`: When a `Component` object is created, similar to `Namespace`, it isn't be registered in etcd. When `create_service` is called, it creates a NATS service group using `{namespace_name}.{service_name}` as the service identifier and registers a service in the registry of the `Component`, where the registry is an internal data structure that tracks all services and endpoints within the `DistributedRuntime`.
+- `Endpoint`: When an Endpoint object is created and started, it performs two key registrations:
+ - NATS Registration: The endpoint is registered with the NATS service group created during service creation. The endpoint is assigned a unique subject following the naming: `{namespace_name}.{service_name}.{endpoint_name}-{lease_id_hex}`.
+ - etcd Registration: The endpoint information is stored in etcd at a path following the naming: `/services/{namespace}/{component}/{endpoint}-{lease_id}`. Note that the endpoints of different workers of the same type (i.e., two `VllmPrefillWorker`s in one deployment) share the same `Namespace`, `Component`, and `Endpoint` name. They are distinguished by their different primary `lease_id` of their `DistributedRuntime`.
+
+## Calling Endpoints
+
+Dynamo uses `Client` object to call an endpoint. When a `Client` objected is created, it is given the name of the `Namespace`, `Component`, and `Endpoint`. It then sets up an etcd watcher to monitor the prefix `/services/{namespace}/{component}/{endpoint}`. The etcd watcher continuously updates the `Client` with the information, including `lease_id` and NATS subject of the available `Endpoint`s.
+
+The user can decide which load balancing strategy to use when calling the `Endpoint` from the `Client`, which is done in [push_router.rs](https://github.com/ai-dynamo/dynamo/tree/main/lib/runtime/src/pipeline/network/egress/push_router.rs). Dynamo supports three load balancing strategies:
+
+- `random`: randomly select an endpoint to hit
+- `round_robin`: select endpoints in round-robin order
+- `direct`: direct the request to a specific endpoint by specifying the `lease_id` of the endpoint
+
+After selecting which endpoint to hit, the `Client` sends the serialized request to the NATS subject of the selected `Endpoint`. The `Endpoint` receives the request and create a TCP response stream using the connection information from the request, which establishes a direct TCP connection to the `Client`. Then, as the worker generates the response, it serializes each response chunk and sends the serialized data over the TCP connection.
+
+## Examples
+
+We provide native rust and python (through binding) examples for basic usage of `DistributedRuntime`:
+
+- Rust: `/lib/runtime/examples/`
+- Python: We also provide complete examples of using `DistributedRuntime`. Please refer to the engines in `components/src/dynamo` for full implementation details.
+
+
diff --git a/fern/pages/design-docs/dynamo-flow.md b/fern/pages/design-docs/dynamo-flow.md
new file mode 100644
index 00000000000..df79a2cfa5f
--- /dev/null
+++ b/fern/pages/design-docs/dynamo-flow.md
@@ -0,0 +1,252 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Dynamo Architecture Flow"
+---
+
+This diagram shows the NVIDIA Dynamo disaggregated inference system as implemented in [examples/backends/vllm](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/vllm). Color-coded flows indicate different types of operations:
+
+## 🔵 Main Request Flow (Blue)
+The primary user journey through the system:
+
+1. **Discovery (S1)**: Client discovers the service endpoint
+2. **Request (S2)**: HTTP client sends API request to Frontend (OpenAI-compatible server on port 8000)
+3. **Validate (S3)**: Frontend forwards request to Processor for validation and routing
+4. **Route (S3)**: Processor routes the validated request to appropriate Decode Worker
+
+## 🟠 Decision and Allocation Flow (Orange)
+The system's intelligent routing and resource allocation:
+
+4. **Query (S4)**: Decode Worker queries for prefix cache hits to optimize processing
+5. **Disagg Decision (S5)**: Based on prefill length and queue size, the system decides whether it needs remote prefill
+5a. **Allocate (S5a)**: Decode Worker pre-allocates KV cache blocks in its local GPU memory
+6. **Queue (S6)**: If remote prefill is required, the system puts the RemotePrefillRequest with block IDs into the PrefillQueue
+
+## 🟢 Prefill Worker Flow (Green)
+The dedicated prefill processing pipeline:
+
+7. **NATS Pull (S7)**: PrefillQueue uses a NATS consumer group to distribute work to available PrefillWorkers
+8. **Load Metadata (S8)**: PrefillWorker loads NIXL metadata from ETCD to establish GPU communication
+9. **Prefill (S9)**: Worker executes the prefill computation on the input tokens
+10. **NIXL Transfer (S10)**: Direct GPU-to-GPU transfer writes the prefilled KV cache to the Decode Worker's pre-allocated blocks
+
+## 🟣 Completion Flow (Purple)
+The response generation and delivery:
+
+11. **Notify (S11)**: PrefillWorker sends completion notification to Decode Worker
+12. **Decode (S12)**: Decode Worker decodes from its local KV cache containing prefilled data
+13. **Response (S13)**: The system sends the generated response to the Processor for post-processing, then through the Frontend to the Client
+
+## 🔗 Infrastructure Connections (Dotted lines)
+Coordination and messaging support:
+
+### ETCD Connections (Gray, dotted)
+- **Frontend, Processor, Planner**: Service discovery and registration
+- **Decode Worker, PrefillWorker**: NIXL metadata storage for GPU communication setup
+
+### NATS Connections (Teal, dotted)
+- **PrefillQueue**: JetStream consumer group for reliable work distribution
+- **Processor**: Load balancing across workers
+
+### Planning Connections (Gold, dotted)
+- **Frontend → Planner**: Metrics collection for auto-scaling decisions
+- **Planner → Workers**: Resource scaling commands for both Decode Worker and PrefillWorker
+
+## Technical Implementation Details
+
+### NIXL (NVIDIA Interchange Library):
+- Enables high-speed GPU-to-GPU data transfers using NVLink/PCIe
+- Decode Worker publishes GPU metadata to ETCD for coordination
+- PrefillWorker loads metadata to establish direct communication channels
+- Block-based transfers (64–128 tokens per block) for efficient batching
+
+### Disaggregated KV Cache:
+- Each Decode Worker maintains local KV cache in its GPU memory
+- No shared storage bottlenecks—all transfers are direct worker-to-worker
+- Pre-allocated blocks ensure deterministic memory layout and performance
+
+```mermaid
+%%{init: {'theme':'dark', 'themeVariables': {'primaryColor': '#f4f4f4', 'primaryTextColor': '#333333', 'primaryBorderColor': '#888888', 'lineColor': '#4A90E2', 'sectionBkgColor': '#f9f9f9', 'altSectionBkgColor': '#eeeeee', 'tertiaryColor': '#f0f0f0', 'background': '#ffffff', 'mainBkg': '#f8f8f8', 'secondaryColor': '#f4f4f4', 'nodeTextColor': '#333333'}, 'flowchart': {'htmlLabels': true, 'curve': 'basis'}, 'fontFamily': 'Inter, system-ui, -apple-system, "Segoe UI", Roboto, sans-serif', 'fontSize': '18px'}%%
+graph TD
+ %% Top Layer - Client & Frontend
+ Client["HTTP Client"]
+ S1[["1 DISCOVERY"]]
+ Frontend["Frontend OpenAI Compatible Server Port 8000"]
+ S2[["2 REQUEST"]]
+
+ %% Processing Layer
+ Processor["Processor Request Handler & Router"]
+ S3[["3 VALIDATE"]]
+
+ %% Infrastructure - Positioned strategically to minimize crossings
+ subgraph INF["Infrastructure Layer"]
+ ETCD[("ETCD Service Discovery & NIXL Metadata")]
+ NATS[("NATS Message Broker")]
+ Planner["Planner Resource Management Auto-scaling"]
+ end
+
+ %% Worker Layer - Main processing
+ subgraph WL["Worker Layer"]
+ %% VllmWorker section
+ VllmWorker["Decode Worker Handles Decoding & Disagg Decisions"]
+ S4[["4 QUERY"]]
+ S5[["5 DISAGG DECISION"]]
+ S5a[["5a ALLOCATE"]]
+ S12[["12 DECODE"]]
+ S6[["6 QUEUE"]]
+ S13[["13 RESPONSE"]]
+
+ %% Storage positioned near workers
+ LocalKVCache[("Local KV Cache Pre-allocated Blocks")]
+
+ %% Prefill System - Right side to minimize crossings
+ subgraph PS["Prefill System"]
+ PrefillQueue["Prefill Queue NATS JetStream Consumer Group"]
+ PrefillWorker["Prefill Worker Dedicated Prefill Processing (Multiple Instances)"]
+ S7[["7 NATS PULL"]]
+ S8[["8 LOAD METADATA"]]
+ S9[["9 PREFILL"]]
+ S10[["10 NIXL TRANSFER"]]
+ S11[["11 NOTIFY"]]
+ end
+ end
+
+ %% Main Request Flow (Blue) - Clean vertical flow
+ Client -.-> S1
+ S1 -->|HTTP API Call| Frontend
+ Frontend -.-> S2
+ S2 -->|Process & Validate| Processor
+ Processor -.-> S3
+ S3 -->|Route to Worker| VllmWorker
+
+ %% VllmWorker Internal Flow (Orange)
+ VllmWorker -.-> S4
+ S4 -->|Query Prefix Cache Hit| S5
+ S5 -->|Prefill Length & Queue Check| S5a
+ S5a -->|Continue to Decode| S12
+
+ %% Allocation & Queuing (Orange) - Minimize crossings
+ S5a -->|Allocate KV Cache Blocks| LocalKVCache
+ VllmWorker --> S6
+ S6 -->|Put RemotePrefillRequest| PrefillQueue
+
+ %% Prefill Worker Flow (Green) - Self-contained within PS
+ PrefillQueue -.-> S7
+ S7 -->|Consumer Group Pull| PrefillWorker
+ PrefillWorker -.-> S8
+ PrefillWorker -.-> S9
+ S9 -->|Execute Prefill| S10
+ S10 -->|Direct GPU Transfer| LocalKVCache
+ PrefillWorker --> S11
+
+ %% Return Flow (Purple) - Clean return path
+ S11 -->|Completion Notification| S12
+ S12 -->|Decode from KV Cache| S13
+ S13 -->|Post-process Response| Processor
+ Processor -->|HTTP Response| Frontend
+ Frontend -->|Final Response| Client
+
+ %% Infrastructure Connections - Organized to avoid crossings
+ %% ETCD Connections - Grouped by proximity
+ Frontend -.->|Service Discovery| ETCD
+ Processor -.->|Service Discovery| ETCD
+ VllmWorker -.->|NIXL Metadata| ETCD
+ PrefillWorker -.->|NIXL Metadata| ETCD
+ S8 -.->|Load NIXL Metadata| ETCD
+ Planner -.->|Service Discovery| ETCD
+
+ %% NATS Connections - Direct to queue system
+ PrefillQueue -.->|JetStream| NATS
+ Processor -.->|Load Balancing| NATS
+
+ %% Planning Connections - Strategic positioning
+ Frontend -.->|Metrics| Planner
+ Planner -.->|Auto-scaling| VllmWorker
+ Planner -.->|Auto-scaling| PrefillWorker
+
+ %% Styling - Each component with unique colors
+ classDef client fill:#e8f5e8,stroke:#2E7D32,stroke-width:3px
+ classDef frontend fill:#fff3e0,stroke:#F57C00,stroke-width:3px
+ classDef processor fill:#f3e5f5,stroke:#7B1FA2,stroke-width:3px
+ classDef worker fill:#e3f2fd,stroke:#1565C0,stroke-width:3px
+ classDef prefillQueue fill:#fff8e1,stroke:#E65100,stroke-width:3px
+ classDef prefillWorker fill:#fce4ec,stroke:#C2185B,stroke-width:3px
+ classDef prefillBox fill:#eceff1,stroke:#455A64,stroke-width:3px
+ classDef planner fill:#f1f8e9,stroke:#558B2F,stroke-width:3px
+ classDef storage fill:#e0f2f1,stroke:#00695C,stroke-width:3px
+ classDef etcd fill:#fff9c4,stroke:#F9A825,stroke-width:3px
+ classDef nats fill:#ede7f6,stroke:#5E35B1,stroke-width:3px
+ classDef infraLayer fill:#fff9c4,stroke:#FFC107,stroke-width:3px
+ classDef workerLayer fill:#e3f2fd,stroke:#2196F3,stroke-width:3px
+
+
+ class Client client
+ class Frontend frontend
+ class Processor processor
+ class VllmWorker worker
+ class PrefillQueue prefillQueue
+ class PrefillWorker prefillWorker
+ class Planner planner
+ class LocalKVCache storage
+ class ETCD etcd
+ class NATS nats
+ class PS prefillBox
+ class INF infraLayer
+ class WL workerLayer
+
+
+
+ %% Flow Colors - Different line styles to reduce visual clutter
+ %% Main Request Flow - Blue (solid)
+ linkStyle 0 stroke:#1565C0,stroke-width:3px,stroke-dasharray: 3 3
+ linkStyle 1 stroke:#1565C0,stroke-width:4px
+ linkStyle 2 stroke:#1565C0,stroke-width:3px,stroke-dasharray: 3 3
+ linkStyle 3 stroke:#1565C0,stroke-width:4px
+ linkStyle 4 stroke:#1565C0,stroke-width:3px,stroke-dasharray: 3 3
+ linkStyle 5 stroke:#1565C0,stroke-width:4px
+
+ %% Decision & Allocation Flow - Orange (mixed)
+ linkStyle 6 stroke:#E65100,stroke-width:3px,stroke-dasharray: 3 3
+ linkStyle 7 stroke:#E65100,stroke-width:4px
+ linkStyle 8 stroke:#E65100,stroke-width:4px
+ linkStyle 9 stroke:#E65100,stroke-width:3px,stroke-dasharray: 3 3
+
+ %% KV Cache & Queue - Orange (solid)
+ linkStyle 10 stroke:#E65100,stroke-width:4px
+ linkStyle 11 stroke:#E65100,stroke-width:4px
+ linkStyle 12 stroke:#E65100,stroke-width:4px
+
+ %% Prefill Worker Flow - Green (mixed)
+ linkStyle 13 stroke:#2E7D32,stroke-width:3px,stroke-dasharray: 3 3
+ linkStyle 14 stroke:#2E7D32,stroke-width:4px
+ linkStyle 15 stroke:#2E7D32,stroke-width:3px,stroke-dasharray: 3 3
+ linkStyle 16 stroke:#2E7D32,stroke-width:3px,stroke-dasharray: 3 3
+ linkStyle 17 stroke:#2E7D32,stroke-width:4px
+ linkStyle 18 stroke:#2E7D32,stroke-width:4px
+ linkStyle 19 stroke:#2E7D32,stroke-width:4px
+
+ %% Completion Flow - Purple (mixed)
+ linkStyle 20 stroke:#6A1B9A,stroke-width:4px
+ linkStyle 21 stroke:#6A1B9A,stroke-width:3px,stroke-dasharray: 3 3
+ linkStyle 22 stroke:#6A1B9A,stroke-width:4px
+ linkStyle 23 stroke:#6A1B9A,stroke-width:4px
+ linkStyle 24 stroke:#6A1B9A,stroke-width:4px
+
+ %% Infrastructure Flows - Lighter and dotted to reduce visual noise
+ %% ETCD Connections - Gray (dotted, thinner)
+ linkStyle 25 stroke:#757575,stroke-width:2px,stroke-dasharray: 8 8
+ linkStyle 26 stroke:#757575,stroke-width:2px,stroke-dasharray: 8 8
+ linkStyle 27 stroke:#757575,stroke-width:2px,stroke-dasharray: 8 8
+ linkStyle 28 stroke:#757575,stroke-width:2px,stroke-dasharray: 8 8
+ linkStyle 29 stroke:#757575,stroke-width:2px,stroke-dasharray: 8 8
+ linkStyle 30 stroke:#757575,stroke-width:2px,stroke-dasharray: 8 8
+
+ %% NATS Connections - Teal (dotted, thinner)
+ linkStyle 31 stroke:#26A69A,stroke-width:2px,stroke-dasharray: 8 8
+ linkStyle 32 stroke:#26A69A,stroke-width:2px,stroke-dasharray: 8 8
+
+ %% Planning Connections - Gold (dotted, thinner)
+ linkStyle 33 stroke:#FFA726,stroke-width:2px,stroke-dasharray: 8 8
+ linkStyle 34 stroke:#FFA726,stroke-width:2px,stroke-dasharray: 8 8
+ linkStyle 35 stroke:#FFA726,stroke-width:2px,stroke-dasharray: 8 8
+```
diff --git a/fern/pages/design-docs/event-plane.md b/fern/pages/design-docs/event-plane.md
new file mode 100644
index 00000000000..2e1011e1ec5
--- /dev/null
+++ b/fern/pages/design-docs/event-plane.md
@@ -0,0 +1,466 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Event Plane Architecture"
+---
+
+This document describes Dynamo's event plane architecture, which handles service discovery, coordination, and event distribution using etcd and NATS.
+
+## Overview
+
+Dynamo's coordination layer adapts to the deployment environment:
+
+| Deployment | Service Discovery | KV Events | Request Plane |
+|------------|-------------------|-----------|---------------|
+| **Kubernetes** (with operator) | Native K8s (CRDs, EndpointSlices) | NATS (optional) | TCP |
+| **Bare metal / Local** (default) | etcd | NATS (optional) | TCP |
+
+
+The runtime always defaults to `kv_store` (etcd) for service discovery. Kubernetes deployments must explicitly set `DYN_DISCOVERY_BACKEND=kubernetes` - the Dynamo operator handles this automatically.
+
+
+```
+┌─────────────────────────────────────────────────────────────────────┐
+│ Coordination Layer │
+│ │
+│ ┌─────────────────────────┐ ┌─────────────────────────────────┐ │
+│ │ Service Discovery │ │ NATS │ │
+│ │ │ │ (Optional) │ │
+│ │ • K8s: CRDs + API │ │ • KV Cache Events │ │
+│ │ • Bare metal: etcd │ │ • Router Replica Sync │ │
+│ │ │ │ • JetStream Persistence │ │
+│ └─────────────────────────┘ └─────────────────────────────────┘ │
+│ │
+└─────────────────────────────────────────────────────────────────────┘
+ │ │
+ ┌──────────┴──────────┐ ┌─────────┴──────────┐
+ ▼ ▼ ▼ ▼
+ ┌─────────┐ ┌─────────┐ ┌─────────┐
+ │Frontend │ │ Planner │ │ Worker │
+ └─────────┘ └─────────┘ └─────────┘
+```
+
+## Kubernetes-Native Service Discovery
+
+When running on Kubernetes with the Dynamo operator, service discovery uses native Kubernetes resources instead of etcd.
+
+### Configuration
+
+The operator explicitly sets:
+```bash
+DYN_DISCOVERY_BACKEND=kubernetes
+```
+
+
+This must be explicitly configured. The runtime defaults to `kv_store` in all environments.
+
+
+### How It Works
+
+1. **DynamoWorkerMetadata CRD**: Workers register their endpoints by creating/updating DynamoWorkerMetadata custom resources
+2. **EndpointSlices**: Used to signal readiness status to the system
+3. **K8s API Watches**: Components watch for CRD changes to discover available endpoints
+
+### Benefits
+
+- No external etcd cluster required
+- Native integration with Kubernetes lifecycle
+- Automatic cleanup when pods terminate
+- Works with standard K8s RBAC
+
+### Environment Variables (Injected by Operator)
+
+| Variable | Description |
+|----------|-------------|
+| `DYN_DISCOVERY_BACKEND` | Set to `kubernetes` |
+| `POD_NAME` | Current pod name |
+| `POD_NAMESPACE` | Current namespace |
+| `POD_UID` | Pod unique identifier |
+
+---
+
+## etcd Architecture (Default for All Deployments)
+
+When `DYN_DISCOVERY_BACKEND=kv_store` (the global default), etcd is used for service discovery.
+
+### Connection Configuration
+
+etcd connection is configured via environment variables:
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `ETCD_ENDPOINTS` | Comma-separated etcd URLs | `http://localhost:2379` |
+| `ETCD_AUTH_USERNAME` | Basic auth username | None |
+| `ETCD_AUTH_PASSWORD` | Basic auth password | None |
+| `ETCD_AUTH_CA` | CA certificate path (TLS) | None |
+| `ETCD_AUTH_CLIENT_CERT` | Client certificate path | None |
+| `ETCD_AUTH_CLIENT_KEY` | Client key path | None |
+
+Example:
+```bash
+export ETCD_ENDPOINTS=http://etcd-0:2379,http://etcd-1:2379,http://etcd-2:2379
+```
+
+### Lease Management
+
+Each `DistributedRuntime` maintains a primary lease with etcd:
+
+```
+┌────────────────────┐ ┌──────────────┐
+│ DistributedRuntime │◄────────│ Primary Lease │
+│ │ │ TTL: 10s │
+│ • Namespace │ └───────┬───────┘
+│ • Components │ │
+│ • Endpoints │ │ Keep-Alive
+│ │ │ Heartbeat
+└────────────────────┘ ▼
+ ┌──────────────┐
+ │ etcd │
+ └──────────────┘
+```
+
+**Lease Lifecycle:**
+
+1. **Creation**: Lease created during `DistributedRuntime` initialization
+2. **Keep-Alive**: Background task sends heartbeats at 50% of remaining TTL
+3. **Expiration**: If heartbeats stop, lease expires after TTL (10 seconds default)
+4. **Cleanup**: All keys associated with the lease are automatically deleted
+
+**Automatic Recovery:**
+
+- Reconnection with exponential backoff (50ms to 5s)
+- Deadline-based retry logic
+- Cancellation token propagation
+
+### Service Discovery
+
+Endpoints are registered in etcd for dynamic discovery:
+
+**Key Format:**
+```
+/services/{namespace}/{component}/{endpoint}/{instance_id}
+```
+
+**Example:**
+```
+/services/vllm-agg/backend/generate/694d98147d54be25
+```
+
+**Registration Data:**
+```json
+{
+ "namespace": "vllm-agg",
+ "component": "backend",
+ "endpoint": "generate",
+ "instance_id": 7587888160958628000,
+ "transport": {
+ "tcp": "192.168.1.10:9999"
+ }
+}
+```
+
+### Discovery Queries
+
+The discovery system supports multiple query patterns:
+
+| Query Type | Pattern | Use Case |
+|------------|---------|----------|
+| `AllEndpoints` | `/services/` | List all services |
+| `NamespacedEndpoints` | `/services/{namespace}/` | Filter by namespace |
+| `ComponentEndpoints` | `/services/{namespace}/{component}/` | Filter by component |
+| `Endpoint` | `/services/{namespace}/{component}/{endpoint}/` | Specific endpoint |
+
+### Watch Functionality
+
+Clients watch etcd prefixes for real-time updates:
+
+```python
+# Client watches for endpoint changes
+watcher = etcd.watch_prefix("/services/vllm-agg/backend/generate/")
+
+for event in watcher:
+ if event.type == "PUT":
+ # New endpoint registered
+ add_endpoint(event.value)
+ elif event.type == "DELETE":
+ # Endpoint removed (worker died)
+ remove_endpoint(event.key)
+```
+
+**Watch Features:**
+
+- Initial state retrieval with `get_and_watch_prefix()`
+- Automatic reconnection on stream failure
+- Revision tracking for no-event-loss guarantees
+- Event types: `PUT` (create/update) and `DELETE`
+
+### Distributed Locks
+
+etcd provides distributed locking for coordination:
+
+**Lock Types:**
+
+| Type | Key Pattern | Behavior |
+|------|-------------|----------|
+| Write Lock | `v1/{prefix}/writer` | Exclusive (no readers/writers) |
+| Read Lock | `v1/{prefix}/readers/{id}` | Shared (multiple readers) |
+
+**Operations:**
+
+```rust
+// Non-blocking write lock
+let lock = client.try_write_lock("my_resource").await?;
+
+// Blocking read lock with polling (100ms intervals)
+let lock = client.read_lock_with_wait("my_resource").await?;
+```
+
+## NATS Architecture
+
+### When NATS is Used
+
+NATS is used for:
+
+1. **KV Cache Events**: Real-time KV cache state updates for routing
+2. **Router Replica Sync**: Synchronizing router state across replicas
+3. **Legacy Request Plane**: NATS-based request transport (optional)
+
+### Configuration
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `NATS_SERVER` | NATS server URL | `nats://localhost:4222` |
+
+### Disabling NATS
+
+For deployments without KV-aware routing:
+
+```bash
+# Disable NATS and KV events
+python -m dynamo.frontend --no-kv-events
+```
+
+This enables "approximate mode" for KV routing without event persistence.
+
+### Event Publishing
+
+Components publish events to NATS subjects:
+
+```rust
+pub trait EventPublisher {
+ async fn publish(&self, event: &str, data: &[u8]) -> Result<()>;
+ async fn publish_serialized(&self, event: &str, data: &T) -> Result<()>;
+}
+```
+
+**Subject Naming:**
+```
+{base_subject}.{event_name}
+```
+
+Example:
+```
+vllm-agg.backend.kv_cache_update
+```
+
+### Event Subscription
+
+Components subscribe to events:
+
+```rust
+pub trait EventSubscriber {
+ async fn subscribe(&self, topic: &str) -> Result;
+ async fn subscribe_typed(&self, topic: &str) -> Result>;
+}
+```
+
+### JetStream Persistence
+
+For durable event delivery, NATS JetStream provides:
+
+- Message persistence
+- Replay from offset
+- Consumer groups for load balancing
+- Acknowledgment tracking
+
+## Key-Value Store Abstraction
+
+Dynamo provides a unified KV store interface supporting multiple backends:
+
+### Supported Backends
+
+| Backend | Use Case | Configuration |
+|---------|----------|---------------|
+| `EtcdStore` | Production deployments | `ETCD_ENDPOINTS` |
+| `MemoryStore` | Testing, development | Default |
+| `NatsStore` | NATS-only deployments | `NATS_SERVER` |
+| `FileStore` | Local persistence | File path |
+
+### Store Interface
+
+```rust
+pub trait KvStore {
+ async fn get(&self, bucket: &str, key: &str) -> Result