diff --git a/infrastructure/Dockerfile b/infrastructure/Dockerfile
index d4f8906..4c03f63 100644
--- a/infrastructure/Dockerfile
+++ b/infrastructure/Dockerfile
@@ -1,4 +1,4 @@
-FROM public.ecr.aws/neuron/pytorch-inference-vllm-neuronx:0.11.0-neuronx-py312-sdk2.27.0-ubuntu24.04
+FROM public.ecr.aws/neuron/pytorch-inference-vllm-neuronx:0.9.1-neuronx-py310-sdk2.25.0-ubuntu22.04
 
 # Copy startup script and make it executable
 COPY start-vllm.sh /app/start-vllm.sh
@@ -52,6 +52,23 @@ ENV KV_ROLE="kv_producer"
 ENV KV_BUFFER_SIZE="2e11"
 ENV ETCD=""
 
+# Install AWS EFA user-space libraries for NeuronConnector KV cache transfer.
+# Must run inside the container so libfabric is compiled against the container's
+# GLIBC (2.35). The host's libfabric (1.30.0) requires GLIBC_2.38 and is incompatible.
+# --skip-kmod: kernel module runs on the host, not needed inside container.
+RUN rm -f /etc/apt/sources.list.d/neuron*.list && \
+    apt-get update -y && \
+    apt-get install -y curl kmod && \
+    curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz && \
+    tar -xf aws-efa-installer-latest.tar.gz && \
+    cd aws-efa-installer && \
+    ./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify && \
+    cd .. && rm -rf aws-efa-installer* && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install dependencies for neuron-proxy-server (already present in image but missing deps)
+RUN pip install quart aiohttp etcd3
+
 EXPOSE ${PORT}
 
 # Launch vLLM server using the startup script
diff --git a/infrastructure/README.md b/infrastructure/README.md
index 12da582..f15bca0 100644
--- a/infrastructure/README.md
+++ b/infrastructure/README.md
@@ -145,14 +145,16 @@ These can be passed at runtime via `--env-file` or `-e` flags:
 | `ENABLE_TOOL_CALLING` | `true` | Enable tool/function calling support |
 | `TOOL_CALL_PARSER` | `llama3_json` | Parser: `llama3_json`, `hermes`, `mistral`, etc. |
 | `ENABLE_PREFIX_CACHING` | `false` | Cache prefixes for repeated prompts |
-| `ADDITIONAL_CONFIG` | (empty) | Neuron config overrides (JSON) |
+| `OVERRIDE_NEURON_CONFIG` | (empty) | Neuron config overrides (JSON) |
 | `SPECULATIVE_CONFIG` | (empty) | Speculative decoding config (JSON) |
-| `VLLM_USE_V1` | `1` | vLLM V1 engine control |
-| `ENABLE_KV_TRANSFER` | `false` | Enable KV cache transfer (distributed) |
+| `VLLM_USE_V1` | `0` | vLLM V1 engine; must be `0` for Neuron in the 0.9.1 image |
+| `ENABLE_KV_TRANSFER` | `false` | Enable KV cache transfer (disaggregated inference) |
 | `KV_CONNECTOR` | `NeuronConnector` | KV connector type |
 | `KV_ROLE` | `kv_producer` | KV role: `kv_producer` or `kv_consumer` |
 | `KV_BUFFER_SIZE` | `2e11` | KV buffer size in bytes |
-| `ETCD` | (empty) | etcd server address for KV coordination |
+| `KV_NEURON_CORE_OFFSET` | `0` | Absolute host logical NeuronCore index for KV transfer |
+| `ETCD` | (empty) | etcd server address (`<host-ip>:8989`) |
+| `NEURON_RT_VISIBLE_CORES` | (empty) | Physical NeuronCore range for this worker (e.g. `0-31`) |
 
 ### Configuration Methods
 
@@ -238,18 +240,117 @@ ADDITIONAL_CONFIG='{"override_neuron_config": {"enable_bucketing": true}}'
 ADDITIONAL_CONFIG='{"override_neuron_config": {"enable_bucketing": true, "context_encoding_buckets": [256, 512, 1024, 2048]}}'
 ```
 
-### KV Cache Transfer (Distributed)
+### Disaggregated Inference (Prefill / Decode Split)
 
-For distributed setups with KV cache transfer:
+Disaggregated inference splits prefill (KV cache generation) and decode (token generation) into
+separate workers connected via EFA. See [DISAGGREGATED_INFERENCE.md](../DISAGGREGATED_INFERENCE.md)
+for full architecture details and troubleshooting.
 
-```env
-ENABLE_KV_TRANSFER=true
-KV_CONNECTOR=NeuronConnector
-KV_ROLE=kv_producer
-KV_BUFFER_SIZE=2e11
-ETCD=http://etcd-server:2379
+> **Requires**: Two `trn2.48xlarge` instances with EFA enabled at launch, and the
+> `0.9.1-neuronx-py310-sdk2.25.0-ubuntu22.04` base image (`NeuronConnector` is absent from newer images).
+
+#### Step 1 — Launch two instances with EFA enabled
+
+Launch two `trn2.48xlarge` instances using the Neuron DLAMI, same VPC/subnet/security group.
+For each instance: **Advanced network configuration → Enable Elastic Fabric Adapter**.
+The security group needs a self-referencing all-traffic inbound rule.
+
+#### Step 2 — On the prefill instance: pull and build
+
+```bash
+cd infrastructure
+docker pull public.ecr.aws/neuron/pytorch-inference-vllm-neuronx:0.9.1-neuronx-py310-sdk2.25.0-ubuntu22.04
+./build.sh configs/distributed-kv.env
+```
+
+#### Step 3 — On the decode instance: copy repo, pull and build
+
+```bash
+# From the prefill instance
+scp -r ~/strands-neuron ubuntu@<decode-ip>:~/strands-neuron
+
+# On the decode instance
+cd strands-neuron/infrastructure
+docker pull public.ecr.aws/neuron/pytorch-inference-vllm-neuronx:0.9.1-neuronx-py310-sdk2.25.0-ubuntu22.04
+./build.sh configs/kv-consumer.env
+```
+
+#### Step 4 — On the prefill instance: start etcd
+
+```bash
+export HOST_IP=$(hostname -I | awk '{print $1}')
+
+docker rm -f etcd 2>/dev/null || true
+
+docker run -d --name etcd \
+  -p 8989:8989 \
+  quay.io/coreos/etcd:v3.5.0 \
+  etcd \
+  --listen-client-urls http://0.0.0.0:8989 \
+  --advertise-client-urls http://${HOST_IP}:8989
+```
+
+#### Step 5 — On the prefill instance: start prefill worker (Terminal 1)
+
+```bash
+HF_TOKEN=<your-token> CONTAINER_NAME=vllm-prefill ./run.sh configs/distributed-kv.env
+```
+
+First run compiles the model — expect **20–40 minutes**. Wait for:
+```
+INFO: Application startup complete.
+```
+
+#### Step 6 — On the decode instance: start decode worker (Terminal 1)
+
+```bash
+HF_TOKEN=<your-token> CONTAINER_NAME=vllm-decode ./run.sh configs/kv-consumer.env
 ```
 
+Wait for `Application startup complete.`
+
+#### Step 7 — On the prefill instance: start the proxy server (Terminal 2)
+
+The official `neuron-proxy-server` (bundled in the vLLM image) handles request routing with the correct `request_id` encoding — prefill receives decode's address and decode receives prefill's address, so each worker knows where to push/pull the KV cache via EFA. It discovers workers automatically via etcd.
+
+```bash
+export HOST_IP=$(hostname -I | awk '{print $1}')
+
+docker rm -f proxy 2>/dev/null || true
+
+docker run -d \
+  --name proxy \
+  --network=host \
+  -e ETCD_IP=${HOST_IP} \
+  -e ETCD_PORT=8989 \
+  vllm-server-strands \
+  bash -c "neuron-proxy-server --etcd \$ETCD_IP:\$ETCD_PORT"
+```
+
+The proxy listens on port **8000**.
+
+#### Step 8 — Test
+
+Send requests to the proxy — do **not** call prefill or decode directly:
+
+```bash
+export HOST_IP=$(hostname -I | awk '{print $1}')
+
+curl http://${HOST_IP}:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "meta-llama/Llama-3.3-70B-Instruct",
+    "messages": [{"role": "user", "content": "Hello!"}],
+    "max_tokens": 50
+  }'
+```
+
+The proxy:
+1. Discovers prefill and decode workers via etcd
+2. Sends prefill the request with decode's address in the `request_id` (so prefill knows where to push KV cache)
+3. Sends decode the request with prefill's address in the `request_id` (so decode knows where to pull KV cache from)
+4. Streams the response from decode back to the client
+
 ## Tool Calling Configuration
 
 **Parser Selection:**
@@ -274,7 +375,8 @@ Pre-configured profiles in `configs/`:
 | `basic.env` | Basic configuration without tool calling |
 | `tool-calling.env` | Tool calling enabled (recommended for Strands agents) |
 | `high-throughput.env` | Optimized for production workloads |
-| `distributed-kv.env` | Distributed setup with KV cache transfer |
+| `distributed-kv.env` | Disaggregated inference — prefill / KV producer worker |
+| `kv-consumer.env` | Disaggregated inference — decode / KV consumer worker |
 | `speculative-decoding.env` | Speculative decoding for improved latency |
 
 ## Troubleshooting
@@ -289,12 +391,20 @@ The build script includes error handling that pauses on failure. If the build fa
 - **For `CONFIG_FILE` (sourced)**: Use single quotes around JSON values
 - **The start script** automatically strips surrounding quotes if present
 
-### VLLM_USE_V1 assertion error
+### VLLM_USE_V1 error
 
-This vLLM version requires `VLLM_USE_V1=1`. If you see an assertion error about `VLLM_USE_V1`, ensure it's set to `1`.
+- **Speculative decoding** (`speculative-decoding.env`): requires `VLLM_USE_V1=1`
+- **Disaggregated inference** (`distributed-kv.env` / `kv-consumer.env`): requires `VLLM_USE_V1=0` — the 0.9.1 Neuron image does not support the V1 engine on Neuron
 
 ### Speculative decoding errors
 
-- Ensure `SPECULATIVE_CONFIG` is passed as a separate argument (not nested in `ADDITIONAL_CONFIG`)
+- Ensure `SPECULATIVE_CONFIG` is passed as a separate argument (not nested in `OVERRIDE_NEURON_CONFIG`)
 - The neuron config should include `"enable_fused_speculation": true`
 - Use `MAX_NUM_SEQS=1` for speculative decoding
+
+### Disaggregated inference: `No matching NIC found for referred NIC BDF`
+
+The instance was launched with insufficient EFA NICs. Each NeuronDevice requires an adjacent EFA
+NIC (by PCI BDF). A `trn2.48xlarge` needs ~8 EFA NICs for full coverage of all 16 NeuronDevices.
+Use two separate instances (one per worker), each launched with EFA enabled.
+See the [Disaggregated Inference guide](../DISAGGREGATED_INFERENCE.md) for details.
diff --git a/infrastructure/configs/distributed-kv.env b/infrastructure/configs/distributed-kv.env
index 2fb053e..31b0901 100644
--- a/infrastructure/configs/distributed-kv.env
+++ b/infrastructure/configs/distributed-kv.env
@@ -1,29 +1,28 @@
-# Distributed KV Cache Configuration - For multi-instance deployments
-# Good for: Large-scale deployments, load balancing, shared KV cache
-
+# Prefill Worker (KV Producer) — runs on the prefill instance
 MODEL=meta-llama/Llama-3.3-70B-Instruct
 PORT=8080
 MAX_NUM_SEQS=8
-MAX_MODEL_LEN=2048
-TENSOR_PARALLEL_SIZE=64
+MAX_MODEL_LEN=8192
+TENSOR_PARALLEL_SIZE=32
 
 # Tool calling
 ENABLE_TOOL_CALLING=true
 TOOL_CALL_PARSER=llama3_json
 
 # Performance
-ENABLE_PREFIX_CACHING=true
+ENABLE_PREFIX_CACHING=false
 DEVICE=neuron
 OVERRIDE_NEURON_CONFIG='{}'
+VLLM_USE_V1=0
+# Must match PORT so NeuronConnector computes ZMQ port as API_SERVER_PORT+1 (8081)
+API_SERVER_PORT=8080
+NEURON_RT_ASYNC_SENDRECV_BOOTSTRAP_PORT=10000
+NEURON_RT_ASYNC_SENDRECV_EXPERIMENTAL_ENABLED=1
 
-# Speculative decoding for lower latency
-ENABLE_SPECULATIVE=true
-SPECULATIVE_MAX_MODEL_LEN=2048
-
-# KV cache transfer between instances
+# KV cache transfer
 ENABLE_KV_TRANSFER=true
 KV_CONNECTOR=NeuronConnector
 KV_ROLE=kv_producer
 KV_BUFFER_SIZE=2e11
-# Set this to your etcd server address
-ETCD=localhost:2379
+KV_NEURON_CORE_OFFSET=0
+ETCD=172.31.31.191:8989
diff --git a/infrastructure/configs/kv-consumer.env b/infrastructure/configs/kv-consumer.env
new file mode 100644
index 0000000..19e1bb1
--- /dev/null
+++ b/infrastructure/configs/kv-consumer.env
@@ -0,0 +1,29 @@
+# Decode Worker (KV Consumer) — runs on the decode instance
+MODEL=meta-llama/Llama-3.3-70B-Instruct
+PORT=8082
+MAX_NUM_SEQS=8
+MAX_MODEL_LEN=8192
+TENSOR_PARALLEL_SIZE=32
+
+# Tool calling
+ENABLE_TOOL_CALLING=true
+TOOL_CALL_PARSER=llama3_json
+
+# Performance
+ENABLE_PREFIX_CACHING=false
+DEVICE=neuron
+OVERRIDE_NEURON_CONFIG='{}'
+VLLM_USE_V1=0
+# Must match PORT so NeuronConnector computes ZMQ port correctly
+API_SERVER_PORT=8082
+NEURON_RT_ASYNC_SENDRECV_BOOTSTRAP_PORT=10000
+NEURON_RT_ASYNC_SENDRECV_EXPERIMENTAL_ENABLED=1
+
+# KV cache transfer
+ENABLE_KV_TRANSFER=true
+KV_CONNECTOR=NeuronConnector
+KV_ROLE=kv_consumer
+KV_BUFFER_SIZE=2e11
+KV_NEURON_CORE_OFFSET=0
+# etcd runs on the prefill instance
+ETCD=172.31.31.191:8989
diff --git a/infrastructure/proxy.py b/infrastructure/proxy.py
new file mode 100644
index 0000000..ab44f24
--- /dev/null
+++ b/infrastructure/proxy.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+"""
+Disaggregated inference proxy for vLLM NeuronConnector.
+
+Routes each request to both prefill (kv_producer) and decode (kv_consumer)
+simultaneously using a shared request_id that encodes the prefill address.
+The response is returned from the decode worker.
+
+Usage:
+    pip install fastapi uvicorn httpx
+    python proxy.py
+
+    # Or with custom ports:
+    PREFILL_URL=http://localhost:8080 DECODE_URL=http://localhost:8082 python proxy.py
+
+Then point your NeuronModel at http://localhost:8000/v1
+"""
+
+import asyncio
+import json
+import logging
+import os
+import uuid
+from typing import AsyncIterator
+
+import httpx
+from fastapi import FastAPI, Request, Response
+from fastapi.responses import StreamingResponse
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+logger = logging.getLogger(__name__)
+
+PREFILL_URL = os.environ.get("PREFILL_URL", "http://localhost:8080")
+DECODE_URL = os.environ.get("DECODE_URL", "http://localhost:8082")
+PROXY_PORT = int(os.environ.get("PROXY_PORT", "8000"))
+
+# Prefill host:port extracted for the request_id (must be the API address, not ZMQ)
+PREFILL_HOST = os.environ.get("PREFILL_HOST", "172.31.31.191")
+PREFILL_API_PORT = os.environ.get("PREFILL_API_PORT", "8080")
+
+app = FastAPI()
+
+
+def make_request_id() -> str:
+    """Build a request_id that encodes the prefill address.
+
+    NeuronConnector parses this on the decode side to find prefill's ZMQ server
+    at prefill_host:(prefill_api_port + 1).
+
+    Format: <uuid>_<prefill_host>:<prefill_api_port>
+    """
+    return f"{uuid.uuid4()}_{PREFILL_HOST}:{PREFILL_API_PORT}"
+
+
+async def _forward(
+    client: httpx.AsyncClient,
+    method: str,
+    url: str,
+    headers: dict,
+    content: bytes,
+    stream: bool,
+) -> httpx.Response:
+    request = client.build_request(method, url, headers=headers, content=content)
+    return await client.send(request, stream=stream)
+
+
+async def _drain(response: httpx.Response) -> None:
+    """Consume and discard a streaming response."""
+    try:
+        async for _ in response.aiter_bytes():
+            pass
+    except Exception:
+        pass
+    finally:
+        await response.aclose()
+
+
+async def _stream_decode(client: httpx.AsyncClient, response: httpx.Response) -> AsyncIterator[bytes]:
+    """Yield chunks from the decode response, then close client."""
+    try:
+        async for chunk in response.aiter_bytes():
+            yield chunk
+    finally:
+        await response.aclose()
+        await client.aclose()
+
+
+@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"])
+async def proxy(request: Request, path: str) -> Response:
+    body = await request.body()
+    method = request.method
+
+    # Pass-through non-completion endpoints (models list, health, etc.)
+    is_completion = path in ("v1/chat/completions", "v1/completions")
+
+    # Build forwarding headers (strip host)
+    forward_headers = {
+        k: v for k, v in request.headers.items()
+        if k.lower() not in ("host", "content-length")
+    }
+
+    if not is_completion or method != "POST":
+        # Forward directly to prefill for non-inference endpoints
+        async with httpx.AsyncClient(timeout=httpx.Timeout(connect=10.0, read=30.0, write=30.0, pool=30.0)) as client:
+            resp = await _forward(
+                client, method,
+                f"{PREFILL_URL}/{path}",
+                forward_headers, body, stream=False,
+            )
+        return Response(
+            content=resp.content,
+            status_code=resp.status_code,
+            headers=dict(resp.headers),
+        )
+
+    # --- Disaggregated inference path ---
+    request_id = make_request_id()
+    forward_headers["x-request-id"] = request_id
+    logger.info("Routing request_id=%s to prefill+decode", request_id)
+
+    prefill_url = f"{PREFILL_URL}/{path}"
+    decode_url = f"{DECODE_URL}/{path}"
+
+    # Build prefill body: max_tokens=1 so prefill only generates the first token,
+    # transfers KV cache to decode, then is immediately free for the next request.
+    # Decode gets the original body with the full max_tokens.
+    try:
+        prefill_body = json.dumps({**json.loads(body), "max_tokens": 1, "stream": False}).encode()
+    except (json.JSONDecodeError, ValueError):
+        prefill_body = body
+
+    # Client is NOT used as a context manager here — it must stay open until
+    # streaming is complete. It is closed inside _stream_decode or explicitly below.
+    client = httpx.AsyncClient(timeout=httpx.Timeout(connect=10.0, read=None, write=None, pool=None))
+
+    prefill_task = asyncio.create_task(
+        _forward(client, "POST", prefill_url, forward_headers, prefill_body, stream=False)
+    )
+    decode_task = asyncio.create_task(
+        _forward(client, "POST", decode_url, forward_headers, body, stream=True)
+    )
+
+    prefill_resp, decode_resp = await asyncio.gather(prefill_task, decode_task)
+
+    # Prefill response is a small non-streaming response (max_tokens=1) — close it directly
+    asyncio.create_task(prefill_resp.aclose())
+
+    content_type = decode_resp.headers.get("content-type", "")
+    wants_stream = (
+        "text/event-stream" in request.headers.get("accept", "")
+        or content_type.startswith("text/event-stream")
+    )
+
+    if wants_stream:
+        return StreamingResponse(
+            _stream_decode(client, decode_resp),
+            status_code=decode_resp.status_code,
+            headers={
+                k: v for k, v in decode_resp.headers.items()
+                if k.lower() not in ("transfer-encoding",)
+            },
+            media_type=content_type or "text/event-stream",
+        )
+    else:
+        try:
+            content = await decode_resp.aread()
+        finally:
+            await decode_resp.aclose()
+            await client.aclose()
+        return Response(
+            content=content,
+            status_code=decode_resp.status_code,
+            headers={
+                k: v for k, v in decode_resp.headers.items()
+                if k.lower() not in ("transfer-encoding",)
+            },
+        )
+
+
+if __name__ == "__main__":
+    import uvicorn
+    logger.info("Disaggregated proxy starting on port %s", PROXY_PORT)
+    logger.info("  Prefill: %s (request_id encodes %s:%s)", PREFILL_URL, PREFILL_HOST, PREFILL_API_PORT)
+    logger.info("  Decode:  %s", DECODE_URL)
+    logger.info("  Point NeuronModel at: http://localhost:%s/v1", PROXY_PORT)
+    uvicorn.run(app, host="0.0.0.0", port=PROXY_PORT)
diff --git a/infrastructure/pull.sh b/infrastructure/pull.sh
index bdb94c4..8bbafd5 100644
--- a/infrastructure/pull.sh
+++ b/infrastructure/pull.sh
@@ -1,2 +1,2 @@
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
-docker pull public.ecr.aws/neuron/pytorch-inference-vllm-neuronx:0.11.0-neuronx-py312-sdk2.27.0-ubuntu24.04
+docker pull public.ecr.aws/neuron/pytorch-inference-vllm-neuronx:0.9.1-neuronx-py310-sdk2.25.0-ubuntu22.04
diff --git a/infrastructure/run.sh b/infrastructure/run.sh
index 8f5af33..e13ae2c 100755
--- a/infrastructure/run.sh
+++ b/infrastructure/run.sh
@@ -7,7 +7,7 @@
 #   CONFIG_FILE=my-config.env ./run.sh    # Set via environment variable
 #   IMAGE_NAME=my-image ./run.sh          # Use custom image name
 #   CONTAINER_NAME=my-container ./run.sh  # Use custom container name
-#   PORT=8081 ./run.sh                    # Override port mapping
+#   PORT=8082 ./run.sh                    # Override port (used for display only with --network=host)
 
 set -e
 
@@ -41,6 +41,13 @@ for i in {0..15}; do
     fi
 done
 
+# Pass EFA/Infiniband devices
+for i in {0..7}; do
+    if [ -e "/dev/infiniband/uverbs${i}" ]; then
+        DEVICE_FLAGS="${DEVICE_FLAGS} --device=/dev/infiniband/uverbs${i}"
+    fi
+done
+
 # Handle config file
 ENV_FILE_FLAG=""
 if [ -n "$CONFIG_FILE" ] && [ -f "$CONFIG_FILE" ]; then
@@ -66,11 +73,12 @@ echo "=================================="
 
 docker run -it \
     -e HF_TOKEN=$HF_TOKEN \
+    -e LD_LIBRARY_PATH=/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH} \
     ${ENV_FILE_FLAG} \
     ${DEVICE_FLAGS} \
-    --cap-add SYS_ADMIN \
-    --cap-add IPC_LOCK \
-    -p ${PORT}:${PORT} \
+    --privileged \
+    --network=host \
+    --shm-size=10g \
     --name ${CONTAINER_NAME} \
     ${IMAGE_NAME}
 
diff --git a/infrastructure/start-vllm.sh b/infrastructure/start-vllm.sh
index 7f3ff7a..3b3c051 100755
--- a/infrastructure/start-vllm.sh
+++ b/infrastructure/start-vllm.sh
@@ -51,16 +51,16 @@ TOOL_CALL_PARSER="${TOOL_CALL_PARSER:-llama3_json}"
 ENABLE_PREFIX_CACHING="${ENABLE_PREFIX_CACHING:-false}"
 
 # =============================================================================
-# Advanced Configuration (--additional-config)
+# Neuron Config Override (--override-neuron-config)
 # =============================================================================
-# Neuron config overrides (JSON format)
-# Example: '{"override_neuron_config":{"enable_bucketing":false}}'
-ADDITIONAL_CONFIG="${ADDITIONAL_CONFIG:-}"
+# Neuron-specific config overrides (JSON format)
+# Example: '{"enable_bucketing": false}'
+OVERRIDE_NEURON_CONFIG="${OVERRIDE_NEURON_CONFIG:-}"
 # Strip surrounding quotes if present (handles docker --env-file including literal quotes)
-ADDITIONAL_CONFIG="${ADDITIONAL_CONFIG#\'}"
-ADDITIONAL_CONFIG="${ADDITIONAL_CONFIG%\'}"
-ADDITIONAL_CONFIG="${ADDITIONAL_CONFIG#\"}"
-ADDITIONAL_CONFIG="${ADDITIONAL_CONFIG%\"}"
+OVERRIDE_NEURON_CONFIG="${OVERRIDE_NEURON_CONFIG#\'}"
+OVERRIDE_NEURON_CONFIG="${OVERRIDE_NEURON_CONFIG%\'}"
+OVERRIDE_NEURON_CONFIG="${OVERRIDE_NEURON_CONFIG#\"}"
+OVERRIDE_NEURON_CONFIG="${OVERRIDE_NEURON_CONFIG%\"}"
 
 # =============================================================================
 # Speculative Decoding (--speculative-config)
@@ -83,6 +83,7 @@ KV_CONNECTOR="${KV_CONNECTOR:-NeuronConnector}"
 KV_ROLE="${KV_ROLE:-kv_producer}"
 KV_BUFFER_SIZE="${KV_BUFFER_SIZE:-2e11}"
 ETCD="${ETCD:-}"  # etcd address for coordination
+KV_NEURON_CORE_OFFSET="${KV_NEURON_CORE_OFFSET:-0}"  # core offset for splitting NeuronCores between workers
 
 # =============================================================================
 # Build Command
@@ -100,13 +101,16 @@ echo "Tool Calling: $ENABLE_TOOL_CALLING"
 if [ "$ENABLE_TOOL_CALLING" = "true" ]; then
     echo "Tool Parser: $TOOL_CALL_PARSER"
 fi
-if [ -n "$ADDITIONAL_CONFIG" ]; then
-    echo "Additional Config: $ADDITIONAL_CONFIG"
+if [ -n "$OVERRIDE_NEURON_CONFIG" ]; then
+    echo "Override Neuron Config: $OVERRIDE_NEURON_CONFIG"
 fi
 if [ -n "$SPECULATIVE_CONFIG" ]; then
     echo "Speculative Config: $SPECULATIVE_CONFIG"
 fi
-echo "VLLM_USE_V1: $VLLM_USE_V1"
+if [ "$ENABLE_KV_TRANSFER" = "true" ]; then
+    echo "KV Transfer: enabled ($KV_ROLE)"
+    echo "ETCD: $ETCD"
+fi
 echo "=================================="
 
 # Build command as array to properly handle JSON arguments
@@ -117,6 +121,7 @@ CMD_ARRAY=(
     "--max-num-seqs" "$MAX_NUM_SEQS"
     "--max-model-len" "$MAX_MODEL_LEN"
     "--tensor-parallel-size" "$TENSOR_PARALLEL_SIZE"
+    "--device" "neuron"
 )
 
 # Add tool calling support
@@ -128,25 +133,24 @@ fi
 # Add prefix caching
 if [ "$ENABLE_PREFIX_CACHING" = "true" ]; then
     CMD_ARRAY+=("--enable-prefix-caching")
-else
-    CMD_ARRAY+=("--no-enable-prefix-caching")
+fi
+
+# Add Neuron config override
+if [ -n "$OVERRIDE_NEURON_CONFIG" ]; then
+    CMD_ARRAY+=("--override-neuron-config" "$OVERRIDE_NEURON_CONFIG")
 fi
 
 # Add KV cache transfer configuration
 if [ "$ENABLE_KV_TRANSFER" = "true" ] && [ -n "$ETCD" ]; then
-    KV_CONFIG="{\"kv_connector\":\"$KV_CONNECTOR\",\"kv_role\":\"$KV_ROLE\",\"kv_buffer_size\":$KV_BUFFER_SIZE,\"etcd\":\"$ETCD\"}"
+    KV_CONFIG="{\"kv_connector\":\"$KV_CONNECTOR\",\"kv_role\":\"$KV_ROLE\",\"kv_buffer_size\":$KV_BUFFER_SIZE,\"etcd\":\"$ETCD\",\"neuron_core_offset\":$KV_NEURON_CORE_OFFSET}"
     CMD_ARRAY+=("--kv-transfer-config" "$KV_CONFIG")
 fi
 
-# Add additional config (neuron overrides)
-if [ -n "$ADDITIONAL_CONFIG" ] && [ "$ADDITIONAL_CONFIG" != "{}" ]; then
-    CMD_ARRAY+=("--additional-config" "$ADDITIONAL_CONFIG")
-fi
-
-# Add speculative decoding config (separate argument)
+# Add speculative decoding config (separate argument, for non-disaggregated use)
 if [ -n "$SPECULATIVE_CONFIG" ] && [ "$SPECULATIVE_CONFIG" != "{}" ]; then
     CMD_ARRAY+=("--speculative-config" "$SPECULATIVE_CONFIG")
 fi
+
 # Execute the command
 echo "Executing: ${CMD_ARRAY[@]}"
 echo "=================================="
diff --git a/src/strands_neuron/neuron.py b/src/strands_neuron/neuron.py
index d229c78..8973f5d 100644
--- a/src/strands_neuron/neuron.py
+++ b/src/strands_neuron/neuron.py
@@ -48,6 +48,9 @@ class NeuronConfig(TypedDict, total=False):
             params: Additional model parameters (e.g., temperature, max_tokens).
             streaming: Whether to use streaming mode (default: True). Set to False to work around
                 vLLM streaming bugs with certain models (e.g., Mistral tool calls).
+            timeout: httpx.Timeout for the OpenAI client. Defaults to no read/write/pool timeout
+                with a 10s connect timeout. Pass httpx.Timeout(None) to disable all timeouts,
+                or httpx.Timeout(60.0) for a 60s timeout on all operations.
         """
 
         model_id: str
@@ -55,6 +58,7 @@ class NeuronConfig(TypedDict, total=False):
         api_key: Optional[str]
         params: Optional[dict[str, Any]]
         streaming: Optional[bool]
+        timeout: Optional[httpx.Timeout]
 
 
     def __init__(self, config: NeuronConfig):
@@ -80,6 +84,7 @@ def __init__(self, config: NeuronConfig):
             "api_key": config.get("api_key", "EMPTY"),
             "params": config.get("params", {}),
             "streaming": config.get("streaming", True),
+            "timeout": config.get("timeout", httpx.Timeout(connect=10.0, read=None, write=None, pool=None)),
         }
         self._check_server_online()
     
@@ -554,6 +559,7 @@ async def _get_client(self) -> AsyncIterator[Any]:
         client = AsyncOpenAI(
             api_key=self.config["api_key"],
             base_url=self.config["base_url"],
+            timeout=self.config["timeout"],
         )
         try:
             yield client