diff --git a/infrastructure/Dockerfile b/infrastructure/Dockerfile index d4f8906..4c03f63 100644 --- a/infrastructure/Dockerfile +++ b/infrastructure/Dockerfile @@ -1,4 +1,4 @@ -FROM public.ecr.aws/neuron/pytorch-inference-vllm-neuronx:0.11.0-neuronx-py312-sdk2.27.0-ubuntu24.04 +FROM public.ecr.aws/neuron/pytorch-inference-vllm-neuronx:0.9.1-neuronx-py310-sdk2.25.0-ubuntu22.04 # Copy startup script and make it executable COPY start-vllm.sh /app/start-vllm.sh @@ -52,6 +52,23 @@ ENV KV_ROLE="kv_producer" ENV KV_BUFFER_SIZE="2e11" ENV ETCD="" +# Install AWS EFA user-space libraries for NeuronConnector KV cache transfer. +# Must run inside the container so libfabric is compiled against the container's +# GLIBC (2.35). The host's libfabric (1.30.0) requires GLIBC_2.38 and is incompatible. +# --skip-kmod: kernel module runs on the host, not needed inside container. +RUN rm -f /etc/apt/sources.list.d/neuron*.list && \ + apt-get update -y && \ + apt-get install -y curl kmod && \ + curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz && \ + tar -xf aws-efa-installer-latest.tar.gz && \ + cd aws-efa-installer && \ + ./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify && \ + cd .. && rm -rf aws-efa-installer* && \ + rm -rf /var/lib/apt/lists/* + +# Install dependencies for neuron-proxy-server (already present in image but missing deps) +RUN pip install quart aiohttp etcd3 + EXPOSE ${PORT} # Launch vLLM server using the startup script diff --git a/infrastructure/README.md b/infrastructure/README.md index 12da582..f15bca0 100644 --- a/infrastructure/README.md +++ b/infrastructure/README.md @@ -145,14 +145,16 @@ These can be passed at runtime via `--env-file` or `-e` flags: | `ENABLE_TOOL_CALLING` | `true` | Enable tool/function calling support | | `TOOL_CALL_PARSER` | `llama3_json` | Parser: `llama3_json`, `hermes`, `mistral`, etc. | | `ENABLE_PREFIX_CACHING` | `false` | Cache prefixes for repeated prompts | -| `ADDITIONAL_CONFIG` | (empty) | Neuron config overrides (JSON) | +| `OVERRIDE_NEURON_CONFIG` | (empty) | Neuron config overrides (JSON) | | `SPECULATIVE_CONFIG` | (empty) | Speculative decoding config (JSON) | -| `VLLM_USE_V1` | `1` | vLLM V1 engine control | -| `ENABLE_KV_TRANSFER` | `false` | Enable KV cache transfer (distributed) | +| `VLLM_USE_V1` | `0` | vLLM V1 engine; must be `0` for Neuron in the 0.9.1 image | +| `ENABLE_KV_TRANSFER` | `false` | Enable KV cache transfer (disaggregated inference) | | `KV_CONNECTOR` | `NeuronConnector` | KV connector type | | `KV_ROLE` | `kv_producer` | KV role: `kv_producer` or `kv_consumer` | | `KV_BUFFER_SIZE` | `2e11` | KV buffer size in bytes | -| `ETCD` | (empty) | etcd server address for KV coordination | +| `KV_NEURON_CORE_OFFSET` | `0` | Absolute host logical NeuronCore index for KV transfer | +| `ETCD` | (empty) | etcd server address (`:8989`) | +| `NEURON_RT_VISIBLE_CORES` | (empty) | Physical NeuronCore range for this worker (e.g. `0-31`) | ### Configuration Methods @@ -238,18 +240,117 @@ ADDITIONAL_CONFIG='{"override_neuron_config": {"enable_bucketing": true}}' ADDITIONAL_CONFIG='{"override_neuron_config": {"enable_bucketing": true, "context_encoding_buckets": [256, 512, 1024, 2048]}}' ``` -### KV Cache Transfer (Distributed) +### Disaggregated Inference (Prefill / Decode Split) -For distributed setups with KV cache transfer: +Disaggregated inference splits prefill (KV cache generation) and decode (token generation) into +separate workers connected via EFA. See [DISAGGREGATED_INFERENCE.md](../DISAGGREGATED_INFERENCE.md) +for full architecture details and troubleshooting. -```env -ENABLE_KV_TRANSFER=true -KV_CONNECTOR=NeuronConnector -KV_ROLE=kv_producer -KV_BUFFER_SIZE=2e11 -ETCD=http://etcd-server:2379 +> **Requires**: Two `trn2.48xlarge` instances with EFA enabled at launch, and the +> `0.9.1-neuronx-py310-sdk2.25.0-ubuntu22.04` base image (`NeuronConnector` is absent from newer images). + +#### Step 1 — Launch two instances with EFA enabled + +Launch two `trn2.48xlarge` instances using the Neuron DLAMI, same VPC/subnet/security group. +For each instance: **Advanced network configuration → Enable Elastic Fabric Adapter**. +The security group needs a self-referencing all-traffic inbound rule. + +#### Step 2 — On the prefill instance: pull and build + +```bash +cd infrastructure +docker pull public.ecr.aws/neuron/pytorch-inference-vllm-neuronx:0.9.1-neuronx-py310-sdk2.25.0-ubuntu22.04 +./build.sh configs/distributed-kv.env +``` + +#### Step 3 — On the decode instance: copy repo, pull and build + +```bash +# From the prefill instance +scp -r ~/strands-neuron ubuntu@:~/strands-neuron + +# On the decode instance +cd strands-neuron/infrastructure +docker pull public.ecr.aws/neuron/pytorch-inference-vllm-neuronx:0.9.1-neuronx-py310-sdk2.25.0-ubuntu22.04 +./build.sh configs/kv-consumer.env +``` + +#### Step 4 — On the prefill instance: start etcd + +```bash +export HOST_IP=$(hostname -I | awk '{print $1}') + +docker rm -f etcd 2>/dev/null || true + +docker run -d --name etcd \ + -p 8989:8989 \ + quay.io/coreos/etcd:v3.5.0 \ + etcd \ + --listen-client-urls http://0.0.0.0:8989 \ + --advertise-client-urls http://${HOST_IP}:8989 +``` + +#### Step 5 — On the prefill instance: start prefill worker (Terminal 1) + +```bash +HF_TOKEN= CONTAINER_NAME=vllm-prefill ./run.sh configs/distributed-kv.env +``` + +First run compiles the model — expect **20–40 minutes**. Wait for: +``` +INFO: Application startup complete. +``` + +#### Step 6 — On the decode instance: start decode worker (Terminal 1) + +```bash +HF_TOKEN= CONTAINER_NAME=vllm-decode ./run.sh configs/kv-consumer.env ``` +Wait for `Application startup complete.` + +#### Step 7 — On the prefill instance: start the proxy server (Terminal 2) + +The official `neuron-proxy-server` (bundled in the vLLM image) handles request routing with the correct `request_id` encoding — prefill receives decode's address and decode receives prefill's address, so each worker knows where to push/pull the KV cache via EFA. It discovers workers automatically via etcd. + +```bash +export HOST_IP=$(hostname -I | awk '{print $1}') + +docker rm -f proxy 2>/dev/null || true + +docker run -d \ + --name proxy \ + --network=host \ + -e ETCD_IP=${HOST_IP} \ + -e ETCD_PORT=8989 \ + vllm-server-strands \ + bash -c "neuron-proxy-server --etcd \$ETCD_IP:\$ETCD_PORT" +``` + +The proxy listens on port **8000**. + +#### Step 8 — Test + +Send requests to the proxy — do **not** call prefill or decode directly: + +```bash +export HOST_IP=$(hostname -I | awk '{print $1}') + +curl http://${HOST_IP}:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-3.3-70B-Instruct", + "messages": [{"role": "user", "content": "Hello!"}], + "max_tokens": 50 + }' +``` + +The proxy: +1. Discovers prefill and decode workers via etcd +2. Sends prefill the request with decode's address in the `request_id` (so prefill knows where to push KV cache) +3. Sends decode the request with prefill's address in the `request_id` (so decode knows where to pull KV cache from) +4. Streams the response from decode back to the client + ## Tool Calling Configuration **Parser Selection:** @@ -274,7 +375,8 @@ Pre-configured profiles in `configs/`: | `basic.env` | Basic configuration without tool calling | | `tool-calling.env` | Tool calling enabled (recommended for Strands agents) | | `high-throughput.env` | Optimized for production workloads | -| `distributed-kv.env` | Distributed setup with KV cache transfer | +| `distributed-kv.env` | Disaggregated inference — prefill / KV producer worker | +| `kv-consumer.env` | Disaggregated inference — decode / KV consumer worker | | `speculative-decoding.env` | Speculative decoding for improved latency | ## Troubleshooting @@ -289,12 +391,20 @@ The build script includes error handling that pauses on failure. If the build fa - **For `CONFIG_FILE` (sourced)**: Use single quotes around JSON values - **The start script** automatically strips surrounding quotes if present -### VLLM_USE_V1 assertion error +### VLLM_USE_V1 error -This vLLM version requires `VLLM_USE_V1=1`. If you see an assertion error about `VLLM_USE_V1`, ensure it's set to `1`. +- **Speculative decoding** (`speculative-decoding.env`): requires `VLLM_USE_V1=1` +- **Disaggregated inference** (`distributed-kv.env` / `kv-consumer.env`): requires `VLLM_USE_V1=0` — the 0.9.1 Neuron image does not support the V1 engine on Neuron ### Speculative decoding errors -- Ensure `SPECULATIVE_CONFIG` is passed as a separate argument (not nested in `ADDITIONAL_CONFIG`) +- Ensure `SPECULATIVE_CONFIG` is passed as a separate argument (not nested in `OVERRIDE_NEURON_CONFIG`) - The neuron config should include `"enable_fused_speculation": true` - Use `MAX_NUM_SEQS=1` for speculative decoding + +### Disaggregated inference: `No matching NIC found for referred NIC BDF` + +The instance was launched with insufficient EFA NICs. Each NeuronDevice requires an adjacent EFA +NIC (by PCI BDF). A `trn2.48xlarge` needs ~8 EFA NICs for full coverage of all 16 NeuronDevices. +Use two separate instances (one per worker), each launched with EFA enabled. +See the [Disaggregated Inference guide](../DISAGGREGATED_INFERENCE.md) for details. diff --git a/infrastructure/configs/distributed-kv.env b/infrastructure/configs/distributed-kv.env index 2fb053e..31b0901 100644 --- a/infrastructure/configs/distributed-kv.env +++ b/infrastructure/configs/distributed-kv.env @@ -1,29 +1,28 @@ -# Distributed KV Cache Configuration - For multi-instance deployments -# Good for: Large-scale deployments, load balancing, shared KV cache - +# Prefill Worker (KV Producer) — runs on the prefill instance MODEL=meta-llama/Llama-3.3-70B-Instruct PORT=8080 MAX_NUM_SEQS=8 -MAX_MODEL_LEN=2048 -TENSOR_PARALLEL_SIZE=64 +MAX_MODEL_LEN=8192 +TENSOR_PARALLEL_SIZE=32 # Tool calling ENABLE_TOOL_CALLING=true TOOL_CALL_PARSER=llama3_json # Performance -ENABLE_PREFIX_CACHING=true +ENABLE_PREFIX_CACHING=false DEVICE=neuron OVERRIDE_NEURON_CONFIG='{}' +VLLM_USE_V1=0 +# Must match PORT so NeuronConnector computes ZMQ port as API_SERVER_PORT+1 (8081) +API_SERVER_PORT=8080 +NEURON_RT_ASYNC_SENDRECV_BOOTSTRAP_PORT=10000 +NEURON_RT_ASYNC_SENDRECV_EXPERIMENTAL_ENABLED=1 -# Speculative decoding for lower latency -ENABLE_SPECULATIVE=true -SPECULATIVE_MAX_MODEL_LEN=2048 - -# KV cache transfer between instances +# KV cache transfer ENABLE_KV_TRANSFER=true KV_CONNECTOR=NeuronConnector KV_ROLE=kv_producer KV_BUFFER_SIZE=2e11 -# Set this to your etcd server address -ETCD=localhost:2379 +KV_NEURON_CORE_OFFSET=0 +ETCD=172.31.31.191:8989 diff --git a/infrastructure/configs/kv-consumer.env b/infrastructure/configs/kv-consumer.env new file mode 100644 index 0000000..19e1bb1 --- /dev/null +++ b/infrastructure/configs/kv-consumer.env @@ -0,0 +1,29 @@ +# Decode Worker (KV Consumer) — runs on the decode instance +MODEL=meta-llama/Llama-3.3-70B-Instruct +PORT=8082 +MAX_NUM_SEQS=8 +MAX_MODEL_LEN=8192 +TENSOR_PARALLEL_SIZE=32 + +# Tool calling +ENABLE_TOOL_CALLING=true +TOOL_CALL_PARSER=llama3_json + +# Performance +ENABLE_PREFIX_CACHING=false +DEVICE=neuron +OVERRIDE_NEURON_CONFIG='{}' +VLLM_USE_V1=0 +# Must match PORT so NeuronConnector computes ZMQ port correctly +API_SERVER_PORT=8082 +NEURON_RT_ASYNC_SENDRECV_BOOTSTRAP_PORT=10000 +NEURON_RT_ASYNC_SENDRECV_EXPERIMENTAL_ENABLED=1 + +# KV cache transfer +ENABLE_KV_TRANSFER=true +KV_CONNECTOR=NeuronConnector +KV_ROLE=kv_consumer +KV_BUFFER_SIZE=2e11 +KV_NEURON_CORE_OFFSET=0 +# etcd runs on the prefill instance +ETCD=172.31.31.191:8989 diff --git a/infrastructure/proxy.py b/infrastructure/proxy.py new file mode 100644 index 0000000..ab44f24 --- /dev/null +++ b/infrastructure/proxy.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +""" +Disaggregated inference proxy for vLLM NeuronConnector. + +Routes each request to both prefill (kv_producer) and decode (kv_consumer) +simultaneously using a shared request_id that encodes the prefill address. +The response is returned from the decode worker. + +Usage: + pip install fastapi uvicorn httpx + python proxy.py + + # Or with custom ports: + PREFILL_URL=http://localhost:8080 DECODE_URL=http://localhost:8082 python proxy.py + +Then point your NeuronModel at http://localhost:8000/v1 +""" + +import asyncio +import json +import logging +import os +import uuid +from typing import AsyncIterator + +import httpx +from fastapi import FastAPI, Request, Response +from fastapi.responses import StreamingResponse + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +logger = logging.getLogger(__name__) + +PREFILL_URL = os.environ.get("PREFILL_URL", "http://localhost:8080") +DECODE_URL = os.environ.get("DECODE_URL", "http://localhost:8082") +PROXY_PORT = int(os.environ.get("PROXY_PORT", "8000")) + +# Prefill host:port extracted for the request_id (must be the API address, not ZMQ) +PREFILL_HOST = os.environ.get("PREFILL_HOST", "172.31.31.191") +PREFILL_API_PORT = os.environ.get("PREFILL_API_PORT", "8080") + +app = FastAPI() + + +def make_request_id() -> str: + """Build a request_id that encodes the prefill address. + + NeuronConnector parses this on the decode side to find prefill's ZMQ server + at prefill_host:(prefill_api_port + 1). + + Format: _: + """ + return f"{uuid.uuid4()}_{PREFILL_HOST}:{PREFILL_API_PORT}" + + +async def _forward( + client: httpx.AsyncClient, + method: str, + url: str, + headers: dict, + content: bytes, + stream: bool, +) -> httpx.Response: + request = client.build_request(method, url, headers=headers, content=content) + return await client.send(request, stream=stream) + + +async def _drain(response: httpx.Response) -> None: + """Consume and discard a streaming response.""" + try: + async for _ in response.aiter_bytes(): + pass + except Exception: + pass + finally: + await response.aclose() + + +async def _stream_decode(client: httpx.AsyncClient, response: httpx.Response) -> AsyncIterator[bytes]: + """Yield chunks from the decode response, then close client.""" + try: + async for chunk in response.aiter_bytes(): + yield chunk + finally: + await response.aclose() + await client.aclose() + + +@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"]) +async def proxy(request: Request, path: str) -> Response: + body = await request.body() + method = request.method + + # Pass-through non-completion endpoints (models list, health, etc.) + is_completion = path in ("v1/chat/completions", "v1/completions") + + # Build forwarding headers (strip host) + forward_headers = { + k: v for k, v in request.headers.items() + if k.lower() not in ("host", "content-length") + } + + if not is_completion or method != "POST": + # Forward directly to prefill for non-inference endpoints + async with httpx.AsyncClient(timeout=httpx.Timeout(connect=10.0, read=30.0, write=30.0, pool=30.0)) as client: + resp = await _forward( + client, method, + f"{PREFILL_URL}/{path}", + forward_headers, body, stream=False, + ) + return Response( + content=resp.content, + status_code=resp.status_code, + headers=dict(resp.headers), + ) + + # --- Disaggregated inference path --- + request_id = make_request_id() + forward_headers["x-request-id"] = request_id + logger.info("Routing request_id=%s to prefill+decode", request_id) + + prefill_url = f"{PREFILL_URL}/{path}" + decode_url = f"{DECODE_URL}/{path}" + + # Build prefill body: max_tokens=1 so prefill only generates the first token, + # transfers KV cache to decode, then is immediately free for the next request. + # Decode gets the original body with the full max_tokens. + try: + prefill_body = json.dumps({**json.loads(body), "max_tokens": 1, "stream": False}).encode() + except (json.JSONDecodeError, ValueError): + prefill_body = body + + # Client is NOT used as a context manager here — it must stay open until + # streaming is complete. It is closed inside _stream_decode or explicitly below. + client = httpx.AsyncClient(timeout=httpx.Timeout(connect=10.0, read=None, write=None, pool=None)) + + prefill_task = asyncio.create_task( + _forward(client, "POST", prefill_url, forward_headers, prefill_body, stream=False) + ) + decode_task = asyncio.create_task( + _forward(client, "POST", decode_url, forward_headers, body, stream=True) + ) + + prefill_resp, decode_resp = await asyncio.gather(prefill_task, decode_task) + + # Prefill response is a small non-streaming response (max_tokens=1) — close it directly + asyncio.create_task(prefill_resp.aclose()) + + content_type = decode_resp.headers.get("content-type", "") + wants_stream = ( + "text/event-stream" in request.headers.get("accept", "") + or content_type.startswith("text/event-stream") + ) + + if wants_stream: + return StreamingResponse( + _stream_decode(client, decode_resp), + status_code=decode_resp.status_code, + headers={ + k: v for k, v in decode_resp.headers.items() + if k.lower() not in ("transfer-encoding",) + }, + media_type=content_type or "text/event-stream", + ) + else: + try: + content = await decode_resp.aread() + finally: + await decode_resp.aclose() + await client.aclose() + return Response( + content=content, + status_code=decode_resp.status_code, + headers={ + k: v for k, v in decode_resp.headers.items() + if k.lower() not in ("transfer-encoding",) + }, + ) + + +if __name__ == "__main__": + import uvicorn + logger.info("Disaggregated proxy starting on port %s", PROXY_PORT) + logger.info(" Prefill: %s (request_id encodes %s:%s)", PREFILL_URL, PREFILL_HOST, PREFILL_API_PORT) + logger.info(" Decode: %s", DECODE_URL) + logger.info(" Point NeuronModel at: http://localhost:%s/v1", PROXY_PORT) + uvicorn.run(app, host="0.0.0.0", port=PROXY_PORT) diff --git a/infrastructure/pull.sh b/infrastructure/pull.sh index bdb94c4..8bbafd5 100644 --- a/infrastructure/pull.sh +++ b/infrastructure/pull.sh @@ -1,2 +1,2 @@ aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws -docker pull public.ecr.aws/neuron/pytorch-inference-vllm-neuronx:0.11.0-neuronx-py312-sdk2.27.0-ubuntu24.04 +docker pull public.ecr.aws/neuron/pytorch-inference-vllm-neuronx:0.9.1-neuronx-py310-sdk2.25.0-ubuntu22.04 diff --git a/infrastructure/run.sh b/infrastructure/run.sh index 8f5af33..e13ae2c 100755 --- a/infrastructure/run.sh +++ b/infrastructure/run.sh @@ -7,7 +7,7 @@ # CONFIG_FILE=my-config.env ./run.sh # Set via environment variable # IMAGE_NAME=my-image ./run.sh # Use custom image name # CONTAINER_NAME=my-container ./run.sh # Use custom container name -# PORT=8081 ./run.sh # Override port mapping +# PORT=8082 ./run.sh # Override port (used for display only with --network=host) set -e @@ -41,6 +41,13 @@ for i in {0..15}; do fi done +# Pass EFA/Infiniband devices +for i in {0..7}; do + if [ -e "/dev/infiniband/uverbs${i}" ]; then + DEVICE_FLAGS="${DEVICE_FLAGS} --device=/dev/infiniband/uverbs${i}" + fi +done + # Handle config file ENV_FILE_FLAG="" if [ -n "$CONFIG_FILE" ] && [ -f "$CONFIG_FILE" ]; then @@ -66,11 +73,12 @@ echo "==================================" docker run -it \ -e HF_TOKEN=$HF_TOKEN \ + -e LD_LIBRARY_PATH=/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH} \ ${ENV_FILE_FLAG} \ ${DEVICE_FLAGS} \ - --cap-add SYS_ADMIN \ - --cap-add IPC_LOCK \ - -p ${PORT}:${PORT} \ + --privileged \ + --network=host \ + --shm-size=10g \ --name ${CONTAINER_NAME} \ ${IMAGE_NAME} diff --git a/infrastructure/start-vllm.sh b/infrastructure/start-vllm.sh index 7f3ff7a..3b3c051 100755 --- a/infrastructure/start-vllm.sh +++ b/infrastructure/start-vllm.sh @@ -51,16 +51,16 @@ TOOL_CALL_PARSER="${TOOL_CALL_PARSER:-llama3_json}" ENABLE_PREFIX_CACHING="${ENABLE_PREFIX_CACHING:-false}" # ============================================================================= -# Advanced Configuration (--additional-config) +# Neuron Config Override (--override-neuron-config) # ============================================================================= -# Neuron config overrides (JSON format) -# Example: '{"override_neuron_config":{"enable_bucketing":false}}' -ADDITIONAL_CONFIG="${ADDITIONAL_CONFIG:-}" +# Neuron-specific config overrides (JSON format) +# Example: '{"enable_bucketing": false}' +OVERRIDE_NEURON_CONFIG="${OVERRIDE_NEURON_CONFIG:-}" # Strip surrounding quotes if present (handles docker --env-file including literal quotes) -ADDITIONAL_CONFIG="${ADDITIONAL_CONFIG#\'}" -ADDITIONAL_CONFIG="${ADDITIONAL_CONFIG%\'}" -ADDITIONAL_CONFIG="${ADDITIONAL_CONFIG#\"}" -ADDITIONAL_CONFIG="${ADDITIONAL_CONFIG%\"}" +OVERRIDE_NEURON_CONFIG="${OVERRIDE_NEURON_CONFIG#\'}" +OVERRIDE_NEURON_CONFIG="${OVERRIDE_NEURON_CONFIG%\'}" +OVERRIDE_NEURON_CONFIG="${OVERRIDE_NEURON_CONFIG#\"}" +OVERRIDE_NEURON_CONFIG="${OVERRIDE_NEURON_CONFIG%\"}" # ============================================================================= # Speculative Decoding (--speculative-config) @@ -83,6 +83,7 @@ KV_CONNECTOR="${KV_CONNECTOR:-NeuronConnector}" KV_ROLE="${KV_ROLE:-kv_producer}" KV_BUFFER_SIZE="${KV_BUFFER_SIZE:-2e11}" ETCD="${ETCD:-}" # etcd address for coordination +KV_NEURON_CORE_OFFSET="${KV_NEURON_CORE_OFFSET:-0}" # core offset for splitting NeuronCores between workers # ============================================================================= # Build Command @@ -100,13 +101,16 @@ echo "Tool Calling: $ENABLE_TOOL_CALLING" if [ "$ENABLE_TOOL_CALLING" = "true" ]; then echo "Tool Parser: $TOOL_CALL_PARSER" fi -if [ -n "$ADDITIONAL_CONFIG" ]; then - echo "Additional Config: $ADDITIONAL_CONFIG" +if [ -n "$OVERRIDE_NEURON_CONFIG" ]; then + echo "Override Neuron Config: $OVERRIDE_NEURON_CONFIG" fi if [ -n "$SPECULATIVE_CONFIG" ]; then echo "Speculative Config: $SPECULATIVE_CONFIG" fi -echo "VLLM_USE_V1: $VLLM_USE_V1" +if [ "$ENABLE_KV_TRANSFER" = "true" ]; then + echo "KV Transfer: enabled ($KV_ROLE)" + echo "ETCD: $ETCD" +fi echo "==================================" # Build command as array to properly handle JSON arguments @@ -117,6 +121,7 @@ CMD_ARRAY=( "--max-num-seqs" "$MAX_NUM_SEQS" "--max-model-len" "$MAX_MODEL_LEN" "--tensor-parallel-size" "$TENSOR_PARALLEL_SIZE" + "--device" "neuron" ) # Add tool calling support @@ -128,25 +133,24 @@ fi # Add prefix caching if [ "$ENABLE_PREFIX_CACHING" = "true" ]; then CMD_ARRAY+=("--enable-prefix-caching") -else - CMD_ARRAY+=("--no-enable-prefix-caching") +fi + +# Add Neuron config override +if [ -n "$OVERRIDE_NEURON_CONFIG" ]; then + CMD_ARRAY+=("--override-neuron-config" "$OVERRIDE_NEURON_CONFIG") fi # Add KV cache transfer configuration if [ "$ENABLE_KV_TRANSFER" = "true" ] && [ -n "$ETCD" ]; then - KV_CONFIG="{\"kv_connector\":\"$KV_CONNECTOR\",\"kv_role\":\"$KV_ROLE\",\"kv_buffer_size\":$KV_BUFFER_SIZE,\"etcd\":\"$ETCD\"}" + KV_CONFIG="{\"kv_connector\":\"$KV_CONNECTOR\",\"kv_role\":\"$KV_ROLE\",\"kv_buffer_size\":$KV_BUFFER_SIZE,\"etcd\":\"$ETCD\",\"neuron_core_offset\":$KV_NEURON_CORE_OFFSET}" CMD_ARRAY+=("--kv-transfer-config" "$KV_CONFIG") fi -# Add additional config (neuron overrides) -if [ -n "$ADDITIONAL_CONFIG" ] && [ "$ADDITIONAL_CONFIG" != "{}" ]; then - CMD_ARRAY+=("--additional-config" "$ADDITIONAL_CONFIG") -fi - -# Add speculative decoding config (separate argument) +# Add speculative decoding config (separate argument, for non-disaggregated use) if [ -n "$SPECULATIVE_CONFIG" ] && [ "$SPECULATIVE_CONFIG" != "{}" ]; then CMD_ARRAY+=("--speculative-config" "$SPECULATIVE_CONFIG") fi + # Execute the command echo "Executing: ${CMD_ARRAY[@]}" echo "==================================" diff --git a/src/strands_neuron/neuron.py b/src/strands_neuron/neuron.py index d229c78..8973f5d 100644 --- a/src/strands_neuron/neuron.py +++ b/src/strands_neuron/neuron.py @@ -48,6 +48,9 @@ class NeuronConfig(TypedDict, total=False): params: Additional model parameters (e.g., temperature, max_tokens). streaming: Whether to use streaming mode (default: True). Set to False to work around vLLM streaming bugs with certain models (e.g., Mistral tool calls). + timeout: httpx.Timeout for the OpenAI client. Defaults to no read/write/pool timeout + with a 10s connect timeout. Pass httpx.Timeout(None) to disable all timeouts, + or httpx.Timeout(60.0) for a 60s timeout on all operations. """ model_id: str @@ -55,6 +58,7 @@ class NeuronConfig(TypedDict, total=False): api_key: Optional[str] params: Optional[dict[str, Any]] streaming: Optional[bool] + timeout: Optional[httpx.Timeout] def __init__(self, config: NeuronConfig): @@ -80,6 +84,7 @@ def __init__(self, config: NeuronConfig): "api_key": config.get("api_key", "EMPTY"), "params": config.get("params", {}), "streaming": config.get("streaming", True), + "timeout": config.get("timeout", httpx.Timeout(connect=10.0, read=None, write=None, pool=None)), } self._check_server_online() @@ -554,6 +559,7 @@ async def _get_client(self) -> AsyncIterator[Any]: client = AsyncOpenAI( api_key=self.config["api_key"], base_url=self.config["base_url"], + timeout=self.config["timeout"], ) try: yield client