From bf8017fbcff0c173c69eebb69149a0d1001da56f Mon Sep 17 00:00:00 2001 From: Kfir Toledo Date: Fri, 25 Apr 2025 08:13:20 +0300 Subject: [PATCH 1/5] feat: add scripts for kubernetes dev env using vLLM and vLLM-p2p (setup for kvcache-aware) Signed-off-by: Kfir Toledo --- DEVELOPMENT.md | 23 ++- Makefile | 7 +- .../deployments/redis-deployment.yaml | 27 ++++ .../vllm-p2p/deployments/secret.yaml | 11 ++ .../vllm-p2p/deployments/vllm-deployment.yaml | 61 ++++++++ deploy/components/vllm-p2p/kustomization.yaml | 30 ++++ deploy/components/vllm-p2p/pvc/volume.yaml | 18 +++ .../vllm-p2p/service/redis-service.yaml | 18 +++ deploy/components/vllm/deployments.yaml | 143 ++++++++++++++++++ deploy/components/vllm/kustomization.yaml | 30 ++++ deploy/components/vllm/secret.yaml | 11 ++ .../kubernetes-kgateway/kustomization.yaml | 6 +- .../patch-deployments.yaml | 10 -- .../vllm-p2p/kustomization.yaml | 11 ++ .../vllm-sim/kustomization.yaml | 14 ++ .../vllm-sim/patch-deployments.yaml | 10 ++ .../kubernetes-vllm/vllm/kustomization.yaml | 11 ++ .../vllm/patch-deployments.yaml | 9 ++ scripts/kubernetes-dev-env.sh | 120 ++++++++++++--- 19 files changed, 525 insertions(+), 45 deletions(-) create mode 100644 deploy/components/vllm-p2p/deployments/redis-deployment.yaml create mode 100644 deploy/components/vllm-p2p/deployments/secret.yaml create mode 100644 deploy/components/vllm-p2p/deployments/vllm-deployment.yaml create mode 100644 deploy/components/vllm-p2p/kustomization.yaml create mode 100644 deploy/components/vllm-p2p/pvc/volume.yaml create mode 100644 deploy/components/vllm-p2p/service/redis-service.yaml create mode 100644 deploy/components/vllm/deployments.yaml create mode 100644 deploy/components/vllm/kustomization.yaml create mode 100644 deploy/components/vllm/secret.yaml create mode 100644 deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml create mode 100644 deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml create mode 100644 deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml create mode 100644 deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml create mode 100644 deploy/environments/dev/kubernetes-vllm/vllm/patch-deployments.yaml diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 8b6cf443..f7a5af69 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -178,14 +178,31 @@ Export the name of the `Secret` to the environment: export REGISTRY_SECRET=anna-pull-secret ``` -Now you need to provide several other environment variables. You'll need to -indicate the location and tag of the `vllm-sim` image: +Set the `VLLM_MODE` environment variable based on which version of vLLM you want to deploy: + +- `vllm-sim`: Lightweight simulator for simple environments +- `vllm`: Full vLLM model server for real inference +- `vllm-p2p`: Full vLLM with LMCache P2P support for distributed KV caching + +```console +export VLLM_MODE=vllm-sim # or vllm / vllm-p2p +``` +Each mode has default image values, but you can override them: + +For vllm-sim: ```console export VLLM_SIM_IMAGE="/" export VLLM_SIM_TAG="" ``` +For vllm and vllm-p2p: + +```console +export VLLM_IMAGE="/" +export VLLM_TAG="" +``` + The same thing will need to be done for the EPP: ```console @@ -203,7 +220,7 @@ This will deploy the entire stack to whatever namespace you chose. You can test by exposing the inference `Gateway` via port-forward: ```console -kubectl -n ${NAMESPACE} port-forward service/inference-gateway-istio 8080:80 +kubectl -n ${NAMESPACE} port-forward service/inference-gateway 8080:80 ``` And making requests with `curl`: diff --git a/Makefile b/Makefile index 641d6cf6..cfaa72cb 100644 --- a/Makefile +++ b/Makefile @@ -784,11 +784,8 @@ environment.dev.kubernetes: check-kubectl check-kustomize check-envsubst # ------------------------------------------------------------------------------ .PHONY: clean.environment.dev.kubernetes clean.environment.dev.kubernetes: check-kubectl check-kustomize check-envsubst -ifndef NAMESPACE - $(error "Error: NAMESPACE is required but not set") -endif - @echo "INFO: cleaning up dev environment in $(NAMESPACE)" - kustomize build deploy/environments/dev/kubernetes-kgateway | envsubst | kubectl -n "${NAMESPACE}" delete -f - + @CLEAN=true ./scripts/kubernetes-dev-env.sh 2>&1 + @echo "INFO: Finish cleanup development environment for $(VLLM_MODE) mode in namespace $(NAMESPACE)" # ----------------------------------------------------------------------------- # TODO: these are old aliases that we still need for the moment, but will be diff --git a/deploy/components/vllm-p2p/deployments/redis-deployment.yaml b/deploy/components/vllm-p2p/deployments/redis-deployment.yaml new file mode 100644 index 00000000..16b61d1f --- /dev/null +++ b/deploy/components/vllm-p2p/deployments/redis-deployment.yaml @@ -0,0 +1,27 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ${REDIS_NAME} + labels: + app.kubernetes.io/name: redis + app.kubernetes.io/component: redis-lookup-server +spec: + replicas: ${REDIS_REPLICA_COUNT} + selector: + matchLabels: + app.kubernetes.io/name: redis + app.kubernetes.io/component: redis-lookup-server + template: + metadata: + labels: + app.kubernetes.io/name: redis + app.kubernetes.io/component: redis-lookup-server + spec: + containers: + - name: lookup-server + image: ${REDIS_IMAGE}:${REDIS_TAG} + imagePullPolicy: Always + command: + - redis-server + ports: + - containerPort: ${REDIS_TARGET_PORT} diff --git a/deploy/components/vllm-p2p/deployments/secret.yaml b/deploy/components/vllm-p2p/deployments/secret.yaml new file mode 100644 index 00000000..1f5a2bcc --- /dev/null +++ b/deploy/components/vllm-p2p/deployments/secret.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Secret +metadata: + name: ${HF_SECRET_NAME} + namespace: ${NAMESPACE} + labels: + app.kubernetes.io/name: vllm + app.kubernetes.io/component: secret +type: Opaque +data: + ${HF_SECRET_KEY}: ${HF_TOKEN} diff --git a/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml b/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml new file mode 100644 index 00000000..b825f358 --- /dev/null +++ b/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml @@ -0,0 +1,61 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ${VLLM_DEPLOYMENT_NAME} + labels: + app.kubernetes.io/name: vllm + app.kubernetes.io/model: ${MODEL_LABEL} + app.kubernetes.io/component: vllm +spec: + replicas: ${VLLM_REPLICA_COUNT} + selector: + matchLabels: + app.kubernetes.io/name: vllm + app.kubernetes.io/component: vllm + app.kubernetes.io/model: ${MODEL_LABEL} + template: + metadata: + labels: + app.kubernetes.io/name: vllm + app.kubernetes.io/component: vllm + app.kubernetes.io/model: ${MODEL_LABEL} + spec: + containers: + - name: vllm + image: ${VLLM_IMAGE}:${VLLM_TAG} + imagePullPolicy: Always + command: + - /bin/sh + - "-c" + args: + - | + export LMCACHE_DISTRIBUTED_URL=${POD_IP}:80 && + vllm serve ${MODEL_NAME} + --host 0.0.0.0 + --port 8000 + --enable-chunked-prefill false + --max-model-len ${MAX_MODEL_LEN} + --kv-transfer-config + '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}' + ports: + - name: http + containerPort: 8000 + - name: lmcache-dist + containerPort: 80 + env: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: ${HF_SECRET_NAME} + key: ${HF_SECRET_KEY} + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + volumeMounts: + - name: model-storage + mountPath: ${VOLUME_MOUNT_PATH} + volumes: + - name: model-storage + persistentVolumeClaim: + claimName: ${PVC_NAME} diff --git a/deploy/components/vllm-p2p/kustomization.yaml b/deploy/components/vllm-p2p/kustomization.yaml new file mode 100644 index 00000000..3f7d6014 --- /dev/null +++ b/deploy/components/vllm-p2p/kustomization.yaml @@ -0,0 +1,30 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: ${NAMESPACE} + +resources: + - deployments/vllm-deployment.yaml + - deployments/redis-deployment.yaml + - service/redis-service.yaml + - pvc/volume.yaml + - deployments/secret.yaml + +images: + - name: vllm/vllm-openai + newName: ${VLLM_IMAGE} + newTag: ${VLLM_TAG} + - name: redis + newName: ${REDIS_IMAGE} + newTag: ${REDIS_TAG} + +configMapGenerator: + - name: model-config + literals: + - MODEL_NAME=${MODEL_NAME} + - MODEL_LABEL=${MODEL_LABEL} + - POOL_LABEL=${POOL_LABEL} + - REDIS_ENABLED=${REDIS_ENABLED} + +generatorOptions: + disableNameSuffixHash: true \ No newline at end of file diff --git a/deploy/components/vllm-p2p/pvc/volume.yaml b/deploy/components/vllm-p2p/pvc/volume.yaml new file mode 100644 index 00000000..6a0a042c --- /dev/null +++ b/deploy/components/vllm-p2p/pvc/volume.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ${PVC_NAME} + namespace: ${NAMESPACE} + labels: + app.kubernetes.io/name: vllm + app.kubernetes.io/component: storage + app.kubernetes.io/model: ${MODEL_LABEL} + finalizers: + - kubernetes.io/pvc-protection +spec: + accessModes: + - ${PVC_ACCESS_MODE} + resources: + requests: + storage: ${PVC_SIZE} + storageClassName: ${PVC_STORAGE_CLASS} diff --git a/deploy/components/vllm-p2p/service/redis-service.yaml b/deploy/components/vllm-p2p/service/redis-service.yaml new file mode 100644 index 00000000..947f87ac --- /dev/null +++ b/deploy/components/vllm-p2p/service/redis-service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + name: ${REDIS_NAME} + namespace: ${NAMESPACE} + labels: + app.kubernetes.io/name: redis + app.kubernetes.io/component: redis-lookup-server +spec: + ports: + - name: lookupserver-port + protocol: TCP + port: ${REDIS_PORT} + targetPort: ${REDIS_TARGET_PORT} + type: ${REDIS_SERVICE_TYPE} + selector: + app.kubernetes.io/name: redis + app.kubernetes.io/component: redis-lookup-server diff --git a/deploy/components/vllm/deployments.yaml b/deploy/components/vllm/deployments.yaml new file mode 100644 index 00000000..19a398b1 --- /dev/null +++ b/deploy/components/vllm/deployments.yaml @@ -0,0 +1,143 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ${VLLM_DEPLOYMENT_NAME} +spec: + replicas: 3 + selector: + matchLabels: + app: vllm-llama3-8b-instruct + template: + metadata: + labels: + app: vllm-llama3-8b-instruct + spec: + securityContext: + runAsUser: ${PROXY_UID} + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + containers: + - name: vllm + image: "vllm/vllm-openai:latest" + imagePullPolicy: IfNotPresent + command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args: + - "--model" + - "meta-llama/Llama-3.1-8B-Instruct" + - "--tensor-parallel-size" + - "1" + - "--port" + - "8000" + - "--max-num-seq" + - "1024" + - "--compilation-config" + - "3" + - "--enable-lora" + - "--max-loras" + - "2" + - "--max-lora-rank" + - "8" + - "--max-cpu-loras" + - "12" + env: + - name: VLLM_USE_V1 + value: "1" + - name: PORT + value: "8000" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: token + - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING + value: "true" + - name: XDG_CACHE_HOME + value: /cache + - name: HF_HOME + value: /cache/huggingface + - name: FLASHINFER_CACHE_DIR + value: /cache/flashinfer + ports: + - containerPort: 8000 + name: http + protocol: TCP + lifecycle: + preStop: + sleep: + seconds: 30 + livenessProbe: + httpGet: + path: /health + port: http + scheme: HTTP + periodSeconds: 1 + successThreshold: 1 + failureThreshold: 5 + timeoutSeconds: 1 + readinessProbe: + httpGet: + path: /health + port: http + scheme: HTTP + periodSeconds: 1 + successThreshold: 1 + failureThreshold: 1 + timeoutSeconds: 1 + startupProbe: + httpGet: + path: /health + port: http + scheme: HTTP + failureThreshold: 600 + initialDelaySeconds: 2 + periodSeconds: 1 + resources: + limits: + nvidia.com/gpu: 1 + requests: + nvidia.com/gpu: 1 + volumeMounts: + - mountPath: /cache + name: hf-cache + - mountPath: /dev/shm + name: shm + - mountPath: /adapters + name: adapters + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + initContainers: + - name: lora-adapter-syncer + tty: true + stdin: true + image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer:main + restartPolicy: Always + imagePullPolicy: Always + env: + - name: DYNAMIC_LORA_ROLLOUT_CONFIG + value: "/config/configmap.yaml" + volumeMounts: + - name: config-volume + mountPath: /config + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + restartPolicy: Always + enableServiceLinks: false + terminationGracePeriodSeconds: 130 + volumes: + - name: hf-cache + emptyDir: {} + - name: shm + emptyDir: + medium: Memory + - name: adapters + emptyDir: {} + - name: config-volume + configMap: + name: vllm-llama3-8b-instruct-adapters diff --git a/deploy/components/vllm/kustomization.yaml b/deploy/components/vllm/kustomization.yaml new file mode 100644 index 00000000..ee4a4e7d --- /dev/null +++ b/deploy/components/vllm/kustomization.yaml @@ -0,0 +1,30 @@ +# ------------------------------------------------------------------------------ +# vLLM Deployment +# +# This deploys the full vLLM model server, capable of serving real models such +# as Llama 3.1-8B-Instruct via the OpenAI-compatible API. It is intended for +# environments with GPU resources and where full inference capabilities are +# required. +# +# The deployment can be customized using environment variables to set: +# - The container image and tag (VLLM_IMAGE, VLLM_TAG) +# - The model to load (MODEL_NAME) +# +# This setup is suitable for testing and production with Kubernetes (including +# GPU-enabled nodes or clusters with scheduling for `nvidia.com/gpu`). +# ----------------------------------------------------------------------------- +kind: Kustomization + +resources: +- deployments.yaml +- secret.yaml + +images: +- name: vllm/vllm-openai + newName: ${VLLM_IMAGE} + newTag: ${VLLM_TAG} + +configMapGenerator: +- name: vllm-model-config + literals: + - MODEL_NAME=${MODEL_NAME} \ No newline at end of file diff --git a/deploy/components/vllm/secret.yaml b/deploy/components/vllm/secret.yaml new file mode 100644 index 00000000..1f5a2bcc --- /dev/null +++ b/deploy/components/vllm/secret.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Secret +metadata: + name: ${HF_SECRET_NAME} + namespace: ${NAMESPACE} + labels: + app.kubernetes.io/name: vllm + app.kubernetes.io/component: secret +type: Opaque +data: + ${HF_SECRET_KEY}: ${HF_TOKEN} diff --git a/deploy/environments/dev/kubernetes-kgateway/kustomization.yaml b/deploy/environments/dev/kubernetes-kgateway/kustomization.yaml index 0b7e1ed8..7dc295de 100644 --- a/deploy/environments/dev/kubernetes-kgateway/kustomization.yaml +++ b/deploy/environments/dev/kubernetes-kgateway/kustomization.yaml @@ -4,18 +4,14 @@ kind: Kustomization namespace: ${NAMESPACE} resources: -- ../../../components/vllm-sim/ - ../../../components/inference-gateway/ - gateway-parameters.yaml images: -- name: quay.io/vllm-d/vllm-sim - newName: ${VLLM_SIM_IMAGE} - newTag: ${VLLM_SIM_TAG} - name: quay.io/vllm-d/gateway-api-inference-extension/epp newName: ${EPP_IMAGE} newTag: ${EPP_TAG} patches: - path: patch-deployments.yaml -- path: patch-gateways.yaml +- path: patch-gateways.yaml \ No newline at end of file diff --git a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml index 20a17d53..0e4ad46e 100644 --- a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml +++ b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml @@ -22,13 +22,3 @@ spec: - "9002" - -grpcHealthPort - "9003" ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm-sim -spec: - template: - spec: - imagePullSecrets: - - name: ${REGISTRY_SECRET} diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml new file mode 100644 index 00000000..a81c387e --- /dev/null +++ b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- ../../../../components/vllm-p2p/ + +images: +- name: quay.io/vllm-d/vllm-d-dev:0.0.2 + newName: ${VLLM_P2P_IMAGE} + newTag: ${VLLM_P2P_TAG} + diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml new file mode 100644 index 00000000..921b9ef5 --- /dev/null +++ b/deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml @@ -0,0 +1,14 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- ../../../../components/vllm-sim/ + +images: +- name: quay.io/vllm-d/vllm-sim + newName: ${VLLM_SIM_IMAGE} + newTag: ${VLLM_SIM_TAG} + +patches: + - path: patch-deployments.yaml + diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml new file mode 100644 index 00000000..d86d712c --- /dev/null +++ b/deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml @@ -0,0 +1,10 @@ + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ${VLLM_DEPLOYMENT_NAME} +spec: + template: + spec: + imagePullSecrets: + - name: ${REGISTRY_SECRET} diff --git a/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml new file mode 100644 index 00000000..81fa76ba --- /dev/null +++ b/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- ../../../../components/vllm/ + +images: +- name: quay.io/vllm-d/vllm-d-dev:0.0.2 + newName: ${VLLM_IMAGE} + newTag: ${VLLM_TAG} + diff --git a/deploy/environments/dev/kubernetes-vllm/vllm/patch-deployments.yaml b/deploy/environments/dev/kubernetes-vllm/vllm/patch-deployments.yaml new file mode 100644 index 00000000..efaa2211 --- /dev/null +++ b/deploy/environments/dev/kubernetes-vllm/vllm/patch-deployments.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm +spec: + template: + spec: + imagePullSecrets: + - name: ${REGISTRY_SECRET} diff --git a/scripts/kubernetes-dev-env.sh b/scripts/kubernetes-dev-env.sh index 28b84409..98c87492 100755 --- a/scripts/kubernetes-dev-env.sh +++ b/scripts/kubernetes-dev-env.sh @@ -12,18 +12,77 @@ set -eux # ------------------------------------------------------------------------------ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Set a default VLLM_SIM_IMAGE if not provided -: "${VLLM_SIM_IMAGE:=quay.io/vllm-d/vllm-sim}" - -# Set a default VLLM_SIM_TAG if not provided -: "${VLLM_SIM_TAG:=0.0.2}" - -# Set a default EPP_IMAGE if not provided -: "${EPP_IMAGE:=us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp}" - -# Set a default EPP_TAG if not provided -: "${EPP_TAG:=main}" +export CLEAN="${CLEAN:-false}" + +# Validate required inputs +if [[ -z "${NAMESPACE:-}" ]]; then + echo "ERROR: NAMESPACE environment variable is not set." + exit 1 +fi +if [[ -z "${VLLM_MODE:-}" ]]; then + echo "ERROR: VLLM_MODE is not set. Please export one of: vllm-sim, vllm, vllm-p2p" + exit 1 +fi + +# vLLM Specific Configuration node + +case "${VLLM_MODE}" in + vllm-sim) + export VLLM_SIM_IMAGE="${VLLM_SIM_IMAGE:-quay.io/vllm-d/vllm-sim}" + export VLLM_SIM_TAG="${VLLM_SIM_TAG:-0.0.2}" + export EPP_IMAGE="${EPP_IMAGE:-us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp}" + export EPP_TAG="${EPP_TAG:-main}" + ;; + vllm | vllm-p2p) + # Shared across both full model modes - // TODO - make more env variables similar + # TODO: Consider unifying more environment variables for consistency and reuse + export HF_SECRET_NAME="${HF_SECRET_NAME:-hf-token}" + export HF_TOKEN=$(echo -n "${HF_TOKEN:-}" | base64 | tr -d '\n') + export VOLUME_MOUNT_PATH="${VOLUME_MOUNT_PATH:-/data}" + + if [[ "$VLLM_MODE" == "vllm" ]]; then + export VLLM_IMAGE="${VLLM_IMAGE:-quay.io/vllm-d/vllm-d-dev}" + export VLLM_TAG="${VLLM_TAG:-0.0.2}" + export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-llama3-8b-instruct}" + export EPP_IMAGE="${EPP_IMAGE:-us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp}" + export EPP_TAG="${EPP_TAG:-main}" + export MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-3.1-8B-Instruct}" + export MODEL_LABEL="${MODEL_LABEL:-llama3-8b}" + export HF_SECRET_KEY="${HF_SECRET_KEY:-token}" + export HF_TOKEN="${HF_TOKEN:-}" + export VLLM_REPLICA_COUNT="${VLLM_REPLICA_COUNT:-2}" + export MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}" + export PVC_NAME="${PVC_NAME:-vllm-storage-claim}" + + elif [[ "$VLLM_MODE" == "vllm-p2p" ]]; then + export VLLM_IMAGE="${VLLM_IMAGE:-vllm/vllm-openai}" + export VLLM_TAG="${VLLM_TAG:-latest}" + export MODEL_NAME="${MODEL_NAME:-mistralai/Mistral-7B-Instruct-v0.2}" + export MODEL_LABEL="${MODEL_LABEL:-mistral7b}" + export HF_SECRET_KEY="${HF_SECRET_KEY:-${MODEL_LABEL}}" + export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-${MODEL_LABEL}}" + export VLLM_REPLICA_COUNT="${VLLM_REPLICA_COUNT:-4}" + export MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}" + export PVC_NAME="${PVC_NAME:-vllm-p2p-storage-claim}" + export PVC_ACCESS_MODE="${PVC_ACCESS_MODE:-ReadWriteOnce}" + export PVC_SIZE="${PVC_SIZE:-10Gi}" + export PVC_STORAGE_CLASS="${PVC_STORAGE_CLASS:-standard}" + export REDIS_NAME="${REDIS_NAME:-vllm-redis}" + export REDIS_IMAGE="${REDIS_IMAGE:-redis}" + export REDIS_TAG="${REDIS_TAG:-7.2.3}" + export REDIS_REPLICA_COUNT="${REDIS_REPLICA_COUNT:-1}" + export REDIS_PORT="${REDIS_PORT:-6379}" + export REDIS_TARGET_PORT="${REDIS_TARGET_PORT:-6379}" + export REDIS_SERVICE_TYPE="${REDIS_SERVICE_TYPE:-ClusterIP}" + export POOL_LABEL="${POOL_LABEL:-vllm-llama3-8b-instruct}" + export REDIS_ENABLED="${REDIS_ENABLED:-true}" + fi + ;; + *) + echo "ERROR: Unsupported VLLM_MODE: ${VLLM_MODE}. Must be one of: vllm-sim, vllm, vllm-p2p" + exit 1 + ;; +esac # ------------------------------------------------------------------------------ # Deployment @@ -32,18 +91,35 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" kubectl create namespace ${NAMESPACE} 2>/dev/null || true # Hack to deal with KGateways broken OpenShift support -export PROXY_UID=$(kubectl get namespace ${NAMESPACE} -o json | jq -e -r '.metadata.annotations["openshift.io/sa.scc.uid-range"]' | perl -F'/' -lane 'print $F[0]+1'); +export PROXY_UID=$(kubectl get namespace ${NAMESPACE} -o json | jq -e -r '.metadata.annotations["openshift.io/sa.scc.uid-range"]' | perl -F'/' -lane 'print $F[0]+1'); set -o pipefail -echo "INFO: Deploying Development Environment in namespace ${NAMESPACE}" - -kustomize build deploy/environments/dev/kubernetes-kgateway | envsubst | kubectl -n ${NAMESPACE} apply -f - - -echo "INFO: Waiting for resources in namespace ${NAMESPACE} to become ready" +if [[ "$CLEAN" == "true" ]]; then + echo "INFO: ${CLEAN^^}ING environment in namespace ${NAMESPACE} for mode ${VLLM_MODE}" + kustomize build deploy/environments/dev/kubernetes-kgateway | envsubst | kubectl -n "${NAMESPACE}" delete --ignore-not-found=true -f - + kustomize build deploy/environments/dev/kubernetes-vllm/${VLLM_MODE} | envsubst | kubectl -n "${NAMESPACE}" delete --ignore-not-found=true -f - +else + echo "INFO: Deploying Development Environment in namespace ${NAMESPACE}" + kustomize build deploy/environments/dev/kubernetes-kgateway | envsubst | kubectl -n "${NAMESPACE}" apply -f - + kustomize build deploy/environments/dev/kubernetes-vllm/${VLLM_MODE} | envsubst | kubectl -n "${NAMESPACE}" apply -f - + echo "INFO: Waiting for resources in namespace ${NAMESPACE} to become ready" + kubectl -n "${NAMESPACE}" wait deployment/endpoint-picker --for=condition=Available --timeout=60s + kubectl -n "${NAMESPACE}" wait gateway/inference-gateway --for=condition=Programmed --timeout=60s + kubectl -n "${NAMESPACE}" wait deployment/inference-gateway --for=condition=Available --timeout=60s + # Mode-specific wait + case "${VLLM_MODE}" in + vllm-sim) + kubectl -n "${NAMESPACE}" wait deployment/vllm-sim --for=condition=Available --timeout=60s + ;; + vllm) + kubectl -n "${NAMESPACE}" wait deployment/vllm-llama3-8b-instruct --for=condition=Available --timeout=180s + ;; + vllm-p2p) + kubectl -n "${NAMESPACE}" wait deployment/vllm-mistral7b --for=condition=Available --timeout=180s + kubectl -n "${NAMESPACE}" wait deployment/${REDIS_NAME} --for=condition=Available --timeout=60s + ;; + esac +fi -kubectl -n ${NAMESPACE} wait deployment/endpoint-picker --for=condition=Available --timeout=60s -kubectl -n ${NAMESPACE} wait deployment/vllm-sim --for=condition=Available --timeout=60s -kubectl -n ${NAMESPACE} wait gateway/inference-gateway --for=condition=Programmed --timeout=60s -kubectl -n ${NAMESPACE} wait deployment/inference-gateway --for=condition=Available --timeout=60s From 78157d52f1a7a04e198ccc535b681a134225fc5d Mon Sep 17 00:00:00 2001 From: Kfir Toledo Date: Fri, 25 Apr 2025 15:02:51 +0300 Subject: [PATCH 2/5] [fix]: Small fixes for development YAMLs Signed-off-by: Kfir Toledo --- DEVELOPMENT.md | 42 ++++--- .../inference-gateway/deployments.yaml | 8 ++ .../inference-gateway/inference-models.yaml | 32 +++++- .../inference-gateway/inference-pools.yaml | 4 +- .../deployments/redis-deployment.yaml | 34 +++++- .../vllm-p2p/deployments/vllm-deployment.yaml | 104 ++++++++++++++---- deploy/components/vllm-p2p/kustomization.yaml | 12 -- deploy/components/vllm-p2p/pvc/volume.yaml | 18 --- .../vllm-p2p/service/redis-service.yaml | 3 +- deploy/components/vllm/configmap.yaml | 14 +++ deploy/components/vllm/deployments.yaml | 6 +- deploy/components/vllm/kustomization.yaml | 2 + .../gateway-parameters.yaml | 6 +- .../vllm-p2p/kustomization.yaml | 4 +- .../vllm-sim/kustomization.yaml | 1 - .../vllm-sim/patch-deployments.yaml | 2 +- scripts/kubernetes-dev-env.sh | 39 ++++--- 17 files changed, 235 insertions(+), 96 deletions(-) delete mode 100644 deploy/components/vllm-p2p/pvc/volume.yaml create mode 100644 deploy/components/vllm/configmap.yaml diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index f7a5af69..9d555a5c 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -178,11 +178,18 @@ Export the name of the `Secret` to the environment: export REGISTRY_SECRET=anna-pull-secret ``` +You can optionally set a custom EPP image (otherwise, the default will be used): + +```console +export EPP_IMAGE="/" +export EPP_TAG="" +``` + Set the `VLLM_MODE` environment variable based on which version of vLLM you want to deploy: - `vllm-sim`: Lightweight simulator for simple environments - `vllm`: Full vLLM model server for real inference -- `vllm-p2p`: Full vLLM with LMCache P2P support for distributed KV caching +- `vllm-p2p`: Full vLLM with LMCache P2P support for enable KV-Cache aware routing ```console export VLLM_MODE=vllm-sim # or vllm / vllm-p2p @@ -197,18 +204,14 @@ export VLLM_SIM_TAG="" ``` For vllm and vllm-p2p: - +- set Vllm image: ```console export VLLM_IMAGE="/" export VLLM_TAG="" ``` - -The same thing will need to be done for the EPP: - -```console -export EPP_IMAGE="/" -export EPP_TAG="" -``` +- Set hugging face token variable: + export HF_TOKEN="" +**Warning**: For vllm mode, the default image uses llama3-8b and vllm-mistral. Make sure you have permission to access these files in their respective repositories. Once all this is set up, you can deploy the environment: @@ -224,12 +227,25 @@ kubectl -n ${NAMESPACE} port-forward service/inference-gateway 8080:80 ``` And making requests with `curl`: +- vllm-sim -```console -curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \ - -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq -``` + ```console + curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \ + -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq + ``` + +- vllm + + ```console + curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \ + -d '{"model":"meta-llama/Llama-3.1-8B-Instruct","prompt":"hi","max_tokens":10,"temperature":0}' | jq + ``` +- vllm-p2p + ```console + curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \ + -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","prompt":"hi","max_tokens":10,"temperature":0}' | jq + ``` #### Development Cycle > **WARNING**: This is a very manual process at the moment. We expect to make diff --git a/deploy/components/inference-gateway/deployments.yaml b/deploy/components/inference-gateway/deployments.yaml index 0fc19d4d..f523f812 100644 --- a/deploy/components/inference-gateway/deployments.yaml +++ b/deploy/components/inference-gateway/deployments.yaml @@ -48,3 +48,11 @@ spec: service: inference-extension initialDelaySeconds: 5 periodSeconds: 10 + env: + - name: KVCACHE_INDEXER_REDIS_ADDR + value: ${REDIS_HOST}:${REDIS_PORT} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: ${HF_SECRET_NAME} + key: ${HF_SECRET_KEY} \ No newline at end of file diff --git a/deploy/components/inference-gateway/inference-models.yaml b/deploy/components/inference-gateway/inference-models.yaml index 12a51394..330f19a9 100644 --- a/deploy/components/inference-gateway/inference-models.yaml +++ b/deploy/components/inference-gateway/inference-models.yaml @@ -6,7 +6,37 @@ spec: modelName: food-review criticality: Critical poolRef: - name: vllm-llama3-8b-instruct + name: ${POOL_NAME} targetModels: - name: food-review weight: 100 +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: base-model +spec: + modelName: meta-llama/Llama-3.1-8B-Instruct + criticality: Critical + poolRef: + name: ${POOL_NAME} +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: base-model-cpu +spec: + modelName: Qwen/Qwen2.5-1.5B-Instruct + criticality: Critical + poolRef: + name: ${POOL_NAME} +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: mistarli +spec: + modelName: mistralai/Mistral-7B-Instruct-v0.2 + criticality: Critical + poolRef: + name: ${POOL_NAME} \ No newline at end of file diff --git a/deploy/components/inference-gateway/inference-pools.yaml b/deploy/components/inference-gateway/inference-pools.yaml index ece6e500..3a981a14 100644 --- a/deploy/components/inference-gateway/inference-pools.yaml +++ b/deploy/components/inference-gateway/inference-pools.yaml @@ -1,10 +1,10 @@ apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool metadata: - name: vllm-llama3-8b-instruct + name: ${POOL_NAME} spec: targetPortNumber: 8000 selector: - app: vllm-llama3-8b-instruct + app: ${POOL_NAME} extensionRef: name: endpoint-picker diff --git a/deploy/components/vllm-p2p/deployments/redis-deployment.yaml b/deploy/components/vllm-p2p/deployments/redis-deployment.yaml index 16b61d1f..f4b5938e 100644 --- a/deploy/components/vllm-p2p/deployments/redis-deployment.yaml +++ b/deploy/components/vllm-p2p/deployments/redis-deployment.yaml @@ -1,7 +1,7 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: ${REDIS_NAME} + name: ${REDIS_SVC_NAME} labels: app.kubernetes.io/name: redis app.kubernetes.io/component: redis-lookup-server @@ -20,8 +20,36 @@ spec: containers: - name: lookup-server image: ${REDIS_IMAGE}:${REDIS_TAG} - imagePullPolicy: Always + imagePullPolicy: IfNotPresent command: - redis-server ports: - - containerPort: ${REDIS_TARGET_PORT} + - name: redis-port + containerPort: ${REDIS_TARGET_PORT} + protocol: TCP + resources: + limits: + cpu: "4" + memory: 10G + requests: + cpu: "4" + memory: 8G + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + restartPolicy: Always + terminationGracePeriodSeconds: 30 + dnsPolicy: ClusterFirst + securityContext: {} + schedulerName: default-scheduler + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 25% + maxSurge: 25% + revisionHistoryLimit: 10 + progressDeadlineSeconds: 600 + # securityContext: + # allowPrivilegeEscalation: false + # capabilities: + # drop: + # - ALL \ No newline at end of file diff --git a/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml b/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml index b825f358..1c296eff 100644 --- a/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml +++ b/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml @@ -13,49 +13,111 @@ spec: app.kubernetes.io/name: vllm app.kubernetes.io/component: vllm app.kubernetes.io/model: ${MODEL_LABEL} + app: ${POOL_NAME} template: metadata: labels: app.kubernetes.io/name: vllm app.kubernetes.io/component: vllm app.kubernetes.io/model: ${MODEL_LABEL} + app: ${POOL_NAME} spec: + # securityContext: + # runAsUser: ${PROXY_UID} + # runAsNonRoot: true + # seccompProfile: + # type: RuntimeDefault containers: - name: vllm image: ${VLLM_IMAGE}:${VLLM_TAG} - imagePullPolicy: Always + imagePullPolicy: IfNotPresent command: - /bin/sh - "-c" args: - | - export LMCACHE_DISTRIBUTED_URL=${POD_IP}:80 && - vllm serve ${MODEL_NAME} - --host 0.0.0.0 - --port 8000 - --enable-chunked-prefill false - --max-model-len ${MAX_MODEL_LEN} - --kv-transfer-config - '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}' + export LMCACHE_DISTRIBUTED_URL=$${${POD_IP}}:80 && \ + vllm serve ${MODEL_NAME} \ + --host 0.0.0.0 \ + --port 8000 \ + --enable-chunked-prefill false \ + --max-model-len ${MAX_MODEL_LEN} \ + --kv-transfer-config '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}' ports: - name: http containerPort: 8000 - - name: lmcache-dist + protocol: TCP + - name: lmcache-dist # Assuming port 80 is used for LMCACHE_DISTRIBUTED_URL containerPort: 80 + protocol: TCP + livenessProbe: + failureThreshold: 3 + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 15 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + startupProbe: + failureThreshold: 60 + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 15 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 env: + - name: HF_HOME + value: /data + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP - name: HF_TOKEN valueFrom: secretKeyRef: name: ${HF_SECRET_NAME} key: ${HF_SECRET_KEY} - - name: POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - volumeMounts: - - name: model-storage - mountPath: ${VOLUME_MOUNT_PATH} - volumes: - - name: model-storage - persistentVolumeClaim: - claimName: ${PVC_NAME} + - name: LMCACHE_LOOKUP_URL + value: ${REDIS_HOST}:${REDIS_PORT} + - name: LMCACHE_ENABLE_DEBUG + value: "True" + - name: LMCACHE_ENABLE_P2P + value: "True" + - name: LMCACHE_LOCAL_CPU + value: "True" + - name: LMCACHE_MAX_LOCAL_CPU_SIZE + value: "20" + - name: LMCACHE_USE_EXPERIMENTAL + value: "True" + - name: VLLM_RPC_TIMEOUT + value: "1000000" + resources: + limits: + nvidia.com/gpu: "1" + requests: + cpu: "10" + memory: 40Gi + nvidia.com/gpu: "1" + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + securityContext: + runAsNonRoot: false + restartPolicy: Always + terminationGracePeriodSeconds: 30 + dnsPolicy: ClusterFirst + securityContext: {} + schedulerName: default-scheduler + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 0 + maxSurge: "100%" + revisionHistoryLimit: 10 + progressDeadlineSeconds: 1200 + diff --git a/deploy/components/vllm-p2p/kustomization.yaml b/deploy/components/vllm-p2p/kustomization.yaml index 3f7d6014..c1e3172a 100644 --- a/deploy/components/vllm-p2p/kustomization.yaml +++ b/deploy/components/vllm-p2p/kustomization.yaml @@ -7,7 +7,6 @@ resources: - deployments/vllm-deployment.yaml - deployments/redis-deployment.yaml - service/redis-service.yaml - - pvc/volume.yaml - deployments/secret.yaml images: @@ -17,14 +16,3 @@ images: - name: redis newName: ${REDIS_IMAGE} newTag: ${REDIS_TAG} - -configMapGenerator: - - name: model-config - literals: - - MODEL_NAME=${MODEL_NAME} - - MODEL_LABEL=${MODEL_LABEL} - - POOL_LABEL=${POOL_LABEL} - - REDIS_ENABLED=${REDIS_ENABLED} - -generatorOptions: - disableNameSuffixHash: true \ No newline at end of file diff --git a/deploy/components/vllm-p2p/pvc/volume.yaml b/deploy/components/vllm-p2p/pvc/volume.yaml deleted file mode 100644 index 6a0a042c..00000000 --- a/deploy/components/vllm-p2p/pvc/volume.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: ${PVC_NAME} - namespace: ${NAMESPACE} - labels: - app.kubernetes.io/name: vllm - app.kubernetes.io/component: storage - app.kubernetes.io/model: ${MODEL_LABEL} - finalizers: - - kubernetes.io/pvc-protection -spec: - accessModes: - - ${PVC_ACCESS_MODE} - resources: - requests: - storage: ${PVC_SIZE} - storageClassName: ${PVC_STORAGE_CLASS} diff --git a/deploy/components/vllm-p2p/service/redis-service.yaml b/deploy/components/vllm-p2p/service/redis-service.yaml index 947f87ac..a5d5fd00 100644 --- a/deploy/components/vllm-p2p/service/redis-service.yaml +++ b/deploy/components/vllm-p2p/service/redis-service.yaml @@ -1,8 +1,7 @@ apiVersion: v1 kind: Service metadata: - name: ${REDIS_NAME} - namespace: ${NAMESPACE} + name: ${REDIS_SVC_NAME} labels: app.kubernetes.io/name: redis app.kubernetes.io/component: redis-lookup-server diff --git a/deploy/components/vllm/configmap.yaml b/deploy/components/vllm/configmap.yaml new file mode 100644 index 00000000..1a4f8903 --- /dev/null +++ b/deploy/components/vllm/configmap.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: vllm-llama3-8b-instruct-adapters +data: + configmap.yaml: | + vLLMLoRAConfig: + name: vllm-llama3-8b-instruct-adapters + port: 8000 + defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct + ensureExist: + models: + - id: food-review-1 + source: Kawon/llama3.1-food-finetune_v14_r8 diff --git a/deploy/components/vllm/deployments.yaml b/deploy/components/vllm/deployments.yaml index 19a398b1..2d7a63d5 100644 --- a/deploy/components/vllm/deployments.yaml +++ b/deploy/components/vllm/deployments.yaml @@ -3,14 +3,14 @@ kind: Deployment metadata: name: ${VLLM_DEPLOYMENT_NAME} spec: - replicas: 3 + replicas: ${VLLM_REPLICA_COUNT} selector: matchLabels: - app: vllm-llama3-8b-instruct + app: ${POOL_NAME} template: metadata: labels: - app: vllm-llama3-8b-instruct + app: ${POOL_NAME} spec: securityContext: runAsUser: ${PROXY_UID} diff --git a/deploy/components/vllm/kustomization.yaml b/deploy/components/vllm/kustomization.yaml index ee4a4e7d..93813639 100644 --- a/deploy/components/vllm/kustomization.yaml +++ b/deploy/components/vllm/kustomization.yaml @@ -18,6 +18,8 @@ kind: Kustomization resources: - deployments.yaml - secret.yaml +- configmap.yaml + images: - name: vllm/vllm-openai diff --git a/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml b/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml index 3461a596..8c07c693 100644 --- a/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml +++ b/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml @@ -3,7 +3,7 @@ kind: GatewayParameters metadata: name: custom-gw-params spec: - kube: + kube: envoyContainer: securityContext: allowPrivilegeEscalation: false @@ -11,12 +11,12 @@ spec: runAsNonRoot: true runAsUser: "${PROXY_UID}" service: - type: NodePort + type: LoadBalancer extraLabels: gateway: custom podTemplate: extraLabels: gateway: custom - securityContext: + securityContext: seccompProfile: type: RuntimeDefault diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml index a81c387e..48c90408 100644 --- a/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml +++ b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml @@ -6,6 +6,6 @@ resources: images: - name: quay.io/vllm-d/vllm-d-dev:0.0.2 - newName: ${VLLM_P2P_IMAGE} - newTag: ${VLLM_P2P_TAG} + newName: ${VLLM_IMAGE} + newTag: ${VLLM_TAG} diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml index 921b9ef5..a45ae271 100644 --- a/deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml +++ b/deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml @@ -6,7 +6,6 @@ resources: images: - name: quay.io/vllm-d/vllm-sim - newName: ${VLLM_SIM_IMAGE} newTag: ${VLLM_SIM_TAG} patches: diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml index d86d712c..dbb99b17 100644 --- a/deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml +++ b/deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml @@ -2,7 +2,7 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: ${VLLM_DEPLOYMENT_NAME} + name: vllm-sim spec: template: spec: diff --git a/scripts/kubernetes-dev-env.sh b/scripts/kubernetes-dev-env.sh index 98c87492..dfa73f35 100755 --- a/scripts/kubernetes-dev-env.sh +++ b/scripts/kubernetes-dev-env.sh @@ -24,14 +24,21 @@ if [[ -z "${VLLM_MODE:-}" ]]; then exit 1 fi -# vLLM Specific Configuration node +# GIE Configuration node +export POOL_NAME="${POOL_NAME:-vllm-llama3-8b-instruct}" +export REDIS_SVC_NAME="${REDIS_SVC_NAME:-lookup-server-service}" +export REDIS_HOST="${REDIS_HOST:-${REDIS_SVC_NAME}.${NAMESPACE}.svc.cluster.local}" #TODO- remove Redis to kustomize +export REDIS_PORT="${REDIS_PORT:-8100}" +export HF_TOKEN="${HF_TOKEN:-}" +# vLLM Specific Configuration node case "${VLLM_MODE}" in vllm-sim) export VLLM_SIM_IMAGE="${VLLM_SIM_IMAGE:-quay.io/vllm-d/vllm-sim}" export VLLM_SIM_TAG="${VLLM_SIM_TAG:-0.0.2}" export EPP_IMAGE="${EPP_IMAGE:-us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp}" export EPP_TAG="${EPP_TAG:-main}" + export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-sim}" ;; vllm | vllm-p2p) # Shared across both full model modes - // TODO - make more env variables similar @@ -39,13 +46,15 @@ case "${VLLM_MODE}" in export HF_SECRET_NAME="${HF_SECRET_NAME:-hf-token}" export HF_TOKEN=$(echo -n "${HF_TOKEN:-}" | base64 | tr -d '\n') export VOLUME_MOUNT_PATH="${VOLUME_MOUNT_PATH:-/data}" + export VLLM_REPLICA_COUNT="${VLLM_REPLICA_COUNT:-3}" + if [[ "$VLLM_MODE" == "vllm" ]]; then export VLLM_IMAGE="${VLLM_IMAGE:-quay.io/vllm-d/vllm-d-dev}" export VLLM_TAG="${VLLM_TAG:-0.0.2}" export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-llama3-8b-instruct}" - export EPP_IMAGE="${EPP_IMAGE:-us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp}" - export EPP_TAG="${EPP_TAG:-main}" + export EPP_IMAGE="${EPP_IMAGE:-quay.io/vllm-d/gateway-api-inference-extension-dev}" + export EPP_TAG="${EPP_TAG:-0.0.4}" export MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-3.1-8B-Instruct}" export MODEL_LABEL="${MODEL_LABEL:-llama3-8b}" export HF_SECRET_KEY="${HF_SECRET_KEY:-token}" @@ -55,27 +64,25 @@ case "${VLLM_MODE}" in export PVC_NAME="${PVC_NAME:-vllm-storage-claim}" elif [[ "$VLLM_MODE" == "vllm-p2p" ]]; then - export VLLM_IMAGE="${VLLM_IMAGE:-vllm/vllm-openai}" - export VLLM_TAG="${VLLM_TAG:-latest}" + export VLLM_IMAGE="${VLLM_IMAGE:-lmcache/vllm-openai}" + export VLLM_TAG="${VLLM_TAG:-2025-03-10}" + export EPP_IMAGE="${EPP_IMAGE:- quay.io/vmaroon/gateway-api-inference-extension/epp}" + export EPP_TAG="${EPP_TAG:-kv-aware}" export MODEL_NAME="${MODEL_NAME:-mistralai/Mistral-7B-Instruct-v0.2}" export MODEL_LABEL="${MODEL_LABEL:-mistral7b}" - export HF_SECRET_KEY="${HF_SECRET_KEY:-${MODEL_LABEL}}" + export HF_SECRET_KEY="${HF_SECRET_KEY:-${HF_SECRET_NAME}_${MODEL_LABEL}}" export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-${MODEL_LABEL}}" - export VLLM_REPLICA_COUNT="${VLLM_REPLICA_COUNT:-4}" export MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}" export PVC_NAME="${PVC_NAME:-vllm-p2p-storage-claim}" export PVC_ACCESS_MODE="${PVC_ACCESS_MODE:-ReadWriteOnce}" export PVC_SIZE="${PVC_SIZE:-10Gi}" export PVC_STORAGE_CLASS="${PVC_STORAGE_CLASS:-standard}" - export REDIS_NAME="${REDIS_NAME:-vllm-redis}" export REDIS_IMAGE="${REDIS_IMAGE:-redis}" export REDIS_TAG="${REDIS_TAG:-7.2.3}" export REDIS_REPLICA_COUNT="${REDIS_REPLICA_COUNT:-1}" - export REDIS_PORT="${REDIS_PORT:-6379}" + export POD_IP="POD_IP" export REDIS_TARGET_PORT="${REDIS_TARGET_PORT:-6379}" export REDIS_SERVICE_TYPE="${REDIS_SERVICE_TYPE:-ClusterIP}" - export POOL_LABEL="${POOL_LABEL:-vllm-llama3-8b-instruct}" - export REDIS_ENABLED="${REDIS_ENABLED:-true}" fi ;; *) @@ -100,9 +107,13 @@ if [[ "$CLEAN" == "true" ]]; then kustomize build deploy/environments/dev/kubernetes-kgateway | envsubst | kubectl -n "${NAMESPACE}" delete --ignore-not-found=true -f - kustomize build deploy/environments/dev/kubernetes-vllm/${VLLM_MODE} | envsubst | kubectl -n "${NAMESPACE}" delete --ignore-not-found=true -f - else - echo "INFO: Deploying Development Environment in namespace ${NAMESPACE}" - kustomize build deploy/environments/dev/kubernetes-kgateway | envsubst | kubectl -n "${NAMESPACE}" apply -f - + echo "INFO: Deploying vLLM Environment in namespace ${NAMESPACE}" + oc adm policy add-scc-to-user anyuid -z default -n ${NAMESPACE} # TODO - Change to security context kustomize build deploy/environments/dev/kubernetes-vllm/${VLLM_MODE} | envsubst | kubectl -n "${NAMESPACE}" apply -f - + + echo "INFO: Deploying Gateway Environment in namespace ${NAMESPACE}" + kustomize build deploy/environments/dev/kubernetes-kgateway | envsubst | kubectl -n "${NAMESPACE}" apply -f - + echo "INFO: Waiting for resources in namespace ${NAMESPACE} to become ready" kubectl -n "${NAMESPACE}" wait deployment/endpoint-picker --for=condition=Available --timeout=60s kubectl -n "${NAMESPACE}" wait gateway/inference-gateway --for=condition=Programmed --timeout=60s @@ -117,7 +128,7 @@ else ;; vllm-p2p) kubectl -n "${NAMESPACE}" wait deployment/vllm-mistral7b --for=condition=Available --timeout=180s - kubectl -n "${NAMESPACE}" wait deployment/${REDIS_NAME} --for=condition=Available --timeout=60s + kubectl -n "${NAMESPACE}" wait deployment/${REDIS_SVC_NAME} --for=condition=Available --timeout=60s ;; esac fi From a11e98487e1b7858a1c91b0ad2efbaa3c1677330 Mon Sep 17 00:00:00 2001 From: Kfir Toledo Date: Sun, 27 Apr 2025 02:34:21 +0300 Subject: [PATCH 3/5] [fix]: Small fixes for deployment and fix comments Signed-off-by: Kfir Toledo --- DEVELOPMENT.md | 91 ++++++++++++------- .../inference-gateway/deployments.yaml | 4 +- .../inference-gateway/httproutes.yaml | 2 +- .../inference-gateway/inference-models.yaml | 22 +---- .../inference-gateway/kustomization.yaml | 2 + .../secret.yaml | 1 - deploy/components/vllm-p2p/kustomization.yaml | 26 ++++-- .../{deployments => }/redis-deployment.yaml | 7 +- .../vllm-p2p/{service => }/redis-service.yaml | 0 deploy/components/vllm-p2p/secret.yaml | 10 ++ .../{deployments => }/vllm-deployment.yaml | 7 +- deploy/components/vllm-sim/deployments.yaml | 6 +- deploy/components/vllm/configmap.yaml | 6 +- deploy/components/vllm/deployments.yaml | 18 +--- deploy/components/vllm/kustomization.yaml | 4 + deploy/components/vllm/secret.yaml | 1 - .../dev/kind-istio/patch-deployments.yaml | 2 +- .../dev/kind-kgateway/patch-deployments.yaml | 2 +- .../kubernetes-istio/patch-deployments.yaml | 2 +- .../patch-deployments.yaml | 2 +- .../vllm-p2p/kustomization.yaml | 2 + .../vllm-p2p/patch-deployments.yaml | 9 ++ .../kubernetes-vllm/vllm/kustomization.yaml | 6 ++ .../vllm/patch-deployments.yaml | 2 +- scripts/kubernetes-dev-env.sh | 40 ++++---- 25 files changed, 154 insertions(+), 120 deletions(-) rename deploy/components/{vllm-p2p/deployments => inference-gateway}/secret.yaml (88%) rename deploy/components/vllm-p2p/{deployments => }/redis-deployment.yaml (88%) rename deploy/components/vllm-p2p/{service => }/redis-service.yaml (100%) create mode 100644 deploy/components/vllm-p2p/secret.yaml rename deploy/components/vllm-p2p/{deployments => }/vllm-deployment.yaml (95%) create mode 100644 deploy/environments/dev/kubernetes-vllm/vllm-p2p/patch-deployments.yaml diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 9d555a5c..801e67ed 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -152,6 +152,13 @@ Create the namespace: ```console kubectl create namespace ${NAMESPACE} ``` +Set the default namespace for kubectl commands + +```console +kubectl config set-context --current --namespace="${NAMESPACE}" +``` + +> NOTE: If you are using OpenShift (oc CLI), use the following instead: `oc project "${NAMESPACE}"` You'll need to provide a `Secret` with the login credentials for your private repository (e.g. quay.io). It should look something like this: @@ -178,13 +185,6 @@ Export the name of the `Secret` to the environment: export REGISTRY_SECRET=anna-pull-secret ``` -You can optionally set a custom EPP image (otherwise, the default will be used): - -```console -export EPP_IMAGE="/" -export EPP_TAG="" -``` - Set the `VLLM_MODE` environment variable based on which version of vLLM you want to deploy: - `vllm-sim`: Lightweight simulator for simple environments @@ -194,24 +194,10 @@ Set the `VLLM_MODE` environment variable based on which version of vLLM you want ```console export VLLM_MODE=vllm-sim # or vllm / vllm-p2p ``` -Each mode has default image values, but you can override them: -For vllm-sim: - -```console -export VLLM_SIM_IMAGE="/" -export VLLM_SIM_TAG="" -``` - -For vllm and vllm-p2p: -- set Vllm image: -```console -export VLLM_IMAGE="/" -export VLLM_TAG="" -``` - Set hugging face token variable: export HF_TOKEN="" -**Warning**: For vllm mode, the default image uses llama3-8b and vllm-mistral. Make sure you have permission to access these files in their respective repositories. +**Warning**: For vllm mode, the default image uses llama3-8b. Make sure you have permission to access these files in their respective repositories. Once all this is set up, you can deploy the environment: @@ -222,30 +208,73 @@ make environment.dev.kubernetes This will deploy the entire stack to whatever namespace you chose. You can test by exposing the inference `Gateway` via port-forward: -```console +```bash kubectl -n ${NAMESPACE} port-forward service/inference-gateway 8080:80 ``` And making requests with `curl`: - vllm-sim - ```console + ```bash curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \ -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq ``` -- vllm +- vllm or vllm-p2p - ```console + ```bash curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \ -d '{"model":"meta-llama/Llama-3.1-8B-Instruct","prompt":"hi","max_tokens":10,"temperature":0}' | jq ``` +#### Environment Configurateion + +##### **1. Setting the EPP image and tag:** + +You can optionally set a custom EPP image (otherwise, the default will be used): + +```bash +export EPP_IMAGE="/" +export EPP_TAG="" +``` +##### **2. Setting the vLLM image and tag:** + +Each vLLM mode has default image values, but you can override them: + +For `vllm-sim` mode:** + +```bash +export VLLM_SIM_IMAGE="/" +export VLLM_SIM_TAG="" +``` + +For `vllm` and `vllm-p2p` modes:** + +```bash +export VLLM_IMAGE="/" +export VLLM_TAG="" +``` + +##### **3. Setting the model name and label:** + +You can replace the model name that will be used in the system. + +```bash +export MODEL_NAME="${MODEL_NAME:-mistralai/Mistral-7B-Instruct-v0.2}" +export MODEL_LABEL="${MODEL_LABEL:-mistral7b}" +``` + +It is also recommended to update the pool name accordingly: + +```bash +export POOL_NAME="${POOL_NAME:-vllm-Mistral-7B-Instruct}" +``` + +##### **4. Additional environment settings:** + +More Setting of environment variables can be found in the `scripts/kubernetes-dev-env.sh`. + + -- vllm-p2p - ```console - curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \ - -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","prompt":"hi","max_tokens":10,"temperature":0}' | jq - ``` #### Development Cycle > **WARNING**: This is a very manual process at the moment. We expect to make diff --git a/deploy/components/inference-gateway/deployments.yaml b/deploy/components/inference-gateway/deployments.yaml index f523f812..b603beb5 100644 --- a/deploy/components/inference-gateway/deployments.yaml +++ b/deploy/components/inference-gateway/deployments.yaml @@ -22,7 +22,7 @@ spec: imagePullPolicy: IfNotPresent args: - -poolName - - "vllm-llama3-8b-instruct" + - "${POOL_NAME}" - -v - "4" - --zap-encoder @@ -55,4 +55,4 @@ spec: valueFrom: secretKeyRef: name: ${HF_SECRET_NAME} - key: ${HF_SECRET_KEY} \ No newline at end of file + key: ${HF_SECRET_KEY} diff --git a/deploy/components/inference-gateway/httproutes.yaml b/deploy/components/inference-gateway/httproutes.yaml index 1115d13d..97eb2cf3 100644 --- a/deploy/components/inference-gateway/httproutes.yaml +++ b/deploy/components/inference-gateway/httproutes.yaml @@ -13,7 +13,7 @@ spec: backendRefs: - group: inference.networking.x-k8s.io kind: InferencePool - name: vllm-llama3-8b-instruct + name: ${POOL_NAME} port: 8000 timeouts: request: 30s diff --git a/deploy/components/inference-gateway/inference-models.yaml b/deploy/components/inference-gateway/inference-models.yaml index 330f19a9..869be700 100644 --- a/deploy/components/inference-gateway/inference-models.yaml +++ b/deploy/components/inference-gateway/inference-models.yaml @@ -16,27 +16,7 @@ kind: InferenceModel metadata: name: base-model spec: - modelName: meta-llama/Llama-3.1-8B-Instruct + modelName: ${MODEL_NAME} criticality: Critical poolRef: name: ${POOL_NAME} ---- -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferenceModel -metadata: - name: base-model-cpu -spec: - modelName: Qwen/Qwen2.5-1.5B-Instruct - criticality: Critical - poolRef: - name: ${POOL_NAME} ---- -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferenceModel -metadata: - name: mistarli -spec: - modelName: mistralai/Mistral-7B-Instruct-v0.2 - criticality: Critical - poolRef: - name: ${POOL_NAME} \ No newline at end of file diff --git a/deploy/components/inference-gateway/kustomization.yaml b/deploy/components/inference-gateway/kustomization.yaml index 49607a37..78dfabcd 100644 --- a/deploy/components/inference-gateway/kustomization.yaml +++ b/deploy/components/inference-gateway/kustomization.yaml @@ -26,6 +26,8 @@ resources: - deployments.yaml - gateways.yaml - httproutes.yaml +- secret.yaml + images: - name: quay.io/vllm-d/gateway-api-inference-extension/epp diff --git a/deploy/components/vllm-p2p/deployments/secret.yaml b/deploy/components/inference-gateway/secret.yaml similarity index 88% rename from deploy/components/vllm-p2p/deployments/secret.yaml rename to deploy/components/inference-gateway/secret.yaml index 1f5a2bcc..23fe9473 100644 --- a/deploy/components/vllm-p2p/deployments/secret.yaml +++ b/deploy/components/inference-gateway/secret.yaml @@ -2,7 +2,6 @@ apiVersion: v1 kind: Secret metadata: name: ${HF_SECRET_NAME} - namespace: ${NAMESPACE} labels: app.kubernetes.io/name: vllm app.kubernetes.io/component: secret diff --git a/deploy/components/vllm-p2p/kustomization.yaml b/deploy/components/vllm-p2p/kustomization.yaml index c1e3172a..64cedf04 100644 --- a/deploy/components/vllm-p2p/kustomization.yaml +++ b/deploy/components/vllm-p2p/kustomization.yaml @@ -1,13 +1,27 @@ +# ------------------------------------------------------------------------------ +# vLLM P2P Deployment +# +# This deploys the full vLLM model server, capable of serving real models such +# as Llama 3.1-8B-Instruct via the OpenAI-compatible API. It is intended for +# environments with GPU resources and where full inference capabilities are +# required. +# in additon it add LMcache a LLM serving engine extension using Redis to vLLM image +# +# The deployment can be customized using environment variables to set: +# - The container image and tag (VLLM_IMAGE, VLLM_TAG) +# - The model to load (MODEL_NAME) +# +# This setup is suitable for testing and production with Kubernetes (including +# GPU-enabled nodes or clusters with scheduling for `nvidia.com/gpu`). +# ----------------------------------------------------------------------------- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization -namespace: ${NAMESPACE} - resources: - - deployments/vllm-deployment.yaml - - deployments/redis-deployment.yaml - - service/redis-service.yaml - - deployments/secret.yaml + - vllm-deployment.yaml + - redis-deployment.yaml + - redis-service.yaml + - secret.yaml images: - name: vllm/vllm-openai diff --git a/deploy/components/vllm-p2p/deployments/redis-deployment.yaml b/deploy/components/vllm-p2p/redis-deployment.yaml similarity index 88% rename from deploy/components/vllm-p2p/deployments/redis-deployment.yaml rename to deploy/components/vllm-p2p/redis-deployment.yaml index f4b5938e..df8f3b0a 100644 --- a/deploy/components/vllm-p2p/deployments/redis-deployment.yaml +++ b/deploy/components/vllm-p2p/redis-deployment.yaml @@ -1,7 +1,7 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: ${REDIS_SVC_NAME} + name: ${REDIS_DEPLOYMENT_NAME} labels: app.kubernetes.io/name: redis app.kubernetes.io/component: redis-lookup-server @@ -48,8 +48,3 @@ spec: maxSurge: 25% revisionHistoryLimit: 10 progressDeadlineSeconds: 600 - # securityContext: - # allowPrivilegeEscalation: false - # capabilities: - # drop: - # - ALL \ No newline at end of file diff --git a/deploy/components/vllm-p2p/service/redis-service.yaml b/deploy/components/vllm-p2p/redis-service.yaml similarity index 100% rename from deploy/components/vllm-p2p/service/redis-service.yaml rename to deploy/components/vllm-p2p/redis-service.yaml diff --git a/deploy/components/vllm-p2p/secret.yaml b/deploy/components/vllm-p2p/secret.yaml new file mode 100644 index 00000000..23fe9473 --- /dev/null +++ b/deploy/components/vllm-p2p/secret.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Secret +metadata: + name: ${HF_SECRET_NAME} + labels: + app.kubernetes.io/name: vllm + app.kubernetes.io/component: secret +type: Opaque +data: + ${HF_SECRET_KEY}: ${HF_TOKEN} diff --git a/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml b/deploy/components/vllm-p2p/vllm-deployment.yaml similarity index 95% rename from deploy/components/vllm-p2p/deployments/vllm-deployment.yaml rename to deploy/components/vllm-p2p/vllm-deployment.yaml index 1c296eff..19fd59c2 100644 --- a/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml +++ b/deploy/components/vllm-p2p/vllm-deployment.yaml @@ -22,11 +22,6 @@ spec: app.kubernetes.io/model: ${MODEL_LABEL} app: ${POOL_NAME} spec: - # securityContext: - # runAsUser: ${PROXY_UID} - # runAsNonRoot: true - # seccompProfile: - # type: RuntimeDefault containers: - name: vllm image: ${VLLM_IMAGE}:${VLLM_TAG} @@ -101,7 +96,7 @@ spec: limits: nvidia.com/gpu: "1" requests: - cpu: "10" + cpu: "${VLLM_CPU_RESOURCES}" memory: 40Gi nvidia.com/gpu: "1" terminationMessagePath: /dev/termination-log diff --git a/deploy/components/vllm-sim/deployments.yaml b/deploy/components/vllm-sim/deployments.yaml index 4673a99c..34b742c2 100644 --- a/deploy/components/vllm-sim/deployments.yaml +++ b/deploy/components/vllm-sim/deployments.yaml @@ -3,16 +3,16 @@ kind: Deployment metadata: name: vllm-sim labels: - app: vllm-llama3-8b-instruct + app: ${POOL_NAME} spec: replicas: 1 selector: matchLabels: - app: vllm-llama3-8b-instruct + app: ${POOL_NAME} template: metadata: labels: - app: vllm-llama3-8b-instruct + app: ${POOL_NAME} ai-aware-router-pod: "true" spec: containers: diff --git a/deploy/components/vllm/configmap.yaml b/deploy/components/vllm/configmap.yaml index 1a4f8903..03019ce1 100644 --- a/deploy/components/vllm/configmap.yaml +++ b/deploy/components/vllm/configmap.yaml @@ -1,13 +1,13 @@ apiVersion: v1 kind: ConfigMap metadata: - name: vllm-llama3-8b-instruct-adapters + name: lora-adapters data: configmap.yaml: | vLLMLoRAConfig: - name: vllm-llama3-8b-instruct-adapters + name: lora-adapters port: 8000 - defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct + defaultBaseModel: ${MODEL_NAME} ensureExist: models: - id: food-review-1 diff --git a/deploy/components/vllm/deployments.yaml b/deploy/components/vllm/deployments.yaml index 2d7a63d5..71eaa72c 100644 --- a/deploy/components/vllm/deployments.yaml +++ b/deploy/components/vllm/deployments.yaml @@ -24,7 +24,7 @@ spec: command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: - "--model" - - "meta-llama/Llama-3.1-8B-Instruct" + - "${MODEL_NAME}" - "--tensor-parallel-size" - "1" - "--port" @@ -48,8 +48,8 @@ spec: - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: - name: hf-token - key: token + name: ${HF_SECRET_NAME} + key: ${HF_SECRET_KEY} - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING value: "true" - name: XDG_CACHE_HOME @@ -104,11 +104,6 @@ spec: name: shm - mountPath: /adapters name: adapters - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL initContainers: - name: lora-adapter-syncer tty: true @@ -122,11 +117,6 @@ spec: volumeMounts: - name: config-volume mountPath: /config - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL restartPolicy: Always enableServiceLinks: false terminationGracePeriodSeconds: 130 @@ -140,4 +130,4 @@ spec: emptyDir: {} - name: config-volume configMap: - name: vllm-llama3-8b-instruct-adapters + name: lora-adapters diff --git a/deploy/components/vllm/kustomization.yaml b/deploy/components/vllm/kustomization.yaml index 93813639..e5f63b73 100644 --- a/deploy/components/vllm/kustomization.yaml +++ b/deploy/components/vllm/kustomization.yaml @@ -26,6 +26,10 @@ images: newName: ${VLLM_IMAGE} newTag: ${VLLM_TAG} +- name: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer + newName: ${LORA_ADAPTER_SYNCER_IMAGE} + newTag: ${LORA_ADAPTER_SYNCER_TAG} + configMapGenerator: - name: vllm-model-config literals: diff --git a/deploy/components/vllm/secret.yaml b/deploy/components/vllm/secret.yaml index 1f5a2bcc..23fe9473 100644 --- a/deploy/components/vllm/secret.yaml +++ b/deploy/components/vllm/secret.yaml @@ -2,7 +2,6 @@ apiVersion: v1 kind: Secret metadata: name: ${HF_SECRET_NAME} - namespace: ${NAMESPACE} labels: app.kubernetes.io/name: vllm app.kubernetes.io/component: secret diff --git a/deploy/environments/dev/kind-istio/patch-deployments.yaml b/deploy/environments/dev/kind-istio/patch-deployments.yaml index 874b287c..7ab6e3ad 100644 --- a/deploy/environments/dev/kind-istio/patch-deployments.yaml +++ b/deploy/environments/dev/kind-istio/patch-deployments.yaml @@ -9,7 +9,7 @@ spec: - name: epp args: - -poolName - - "vllm-llama3-8b-instruct" + - ${POOL_NAME} - -poolNamespace - "default" - -v diff --git a/deploy/environments/dev/kind-kgateway/patch-deployments.yaml b/deploy/environments/dev/kind-kgateway/patch-deployments.yaml index 874b287c..7ab6e3ad 100644 --- a/deploy/environments/dev/kind-kgateway/patch-deployments.yaml +++ b/deploy/environments/dev/kind-kgateway/patch-deployments.yaml @@ -9,7 +9,7 @@ spec: - name: epp args: - -poolName - - "vllm-llama3-8b-instruct" + - ${POOL_NAME} - -poolNamespace - "default" - -v diff --git a/deploy/environments/dev/kubernetes-istio/patch-deployments.yaml b/deploy/environments/dev/kubernetes-istio/patch-deployments.yaml index 20a17d53..a5a721b8 100644 --- a/deploy/environments/dev/kubernetes-istio/patch-deployments.yaml +++ b/deploy/environments/dev/kubernetes-istio/patch-deployments.yaml @@ -11,7 +11,7 @@ spec: - name: epp args: - -poolName - - "vllm-llama3-8b-instruct" + - ${POOL_NAME} - -poolNamespace - ${NAMESPACE} - -v diff --git a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml index 0e4ad46e..a3b93d36 100644 --- a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml +++ b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml @@ -11,7 +11,7 @@ spec: - name: epp args: - -poolName - - "vllm-llama3-8b-instruct" + - ${POOL_NAME} - -poolNamespace - ${NAMESPACE} - -v diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml index 48c90408..2d378312 100644 --- a/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml +++ b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml @@ -9,3 +9,5 @@ images: newName: ${VLLM_IMAGE} newTag: ${VLLM_TAG} +patches: + - path: patch-deployments.yaml diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-p2p/patch-deployments.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/patch-deployments.yaml new file mode 100644 index 00000000..b1afb13e --- /dev/null +++ b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/patch-deployments.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ${VLLM_DEPLOYMENT_NAME} +spec: + template: + spec: + imagePullSecrets: + - name: ${REGISTRY_SECRET} diff --git a/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml index 81fa76ba..af346345 100644 --- a/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml +++ b/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml @@ -9,3 +9,9 @@ images: newName: ${VLLM_IMAGE} newTag: ${VLLM_TAG} +- name: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer + newName: ${LORA_ADAPTER_SYNCER_IMAGE} + newTag: ${LORA_ADAPTER_SYNCER_TAG} + +patches: + - path: patch-deployments.yaml diff --git a/deploy/environments/dev/kubernetes-vllm/vllm/patch-deployments.yaml b/deploy/environments/dev/kubernetes-vllm/vllm/patch-deployments.yaml index efaa2211..b1afb13e 100644 --- a/deploy/environments/dev/kubernetes-vllm/vllm/patch-deployments.yaml +++ b/deploy/environments/dev/kubernetes-vllm/vllm/patch-deployments.yaml @@ -1,7 +1,7 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: vllm + name: ${VLLM_DEPLOYMENT_NAME} spec: template: spec: diff --git a/scripts/kubernetes-dev-env.sh b/scripts/kubernetes-dev-env.sh index dfa73f35..b04e2cba 100755 --- a/scripts/kubernetes-dev-env.sh +++ b/scripts/kubernetes-dev-env.sh @@ -24,13 +24,18 @@ if [[ -z "${VLLM_MODE:-}" ]]; then exit 1 fi -# GIE Configuration node +# GIE Configuration export POOL_NAME="${POOL_NAME:-vllm-llama3-8b-instruct}" -export REDIS_SVC_NAME="${REDIS_SVC_NAME:-lookup-server-service}" -export REDIS_HOST="${REDIS_HOST:-${REDIS_SVC_NAME}.${NAMESPACE}.svc.cluster.local}" #TODO- remove Redis to kustomize -export REDIS_PORT="${REDIS_PORT:-8100}" -export HF_TOKEN="${HF_TOKEN:-}" +export MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-3.1-8B-Instruct}" +## EPP ENV VARs — currently added to all EPPs, regardless of the VLLM mode or whether they are actually needed +export REDIS_DEPLOYMENT_NAME="${REDIS_DEPLOYMENT_NAME:-lookup-server-service}" +export REDIS_SVC_NAME="${REDIS_SVC_NAME:-${REDIS_DEPLOYMENT_NAME}}" +export REDIS_HOST="${REDIS_HOST:-${REDIS_SVC_NAME}.${NAMESPACE}.svc.cluster.local}" +export REDIS_PORT="${REDIS_PORT:-8100}" +export HF_TOKEN=$(echo -n "${HF_TOKEN}" | base64 | tr -d '\n') +export HF_SECRET_NAME="${HF_SECRET_NAME:-hf-token}" +export HF_SECRET_KEY="${HF_SECRET_KEY:-token}" # vLLM Specific Configuration node case "${VLLM_MODE}" in vllm-sim) @@ -38,40 +43,34 @@ case "${VLLM_MODE}" in export VLLM_SIM_TAG="${VLLM_SIM_TAG:-0.0.2}" export EPP_IMAGE="${EPP_IMAGE:-us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp}" export EPP_TAG="${EPP_TAG:-main}" - export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-sim}" + export HF_TOKEN=$(echo -n "dummy-token" | base64 | tr -d '\n') ;; vllm | vllm-p2p) # Shared across both full model modes - // TODO - make more env variables similar # TODO: Consider unifying more environment variables for consistency and reuse - export HF_SECRET_NAME="${HF_SECRET_NAME:-hf-token}" - export HF_TOKEN=$(echo -n "${HF_TOKEN:-}" | base64 | tr -d '\n') + export VOLUME_MOUNT_PATH="${VOLUME_MOUNT_PATH:-/data}" export VLLM_REPLICA_COUNT="${VLLM_REPLICA_COUNT:-3}" - + export MODEL_LABEL="${MODEL_LABEL:-llama3-8b}" + export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-${MODEL_LABEL}}" if [[ "$VLLM_MODE" == "vllm" ]]; then export VLLM_IMAGE="${VLLM_IMAGE:-quay.io/vllm-d/vllm-d-dev}" export VLLM_TAG="${VLLM_TAG:-0.0.2}" - export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-llama3-8b-instruct}" export EPP_IMAGE="${EPP_IMAGE:-quay.io/vllm-d/gateway-api-inference-extension-dev}" export EPP_TAG="${EPP_TAG:-0.0.4}" - export MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-3.1-8B-Instruct}" - export MODEL_LABEL="${MODEL_LABEL:-llama3-8b}" - export HF_SECRET_KEY="${HF_SECRET_KEY:-token}" - export HF_TOKEN="${HF_TOKEN:-}" + export VLLM_REPLICA_COUNT="${VLLM_REPLICA_COUNT:-2}" export MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}" export PVC_NAME="${PVC_NAME:-vllm-storage-claim}" + export LORA_ADAPTER_SYNCER_IMAGE="${LORA_ADAPTER_SYNCER_IMAGE:-us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer}" + export LORA_ADAPTER_SYNCER_TAG="${LORA_ADAPTER_SYNCER_TAG:-v20250425-ddc3d69}" elif [[ "$VLLM_MODE" == "vllm-p2p" ]]; then export VLLM_IMAGE="${VLLM_IMAGE:-lmcache/vllm-openai}" export VLLM_TAG="${VLLM_TAG:-2025-03-10}" export EPP_IMAGE="${EPP_IMAGE:- quay.io/vmaroon/gateway-api-inference-extension/epp}" export EPP_TAG="${EPP_TAG:-kv-aware}" - export MODEL_NAME="${MODEL_NAME:-mistralai/Mistral-7B-Instruct-v0.2}" - export MODEL_LABEL="${MODEL_LABEL:-mistral7b}" - export HF_SECRET_KEY="${HF_SECRET_KEY:-${HF_SECRET_NAME}_${MODEL_LABEL}}" - export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-${MODEL_LABEL}}" export MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}" export PVC_NAME="${PVC_NAME:-vllm-p2p-storage-claim}" export PVC_ACCESS_MODE="${PVC_ACCESS_MODE:-ReadWriteOnce}" @@ -79,6 +78,7 @@ case "${VLLM_MODE}" in export PVC_STORAGE_CLASS="${PVC_STORAGE_CLASS:-standard}" export REDIS_IMAGE="${REDIS_IMAGE:-redis}" export REDIS_TAG="${REDIS_TAG:-7.2.3}" + export VLLM_CPU_RESOURCES="${VLLM_CPU_RESOURCES:-10}" export REDIS_REPLICA_COUNT="${REDIS_REPLICA_COUNT:-1}" export POD_IP="POD_IP" export REDIS_TARGET_PORT="${REDIS_TARGET_PORT:-6379}" @@ -124,10 +124,10 @@ else kubectl -n "${NAMESPACE}" wait deployment/vllm-sim --for=condition=Available --timeout=60s ;; vllm) - kubectl -n "${NAMESPACE}" wait deployment/vllm-llama3-8b-instruct --for=condition=Available --timeout=180s + kubectl -n "${NAMESPACE}" wait deployment/${VLLM_DEPLOYMENT_NAME} --for=condition=Available --timeout=300s ;; vllm-p2p) - kubectl -n "${NAMESPACE}" wait deployment/vllm-mistral7b --for=condition=Available --timeout=180s + kubectl -n "${NAMESPACE}" wait deployment/${VLLM_DEPLOYMENT_NAME} --for=condition=Available --timeout=180s kubectl -n "${NAMESPACE}" wait deployment/${REDIS_SVC_NAME} --for=condition=Available --timeout=60s ;; esac From 937bb50172338080ec4092602d5843031df879a1 Mon Sep 17 00:00:00 2001 From: Kfir Toledo Date: Mon, 28 Apr 2025 14:54:18 +0300 Subject: [PATCH 4/5] [fix]: fix typos and edit the Readme and env vars Signed-off-by: Kfir Toledo --- DEVELOPMENT.md | 67 +++++++++++-------- Makefile | 2 +- deploy/components/vllm-p2p/kustomization.yaml | 2 +- .../components/vllm-p2p/redis-deployment.yaml | 2 +- deploy/components/vllm/kustomization.yaml | 2 +- scripts/kubernetes-dev-env.sh | 10 +-- 6 files changed, 47 insertions(+), 38 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 801e67ed..24f28e19 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -37,7 +37,7 @@ serving resources. Run the following: -```console +```bash make environment.dev.kind ``` @@ -48,6 +48,7 @@ namespace. There are several ways to access the gateway: **Port forward**: + ```sh $ kubectl --context kind-gie-dev port-forward service/inference-gateway 8080:80 ``` @@ -55,6 +56,7 @@ $ kubectl --context kind-gie-dev port-forward service/inference-gateway 8080:80 **NodePort `inference-gateway-istio`** > **Warning**: This method doesn't work on `podman` correctly, as `podman` support > with `kind` is not fully implemented yet. + ```sh # Determine the k8s node address $ kubectl --context kind-gie-dev get node -o yaml | grep address @@ -80,9 +82,10 @@ By default the created inference gateway, can be accessed on port 30080. This ca be overriden to any free port in the range of 30000 to 32767, by running the above command as follows: -```console +```bash GATEWAY_HOST_PORT= make environment.dev.kind ``` + **Where:** <selected-port> is the port on your local machine you want to use to access the inference gatyeway. @@ -96,7 +99,7 @@ access the inference gatyeway. To test your changes to the GIE in this environment, make your changes locally and then run the following: -```console +```bash make environment.dev.kind.update ``` @@ -122,7 +125,7 @@ the `default` namespace if the cluster is private/personal). The following will deploy all the infrastructure-level requirements (e.g. CRDs, Operators, etc) to support the namespace-level development environments: -```console +```bash make environment.dev.kubernetes.infrastructure ``` @@ -140,7 +143,7 @@ To deploy a development environment to the cluster you'll need to explicitly provide a namespace. This can be `default` if this is your personal cluster, but on a shared cluster you should pick something unique. For example: -```console +```bash export NAMESPACE=annas-dev-environment ``` @@ -149,12 +152,13 @@ export NAMESPACE=annas-dev-environment Create the namespace: -```console +```bash kubectl create namespace ${NAMESPACE} ``` + Set the default namespace for kubectl commands -```console +```bash kubectl config set-context --current --namespace="${NAMESPACE}" ``` @@ -175,33 +179,39 @@ type: kubernetes.io/dockerconfigjson Apply that to your namespace: -```console -kubectl -n ${NAMESPACE} apply -f secret.yaml +```bash +kubectl apply -f secret.yaml ``` Export the name of the `Secret` to the environment: -```console +```bash export REGISTRY_SECRET=anna-pull-secret ``` Set the `VLLM_MODE` environment variable based on which version of vLLM you want to deploy: -- `vllm-sim`: Lightweight simulator for simple environments -- `vllm`: Full vLLM model server for real inference -- `vllm-p2p`: Full vLLM with LMCache P2P support for enable KV-Cache aware routing +* `vllm-sim`: Lightweight simulator for simple environments (defult). +* `vllm`: Full vLLM model server, using GPU/CPU for inferencing +* `vllm-p2p`: Full vLLM with LMCache P2P support for enable KV-Cache aware routing -```console +```bash export VLLM_MODE=vllm-sim # or vllm / vllm-p2p ``` -- Set hugging face token variable: - export HF_TOKEN="" +- Set Hugging Face token variable: + +```bash +export HF_TOKEN="" +``` + **Warning**: For vllm mode, the default image uses llama3-8b. Make sure you have permission to access these files in their respective repositories. +**Note:** The model can be replaced. See [Environment Configuration](#environment-configuration) for model settings. + Once all this is set up, you can deploy the environment: -```console +```bash make environment.dev.kubernetes ``` @@ -209,10 +219,11 @@ This will deploy the entire stack to whatever namespace you chose. You can test by exposing the inference `Gateway` via port-forward: ```bash -kubectl -n ${NAMESPACE} port-forward service/inference-gateway 8080:80 +kubectl port-forward service/inference-gateway 8080:80 ``` And making requests with `curl`: + - vllm-sim ```bash @@ -226,9 +237,10 @@ And making requests with `curl`: curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \ -d '{"model":"meta-llama/Llama-3.1-8B-Instruct","prompt":"hi","max_tokens":10,"temperature":0}' | jq ``` + #### Environment Configurateion -##### **1. Setting the EPP image and tag:** +**1. Setting the EPP image and tag:** You can optionally set a custom EPP image (otherwise, the default will be used): @@ -236,7 +248,8 @@ You can optionally set a custom EPP image (otherwise, the default will be used): export EPP_IMAGE="/" export EPP_TAG="" ``` -##### **2. Setting the vLLM image and tag:** + +**2. Setting the vLLM image and tag:** Each vLLM mode has default image values, but you can override them: @@ -254,7 +267,7 @@ export VLLM_IMAGE="/" export VLLM_TAG="" ``` -##### **3. Setting the model name and label:** +**3. Setting the model name and label:** You can replace the model name that will be used in the system. @@ -263,13 +276,13 @@ export MODEL_NAME="${MODEL_NAME:-mistralai/Mistral-7B-Instruct-v0.2}" export MODEL_LABEL="${MODEL_LABEL:-mistral7b}" ``` -It is also recommended to update the pool name accordingly: +It is also recommended to update the inference pool name accordingly so that it aligns with the models: ```bash export POOL_NAME="${POOL_NAME:-vllm-Mistral-7B-Instruct}" ``` -##### **4. Additional environment settings:** +**4. Additional environment settings:** More Setting of environment variables can be found in the `scripts/kubernetes-dev-env.sh`. @@ -283,19 +296,19 @@ More Setting of environment variables can be found in the `scripts/kubernetes-de Make your changes locally and commit them. Then select an image tag based on the `git` SHA: -```console +```bash export EPP_TAG=$(git rev-parse HEAD) ``` Build the image: -```console +```bash DEV_VERSION=$EPP_TAG make image-build ``` Tag the image for your private registry and push it: -```console +```bash $CONTAINER_RUNTIME tag quay.io/vllm-d/gateway-api-inference-extension/epp:$TAG \ /:$EPP_TAG $CONTAINER_RUNTIME push /:$EPP_TAG @@ -307,7 +320,7 @@ $CONTAINER_RUNTIME push /:$EPP_TAG Then you can re-deploy the environment with the new changes (don't forget all the required env vars): -```console +```bash make environment.dev.kubernetes ``` diff --git a/Makefile b/Makefile index cfaa72cb..471e95a9 100644 --- a/Makefile +++ b/Makefile @@ -785,7 +785,7 @@ environment.dev.kubernetes: check-kubectl check-kustomize check-envsubst .PHONY: clean.environment.dev.kubernetes clean.environment.dev.kubernetes: check-kubectl check-kustomize check-envsubst @CLEAN=true ./scripts/kubernetes-dev-env.sh 2>&1 - @echo "INFO: Finish cleanup development environment for $(VLLM_MODE) mode in namespace $(NAMESPACE)" + @echo "INFO: Finished cleanup of development environment for $(VLLM_MODE) mode in namespace $(NAMESPACE)" # ----------------------------------------------------------------------------- # TODO: these are old aliases that we still need for the moment, but will be diff --git a/deploy/components/vllm-p2p/kustomization.yaml b/deploy/components/vllm-p2p/kustomization.yaml index 64cedf04..1b4c0b28 100644 --- a/deploy/components/vllm-p2p/kustomization.yaml +++ b/deploy/components/vllm-p2p/kustomization.yaml @@ -11,7 +11,7 @@ # - The container image and tag (VLLM_IMAGE, VLLM_TAG) # - The model to load (MODEL_NAME) # -# This setup is suitable for testing and production with Kubernetes (including +# This setup is suitable for testing on Kubernetes (including # GPU-enabled nodes or clusters with scheduling for `nvidia.com/gpu`). # ----------------------------------------------------------------------------- apiVersion: kustomize.config.k8s.io/v1beta1 diff --git a/deploy/components/vllm-p2p/redis-deployment.yaml b/deploy/components/vllm-p2p/redis-deployment.yaml index df8f3b0a..31b329e4 100644 --- a/deploy/components/vllm-p2p/redis-deployment.yaml +++ b/deploy/components/vllm-p2p/redis-deployment.yaml @@ -6,7 +6,7 @@ metadata: app.kubernetes.io/name: redis app.kubernetes.io/component: redis-lookup-server spec: - replicas: ${REDIS_REPLICA_COUNT} + replicas: 1 selector: matchLabels: app.kubernetes.io/name: redis diff --git a/deploy/components/vllm/kustomization.yaml b/deploy/components/vllm/kustomization.yaml index e5f63b73..f04fdf9a 100644 --- a/deploy/components/vllm/kustomization.yaml +++ b/deploy/components/vllm/kustomization.yaml @@ -10,7 +10,7 @@ # - The container image and tag (VLLM_IMAGE, VLLM_TAG) # - The model to load (MODEL_NAME) # -# This setup is suitable for testing and production with Kubernetes (including +# This setup is suitable for testing on Kubernetes (including # GPU-enabled nodes or clusters with scheduling for `nvidia.com/gpu`). # ----------------------------------------------------------------------------- kind: Kustomization diff --git a/scripts/kubernetes-dev-env.sh b/scripts/kubernetes-dev-env.sh index b04e2cba..94ca77fc 100755 --- a/scripts/kubernetes-dev-env.sh +++ b/scripts/kubernetes-dev-env.sh @@ -19,10 +19,7 @@ if [[ -z "${NAMESPACE:-}" ]]; then echo "ERROR: NAMESPACE environment variable is not set." exit 1 fi -if [[ -z "${VLLM_MODE:-}" ]]; then - echo "ERROR: VLLM_MODE is not set. Please export one of: vllm-sim, vllm, vllm-p2p" - exit 1 -fi + # GIE Configuration export POOL_NAME="${POOL_NAME:-vllm-llama3-8b-instruct}" @@ -37,6 +34,8 @@ export HF_TOKEN=$(echo -n "${HF_TOKEN}" | base64 | tr -d '\n') export HF_SECRET_NAME="${HF_SECRET_NAME:-hf-token}" export HF_SECRET_KEY="${HF_SECRET_KEY:-token}" # vLLM Specific Configuration node +export VLLM_MODE="${VLLM_MODE:-vllm-sim}" + case "${VLLM_MODE}" in vllm-sim) export VLLM_SIM_IMAGE="${VLLM_SIM_IMAGE:-quay.io/vllm-d/vllm-sim}" @@ -59,8 +58,6 @@ case "${VLLM_MODE}" in export VLLM_TAG="${VLLM_TAG:-0.0.2}" export EPP_IMAGE="${EPP_IMAGE:-quay.io/vllm-d/gateway-api-inference-extension-dev}" export EPP_TAG="${EPP_TAG:-0.0.4}" - - export VLLM_REPLICA_COUNT="${VLLM_REPLICA_COUNT:-2}" export MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}" export PVC_NAME="${PVC_NAME:-vllm-storage-claim}" export LORA_ADAPTER_SYNCER_IMAGE="${LORA_ADAPTER_SYNCER_IMAGE:-us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer}" @@ -79,7 +76,6 @@ case "${VLLM_MODE}" in export REDIS_IMAGE="${REDIS_IMAGE:-redis}" export REDIS_TAG="${REDIS_TAG:-7.2.3}" export VLLM_CPU_RESOURCES="${VLLM_CPU_RESOURCES:-10}" - export REDIS_REPLICA_COUNT="${REDIS_REPLICA_COUNT:-1}" export POD_IP="POD_IP" export REDIS_TARGET_PORT="${REDIS_TARGET_PORT:-6379}" export REDIS_SERVICE_TYPE="${REDIS_SERVICE_TYPE:-ClusterIP}" From 17a23e5a553c68e39cec4a567bb9d65b4dc561fd Mon Sep 17 00:00:00 2001 From: Kfir Toledo Date: Tue, 29 Apr 2025 12:20:03 +0300 Subject: [PATCH 5/5] [fix] Fix the kind environemnt and set gateway service to be NodePort Signed-off-by: Kfir Toledo --- DEVELOPMENT.md | 22 +++++++++---------- .../inference-gateway/deployments.yaml | 8 ------- .../inference-gateway/kustomization.yaml | 2 -- deploy/components/vllm/kustomization.yaml | 2 +- .../gateway-parameters.yaml | 2 +- .../kubernetes-kgateway/kustomization.yaml | 3 ++- .../patch-deployments.yaml | 8 +++++++ .../dev/kubernetes-kgateway}/secret.yaml | 0 .../kubernetes-vllm/vllm/kustomization.yaml | 2 +- scripts/kind-dev-env.sh | 7 +++++- scripts/kubernetes-dev-env.sh | 5 +++-- 11 files changed, 33 insertions(+), 28 deletions(-) rename deploy/{components/inference-gateway => environments/dev/kubernetes-kgateway}/secret.yaml (100%) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 24f28e19..f6a7c466 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -191,7 +191,7 @@ export REGISTRY_SECRET=anna-pull-secret Set the `VLLM_MODE` environment variable based on which version of vLLM you want to deploy: -* `vllm-sim`: Lightweight simulator for simple environments (defult). +* `vllm-sim`: Lightweight simulator for simple environments (default). * `vllm`: Full vLLM model server, using GPU/CPU for inferencing * `vllm-p2p`: Full vLLM with LMCache P2P support for enable KV-Cache aware routing @@ -224,19 +224,19 @@ kubectl port-forward service/inference-gateway 8080:80 And making requests with `curl`: -- vllm-sim +**vllm-sim:** - ```bash - curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \ - -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq - ``` +```bash +curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \ + -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq +``` -- vllm or vllm-p2p +**vllm or vllm-p2p:** - ```bash - curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \ - -d '{"model":"meta-llama/Llama-3.1-8B-Instruct","prompt":"hi","max_tokens":10,"temperature":0}' | jq - ``` +```bash +curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \ + -d '{"model":"meta-llama/Llama-3.1-8B-Instruct","prompt":"hi","max_tokens":10,"temperature":0}' | jq +``` #### Environment Configurateion diff --git a/deploy/components/inference-gateway/deployments.yaml b/deploy/components/inference-gateway/deployments.yaml index b603beb5..afff8fd2 100644 --- a/deploy/components/inference-gateway/deployments.yaml +++ b/deploy/components/inference-gateway/deployments.yaml @@ -48,11 +48,3 @@ spec: service: inference-extension initialDelaySeconds: 5 periodSeconds: 10 - env: - - name: KVCACHE_INDEXER_REDIS_ADDR - value: ${REDIS_HOST}:${REDIS_PORT} - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: ${HF_SECRET_NAME} - key: ${HF_SECRET_KEY} diff --git a/deploy/components/inference-gateway/kustomization.yaml b/deploy/components/inference-gateway/kustomization.yaml index 78dfabcd..49607a37 100644 --- a/deploy/components/inference-gateway/kustomization.yaml +++ b/deploy/components/inference-gateway/kustomization.yaml @@ -26,8 +26,6 @@ resources: - deployments.yaml - gateways.yaml - httproutes.yaml -- secret.yaml - images: - name: quay.io/vllm-d/gateway-api-inference-extension/epp diff --git a/deploy/components/vllm/kustomization.yaml b/deploy/components/vllm/kustomization.yaml index f04fdf9a..6e0da28b 100644 --- a/deploy/components/vllm/kustomization.yaml +++ b/deploy/components/vllm/kustomization.yaml @@ -33,4 +33,4 @@ images: configMapGenerator: - name: vllm-model-config literals: - - MODEL_NAME=${MODEL_NAME} \ No newline at end of file + - MODEL_NAME=${MODEL_NAME} diff --git a/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml b/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml index 8c07c693..da2d91d2 100644 --- a/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml +++ b/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml @@ -11,7 +11,7 @@ spec: runAsNonRoot: true runAsUser: "${PROXY_UID}" service: - type: LoadBalancer + type: ${GATEWAY_SERVICE_TYPE} extraLabels: gateway: custom podTemplate: diff --git a/deploy/environments/dev/kubernetes-kgateway/kustomization.yaml b/deploy/environments/dev/kubernetes-kgateway/kustomization.yaml index 7dc295de..293119e2 100644 --- a/deploy/environments/dev/kubernetes-kgateway/kustomization.yaml +++ b/deploy/environments/dev/kubernetes-kgateway/kustomization.yaml @@ -4,6 +4,7 @@ kind: Kustomization namespace: ${NAMESPACE} resources: +- secret.yaml - ../../../components/inference-gateway/ - gateway-parameters.yaml @@ -14,4 +15,4 @@ images: patches: - path: patch-deployments.yaml -- path: patch-gateways.yaml \ No newline at end of file +- path: patch-gateways.yaml diff --git a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml index a3b93d36..00c87fbb 100644 --- a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml +++ b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml @@ -22,3 +22,11 @@ spec: - "9002" - -grpcHealthPort - "9003" + env: + - name: KVCACHE_INDEXER_REDIS_ADDR + value: ${REDIS_HOST}:${REDIS_PORT} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: ${HF_SECRET_KEY} \ No newline at end of file diff --git a/deploy/components/inference-gateway/secret.yaml b/deploy/environments/dev/kubernetes-kgateway/secret.yaml similarity index 100% rename from deploy/components/inference-gateway/secret.yaml rename to deploy/environments/dev/kubernetes-kgateway/secret.yaml diff --git a/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml index af346345..e512ee89 100644 --- a/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml +++ b/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml @@ -5,7 +5,7 @@ resources: - ../../../../components/vllm/ images: -- name: quay.io/vllm-d/vllm-d-dev:0.0.2 +- name: quay.io/vllm-d/vllm-d-dev newName: ${VLLM_IMAGE} newTag: ${VLLM_TAG} diff --git a/scripts/kind-dev-env.sh b/scripts/kind-dev-env.sh index e40847e0..85cd988e 100755 --- a/scripts/kind-dev-env.sh +++ b/scripts/kind-dev-env.sh @@ -25,6 +25,11 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Set the host port to map to the Gateway's inbound port (30080) : "${GATEWAY_HOST_PORT:=30080}" +# Set the inference pool name for the deployment +export POOL_NAME="${POOL_NAME:-vllm-llama3-8b-instruct}" + +# Set the model name to deploy +export MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-3.1-8B-Instruct}" # ------------------------------------------------------------------------------ # Setup & Requirement Checks # ------------------------------------------------------------------------------ @@ -113,7 +118,7 @@ kustomize build --enable-helm deploy/components/crds-kgateway | # Deploy the environment to the "default" namespace kustomize build --enable-helm deploy/environments/dev/kind-kgateway \ - | sed "s/REPLACE_NAMESPACE/${PROJECT_NAMESPACE}/gI" \ + | envsubst | sed "s/REPLACE_NAMESPACE/${PROJECT_NAMESPACE}/gI" \ | kubectl --context ${KUBE_CONTEXT} apply -f - # Wait for all control-plane pods to be ready diff --git a/scripts/kubernetes-dev-env.sh b/scripts/kubernetes-dev-env.sh index 94ca77fc..62027c69 100755 --- a/scripts/kubernetes-dev-env.sh +++ b/scripts/kubernetes-dev-env.sh @@ -24,6 +24,7 @@ fi # GIE Configuration export POOL_NAME="${POOL_NAME:-vllm-llama3-8b-instruct}" export MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-3.1-8B-Instruct}" +export GATEWAY_SERVICE_TYPE="${GATEWAY_SERVICE_TYPE:-NodePort}" ## EPP ENV VARs — currently added to all EPPs, regardless of the VLLM mode or whether they are actually needed export REDIS_DEPLOYMENT_NAME="${REDIS_DEPLOYMENT_NAME:-lookup-server-service}" @@ -104,7 +105,7 @@ if [[ "$CLEAN" == "true" ]]; then kustomize build deploy/environments/dev/kubernetes-vllm/${VLLM_MODE} | envsubst | kubectl -n "${NAMESPACE}" delete --ignore-not-found=true -f - else echo "INFO: Deploying vLLM Environment in namespace ${NAMESPACE}" - oc adm policy add-scc-to-user anyuid -z default -n ${NAMESPACE} # TODO - Change to security context + oc adm policy add-scc-to-user anyuid -z default -n ${NAMESPACE} kustomize build deploy/environments/dev/kubernetes-vllm/${VLLM_MODE} | envsubst | kubectl -n "${NAMESPACE}" apply -f - echo "INFO: Deploying Gateway Environment in namespace ${NAMESPACE}" @@ -120,7 +121,7 @@ else kubectl -n "${NAMESPACE}" wait deployment/vllm-sim --for=condition=Available --timeout=60s ;; vllm) - kubectl -n "${NAMESPACE}" wait deployment/${VLLM_DEPLOYMENT_NAME} --for=condition=Available --timeout=300s + kubectl -n "${NAMESPACE}" wait deployment/${VLLM_DEPLOYMENT_NAME} --for=condition=Available --timeout=500s ;; vllm-p2p) kubectl -n "${NAMESPACE}" wait deployment/${VLLM_DEPLOYMENT_NAME} --for=condition=Available --timeout=180s