neuralmagic · kfirtoledo · Apr 25, 2025 · Apr 25, 2025 · Apr 26, 2025 · elevran
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
@@ -178,20 +178,40 @@ Export the name of the `Secret` to the environment:
 export REGISTRY_SECRET=anna-pull-secret
 ```
 
-Now you need to provide several other environment variables. You'll need to
-indicate the location and tag of the `vllm-sim` image:
+You can optionally set a custom EPP image (otherwise, the default will be used):
+
+```console
+export EPP_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
+export EPP_TAG="<YOUR_TAG>"
+```
+
+Set the `VLLM_MODE` environment variable based on which version of vLLM you want to deploy:
+
+- `vllm-sim`: Lightweight simulator for simple environments
+- `vllm`: Full vLLM model server for real inference
- `vllm`: Full vLLM model server for real inference
+- `vllm`: Full vLLM model server, using GPU/CPU for inferencing
- `vllm`: Full vLLM model server for real inference
+- `vllm`: Full vLLM model server, using GPU/CPU for inferencing
+- `vllm-p2p`: Full vLLM with LMCache P2P support for enable KV-Cache aware routing
+
+```console
+export VLLM_MODE=vllm-sim  # or vllm / vllm-p2p
+```
+Each mode has default image values, but you can override them:
+
+For vllm-sim:
 
 ```console
 export VLLM_SIM_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
 export VLLM_SIM_TAG="<YOUR_TAG>"
 ```
 
-The same thing will need to be done for the EPP:
-
+For vllm and vllm-p2p:
+- set Vllm image:
 ```console
-export EPP_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
-export EPP_TAG="<YOUR_TAG>"
+export VLLM_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
+export VLLM_TAG="<YOUR_TAG>"
 ```
+- Set hugging face token variable:
- Set hugging face token variable:
+- Set Hugging Face token variable:
- Set hugging face token variable:
+- Set Hugging Face token variable:
+  export HF_TOKEN="<HF_TOKEN>"
+**Warning**: For vllm mode, the default image uses llama3-8b and vllm-mistral. Make sure you have permission to access these files in their respective repositories.
 
 Once all this is set up, you can deploy the environment:
 
@@ -203,16 +223,29 @@ This will deploy the entire stack to whatever namespace you chose. You can test
 by exposing the inference `Gateway` via port-forward:
 
 ```console
-kubectl -n ${NAMESPACE} port-forward service/inference-gateway-istio 8080:80
+kubectl -n ${NAMESPACE} port-forward service/inference-gateway 8080:80
 ```
 
 And making requests with `curl`:
+- vllm-sim
 
-```console
-curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
-  -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq
-```
+    ```console
+    curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
+      -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq
+    ```
+
+- vllm
+
+  ```console
+  curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
+    -d '{"model":"meta-llama/Llama-3.1-8B-Instruct","prompt":"hi","max_tokens":10,"temperature":0}' | jq
+  ```
 
+- vllm-p2p
+  ```console
+  curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
+    -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","prompt":"hi","max_tokens":10,"temperature":0}' | jq
+  ```
 #### Development Cycle
 
 > **WARNING**: This is a very manual process at the moment. We expect to make

diff --git a/Makefile b/Makefile
@@ -780,11 +780,8 @@ environment.dev.kubernetes: check-kubectl check-kustomize check-envsubst
 # ------------------------------------------------------------------------------
 .PHONY: clean.environment.dev.kubernetes
 clean.environment.dev.kubernetes: check-kubectl check-kustomize check-envsubst
-ifndef NAMESPACE
-	$(error "Error: NAMESPACE is required but not set")
-endif
-	@echo "INFO: cleaning up dev environment in $(NAMESPACE)"
-	kustomize build deploy/environments/dev/kubernetes-kgateway | envsubst | kubectl -n "${NAMESPACE}" delete -f -
+	@CLEAN=true ./scripts/kubernetes-dev-env.sh 2>&1
+	@echo "INFO: Finish cleanup development environment for $(VLLM_MODE) mode in namespace $(NAMESPACE)"
-	@echo "INFO: Finish cleanup development environment for $(VLLM_MODE) mode in namespace $(NAMESPACE)"
+	@echo "INFO: Finished cleanup of development environment for $(VLLM_MODE) mode in namespace $(NAMESPACE)"
-	@echo "INFO: Finish cleanup development environment for $(VLLM_MODE) mode in namespace $(NAMESPACE)"
+	@echo "INFO: Finished cleanup of development environment for $(VLLM_MODE) mode in namespace $(NAMESPACE)"
 
 # -----------------------------------------------------------------------------
 # TODO: these are old aliases that we still need for the moment, but will be

diff --git a/deploy/components/inference-gateway/deployments.yaml b/deploy/components/inference-gateway/deployments.yaml
@@ -48,3 +48,11 @@ spec:
             service: inference-extension
           initialDelaySeconds: 5
           periodSeconds: 10
+        env:
+          - name: KVCACHE_INDEXER_REDIS_ADDR
+            value: ${REDIS_HOST}:${REDIS_PORT}
+          - name: HF_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: ${HF_SECRET_NAME}
+                key: ${HF_SECRET_KEY}
diff --git a/deploy/components/inference-gateway/inference-models.yaml b/deploy/components/inference-gateway/inference-models.yaml
@@ -6,7 +6,37 @@ spec:
   modelName: food-review
   criticality: Critical
   poolRef:
-    name: vllm-llama3-8b-instruct
+    name: ${POOL_NAME}
   targetModels:
   - name: food-review
     weight: 100
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: base-model
+spec:
+  modelName: meta-llama/Llama-3.1-8B-Instruct
+  criticality: Critical
+  poolRef:
+    name: ${POOL_NAME}
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: base-model-cpu
+spec:
+  modelName: Qwen/Qwen2.5-1.5B-Instruct
+  criticality: Critical
+  poolRef:
+    name: ${POOL_NAME}
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: mistarli
-  name: mistarli
+  name: mistral
-  name: mistarli
+  name: mistral
+spec:
+  modelName: mistralai/Mistral-7B-Instruct-v0.2
+  criticality: Critical
+  poolRef:
+    name: ${POOL_NAME}
diff --git a/deploy/components/inference-gateway/inference-pools.yaml b/deploy/components/inference-gateway/inference-pools.yaml
@@ -1,10 +1,10 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferencePool
 metadata:
-  name: vllm-llama3-8b-instruct
+  name: ${POOL_NAME}
 spec:
   targetPortNumber: 8000
   selector:
-    app: vllm-llama3-8b-instruct
+    app: ${POOL_NAME}
   extensionRef:
     name: endpoint-picker
diff --git a/deploy/components/vllm-p2p/deployments/redis-deployment.yaml b/deploy/components/vllm-p2p/deployments/redis-deployment.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ${REDIS_SVC_NAME}
+  labels:
+    app.kubernetes.io/name: redis
+    app.kubernetes.io/component: redis-lookup-server
+spec:
+  replicas: ${REDIS_REPLICA_COUNT}
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: redis
+      app.kubernetes.io/component: redis-lookup-server
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: redis
+        app.kubernetes.io/component: redis-lookup-server
+    spec:
+      containers:
+        - name: lookup-server
+          image: ${REDIS_IMAGE}:${REDIS_TAG}
+          imagePullPolicy: IfNotPresent
+          command:
+            - redis-server
+          ports:
+            - name: redis-port
+              containerPort: ${REDIS_TARGET_PORT}
+              protocol: TCP
+          resources:
+            limits:
+              cpu: "4"
+              memory: 10G
+            requests:
+              cpu: "4"
+              memory: 8G
+          terminationMessagePath: /dev/termination-log
+          terminationMessagePolicy: File
+      restartPolicy: Always
+      terminationGracePeriodSeconds: 30
+      dnsPolicy: ClusterFirst
+      securityContext: {}
+      schedulerName: default-scheduler
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 25%
+      maxSurge: 25%
+  revisionHistoryLimit: 10
+  progressDeadlineSeconds: 600
+          # securityContext:
+          #   allowPrivilegeEscalation: false
+          #   capabilities:
+          #     drop:
+          #       - ALL
diff --git a/deploy/components/vllm-p2p/deployments/secret.yaml b/deploy/components/vllm-p2p/deployments/secret.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ${HF_SECRET_NAME}
+  namespace: ${NAMESPACE}
+  labels:
+    app.kubernetes.io/name: vllm
+    app.kubernetes.io/component: secret
+type: Opaque
+data:
+  ${HF_SECRET_KEY}: ${HF_TOKEN}
diff --git a/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml b/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml
@@ -0,0 +1,123 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ${VLLM_DEPLOYMENT_NAME}
+  labels:
+    app.kubernetes.io/name: vllm
+    app.kubernetes.io/model: ${MODEL_LABEL}
+    app.kubernetes.io/component: vllm
+spec:
+  replicas: ${VLLM_REPLICA_COUNT}
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: vllm
+      app.kubernetes.io/component: vllm
+      app.kubernetes.io/model: ${MODEL_LABEL}
+      app: ${POOL_NAME}
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: vllm
+        app.kubernetes.io/component: vllm
+        app.kubernetes.io/model: ${MODEL_LABEL}
+        app: ${POOL_NAME}
+    spec:
+      # securityContext:
+      #   runAsUser: ${PROXY_UID}
+        # runAsNonRoot: true
+        # seccompProfile:
+        #   type: RuntimeDefault
+      containers:
+        - name: vllm
+          image: ${VLLM_IMAGE}:${VLLM_TAG}
+          imagePullPolicy: IfNotPresent
+          command:
+            - /bin/sh
+            - "-c"
+          args:
+            - |
+              export LMCACHE_DISTRIBUTED_URL=$${${POD_IP}}:80 && \
-              export LMCACHE_DISTRIBUTED_URL=$${${POD_IP}}:80 && \
+              export LMCACHE_DISTRIBUTED_URL=${POD_IP}:80 && \
-              export LMCACHE_DISTRIBUTED_URL=$${${POD_IP}}:80 && \
+              export LMCACHE_DISTRIBUTED_URL=${POD_IP}:80 && \
+              vllm serve ${MODEL_NAME} \
+              --host 0.0.0.0 \
+              --port 8000 \
+              --enable-chunked-prefill false \
+              --max-model-len ${MAX_MODEL_LEN} \
+              --kv-transfer-config '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}'
+          ports:
+            - name: http
+              containerPort: 8000
+              protocol: TCP
+            - name: lmcache-dist # Assuming port 80 is used for LMCACHE_DISTRIBUTED_URL
+              containerPort: 80
+              protocol: TCP
+          livenessProbe:
+            failureThreshold: 3
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            initialDelaySeconds: 15
+            periodSeconds: 10
+            successThreshold: 1
+            timeoutSeconds: 1
+          startupProbe:
+            failureThreshold: 60
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            initialDelaySeconds: 15
+            periodSeconds: 10
+            successThreshold: 1
+            timeoutSeconds: 1
+          env:
+            - name: HF_HOME
+              value: /data
+            - name: POD_IP
+              valueFrom:
+                fieldRef:
+                  apiVersion: v1
+                  fieldPath: status.podIP
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: ${HF_SECRET_NAME}
+                  key: ${HF_SECRET_KEY}
+            - name: LMCACHE_LOOKUP_URL
+              value: ${REDIS_HOST}:${REDIS_PORT}
+            - name: LMCACHE_ENABLE_DEBUG
+              value: "True"
+            - name: LMCACHE_ENABLE_P2P
+              value: "True"
+            - name: LMCACHE_LOCAL_CPU
+              value: "True"
+            - name: LMCACHE_MAX_LOCAL_CPU_SIZE
+              value: "20"
+            - name: LMCACHE_USE_EXPERIMENTAL
+              value: "True"
+            - name: VLLM_RPC_TIMEOUT
+              value: "1000000"
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              cpu: "10"
+              memory: 40Gi
+              nvidia.com/gpu: "1"
+          terminationMessagePath: /dev/termination-log
+          terminationMessagePolicy: File
+          securityContext:
+            runAsNonRoot: false
+      restartPolicy: Always
+      terminationGracePeriodSeconds: 30
+      dnsPolicy: ClusterFirst
+      securityContext: {}
+      schedulerName: default-scheduler
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 0
+      maxSurge: "100%"
+  revisionHistoryLimit: 10
+  progressDeadlineSeconds: 1200
+
diff --git a/deploy/components/vllm-p2p/kustomization.yaml b/deploy/components/vllm-p2p/kustomization.yaml
@@ -0,0 +1,18 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: ${NAMESPACE}
+
+resources:
+  - deployments/vllm-deployment.yaml
+  - deployments/redis-deployment.yaml
+  - service/redis-service.yaml
+  - deployments/secret.yaml
+
+images:
+  - name: vllm/vllm-openai
+    newName: ${VLLM_IMAGE}
+    newTag: ${VLLM_TAG}
+  - name: redis
+    newName: ${REDIS_IMAGE}
+    newTag: ${REDIS_TAG}
diff --git a/deploy/components/vllm-p2p/service/redis-service.yaml b/deploy/components/vllm-p2p/service/redis-service.yaml
@@ -0,0 +1,17 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: ${REDIS_SVC_NAME}
+  labels:
+    app.kubernetes.io/name: redis
+    app.kubernetes.io/component: redis-lookup-server
+spec:
+  ports:
+    - name: lookupserver-port
+      protocol: TCP
+      port: ${REDIS_PORT}
+      targetPort: ${REDIS_TARGET_PORT}
+  type: ${REDIS_SERVICE_TYPE}
+  selector:
+    app.kubernetes.io/name: redis
+    app.kubernetes.io/component: redis-lookup-server