small fixes for pr

kfirtoledo · kfirtoledo · commit f95936ffd775 · 2025-04-25T22:38:01.000+03:00
Signed-off-by: Kfir Toledo &lt;kfir.toledo@ibm.com&gt;
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
@@ -178,6 +178,13 @@ Export the name of the `Secret` to the environment:
 export REGISTRY_SECRET=anna-pull-secret
 ```
 
+You can optionally set a custom EPP image (otherwise, the default will be used):
+
+```console
+export EPP_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
+export EPP_TAG="<YOUR_TAG>"
+```
+
 Set the `VLLM_MODE` environment variable based on which version of vLLM you want to deploy:
 
 - `vllm-sim`: Lightweight simulator for simple environments
@@ -197,18 +204,14 @@ export VLLM_SIM_TAG="<YOUR_TAG>"
 ```
 
 For vllm and vllm-p2p:
-
+- set Vllm image:
 ```console
 export VLLM_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
 export VLLM_TAG="<YOUR_TAG>"
 ```
-
-The same thing will need to be done for the EPP:
-
-```console
-export EPP_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
-export EPP_TAG="<YOUR_TAG>"
-```
+- Set hugging face token variable:
+  export HF_TOKEN="<HF_TOKEN>"
+Warning: For vllm, the default image uses llama3-8b and vllm-mistral. Make sure you have permission to access these files in their respective repositories.
 
 Once all this is set up, you can deploy the environment:
 
@@ -224,12 +227,22 @@ kubectl -n ${NAMESPACE} port-forward service/inference-gateway 8080:80
 ```
 
 And making requests with `curl`:
-
+1. vllm-sim
 ```console
 curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
   -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq
 ```
 
+2. vllm
+```console
+curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
+  -d '{"model":"meta-llama/Llama-3.1-8B-Instruct","prompt":"hi","max_tokens":10,"temperature":0}' | jq
+```
+3. vllm-p2p
+```console
+curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
+  -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","prompt":"hi","max_tokens":10,"temperature":0}' | jq
+```
 #### Development Cycle
 
 > **WARNING**: This is a very manual process at the moment. We expect to make
diff --git a/deploy/components/inference-gateway/deployments.yaml b/deploy/components/inference-gateway/deployments.yaml
@@ -48,3 +48,8 @@ spec:
             service: inference-extension
           initialDelaySeconds: 5
           periodSeconds: 10
+        env:
+          - name: KVCACHE_INDEXER_REDIS_ADDR
+            value: ${REDIS_HOST}:${REDIS_PORT}
+          - name: HF_TOKEN
+            value: ${HF_TOKEN}
diff --git a/deploy/components/inference-gateway/inference-models.yaml b/deploy/components/inference-gateway/inference-models.yaml
@@ -6,7 +6,37 @@ spec:
   modelName: food-review
   criticality: Critical
   poolRef:
-    name: vllm-llama3-8b-instruct
+    name: ${POOL_NAME}
   targetModels:
   - name: food-review
     weight: 100
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: base-model
+spec:
+  modelName: meta-llama/Llama-3.1-8B-Instruct
+  criticality: Critical
+  poolRef:
+    name: ${POOL_NAME}
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: base-model-cpu
+spec:
+  modelName: Qwen/Qwen2.5-1.5B-Instruct
+  criticality: Critical
+  poolRef:
+    name: ${POOL_NAME}
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: mistarli
+spec:
+  modelName: mistralai/Mistral-7B-Instruct-v0.2
+  criticality: Critical
+  poolRef:
+    name: ${POOL_NAME}
diff --git a/deploy/components/inference-gateway/inference-pools.yaml b/deploy/components/inference-gateway/inference-pools.yaml
@@ -1,10 +1,10 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferencePool
 metadata:
-  name: vllm-llama3-8b-instruct
+  name: ${POOL_NAME}
 spec:
   targetPortNumber: 8000
   selector:
-    app: vllm-llama3-8b-instruct
+    app: ${POOL_NAME}
   extensionRef:
     name: endpoint-picker
diff --git a/deploy/components/vllm-p2p/deployments/redis-deployment.yaml b/deploy/components/vllm-p2p/deployments/redis-deployment.yaml
@@ -1,7 +1,7 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: ${REDIS_NAME}
+  name: ${REDIS_SVC_NAME}
   labels:
     app.kubernetes.io/name: redis
     app.kubernetes.io/component: redis-lookup-server
@@ -20,8 +20,36 @@ spec:
       containers:
         - name: lookup-server
           image: ${REDIS_IMAGE}:${REDIS_TAG}
-          imagePullPolicy: Always
+          imagePullPolicy: IfNotPresent
           command:
             - redis-server
           ports:
-            - containerPort: ${REDIS_TARGET_PORT}
+            - name: redis-port
+              containerPort: ${REDIS_TARGET_PORT}
+              protocol: TCP
+          resources:
+            limits:
+              cpu: "4"
+              memory: 10G
+            requests:
+              cpu: "4"
+              memory: 8G
+          terminationMessagePath: /dev/termination-log
+          terminationMessagePolicy: File
+      restartPolicy: Always
+      terminationGracePeriodSeconds: 30
+      dnsPolicy: ClusterFirst
+      securityContext: {}
+      schedulerName: default-scheduler
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 25%
+      maxSurge: 25%
+  revisionHistoryLimit: 10
+  progressDeadlineSeconds: 600
+          # securityContext:
+          #   allowPrivilegeEscalation: false
+          #   capabilities:
+          #     drop:
+          #       - ALL
diff --git a/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml b/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml
@@ -13,23 +13,30 @@ spec:
       app.kubernetes.io/name: vllm
       app.kubernetes.io/component: vllm
       app.kubernetes.io/model: ${MODEL_LABEL}
+      app: ${POOL_NAME}
   template:
     metadata:
       labels:
         app.kubernetes.io/name: vllm
         app.kubernetes.io/component: vllm
         app.kubernetes.io/model: ${MODEL_LABEL}
+        app: ${POOL_NAME}
     spec:
+      # securityContext:
+      #   runAsUser: ${PROXY_UID}
+        # runAsNonRoot: true
+        # seccompProfile:
+        #   type: RuntimeDefault
       containers:
         - name: vllm
           image: ${VLLM_IMAGE}:${VLLM_TAG}
-          imagePullPolicy: Always
+          imagePullPolicy: IfNotPresent
           command:
             - /bin/sh
             - "-c"
           args:
             - |
-              export LMCACHE_DISTRIBUTED_URL=${POD_IP}:80 &&
+              export LMCACHE_DISTRIBUTED_URL=${${POD_IP}}:80 &&
               vllm serve ${MODEL_NAME}
               --host 0.0.0.0
               --port 8000
@@ -40,22 +47,78 @@ spec:
           ports:
             - name: http
               containerPort: 8000
-            - name: lmcache-dist
+              protocol: TCP
+            - name: lmcache-dist # Assuming port 80 is used for LMCACHE_DISTRIBUTED_URL
               containerPort: 80
+              protocol: TCP
+          livenessProbe:
+            failureThreshold: 3
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            initialDelaySeconds: 15
+            periodSeconds: 10
+            successThreshold: 1
+            timeoutSeconds: 1
+          startupProbe:
+            failureThreshold: 60
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            initialDelaySeconds: 15
+            periodSeconds: 10
+            successThreshold: 1
+            timeoutSeconds: 1
           env:
+            - name: HF_HOME
+              value: /data
+            - name: POD_IP
+              valueFrom:
+                fieldRef:
+                  apiVersion: v1
+                  fieldPath: status.podIP
             - name: HF_TOKEN
               valueFrom:
                 secretKeyRef:
                   name: ${HF_SECRET_NAME}
                   key: ${HF_SECRET_KEY}
-            - name: POD_IP
-              valueFrom:
-                fieldRef:
-                  fieldPath: status.podIP
-          volumeMounts:
-            - name: model-storage
-              mountPath: ${VOLUME_MOUNT_PATH}
-      volumes:
-        - name: model-storage
-          persistentVolumeClaim:
-            claimName: ${PVC_NAME}
+            - name: LMCACHE_LOOKUP_URL
+              value: ${REDIS_HOST}:${REDIS_PORT}
+            - name: LMCACHE_ENABLE_DEBUG
+              value: "True"
+            - name: LMCACHE_ENABLE_P2P
+              value: "True"
+            - name: LMCACHE_LOCAL_CPU
+              value: "True"
+            - name: LMCACHE_MAX_LOCAL_CPU_SIZE
+              value: "20"
+            - name: LMCACHE_USE_EXPERIMENTAL
+              value: "True"
+            - name: VLLM_RPC_TIMEOUT
+              value: "1000000"
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              cpu: "10"
+              memory: 40Gi
+              nvidia.com/gpu: "1"
+          terminationMessagePath: /dev/termination-log
+          terminationMessagePolicy: File
+          securityContext:
+            runAsNonRoot: false
+      restartPolicy: Always
+      terminationGracePeriodSeconds: 30
+      dnsPolicy: ClusterFirst
+      securityContext: {}
+      schedulerName: default-scheduler
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 0
+      maxSurge: "100%"
+  revisionHistoryLimit: 10
+  progressDeadlineSeconds: 1200
+
diff --git a/deploy/components/vllm-p2p/kustomization.yaml b/deploy/components/vllm-p2p/kustomization.yaml
@@ -7,7 +7,6 @@ resources:
   - deployments/vllm-deployment.yaml
   - deployments/redis-deployment.yaml
   - service/redis-service.yaml
-  - pvc/volume.yaml
   - deployments/secret.yaml
 
 images:
@@ -17,14 +16,3 @@ images:
   - name: redis
     newName: ${REDIS_IMAGE}
     newTag: ${REDIS_TAG}
-
-configMapGenerator:
-  - name: model-config
-    literals:
-      - MODEL_NAME=${MODEL_NAME}
-      - MODEL_LABEL=${MODEL_LABEL}
-      - POOL_LABEL=${POOL_LABEL}
-      - REDIS_ENABLED=${REDIS_ENABLED}
-
-generatorOptions:
-  disableNameSuffixHash: true
diff --git a/deploy/components/vllm-p2p/pvc/volume.yaml b/deploy/components/vllm-p2p/pvc/volume.yaml
diff --git a/deploy/components/vllm-p2p/service/redis-service.yaml b/deploy/components/vllm-p2p/service/redis-service.yaml
@@ -1,8 +1,7 @@
 apiVersion: v1
 kind: Service
 metadata:
-  name: ${REDIS_NAME}
-  namespace: ${NAMESPACE}
+  name: ${REDIS_SVC_NAME}
   labels:
     app.kubernetes.io/name: redis
     app.kubernetes.io/component: redis-lookup-server
diff --git a/deploy/components/vllm/deployments.yaml b/deploy/components/vllm/deployments.yaml
@@ -3,14 +3,14 @@ kind: Deployment
 metadata:
   name: ${VLLM_DEPLOYMENT_NAME}
 spec:
-  replicas: 3
+  replicas: ${VLLM_REPLICA_COUNT}
   selector:
     matchLabels:
-      app: vllm-llama3-8b-instruct
+      app: ${POOL_NAME}
   template:
     metadata:
       labels:
-        app: vllm-llama3-8b-instruct
+        app: ${POOL_NAME}
     spec:
       securityContext:
         runAsUser: ${PROXY_UID}
diff --git a/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml b/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml
@@ -3,20 +3,20 @@ kind: GatewayParameters
 metadata:
   name: custom-gw-params
 spec:
-  kube: 
+  kube:
     envoyContainer:
       securityContext:
         allowPrivilegeEscalation: false
         readOnlyRootFilesystem: true
         runAsNonRoot: true
         runAsUser: "${PROXY_UID}"
     service:
-      type: NodePort
+      type: LoadBalancer
       extraLabels:
         gateway: custom
     podTemplate:
       extraLabels:
         gateway: custom
-      securityContext: 
+      securityContext:
         seccompProfile:
           type: RuntimeDefault
diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml
diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml
diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml
diff --git a/scripts/kubernetes-dev-env.sh b/scripts/kubernetes-dev-env.sh