From bf8017fbcff0c173c69eebb69149a0d1001da56f Mon Sep 17 00:00:00 2001
From: Kfir Toledo <kfir.toledo@ibm.com>
Date: Fri, 25 Apr 2025 08:13:20 +0300
Subject: [PATCH 1/5]  feat: add scripts for kubernetes dev env using vLLM and
 vLLM-p2p (setup for kvcache-aware)

Signed-off-by: Kfir Toledo <kfir.toledo@ibm.com>
---
 DEVELOPMENT.md                                |  23 ++-
 Makefile                                      |   7 +-
 .../deployments/redis-deployment.yaml         |  27 ++++
 .../vllm-p2p/deployments/secret.yaml          |  11 ++
 .../vllm-p2p/deployments/vllm-deployment.yaml |  61 ++++++++
 deploy/components/vllm-p2p/kustomization.yaml |  30 ++++
 deploy/components/vllm-p2p/pvc/volume.yaml    |  18 +++
 .../vllm-p2p/service/redis-service.yaml       |  18 +++
 deploy/components/vllm/deployments.yaml       | 143 ++++++++++++++++++
 deploy/components/vllm/kustomization.yaml     |  30 ++++
 deploy/components/vllm/secret.yaml            |  11 ++
 .../kubernetes-kgateway/kustomization.yaml    |   6 +-
 .../patch-deployments.yaml                    |  10 --
 .../vllm-p2p/kustomization.yaml               |  11 ++
 .../vllm-sim/kustomization.yaml               |  14 ++
 .../vllm-sim/patch-deployments.yaml           |  10 ++
 .../kubernetes-vllm/vllm/kustomization.yaml   |  11 ++
 .../vllm/patch-deployments.yaml               |   9 ++
 scripts/kubernetes-dev-env.sh                 | 120 ++++++++++++---
 19 files changed, 525 insertions(+), 45 deletions(-)
 create mode 100644 deploy/components/vllm-p2p/deployments/redis-deployment.yaml
 create mode 100644 deploy/components/vllm-p2p/deployments/secret.yaml
 create mode 100644 deploy/components/vllm-p2p/deployments/vllm-deployment.yaml
 create mode 100644 deploy/components/vllm-p2p/kustomization.yaml
 create mode 100644 deploy/components/vllm-p2p/pvc/volume.yaml
 create mode 100644 deploy/components/vllm-p2p/service/redis-service.yaml
 create mode 100644 deploy/components/vllm/deployments.yaml
 create mode 100644 deploy/components/vllm/kustomization.yaml
 create mode 100644 deploy/components/vllm/secret.yaml
 create mode 100644 deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml
 create mode 100644 deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml
 create mode 100644 deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml
 create mode 100644 deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml
 create mode 100644 deploy/environments/dev/kubernetes-vllm/vllm/patch-deployments.yaml

diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index 8b6cf443..f7a5af69 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -178,14 +178,31 @@ Export the name of the `Secret` to the environment:
 export REGISTRY_SECRET=anna-pull-secret
 ```
 
-Now you need to provide several other environment variables. You'll need to
-indicate the location and tag of the `vllm-sim` image:
+Set the `VLLM_MODE` environment variable based on which version of vLLM you want to deploy:
+
+- `vllm-sim`: Lightweight simulator for simple environments
+- `vllm`: Full vLLM model server for real inference
+- `vllm-p2p`: Full vLLM with LMCache P2P support for distributed KV caching
+
+```console
+export VLLM_MODE=vllm-sim  # or vllm / vllm-p2p
+```
+Each mode has default image values, but you can override them:
+
+For vllm-sim:
 
 ```console
 export VLLM_SIM_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
 export VLLM_SIM_TAG="<YOUR_TAG>"
 ```
 
+For vllm and vllm-p2p:
+
+```console
+export VLLM_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
+export VLLM_TAG="<YOUR_TAG>"
+```
+
 The same thing will need to be done for the EPP:
 
 ```console
@@ -203,7 +220,7 @@ This will deploy the entire stack to whatever namespace you chose. You can test
 by exposing the inference `Gateway` via port-forward:
 
 ```console
-kubectl -n ${NAMESPACE} port-forward service/inference-gateway-istio 8080:80
+kubectl -n ${NAMESPACE} port-forward service/inference-gateway 8080:80
 ```
 
 And making requests with `curl`:
diff --git a/Makefile b/Makefile
index 641d6cf6..cfaa72cb 100644
--- a/Makefile
+++ b/Makefile
@@ -784,11 +784,8 @@ environment.dev.kubernetes: check-kubectl check-kustomize check-envsubst
 # ------------------------------------------------------------------------------
 .PHONY: clean.environment.dev.kubernetes
 clean.environment.dev.kubernetes: check-kubectl check-kustomize check-envsubst
-ifndef NAMESPACE
-	$(error "Error: NAMESPACE is required but not set")
-endif
-	@echo "INFO: cleaning up dev environment in $(NAMESPACE)"
-	kustomize build deploy/environments/dev/kubernetes-kgateway | envsubst | kubectl -n "${NAMESPACE}" delete -f -
+	@CLEAN=true ./scripts/kubernetes-dev-env.sh 2>&1
+	@echo "INFO: Finish cleanup development environment for $(VLLM_MODE) mode in namespace $(NAMESPACE)"
 
 # -----------------------------------------------------------------------------
 # TODO: these are old aliases that we still need for the moment, but will be
diff --git a/deploy/components/vllm-p2p/deployments/redis-deployment.yaml b/deploy/components/vllm-p2p/deployments/redis-deployment.yaml
new file mode 100644
index 00000000..16b61d1f
--- /dev/null
+++ b/deploy/components/vllm-p2p/deployments/redis-deployment.yaml
@@ -0,0 +1,27 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ${REDIS_NAME}
+  labels:
+    app.kubernetes.io/name: redis
+    app.kubernetes.io/component: redis-lookup-server
+spec:
+  replicas: ${REDIS_REPLICA_COUNT}
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: redis
+      app.kubernetes.io/component: redis-lookup-server
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: redis
+        app.kubernetes.io/component: redis-lookup-server
+    spec:
+      containers:
+        - name: lookup-server
+          image: ${REDIS_IMAGE}:${REDIS_TAG}
+          imagePullPolicy: Always
+          command:
+            - redis-server
+          ports:
+            - containerPort: ${REDIS_TARGET_PORT}
diff --git a/deploy/components/vllm-p2p/deployments/secret.yaml b/deploy/components/vllm-p2p/deployments/secret.yaml
new file mode 100644
index 00000000..1f5a2bcc
--- /dev/null
+++ b/deploy/components/vllm-p2p/deployments/secret.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ${HF_SECRET_NAME}
+  namespace: ${NAMESPACE}
+  labels:
+    app.kubernetes.io/name: vllm
+    app.kubernetes.io/component: secret
+type: Opaque
+data:
+  ${HF_SECRET_KEY}: ${HF_TOKEN}
diff --git a/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml b/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml
new file mode 100644
index 00000000..b825f358
--- /dev/null
+++ b/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml
@@ -0,0 +1,61 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ${VLLM_DEPLOYMENT_NAME}
+  labels:
+    app.kubernetes.io/name: vllm
+    app.kubernetes.io/model: ${MODEL_LABEL}
+    app.kubernetes.io/component: vllm
+spec:
+  replicas: ${VLLM_REPLICA_COUNT}
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: vllm
+      app.kubernetes.io/component: vllm
+      app.kubernetes.io/model: ${MODEL_LABEL}
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: vllm
+        app.kubernetes.io/component: vllm
+        app.kubernetes.io/model: ${MODEL_LABEL}
+    spec:
+      containers:
+        - name: vllm
+          image: ${VLLM_IMAGE}:${VLLM_TAG}
+          imagePullPolicy: Always
+          command:
+            - /bin/sh
+            - "-c"
+          args:
+            - |
+              export LMCACHE_DISTRIBUTED_URL=${POD_IP}:80 &&
+              vllm serve ${MODEL_NAME}
+              --host 0.0.0.0
+              --port 8000
+              --enable-chunked-prefill false
+              --max-model-len ${MAX_MODEL_LEN}
+              --kv-transfer-config
+              '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}'
+          ports:
+            - name: http
+              containerPort: 8000
+            - name: lmcache-dist
+              containerPort: 80
+          env:
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: ${HF_SECRET_NAME}
+                  key: ${HF_SECRET_KEY}
+            - name: POD_IP
+              valueFrom:
+                fieldRef:
+                  fieldPath: status.podIP
+          volumeMounts:
+            - name: model-storage
+              mountPath: ${VOLUME_MOUNT_PATH}
+      volumes:
+        - name: model-storage
+          persistentVolumeClaim:
+            claimName: ${PVC_NAME}
diff --git a/deploy/components/vllm-p2p/kustomization.yaml b/deploy/components/vllm-p2p/kustomization.yaml
new file mode 100644
index 00000000..3f7d6014
--- /dev/null
+++ b/deploy/components/vllm-p2p/kustomization.yaml
@@ -0,0 +1,30 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: ${NAMESPACE}
+
+resources:
+  - deployments/vllm-deployment.yaml
+  - deployments/redis-deployment.yaml
+  - service/redis-service.yaml
+  - pvc/volume.yaml
+  - deployments/secret.yaml
+
+images:
+  - name: vllm/vllm-openai
+    newName: ${VLLM_IMAGE}
+    newTag: ${VLLM_TAG}
+  - name: redis
+    newName: ${REDIS_IMAGE}
+    newTag: ${REDIS_TAG}
+
+configMapGenerator:
+  - name: model-config
+    literals:
+      - MODEL_NAME=${MODEL_NAME}
+      - MODEL_LABEL=${MODEL_LABEL}
+      - POOL_LABEL=${POOL_LABEL}
+      - REDIS_ENABLED=${REDIS_ENABLED}
+
+generatorOptions:
+  disableNameSuffixHash: true
\ No newline at end of file
diff --git a/deploy/components/vllm-p2p/pvc/volume.yaml b/deploy/components/vllm-p2p/pvc/volume.yaml
new file mode 100644
index 00000000..6a0a042c
--- /dev/null
+++ b/deploy/components/vllm-p2p/pvc/volume.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: ${PVC_NAME}
+  namespace: ${NAMESPACE}
+  labels:
+    app.kubernetes.io/name: vllm
+    app.kubernetes.io/component: storage
+    app.kubernetes.io/model: ${MODEL_LABEL}
+  finalizers:
+    - kubernetes.io/pvc-protection
+spec:
+  accessModes:
+    - ${PVC_ACCESS_MODE}
+  resources:
+    requests:
+      storage: ${PVC_SIZE}
+  storageClassName: ${PVC_STORAGE_CLASS}
diff --git a/deploy/components/vllm-p2p/service/redis-service.yaml b/deploy/components/vllm-p2p/service/redis-service.yaml
new file mode 100644
index 00000000..947f87ac
--- /dev/null
+++ b/deploy/components/vllm-p2p/service/redis-service.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: ${REDIS_NAME}
+  namespace: ${NAMESPACE}
+  labels:
+    app.kubernetes.io/name: redis
+    app.kubernetes.io/component: redis-lookup-server
+spec:
+  ports:
+    - name: lookupserver-port
+      protocol: TCP
+      port: ${REDIS_PORT}
+      targetPort: ${REDIS_TARGET_PORT}
+  type: ${REDIS_SERVICE_TYPE}
+  selector:
+    app.kubernetes.io/name: redis
+    app.kubernetes.io/component: redis-lookup-server
diff --git a/deploy/components/vllm/deployments.yaml b/deploy/components/vllm/deployments.yaml
new file mode 100644
index 00000000..19a398b1
--- /dev/null
+++ b/deploy/components/vllm/deployments.yaml
@@ -0,0 +1,143 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ${VLLM_DEPLOYMENT_NAME}
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: vllm-llama3-8b-instruct
+  template:
+    metadata:
+      labels:
+        app: vllm-llama3-8b-instruct
+    spec:
+      securityContext:
+        runAsUser: ${PROXY_UID}
+        runAsNonRoot: true
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: vllm
+          image: "vllm/vllm-openai:latest"
+          imagePullPolicy: IfNotPresent
+          command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+          args:
+            - "--model"
+            - "meta-llama/Llama-3.1-8B-Instruct"
+            - "--tensor-parallel-size"
+            - "1"
+            - "--port"
+            - "8000"
+            - "--max-num-seq"
+            - "1024"
+            - "--compilation-config"
+            - "3"
+            - "--enable-lora"
+            - "--max-loras"
+            - "2"
+            - "--max-lora-rank"
+            - "8"
+            - "--max-cpu-loras"
+            - "12"
+          env:
+            - name: VLLM_USE_V1
+              value: "1"
+            - name: PORT
+              value: "8000"
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: token
+            - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
+              value: "true"
+            - name: XDG_CACHE_HOME
+              value: /cache
+            - name: HF_HOME
+              value: /cache/huggingface
+            - name: FLASHINFER_CACHE_DIR
+              value: /cache/flashinfer
+          ports:
+            - containerPort: 8000
+              name: http
+              protocol: TCP
+          lifecycle:
+            preStop:
+              sleep:
+                seconds: 30
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: http
+              scheme: HTTP
+            periodSeconds: 1
+            successThreshold: 1
+            failureThreshold: 5
+            timeoutSeconds: 1
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: http
+              scheme: HTTP
+            periodSeconds: 1
+            successThreshold: 1
+            failureThreshold: 1
+            timeoutSeconds: 1
+          startupProbe:
+            httpGet:
+              path: /health
+              port: http
+              scheme: HTTP
+            failureThreshold: 600
+            initialDelaySeconds: 2
+            periodSeconds: 1
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+            requests:
+              nvidia.com/gpu: 1
+          volumeMounts:
+            - mountPath: /cache
+              name: hf-cache
+            - mountPath: /dev/shm
+              name: shm
+            - mountPath: /adapters
+              name: adapters
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop:
+                - ALL
+      initContainers:
+        - name: lora-adapter-syncer
+          tty: true
+          stdin: true
+          image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer:main
+          restartPolicy: Always
+          imagePullPolicy: Always
+          env:
+            - name: DYNAMIC_LORA_ROLLOUT_CONFIG
+              value: "/config/configmap.yaml"
+          volumeMounts:
+            - name: config-volume
+              mountPath: /config
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop:
+                - ALL
+      restartPolicy: Always
+      enableServiceLinks: false
+      terminationGracePeriodSeconds: 130
+      volumes:
+        - name: hf-cache
+          emptyDir: {}
+        - name: shm
+          emptyDir:
+            medium: Memory
+        - name: adapters
+          emptyDir: {}
+        - name: config-volume
+          configMap:
+            name: vllm-llama3-8b-instruct-adapters
diff --git a/deploy/components/vllm/kustomization.yaml b/deploy/components/vllm/kustomization.yaml
new file mode 100644
index 00000000..ee4a4e7d
--- /dev/null
+++ b/deploy/components/vllm/kustomization.yaml
@@ -0,0 +1,30 @@
+# ------------------------------------------------------------------------------
+# vLLM Deployment
+#
+# This deploys the full vLLM model server, capable of serving real models such
+# as Llama 3.1-8B-Instruct via the OpenAI-compatible API. It is intended for
+# environments with GPU resources and where full inference capabilities are
+# required.
+#
+# The deployment can be customized using environment variables to set:
+#   - The container image and tag (VLLM_IMAGE, VLLM_TAG)
+#   - The model to load (MODEL_NAME)
+#
+# This setup is suitable for testing and production with Kubernetes (including
+# GPU-enabled nodes or clusters with scheduling for `nvidia.com/gpu`).
+# -----------------------------------------------------------------------------
+kind: Kustomization
+
+resources:
+- deployments.yaml
+- secret.yaml
+
+images:
+- name: vllm/vllm-openai
+  newName: ${VLLM_IMAGE}
+  newTag: ${VLLM_TAG}
+
+configMapGenerator:
+- name: vllm-model-config
+  literals:
+    - MODEL_NAME=${MODEL_NAME}
\ No newline at end of file
diff --git a/deploy/components/vllm/secret.yaml b/deploy/components/vllm/secret.yaml
new file mode 100644
index 00000000..1f5a2bcc
--- /dev/null
+++ b/deploy/components/vllm/secret.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ${HF_SECRET_NAME}
+  namespace: ${NAMESPACE}
+  labels:
+    app.kubernetes.io/name: vllm
+    app.kubernetes.io/component: secret
+type: Opaque
+data:
+  ${HF_SECRET_KEY}: ${HF_TOKEN}
diff --git a/deploy/environments/dev/kubernetes-kgateway/kustomization.yaml b/deploy/environments/dev/kubernetes-kgateway/kustomization.yaml
index 0b7e1ed8..7dc295de 100644
--- a/deploy/environments/dev/kubernetes-kgateway/kustomization.yaml
+++ b/deploy/environments/dev/kubernetes-kgateway/kustomization.yaml
@@ -4,18 +4,14 @@ kind: Kustomization
 namespace: ${NAMESPACE}
 
 resources:
-- ../../../components/vllm-sim/
 - ../../../components/inference-gateway/
 - gateway-parameters.yaml
 
 images:
-- name: quay.io/vllm-d/vllm-sim
-  newName: ${VLLM_SIM_IMAGE}
-  newTag: ${VLLM_SIM_TAG}
 - name: quay.io/vllm-d/gateway-api-inference-extension/epp
   newName: ${EPP_IMAGE}
   newTag: ${EPP_TAG}
 
 patches:
 - path: patch-deployments.yaml
-- path: patch-gateways.yaml
+- path: patch-gateways.yaml
\ No newline at end of file
diff --git a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml
index 20a17d53..0e4ad46e 100644
--- a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml
+++ b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml
@@ -22,13 +22,3 @@ spec:
         - "9002"
         - -grpcHealthPort
         - "9003"
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vllm-sim
-spec:
-  template:
-    spec:
-      imagePullSecrets:
-      - name: ${REGISTRY_SECRET}
diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml
new file mode 100644
index 00000000..a81c387e
--- /dev/null
+++ b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml
@@ -0,0 +1,11 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- ../../../../components/vllm-p2p/
+
+images:
+- name: quay.io/vllm-d/vllm-d-dev:0.0.2
+  newName: ${VLLM_P2P_IMAGE}
+  newTag: ${VLLM_P2P_TAG}
+
diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml
new file mode 100644
index 00000000..921b9ef5
--- /dev/null
+++ b/deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml
@@ -0,0 +1,14 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- ../../../../components/vllm-sim/
+
+images:
+- name: quay.io/vllm-d/vllm-sim
+  newName: ${VLLM_SIM_IMAGE}
+  newTag: ${VLLM_SIM_TAG}
+
+patches:
+  - path: patch-deployments.yaml
+
diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml
new file mode 100644
index 00000000..d86d712c
--- /dev/null
+++ b/deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml
@@ -0,0 +1,10 @@
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ${VLLM_DEPLOYMENT_NAME}
+spec:
+  template:
+    spec:
+      imagePullSecrets:
+      - name: ${REGISTRY_SECRET}
diff --git a/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml
new file mode 100644
index 00000000..81fa76ba
--- /dev/null
+++ b/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml
@@ -0,0 +1,11 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- ../../../../components/vllm/
+
+images:
+- name: quay.io/vllm-d/vllm-d-dev:0.0.2
+  newName: ${VLLM_IMAGE}
+  newTag: ${VLLM_TAG}
+
diff --git a/deploy/environments/dev/kubernetes-vllm/vllm/patch-deployments.yaml b/deploy/environments/dev/kubernetes-vllm/vllm/patch-deployments.yaml
new file mode 100644
index 00000000..efaa2211
--- /dev/null
+++ b/deploy/environments/dev/kubernetes-vllm/vllm/patch-deployments.yaml
@@ -0,0 +1,9 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+spec:
+  template:
+    spec:
+      imagePullSecrets:
+      - name: ${REGISTRY_SECRET}
diff --git a/scripts/kubernetes-dev-env.sh b/scripts/kubernetes-dev-env.sh
index 28b84409..98c87492 100755
--- a/scripts/kubernetes-dev-env.sh
+++ b/scripts/kubernetes-dev-env.sh
@@ -12,18 +12,77 @@ set -eux
 # ------------------------------------------------------------------------------
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-# Set a default VLLM_SIM_IMAGE if not provided
-: "${VLLM_SIM_IMAGE:=quay.io/vllm-d/vllm-sim}"
-
-# Set a default VLLM_SIM_TAG if not provided
-: "${VLLM_SIM_TAG:=0.0.2}"
-
-# Set a default EPP_IMAGE if not provided
-: "${EPP_IMAGE:=us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp}"
-
-# Set a default EPP_TAG if not provided
-: "${EPP_TAG:=main}"
+export CLEAN="${CLEAN:-false}"
+
+# Validate required inputs
+if [[ -z "${NAMESPACE:-}" ]]; then
+  echo "ERROR: NAMESPACE environment variable is not set."
+  exit 1
+fi
+if [[ -z "${VLLM_MODE:-}" ]]; then
+  echo "ERROR: VLLM_MODE is not set. Please export one of: vllm-sim, vllm, vllm-p2p"
+  exit 1
+fi
+
+# vLLM Specific Configuration node
+
+case "${VLLM_MODE}" in
+  vllm-sim)
+    export VLLM_SIM_IMAGE="${VLLM_SIM_IMAGE:-quay.io/vllm-d/vllm-sim}"
+    export VLLM_SIM_TAG="${VLLM_SIM_TAG:-0.0.2}"
+    export EPP_IMAGE="${EPP_IMAGE:-us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp}"
+    export EPP_TAG="${EPP_TAG:-main}"
+    ;;
+  vllm | vllm-p2p)
+    # Shared across both full model modes - // TODO - make more env variables similar
+    # TODO: Consider unifying more environment variables for consistency and reuse
+    export HF_SECRET_NAME="${HF_SECRET_NAME:-hf-token}"
+    export HF_TOKEN=$(echo -n "${HF_TOKEN:-}" | base64 | tr -d '\n')
+    export VOLUME_MOUNT_PATH="${VOLUME_MOUNT_PATH:-/data}"
+
+    if [[ "$VLLM_MODE" == "vllm" ]]; then
+      export VLLM_IMAGE="${VLLM_IMAGE:-quay.io/vllm-d/vllm-d-dev}"
+      export VLLM_TAG="${VLLM_TAG:-0.0.2}"
+      export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-llama3-8b-instruct}"
+      export EPP_IMAGE="${EPP_IMAGE:-us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp}"
+      export EPP_TAG="${EPP_TAG:-main}"
+      export MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-3.1-8B-Instruct}"
+      export MODEL_LABEL="${MODEL_LABEL:-llama3-8b}"
+      export HF_SECRET_KEY="${HF_SECRET_KEY:-token}"
+      export HF_TOKEN="${HF_TOKEN:-}"
+      export VLLM_REPLICA_COUNT="${VLLM_REPLICA_COUNT:-2}"
+      export MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"
+      export PVC_NAME="${PVC_NAME:-vllm-storage-claim}"
+
+    elif [[ "$VLLM_MODE" == "vllm-p2p" ]]; then
+      export VLLM_IMAGE="${VLLM_IMAGE:-vllm/vllm-openai}"
+      export VLLM_TAG="${VLLM_TAG:-latest}"
+      export MODEL_NAME="${MODEL_NAME:-mistralai/Mistral-7B-Instruct-v0.2}"
+      export MODEL_LABEL="${MODEL_LABEL:-mistral7b}"
+      export HF_SECRET_KEY="${HF_SECRET_KEY:-${MODEL_LABEL}}"
+      export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-${MODEL_LABEL}}"
+      export VLLM_REPLICA_COUNT="${VLLM_REPLICA_COUNT:-4}"
+      export MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
+      export PVC_NAME="${PVC_NAME:-vllm-p2p-storage-claim}"
+      export PVC_ACCESS_MODE="${PVC_ACCESS_MODE:-ReadWriteOnce}"
+      export PVC_SIZE="${PVC_SIZE:-10Gi}"
+      export PVC_STORAGE_CLASS="${PVC_STORAGE_CLASS:-standard}"
+      export REDIS_NAME="${REDIS_NAME:-vllm-redis}"
+      export REDIS_IMAGE="${REDIS_IMAGE:-redis}"
+      export REDIS_TAG="${REDIS_TAG:-7.2.3}"
+      export REDIS_REPLICA_COUNT="${REDIS_REPLICA_COUNT:-1}"
+      export REDIS_PORT="${REDIS_PORT:-6379}"
+      export REDIS_TARGET_PORT="${REDIS_TARGET_PORT:-6379}"
+      export REDIS_SERVICE_TYPE="${REDIS_SERVICE_TYPE:-ClusterIP}"
+      export POOL_LABEL="${POOL_LABEL:-vllm-llama3-8b-instruct}"
+      export REDIS_ENABLED="${REDIS_ENABLED:-true}"
+    fi
+    ;;
+  *)
+    echo "ERROR: Unsupported VLLM_MODE: ${VLLM_MODE}. Must be one of: vllm-sim, vllm, vllm-p2p"
+    exit 1
+    ;;
+esac
 
 # ------------------------------------------------------------------------------
 # Deployment
@@ -32,18 +91,35 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 kubectl create namespace ${NAMESPACE} 2>/dev/null || true
 
 # Hack to deal with KGateways broken OpenShift support
-export PROXY_UID=$(kubectl get namespace ${NAMESPACE} -o json | jq -e -r '.metadata.annotations["openshift.io/sa.scc.uid-range"]' | perl -F'/' -lane 'print $F[0]+1'); 
+export PROXY_UID=$(kubectl get namespace ${NAMESPACE} -o json | jq -e -r '.metadata.annotations["openshift.io/sa.scc.uid-range"]' | perl -F'/' -lane 'print $F[0]+1');
 
 set -o pipefail
 
-echo "INFO: Deploying Development Environment in namespace ${NAMESPACE}"
-
-kustomize build deploy/environments/dev/kubernetes-kgateway | envsubst | kubectl -n ${NAMESPACE} apply -f -
-
-echo "INFO: Waiting for resources in namespace ${NAMESPACE} to become ready"
+if [[ "$CLEAN" == "true" ]]; then
+  echo "INFO: ${CLEAN^^}ING environment in namespace ${NAMESPACE} for mode ${VLLM_MODE}"
+  kustomize build deploy/environments/dev/kubernetes-kgateway | envsubst | kubectl -n "${NAMESPACE}" delete --ignore-not-found=true -f -
+  kustomize build deploy/environments/dev/kubernetes-vllm/${VLLM_MODE} | envsubst | kubectl -n "${NAMESPACE}" delete --ignore-not-found=true -f -
+else
+  echo "INFO: Deploying Development Environment in namespace ${NAMESPACE}"
+  kustomize build deploy/environments/dev/kubernetes-kgateway | envsubst | kubectl -n "${NAMESPACE}" apply -f -
+  kustomize build deploy/environments/dev/kubernetes-vllm/${VLLM_MODE} | envsubst | kubectl -n "${NAMESPACE}" apply -f -
+  echo "INFO: Waiting for resources in namespace ${NAMESPACE} to become ready"
+  kubectl -n "${NAMESPACE}" wait deployment/endpoint-picker --for=condition=Available --timeout=60s
+  kubectl -n "${NAMESPACE}" wait gateway/inference-gateway --for=condition=Programmed --timeout=60s
+  kubectl -n "${NAMESPACE}" wait deployment/inference-gateway --for=condition=Available --timeout=60s
+  # Mode-specific wait
+  case "${VLLM_MODE}" in
+    vllm-sim)
+      kubectl -n "${NAMESPACE}" wait deployment/vllm-sim --for=condition=Available --timeout=60s
+      ;;
+    vllm)
+      kubectl -n "${NAMESPACE}" wait deployment/vllm-llama3-8b-instruct --for=condition=Available --timeout=180s
+      ;;
+    vllm-p2p)
+      kubectl -n "${NAMESPACE}" wait deployment/vllm-mistral7b --for=condition=Available --timeout=180s
+      kubectl -n "${NAMESPACE}" wait deployment/${REDIS_NAME} --for=condition=Available --timeout=60s
+      ;;
+  esac
+fi
 
-kubectl -n ${NAMESPACE} wait deployment/endpoint-picker --for=condition=Available --timeout=60s
-kubectl -n ${NAMESPACE} wait deployment/vllm-sim --for=condition=Available --timeout=60s
-kubectl -n ${NAMESPACE} wait gateway/inference-gateway --for=condition=Programmed --timeout=60s
-kubectl -n ${NAMESPACE} wait deployment/inference-gateway --for=condition=Available --timeout=60s
 

From 78157d52f1a7a04e198ccc535b681a134225fc5d Mon Sep 17 00:00:00 2001
From: Kfir Toledo <kfir.toledo@ibm.com>
Date: Fri, 25 Apr 2025 15:02:51 +0300
Subject: [PATCH 2/5] [fix]: Small fixes for development YAMLs

Signed-off-by: Kfir Toledo <kfir.toledo@ibm.com>
---
 DEVELOPMENT.md                                |  42 ++++---
 .../inference-gateway/deployments.yaml        |   8 ++
 .../inference-gateway/inference-models.yaml   |  32 +++++-
 .../inference-gateway/inference-pools.yaml    |   4 +-
 .../deployments/redis-deployment.yaml         |  34 +++++-
 .../vllm-p2p/deployments/vllm-deployment.yaml | 104 ++++++++++++++----
 deploy/components/vllm-p2p/kustomization.yaml |  12 --
 deploy/components/vllm-p2p/pvc/volume.yaml    |  18 ---
 .../vllm-p2p/service/redis-service.yaml       |   3 +-
 deploy/components/vllm/configmap.yaml         |  14 +++
 deploy/components/vllm/deployments.yaml       |   6 +-
 deploy/components/vllm/kustomization.yaml     |   2 +
 .../gateway-parameters.yaml                   |   6 +-
 .../vllm-p2p/kustomization.yaml               |   4 +-
 .../vllm-sim/kustomization.yaml               |   1 -
 .../vllm-sim/patch-deployments.yaml           |   2 +-
 scripts/kubernetes-dev-env.sh                 |  39 ++++---
 17 files changed, 235 insertions(+), 96 deletions(-)
 delete mode 100644 deploy/components/vllm-p2p/pvc/volume.yaml
 create mode 100644 deploy/components/vllm/configmap.yaml

diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index f7a5af69..9d555a5c 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -178,11 +178,18 @@ Export the name of the `Secret` to the environment:
 export REGISTRY_SECRET=anna-pull-secret
 ```
 
+You can optionally set a custom EPP image (otherwise, the default will be used):
+
+```console
+export EPP_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
+export EPP_TAG="<YOUR_TAG>"
+```
+
 Set the `VLLM_MODE` environment variable based on which version of vLLM you want to deploy:
 
 - `vllm-sim`: Lightweight simulator for simple environments
 - `vllm`: Full vLLM model server for real inference
-- `vllm-p2p`: Full vLLM with LMCache P2P support for distributed KV caching
+- `vllm-p2p`: Full vLLM with LMCache P2P support for enable KV-Cache aware routing
 
 ```console
 export VLLM_MODE=vllm-sim  # or vllm / vllm-p2p
@@ -197,18 +204,14 @@ export VLLM_SIM_TAG="<YOUR_TAG>"
 ```
 
 For vllm and vllm-p2p:
-
+- set Vllm image:
 ```console
 export VLLM_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
 export VLLM_TAG="<YOUR_TAG>"
 ```
-
-The same thing will need to be done for the EPP:
-
-```console
-export EPP_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
-export EPP_TAG="<YOUR_TAG>"
-```
+- Set hugging face token variable:
+  export HF_TOKEN="<HF_TOKEN>"
+**Warning**: For vllm mode, the default image uses llama3-8b and vllm-mistral. Make sure you have permission to access these files in their respective repositories.
 
 Once all this is set up, you can deploy the environment:
 
@@ -224,12 +227,25 @@ kubectl -n ${NAMESPACE} port-forward service/inference-gateway 8080:80
 ```
 
 And making requests with `curl`:
+- vllm-sim
 
-```console
-curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
-  -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq
-```
+    ```console
+    curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
+      -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq
+    ```
+
+- vllm
+
+  ```console
+  curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
+    -d '{"model":"meta-llama/Llama-3.1-8B-Instruct","prompt":"hi","max_tokens":10,"temperature":0}' | jq
+  ```
 
+- vllm-p2p
+  ```console
+  curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
+    -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","prompt":"hi","max_tokens":10,"temperature":0}' | jq
+  ```
 #### Development Cycle
 
 > **WARNING**: This is a very manual process at the moment. We expect to make
diff --git a/deploy/components/inference-gateway/deployments.yaml b/deploy/components/inference-gateway/deployments.yaml
index 0fc19d4d..f523f812 100644
--- a/deploy/components/inference-gateway/deployments.yaml
+++ b/deploy/components/inference-gateway/deployments.yaml
@@ -48,3 +48,11 @@ spec:
             service: inference-extension
           initialDelaySeconds: 5
           periodSeconds: 10
+        env:
+          - name: KVCACHE_INDEXER_REDIS_ADDR
+            value: ${REDIS_HOST}:${REDIS_PORT}
+          - name: HF_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: ${HF_SECRET_NAME}
+                key: ${HF_SECRET_KEY}
\ No newline at end of file
diff --git a/deploy/components/inference-gateway/inference-models.yaml b/deploy/components/inference-gateway/inference-models.yaml
index 12a51394..330f19a9 100644
--- a/deploy/components/inference-gateway/inference-models.yaml
+++ b/deploy/components/inference-gateway/inference-models.yaml
@@ -6,7 +6,37 @@ spec:
   modelName: food-review
   criticality: Critical
   poolRef:
-    name: vllm-llama3-8b-instruct
+    name: ${POOL_NAME}
   targetModels:
   - name: food-review
     weight: 100
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: base-model
+spec:
+  modelName: meta-llama/Llama-3.1-8B-Instruct
+  criticality: Critical
+  poolRef:
+    name: ${POOL_NAME}
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: base-model-cpu
+spec:
+  modelName: Qwen/Qwen2.5-1.5B-Instruct
+  criticality: Critical
+  poolRef:
+    name: ${POOL_NAME}
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: mistarli
+spec:
+  modelName: mistralai/Mistral-7B-Instruct-v0.2
+  criticality: Critical
+  poolRef:
+    name: ${POOL_NAME}
\ No newline at end of file
diff --git a/deploy/components/inference-gateway/inference-pools.yaml b/deploy/components/inference-gateway/inference-pools.yaml
index ece6e500..3a981a14 100644
--- a/deploy/components/inference-gateway/inference-pools.yaml
+++ b/deploy/components/inference-gateway/inference-pools.yaml
@@ -1,10 +1,10 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferencePool
 metadata:
-  name: vllm-llama3-8b-instruct
+  name: ${POOL_NAME}
 spec:
   targetPortNumber: 8000
   selector:
-    app: vllm-llama3-8b-instruct
+    app: ${POOL_NAME}
   extensionRef:
     name: endpoint-picker
diff --git a/deploy/components/vllm-p2p/deployments/redis-deployment.yaml b/deploy/components/vllm-p2p/deployments/redis-deployment.yaml
index 16b61d1f..f4b5938e 100644
--- a/deploy/components/vllm-p2p/deployments/redis-deployment.yaml
+++ b/deploy/components/vllm-p2p/deployments/redis-deployment.yaml
@@ -1,7 +1,7 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: ${REDIS_NAME}
+  name: ${REDIS_SVC_NAME}
   labels:
     app.kubernetes.io/name: redis
     app.kubernetes.io/component: redis-lookup-server
@@ -20,8 +20,36 @@ spec:
       containers:
         - name: lookup-server
           image: ${REDIS_IMAGE}:${REDIS_TAG}
-          imagePullPolicy: Always
+          imagePullPolicy: IfNotPresent
           command:
             - redis-server
           ports:
-            - containerPort: ${REDIS_TARGET_PORT}
+            - name: redis-port
+              containerPort: ${REDIS_TARGET_PORT}
+              protocol: TCP
+          resources:
+            limits:
+              cpu: "4"
+              memory: 10G
+            requests:
+              cpu: "4"
+              memory: 8G
+          terminationMessagePath: /dev/termination-log
+          terminationMessagePolicy: File
+      restartPolicy: Always
+      terminationGracePeriodSeconds: 30
+      dnsPolicy: ClusterFirst
+      securityContext: {}
+      schedulerName: default-scheduler
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 25%
+      maxSurge: 25%
+  revisionHistoryLimit: 10
+  progressDeadlineSeconds: 600
+          # securityContext:
+          #   allowPrivilegeEscalation: false
+          #   capabilities:
+          #     drop:
+          #       - ALL
\ No newline at end of file
diff --git a/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml b/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml
index b825f358..1c296eff 100644
--- a/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml
+++ b/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml
@@ -13,49 +13,111 @@ spec:
       app.kubernetes.io/name: vllm
       app.kubernetes.io/component: vllm
       app.kubernetes.io/model: ${MODEL_LABEL}
+      app: ${POOL_NAME}
   template:
     metadata:
       labels:
         app.kubernetes.io/name: vllm
         app.kubernetes.io/component: vllm
         app.kubernetes.io/model: ${MODEL_LABEL}
+        app: ${POOL_NAME}
     spec:
+      # securityContext:
+      #   runAsUser: ${PROXY_UID}
+        # runAsNonRoot: true
+        # seccompProfile:
+        #   type: RuntimeDefault
       containers:
         - name: vllm
           image: ${VLLM_IMAGE}:${VLLM_TAG}
-          imagePullPolicy: Always
+          imagePullPolicy: IfNotPresent
           command:
             - /bin/sh
             - "-c"
           args:
             - |
-              export LMCACHE_DISTRIBUTED_URL=${POD_IP}:80 &&
-              vllm serve ${MODEL_NAME}
-              --host 0.0.0.0
-              --port 8000
-              --enable-chunked-prefill false
-              --max-model-len ${MAX_MODEL_LEN}
-              --kv-transfer-config
-              '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}'
+              export LMCACHE_DISTRIBUTED_URL=$${${POD_IP}}:80 && \
+              vllm serve ${MODEL_NAME} \
+              --host 0.0.0.0 \
+              --port 8000 \
+              --enable-chunked-prefill false \
+              --max-model-len ${MAX_MODEL_LEN} \
+              --kv-transfer-config '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}'
           ports:
             - name: http
               containerPort: 8000
-            - name: lmcache-dist
+              protocol: TCP
+            - name: lmcache-dist # Assuming port 80 is used for LMCACHE_DISTRIBUTED_URL
               containerPort: 80
+              protocol: TCP
+          livenessProbe:
+            failureThreshold: 3
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            initialDelaySeconds: 15
+            periodSeconds: 10
+            successThreshold: 1
+            timeoutSeconds: 1
+          startupProbe:
+            failureThreshold: 60
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            initialDelaySeconds: 15
+            periodSeconds: 10
+            successThreshold: 1
+            timeoutSeconds: 1
           env:
+            - name: HF_HOME
+              value: /data
+            - name: POD_IP
+              valueFrom:
+                fieldRef:
+                  apiVersion: v1
+                  fieldPath: status.podIP
             - name: HF_TOKEN
               valueFrom:
                 secretKeyRef:
                   name: ${HF_SECRET_NAME}
                   key: ${HF_SECRET_KEY}
-            - name: POD_IP
-              valueFrom:
-                fieldRef:
-                  fieldPath: status.podIP
-          volumeMounts:
-            - name: model-storage
-              mountPath: ${VOLUME_MOUNT_PATH}
-      volumes:
-        - name: model-storage
-          persistentVolumeClaim:
-            claimName: ${PVC_NAME}
+            - name: LMCACHE_LOOKUP_URL
+              value: ${REDIS_HOST}:${REDIS_PORT}
+            - name: LMCACHE_ENABLE_DEBUG
+              value: "True"
+            - name: LMCACHE_ENABLE_P2P
+              value: "True"
+            - name: LMCACHE_LOCAL_CPU
+              value: "True"
+            - name: LMCACHE_MAX_LOCAL_CPU_SIZE
+              value: "20"
+            - name: LMCACHE_USE_EXPERIMENTAL
+              value: "True"
+            - name: VLLM_RPC_TIMEOUT
+              value: "1000000"
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              cpu: "10"
+              memory: 40Gi
+              nvidia.com/gpu: "1"
+          terminationMessagePath: /dev/termination-log
+          terminationMessagePolicy: File
+          securityContext:
+            runAsNonRoot: false
+      restartPolicy: Always
+      terminationGracePeriodSeconds: 30
+      dnsPolicy: ClusterFirst
+      securityContext: {}
+      schedulerName: default-scheduler
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 0
+      maxSurge: "100%"
+  revisionHistoryLimit: 10
+  progressDeadlineSeconds: 1200
+
diff --git a/deploy/components/vllm-p2p/kustomization.yaml b/deploy/components/vllm-p2p/kustomization.yaml
index 3f7d6014..c1e3172a 100644
--- a/deploy/components/vllm-p2p/kustomization.yaml
+++ b/deploy/components/vllm-p2p/kustomization.yaml
@@ -7,7 +7,6 @@ resources:
   - deployments/vllm-deployment.yaml
   - deployments/redis-deployment.yaml
   - service/redis-service.yaml
-  - pvc/volume.yaml
   - deployments/secret.yaml
 
 images:
@@ -17,14 +16,3 @@ images:
   - name: redis
     newName: ${REDIS_IMAGE}
     newTag: ${REDIS_TAG}
-
-configMapGenerator:
-  - name: model-config
-    literals:
-      - MODEL_NAME=${MODEL_NAME}
-      - MODEL_LABEL=${MODEL_LABEL}
-      - POOL_LABEL=${POOL_LABEL}
-      - REDIS_ENABLED=${REDIS_ENABLED}
-
-generatorOptions:
-  disableNameSuffixHash: true
\ No newline at end of file
diff --git a/deploy/components/vllm-p2p/pvc/volume.yaml b/deploy/components/vllm-p2p/pvc/volume.yaml
deleted file mode 100644
index 6a0a042c..00000000
--- a/deploy/components/vllm-p2p/pvc/volume.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: ${PVC_NAME}
-  namespace: ${NAMESPACE}
-  labels:
-    app.kubernetes.io/name: vllm
-    app.kubernetes.io/component: storage
-    app.kubernetes.io/model: ${MODEL_LABEL}
-  finalizers:
-    - kubernetes.io/pvc-protection
-spec:
-  accessModes:
-    - ${PVC_ACCESS_MODE}
-  resources:
-    requests:
-      storage: ${PVC_SIZE}
-  storageClassName: ${PVC_STORAGE_CLASS}
diff --git a/deploy/components/vllm-p2p/service/redis-service.yaml b/deploy/components/vllm-p2p/service/redis-service.yaml
index 947f87ac..a5d5fd00 100644
--- a/deploy/components/vllm-p2p/service/redis-service.yaml
+++ b/deploy/components/vllm-p2p/service/redis-service.yaml
@@ -1,8 +1,7 @@
 apiVersion: v1
 kind: Service
 metadata:
-  name: ${REDIS_NAME}
-  namespace: ${NAMESPACE}
+  name: ${REDIS_SVC_NAME}
   labels:
     app.kubernetes.io/name: redis
     app.kubernetes.io/component: redis-lookup-server
diff --git a/deploy/components/vllm/configmap.yaml b/deploy/components/vllm/configmap.yaml
new file mode 100644
index 00000000..1a4f8903
--- /dev/null
+++ b/deploy/components/vllm/configmap.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: vllm-llama3-8b-instruct-adapters
+data:
+  configmap.yaml: |
+      vLLMLoRAConfig:
+        name: vllm-llama3-8b-instruct-adapters
+        port: 8000
+        defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct
+        ensureExist:
+          models:
+          - id: food-review-1
+            source: Kawon/llama3.1-food-finetune_v14_r8
diff --git a/deploy/components/vllm/deployments.yaml b/deploy/components/vllm/deployments.yaml
index 19a398b1..2d7a63d5 100644
--- a/deploy/components/vllm/deployments.yaml
+++ b/deploy/components/vllm/deployments.yaml
@@ -3,14 +3,14 @@ kind: Deployment
 metadata:
   name: ${VLLM_DEPLOYMENT_NAME}
 spec:
-  replicas: 3
+  replicas: ${VLLM_REPLICA_COUNT}
   selector:
     matchLabels:
-      app: vllm-llama3-8b-instruct
+      app: ${POOL_NAME}
   template:
     metadata:
       labels:
-        app: vllm-llama3-8b-instruct
+        app: ${POOL_NAME}
     spec:
       securityContext:
         runAsUser: ${PROXY_UID}
diff --git a/deploy/components/vllm/kustomization.yaml b/deploy/components/vllm/kustomization.yaml
index ee4a4e7d..93813639 100644
--- a/deploy/components/vllm/kustomization.yaml
+++ b/deploy/components/vllm/kustomization.yaml
@@ -18,6 +18,8 @@ kind: Kustomization
 resources:
 - deployments.yaml
 - secret.yaml
+- configmap.yaml
+
 
 images:
 - name: vllm/vllm-openai
diff --git a/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml b/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml
index 3461a596..8c07c693 100644
--- a/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml
+++ b/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml
@@ -3,7 +3,7 @@ kind: GatewayParameters
 metadata:
   name: custom-gw-params
 spec:
-  kube: 
+  kube:
     envoyContainer:
       securityContext:
         allowPrivilegeEscalation: false
@@ -11,12 +11,12 @@ spec:
         runAsNonRoot: true
         runAsUser: "${PROXY_UID}"
     service:
-      type: NodePort
+      type: LoadBalancer
       extraLabels:
         gateway: custom
     podTemplate:
       extraLabels:
         gateway: custom
-      securityContext: 
+      securityContext:
         seccompProfile:
           type: RuntimeDefault
diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml
index a81c387e..48c90408 100644
--- a/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml
+++ b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml
@@ -6,6 +6,6 @@ resources:
 
 images:
 - name: quay.io/vllm-d/vllm-d-dev:0.0.2
-  newName: ${VLLM_P2P_IMAGE}
-  newTag: ${VLLM_P2P_TAG}
+  newName: ${VLLM_IMAGE}
+  newTag: ${VLLM_TAG}
 
diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml
index 921b9ef5..a45ae271 100644
--- a/deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml
+++ b/deploy/environments/dev/kubernetes-vllm/vllm-sim/kustomization.yaml
@@ -6,7 +6,6 @@ resources:
 
 images:
 - name: quay.io/vllm-d/vllm-sim
-  newName: ${VLLM_SIM_IMAGE}
   newTag: ${VLLM_SIM_TAG}
 
 patches:
diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml
index d86d712c..dbb99b17 100644
--- a/deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml
+++ b/deploy/environments/dev/kubernetes-vllm/vllm-sim/patch-deployments.yaml
@@ -2,7 +2,7 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: ${VLLM_DEPLOYMENT_NAME}
+  name: vllm-sim
 spec:
   template:
     spec:
diff --git a/scripts/kubernetes-dev-env.sh b/scripts/kubernetes-dev-env.sh
index 98c87492..dfa73f35 100755
--- a/scripts/kubernetes-dev-env.sh
+++ b/scripts/kubernetes-dev-env.sh
@@ -24,14 +24,21 @@ if [[ -z "${VLLM_MODE:-}" ]]; then
   exit 1
 fi
 
-# vLLM Specific Configuration node
+# GIE Configuration node
+export POOL_NAME="${POOL_NAME:-vllm-llama3-8b-instruct}"
+export REDIS_SVC_NAME="${REDIS_SVC_NAME:-lookup-server-service}"
+export REDIS_HOST="${REDIS_HOST:-${REDIS_SVC_NAME}.${NAMESPACE}.svc.cluster.local}" #TODO- remove Redis to kustomize
+export REDIS_PORT="${REDIS_PORT:-8100}"
+export HF_TOKEN="${HF_TOKEN:-}"
 
+# vLLM Specific Configuration node
 case "${VLLM_MODE}" in
   vllm-sim)
     export VLLM_SIM_IMAGE="${VLLM_SIM_IMAGE:-quay.io/vllm-d/vllm-sim}"
     export VLLM_SIM_TAG="${VLLM_SIM_TAG:-0.0.2}"
     export EPP_IMAGE="${EPP_IMAGE:-us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp}"
     export EPP_TAG="${EPP_TAG:-main}"
+    export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-sim}"
     ;;
   vllm | vllm-p2p)
     # Shared across both full model modes - // TODO - make more env variables similar
@@ -39,13 +46,15 @@ case "${VLLM_MODE}" in
     export HF_SECRET_NAME="${HF_SECRET_NAME:-hf-token}"
     export HF_TOKEN=$(echo -n "${HF_TOKEN:-}" | base64 | tr -d '\n')
     export VOLUME_MOUNT_PATH="${VOLUME_MOUNT_PATH:-/data}"
+    export VLLM_REPLICA_COUNT="${VLLM_REPLICA_COUNT:-3}"
+
 
     if [[ "$VLLM_MODE" == "vllm" ]]; then
       export VLLM_IMAGE="${VLLM_IMAGE:-quay.io/vllm-d/vllm-d-dev}"
       export VLLM_TAG="${VLLM_TAG:-0.0.2}"
       export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-llama3-8b-instruct}"
-      export EPP_IMAGE="${EPP_IMAGE:-us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp}"
-      export EPP_TAG="${EPP_TAG:-main}"
+      export EPP_IMAGE="${EPP_IMAGE:-quay.io/vllm-d/gateway-api-inference-extension-dev}"
+      export EPP_TAG="${EPP_TAG:-0.0.4}"
       export MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-3.1-8B-Instruct}"
       export MODEL_LABEL="${MODEL_LABEL:-llama3-8b}"
       export HF_SECRET_KEY="${HF_SECRET_KEY:-token}"
@@ -55,27 +64,25 @@ case "${VLLM_MODE}" in
       export PVC_NAME="${PVC_NAME:-vllm-storage-claim}"
 
     elif [[ "$VLLM_MODE" == "vllm-p2p" ]]; then
-      export VLLM_IMAGE="${VLLM_IMAGE:-vllm/vllm-openai}"
-      export VLLM_TAG="${VLLM_TAG:-latest}"
+      export VLLM_IMAGE="${VLLM_IMAGE:-lmcache/vllm-openai}"
+      export VLLM_TAG="${VLLM_TAG:-2025-03-10}"
+      export EPP_IMAGE="${EPP_IMAGE:- quay.io/vmaroon/gateway-api-inference-extension/epp}"
+      export EPP_TAG="${EPP_TAG:-kv-aware}"
       export MODEL_NAME="${MODEL_NAME:-mistralai/Mistral-7B-Instruct-v0.2}"
       export MODEL_LABEL="${MODEL_LABEL:-mistral7b}"
-      export HF_SECRET_KEY="${HF_SECRET_KEY:-${MODEL_LABEL}}"
+      export HF_SECRET_KEY="${HF_SECRET_KEY:-${HF_SECRET_NAME}_${MODEL_LABEL}}"
       export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-${MODEL_LABEL}}"
-      export VLLM_REPLICA_COUNT="${VLLM_REPLICA_COUNT:-4}"
       export MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
       export PVC_NAME="${PVC_NAME:-vllm-p2p-storage-claim}"
       export PVC_ACCESS_MODE="${PVC_ACCESS_MODE:-ReadWriteOnce}"
       export PVC_SIZE="${PVC_SIZE:-10Gi}"
       export PVC_STORAGE_CLASS="${PVC_STORAGE_CLASS:-standard}"
-      export REDIS_NAME="${REDIS_NAME:-vllm-redis}"
       export REDIS_IMAGE="${REDIS_IMAGE:-redis}"
       export REDIS_TAG="${REDIS_TAG:-7.2.3}"
       export REDIS_REPLICA_COUNT="${REDIS_REPLICA_COUNT:-1}"
-      export REDIS_PORT="${REDIS_PORT:-6379}"
+      export POD_IP="POD_IP"
       export REDIS_TARGET_PORT="${REDIS_TARGET_PORT:-6379}"
       export REDIS_SERVICE_TYPE="${REDIS_SERVICE_TYPE:-ClusterIP}"
-      export POOL_LABEL="${POOL_LABEL:-vllm-llama3-8b-instruct}"
-      export REDIS_ENABLED="${REDIS_ENABLED:-true}"
     fi
     ;;
   *)
@@ -100,9 +107,13 @@ if [[ "$CLEAN" == "true" ]]; then
   kustomize build deploy/environments/dev/kubernetes-kgateway | envsubst | kubectl -n "${NAMESPACE}" delete --ignore-not-found=true -f -
   kustomize build deploy/environments/dev/kubernetes-vllm/${VLLM_MODE} | envsubst | kubectl -n "${NAMESPACE}" delete --ignore-not-found=true -f -
 else
-  echo "INFO: Deploying Development Environment in namespace ${NAMESPACE}"
-  kustomize build deploy/environments/dev/kubernetes-kgateway | envsubst | kubectl -n "${NAMESPACE}" apply -f -
+  echo "INFO: Deploying vLLM Environment in namespace ${NAMESPACE}"
+  oc adm policy add-scc-to-user anyuid -z default -n ${NAMESPACE}  # TODO - Change to security context
   kustomize build deploy/environments/dev/kubernetes-vllm/${VLLM_MODE} | envsubst | kubectl -n "${NAMESPACE}" apply -f -
+
+  echo "INFO: Deploying Gateway Environment in namespace ${NAMESPACE}"
+  kustomize build deploy/environments/dev/kubernetes-kgateway | envsubst | kubectl -n "${NAMESPACE}" apply -f -
+
   echo "INFO: Waiting for resources in namespace ${NAMESPACE} to become ready"
   kubectl -n "${NAMESPACE}" wait deployment/endpoint-picker --for=condition=Available --timeout=60s
   kubectl -n "${NAMESPACE}" wait gateway/inference-gateway --for=condition=Programmed --timeout=60s
@@ -117,7 +128,7 @@ else
       ;;
     vllm-p2p)
       kubectl -n "${NAMESPACE}" wait deployment/vllm-mistral7b --for=condition=Available --timeout=180s
-      kubectl -n "${NAMESPACE}" wait deployment/${REDIS_NAME} --for=condition=Available --timeout=60s
+      kubectl -n "${NAMESPACE}" wait deployment/${REDIS_SVC_NAME} --for=condition=Available --timeout=60s
       ;;
   esac
 fi

From a11e98487e1b7858a1c91b0ad2efbaa3c1677330 Mon Sep 17 00:00:00 2001
From: Kfir Toledo <kfir.toledo@ibm.com>
Date: Sun, 27 Apr 2025 02:34:21 +0300
Subject: [PATCH 3/5] [fix]: Small fixes for deployment and fix comments

Signed-off-by: Kfir Toledo <kfir.toledo@ibm.com>
---
 DEVELOPMENT.md                                | 91 ++++++++++++-------
 .../inference-gateway/deployments.yaml        |  4 +-
 .../inference-gateway/httproutes.yaml         |  2 +-
 .../inference-gateway/inference-models.yaml   | 22 +----
 .../inference-gateway/kustomization.yaml      |  2 +
 .../secret.yaml                               |  1 -
 deploy/components/vllm-p2p/kustomization.yaml | 26 ++++--
 .../{deployments => }/redis-deployment.yaml   |  7 +-
 .../vllm-p2p/{service => }/redis-service.yaml |  0
 deploy/components/vllm-p2p/secret.yaml        | 10 ++
 .../{deployments => }/vllm-deployment.yaml    |  7 +-
 deploy/components/vllm-sim/deployments.yaml   |  6 +-
 deploy/components/vllm/configmap.yaml         |  6 +-
 deploy/components/vllm/deployments.yaml       | 18 +---
 deploy/components/vllm/kustomization.yaml     |  4 +
 deploy/components/vllm/secret.yaml            |  1 -
 .../dev/kind-istio/patch-deployments.yaml     |  2 +-
 .../dev/kind-kgateway/patch-deployments.yaml  |  2 +-
 .../kubernetes-istio/patch-deployments.yaml   |  2 +-
 .../patch-deployments.yaml                    |  2 +-
 .../vllm-p2p/kustomization.yaml               |  2 +
 .../vllm-p2p/patch-deployments.yaml           |  9 ++
 .../kubernetes-vllm/vllm/kustomization.yaml   |  6 ++
 .../vllm/patch-deployments.yaml               |  2 +-
 scripts/kubernetes-dev-env.sh                 | 40 ++++----
 25 files changed, 154 insertions(+), 120 deletions(-)
 rename deploy/components/{vllm-p2p/deployments => inference-gateway}/secret.yaml (88%)
 rename deploy/components/vllm-p2p/{deployments => }/redis-deployment.yaml (88%)
 rename deploy/components/vllm-p2p/{service => }/redis-service.yaml (100%)
 create mode 100644 deploy/components/vllm-p2p/secret.yaml
 rename deploy/components/vllm-p2p/{deployments => }/vllm-deployment.yaml (95%)
 create mode 100644 deploy/environments/dev/kubernetes-vllm/vllm-p2p/patch-deployments.yaml

diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index 9d555a5c..801e67ed 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -152,6 +152,13 @@ Create the namespace:
 ```console
 kubectl create namespace ${NAMESPACE}
 ```
+Set the default namespace for kubectl commands
+
+```console
+kubectl config set-context --current --namespace="${NAMESPACE}"
+```
+
+> NOTE: If you are using OpenShift (oc CLI), use the following instead: `oc project "${NAMESPACE}"`
 
 You'll need to provide a `Secret` with the login credentials for your private
 repository (e.g. quay.io). It should look something like this:
@@ -178,13 +185,6 @@ Export the name of the `Secret` to the environment:
 export REGISTRY_SECRET=anna-pull-secret
 ```
 
-You can optionally set a custom EPP image (otherwise, the default will be used):
-
-```console
-export EPP_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
-export EPP_TAG="<YOUR_TAG>"
-```
-
 Set the `VLLM_MODE` environment variable based on which version of vLLM you want to deploy:
 
 - `vllm-sim`: Lightweight simulator for simple environments
@@ -194,24 +194,10 @@ Set the `VLLM_MODE` environment variable based on which version of vLLM you want
 ```console
 export VLLM_MODE=vllm-sim  # or vllm / vllm-p2p
 ```
-Each mode has default image values, but you can override them:
 
-For vllm-sim:
-
-```console
-export VLLM_SIM_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
-export VLLM_SIM_TAG="<YOUR_TAG>"
-```
-
-For vllm and vllm-p2p:
-- set Vllm image:
-```console
-export VLLM_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
-export VLLM_TAG="<YOUR_TAG>"
-```
 - Set hugging face token variable:
   export HF_TOKEN="<HF_TOKEN>"
-**Warning**: For vllm mode, the default image uses llama3-8b and vllm-mistral. Make sure you have permission to access these files in their respective repositories.
+**Warning**: For vllm mode, the default image uses llama3-8b. Make sure you have permission to access these files in their respective repositories.
 
 Once all this is set up, you can deploy the environment:
 
@@ -222,30 +208,73 @@ make environment.dev.kubernetes
 This will deploy the entire stack to whatever namespace you chose. You can test
 by exposing the inference `Gateway` via port-forward:
 
-```console
+```bash
 kubectl -n ${NAMESPACE} port-forward service/inference-gateway 8080:80
 ```
 
 And making requests with `curl`:
 - vllm-sim
 
-    ```console
+    ```bash
     curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
       -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq
     ```
 
-- vllm
+- vllm or vllm-p2p
 
-  ```console
+  ```bash
   curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
     -d '{"model":"meta-llama/Llama-3.1-8B-Instruct","prompt":"hi","max_tokens":10,"temperature":0}' | jq
   ```
+#### Environment Configurateion
+
+##### **1. Setting the EPP image and tag:**
+
+You can optionally set a custom EPP image (otherwise, the default will be used):
+
+```bash
+export EPP_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
+export EPP_TAG="<YOUR_TAG>"
+```
+##### **2. Setting the vLLM image and tag:**
+
+Each vLLM mode has default image values, but you can override them:
+
+For `vllm-sim` mode:**
+
+```bash
+export VLLM_SIM_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
+export VLLM_SIM_TAG="<YOUR_TAG>"
+```
+
+For `vllm` and `vllm-p2p` modes:**
+
+```bash
+export VLLM_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
+export VLLM_TAG="<YOUR_TAG>"
+```
+
+##### **3. Setting the model name and label:**
+
+You can replace the model name that will be used in the system.
+
+```bash
+export MODEL_NAME="${MODEL_NAME:-mistralai/Mistral-7B-Instruct-v0.2}"
+export MODEL_LABEL="${MODEL_LABEL:-mistral7b}"
+```
+
+It is also recommended to update the pool name accordingly:
+
+```bash
+export POOL_NAME="${POOL_NAME:-vllm-Mistral-7B-Instruct}"
+```
+
+##### **4. Additional environment settings:**
+
+More Setting of environment variables can be found in the `scripts/kubernetes-dev-env.sh`.
+
+
 
-- vllm-p2p
-  ```console
-  curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
-    -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","prompt":"hi","max_tokens":10,"temperature":0}' | jq
-  ```
 #### Development Cycle
 
 > **WARNING**: This is a very manual process at the moment. We expect to make
diff --git a/deploy/components/inference-gateway/deployments.yaml b/deploy/components/inference-gateway/deployments.yaml
index f523f812..b603beb5 100644
--- a/deploy/components/inference-gateway/deployments.yaml
+++ b/deploy/components/inference-gateway/deployments.yaml
@@ -22,7 +22,7 @@ spec:
         imagePullPolicy: IfNotPresent
         args:
         - -poolName
-        - "vllm-llama3-8b-instruct"
+        - "${POOL_NAME}"
         - -v
         - "4"
         - --zap-encoder
@@ -55,4 +55,4 @@ spec:
             valueFrom:
               secretKeyRef:
                 name: ${HF_SECRET_NAME}
-                key: ${HF_SECRET_KEY}
\ No newline at end of file
+                key: ${HF_SECRET_KEY}
diff --git a/deploy/components/inference-gateway/httproutes.yaml b/deploy/components/inference-gateway/httproutes.yaml
index 1115d13d..97eb2cf3 100644
--- a/deploy/components/inference-gateway/httproutes.yaml
+++ b/deploy/components/inference-gateway/httproutes.yaml
@@ -13,7 +13,7 @@ spec:
     backendRefs:
     - group: inference.networking.x-k8s.io
       kind: InferencePool
-      name: vllm-llama3-8b-instruct
+      name: ${POOL_NAME}
       port: 8000
     timeouts:
       request: 30s
diff --git a/deploy/components/inference-gateway/inference-models.yaml b/deploy/components/inference-gateway/inference-models.yaml
index 330f19a9..869be700 100644
--- a/deploy/components/inference-gateway/inference-models.yaml
+++ b/deploy/components/inference-gateway/inference-models.yaml
@@ -16,27 +16,7 @@ kind: InferenceModel
 metadata:
   name: base-model
 spec:
-  modelName: meta-llama/Llama-3.1-8B-Instruct
+  modelName: ${MODEL_NAME}
   criticality: Critical
   poolRef:
     name: ${POOL_NAME}
----
-apiVersion: inference.networking.x-k8s.io/v1alpha2
-kind: InferenceModel
-metadata:
-  name: base-model-cpu
-spec:
-  modelName: Qwen/Qwen2.5-1.5B-Instruct
-  criticality: Critical
-  poolRef:
-    name: ${POOL_NAME}
----
-apiVersion: inference.networking.x-k8s.io/v1alpha2
-kind: InferenceModel
-metadata:
-  name: mistarli
-spec:
-  modelName: mistralai/Mistral-7B-Instruct-v0.2
-  criticality: Critical
-  poolRef:
-    name: ${POOL_NAME}
\ No newline at end of file
diff --git a/deploy/components/inference-gateway/kustomization.yaml b/deploy/components/inference-gateway/kustomization.yaml
index 49607a37..78dfabcd 100644
--- a/deploy/components/inference-gateway/kustomization.yaml
+++ b/deploy/components/inference-gateway/kustomization.yaml
@@ -26,6 +26,8 @@ resources:
 - deployments.yaml
 - gateways.yaml
 - httproutes.yaml
+- secret.yaml
+
 
 images:
 - name: quay.io/vllm-d/gateway-api-inference-extension/epp
diff --git a/deploy/components/vllm-p2p/deployments/secret.yaml b/deploy/components/inference-gateway/secret.yaml
similarity index 88%
rename from deploy/components/vllm-p2p/deployments/secret.yaml
rename to deploy/components/inference-gateway/secret.yaml
index 1f5a2bcc..23fe9473 100644
--- a/deploy/components/vllm-p2p/deployments/secret.yaml
+++ b/deploy/components/inference-gateway/secret.yaml
@@ -2,7 +2,6 @@ apiVersion: v1
 kind: Secret
 metadata:
   name: ${HF_SECRET_NAME}
-  namespace: ${NAMESPACE}
   labels:
     app.kubernetes.io/name: vllm
     app.kubernetes.io/component: secret
diff --git a/deploy/components/vllm-p2p/kustomization.yaml b/deploy/components/vllm-p2p/kustomization.yaml
index c1e3172a..64cedf04 100644
--- a/deploy/components/vllm-p2p/kustomization.yaml
+++ b/deploy/components/vllm-p2p/kustomization.yaml
@@ -1,13 +1,27 @@
+# ------------------------------------------------------------------------------
+# vLLM P2P Deployment
+#
+# This deploys the full vLLM model server, capable of serving real models such
+# as Llama 3.1-8B-Instruct via the OpenAI-compatible API. It is intended for
+# environments with GPU resources and where full inference capabilities are
+# required.
+# in additon it add LMcache  a LLM serving engine extension using Redis to vLLM image
+#
+# The deployment can be customized using environment variables to set:
+#   - The container image and tag (VLLM_IMAGE, VLLM_TAG)
+#   - The model to load (MODEL_NAME)
+#
+# This setup is suitable for testing and production with Kubernetes (including
+# GPU-enabled nodes or clusters with scheduling for `nvidia.com/gpu`).
+# -----------------------------------------------------------------------------
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 
-namespace: ${NAMESPACE}
-
 resources:
-  - deployments/vllm-deployment.yaml
-  - deployments/redis-deployment.yaml
-  - service/redis-service.yaml
-  - deployments/secret.yaml
+  - vllm-deployment.yaml
+  - redis-deployment.yaml
+  - redis-service.yaml
+  - secret.yaml
 
 images:
   - name: vllm/vllm-openai
diff --git a/deploy/components/vllm-p2p/deployments/redis-deployment.yaml b/deploy/components/vllm-p2p/redis-deployment.yaml
similarity index 88%
rename from deploy/components/vllm-p2p/deployments/redis-deployment.yaml
rename to deploy/components/vllm-p2p/redis-deployment.yaml
index f4b5938e..df8f3b0a 100644
--- a/deploy/components/vllm-p2p/deployments/redis-deployment.yaml
+++ b/deploy/components/vllm-p2p/redis-deployment.yaml
@@ -1,7 +1,7 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: ${REDIS_SVC_NAME}
+  name: ${REDIS_DEPLOYMENT_NAME}
   labels:
     app.kubernetes.io/name: redis
     app.kubernetes.io/component: redis-lookup-server
@@ -48,8 +48,3 @@ spec:
       maxSurge: 25%
   revisionHistoryLimit: 10
   progressDeadlineSeconds: 600
-          # securityContext:
-          #   allowPrivilegeEscalation: false
-          #   capabilities:
-          #     drop:
-          #       - ALL
\ No newline at end of file
diff --git a/deploy/components/vllm-p2p/service/redis-service.yaml b/deploy/components/vllm-p2p/redis-service.yaml
similarity index 100%
rename from deploy/components/vllm-p2p/service/redis-service.yaml
rename to deploy/components/vllm-p2p/redis-service.yaml
diff --git a/deploy/components/vllm-p2p/secret.yaml b/deploy/components/vllm-p2p/secret.yaml
new file mode 100644
index 00000000..23fe9473
--- /dev/null
+++ b/deploy/components/vllm-p2p/secret.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ${HF_SECRET_NAME}
+  labels:
+    app.kubernetes.io/name: vllm
+    app.kubernetes.io/component: secret
+type: Opaque
+data:
+  ${HF_SECRET_KEY}: ${HF_TOKEN}
diff --git a/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml b/deploy/components/vllm-p2p/vllm-deployment.yaml
similarity index 95%
rename from deploy/components/vllm-p2p/deployments/vllm-deployment.yaml
rename to deploy/components/vllm-p2p/vllm-deployment.yaml
index 1c296eff..19fd59c2 100644
--- a/deploy/components/vllm-p2p/deployments/vllm-deployment.yaml
+++ b/deploy/components/vllm-p2p/vllm-deployment.yaml
@@ -22,11 +22,6 @@ spec:
         app.kubernetes.io/model: ${MODEL_LABEL}
         app: ${POOL_NAME}
     spec:
-      # securityContext:
-      #   runAsUser: ${PROXY_UID}
-        # runAsNonRoot: true
-        # seccompProfile:
-        #   type: RuntimeDefault
       containers:
         - name: vllm
           image: ${VLLM_IMAGE}:${VLLM_TAG}
@@ -101,7 +96,7 @@ spec:
             limits:
               nvidia.com/gpu: "1"
             requests:
-              cpu: "10"
+              cpu: "${VLLM_CPU_RESOURCES}"
               memory: 40Gi
               nvidia.com/gpu: "1"
           terminationMessagePath: /dev/termination-log
diff --git a/deploy/components/vllm-sim/deployments.yaml b/deploy/components/vllm-sim/deployments.yaml
index 4673a99c..34b742c2 100644
--- a/deploy/components/vllm-sim/deployments.yaml
+++ b/deploy/components/vllm-sim/deployments.yaml
@@ -3,16 +3,16 @@ kind: Deployment
 metadata:
   name: vllm-sim
   labels:
-    app: vllm-llama3-8b-instruct
+    app: ${POOL_NAME}
 spec:
   replicas: 1
   selector:
     matchLabels:
-      app: vllm-llama3-8b-instruct
+      app: ${POOL_NAME}
   template:
     metadata:
       labels:
-        app: vllm-llama3-8b-instruct
+        app: ${POOL_NAME}
         ai-aware-router-pod: "true"
     spec:
       containers:
diff --git a/deploy/components/vllm/configmap.yaml b/deploy/components/vllm/configmap.yaml
index 1a4f8903..03019ce1 100644
--- a/deploy/components/vllm/configmap.yaml
+++ b/deploy/components/vllm/configmap.yaml
@@ -1,13 +1,13 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: vllm-llama3-8b-instruct-adapters
+  name: lora-adapters
 data:
   configmap.yaml: |
       vLLMLoRAConfig:
-        name: vllm-llama3-8b-instruct-adapters
+        name: lora-adapters
         port: 8000
-        defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct
+        defaultBaseModel: ${MODEL_NAME}
         ensureExist:
           models:
           - id: food-review-1
diff --git a/deploy/components/vllm/deployments.yaml b/deploy/components/vllm/deployments.yaml
index 2d7a63d5..71eaa72c 100644
--- a/deploy/components/vllm/deployments.yaml
+++ b/deploy/components/vllm/deployments.yaml
@@ -24,7 +24,7 @@ spec:
           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
           args:
             - "--model"
-            - "meta-llama/Llama-3.1-8B-Instruct"
+            - "${MODEL_NAME}"
             - "--tensor-parallel-size"
             - "1"
             - "--port"
@@ -48,8 +48,8 @@ spec:
             - name: HUGGING_FACE_HUB_TOKEN
               valueFrom:
                 secretKeyRef:
-                  name: hf-token
-                  key: token
+                  name: ${HF_SECRET_NAME}
+                  key: ${HF_SECRET_KEY}
             - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
               value: "true"
             - name: XDG_CACHE_HOME
@@ -104,11 +104,6 @@ spec:
               name: shm
             - mountPath: /adapters
               name: adapters
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-                - ALL
       initContainers:
         - name: lora-adapter-syncer
           tty: true
@@ -122,11 +117,6 @@ spec:
           volumeMounts:
             - name: config-volume
               mountPath: /config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-                - ALL
       restartPolicy: Always
       enableServiceLinks: false
       terminationGracePeriodSeconds: 130
@@ -140,4 +130,4 @@ spec:
           emptyDir: {}
         - name: config-volume
           configMap:
-            name: vllm-llama3-8b-instruct-adapters
+            name: lora-adapters
diff --git a/deploy/components/vllm/kustomization.yaml b/deploy/components/vllm/kustomization.yaml
index 93813639..e5f63b73 100644
--- a/deploy/components/vllm/kustomization.yaml
+++ b/deploy/components/vllm/kustomization.yaml
@@ -26,6 +26,10 @@ images:
   newName: ${VLLM_IMAGE}
   newTag: ${VLLM_TAG}
 
+- name: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer
+  newName: ${LORA_ADAPTER_SYNCER_IMAGE}
+  newTag: ${LORA_ADAPTER_SYNCER_TAG}
+
 configMapGenerator:
 - name: vllm-model-config
   literals:
diff --git a/deploy/components/vllm/secret.yaml b/deploy/components/vllm/secret.yaml
index 1f5a2bcc..23fe9473 100644
--- a/deploy/components/vllm/secret.yaml
+++ b/deploy/components/vllm/secret.yaml
@@ -2,7 +2,6 @@ apiVersion: v1
 kind: Secret
 metadata:
   name: ${HF_SECRET_NAME}
-  namespace: ${NAMESPACE}
   labels:
     app.kubernetes.io/name: vllm
     app.kubernetes.io/component: secret
diff --git a/deploy/environments/dev/kind-istio/patch-deployments.yaml b/deploy/environments/dev/kind-istio/patch-deployments.yaml
index 874b287c..7ab6e3ad 100644
--- a/deploy/environments/dev/kind-istio/patch-deployments.yaml
+++ b/deploy/environments/dev/kind-istio/patch-deployments.yaml
@@ -9,7 +9,7 @@ spec:
       - name: epp
         args:
         - -poolName
-        - "vllm-llama3-8b-instruct"
+        - ${POOL_NAME}
         - -poolNamespace
         - "default"
         - -v
diff --git a/deploy/environments/dev/kind-kgateway/patch-deployments.yaml b/deploy/environments/dev/kind-kgateway/patch-deployments.yaml
index 874b287c..7ab6e3ad 100644
--- a/deploy/environments/dev/kind-kgateway/patch-deployments.yaml
+++ b/deploy/environments/dev/kind-kgateway/patch-deployments.yaml
@@ -9,7 +9,7 @@ spec:
       - name: epp
         args:
         - -poolName
-        - "vllm-llama3-8b-instruct"
+        - ${POOL_NAME}
         - -poolNamespace
         - "default"
         - -v
diff --git a/deploy/environments/dev/kubernetes-istio/patch-deployments.yaml b/deploy/environments/dev/kubernetes-istio/patch-deployments.yaml
index 20a17d53..a5a721b8 100644
--- a/deploy/environments/dev/kubernetes-istio/patch-deployments.yaml
+++ b/deploy/environments/dev/kubernetes-istio/patch-deployments.yaml
@@ -11,7 +11,7 @@ spec:
       - name: epp
         args:
         - -poolName
-        - "vllm-llama3-8b-instruct"
+        - ${POOL_NAME}
         - -poolNamespace
         - ${NAMESPACE}
         - -v
diff --git a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml
index 0e4ad46e..a3b93d36 100644
--- a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml
+++ b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml
@@ -11,7 +11,7 @@ spec:
       - name: epp
         args:
         - -poolName
-        - "vllm-llama3-8b-instruct"
+        - ${POOL_NAME}
         - -poolNamespace
         - ${NAMESPACE}
         - -v
diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml
index 48c90408..2d378312 100644
--- a/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml
+++ b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/kustomization.yaml
@@ -9,3 +9,5 @@ images:
   newName: ${VLLM_IMAGE}
   newTag: ${VLLM_TAG}
 
+patches:
+  - path: patch-deployments.yaml
diff --git a/deploy/environments/dev/kubernetes-vllm/vllm-p2p/patch-deployments.yaml b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/patch-deployments.yaml
new file mode 100644
index 00000000..b1afb13e
--- /dev/null
+++ b/deploy/environments/dev/kubernetes-vllm/vllm-p2p/patch-deployments.yaml
@@ -0,0 +1,9 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ${VLLM_DEPLOYMENT_NAME}
+spec:
+  template:
+    spec:
+      imagePullSecrets:
+      - name: ${REGISTRY_SECRET}
diff --git a/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml
index 81fa76ba..af346345 100644
--- a/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml
+++ b/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml
@@ -9,3 +9,9 @@ images:
   newName: ${VLLM_IMAGE}
   newTag: ${VLLM_TAG}
 
+- name: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer
+  newName: ${LORA_ADAPTER_SYNCER_IMAGE}
+  newTag: ${LORA_ADAPTER_SYNCER_TAG}
+
+patches:
+  - path: patch-deployments.yaml
diff --git a/deploy/environments/dev/kubernetes-vllm/vllm/patch-deployments.yaml b/deploy/environments/dev/kubernetes-vllm/vllm/patch-deployments.yaml
index efaa2211..b1afb13e 100644
--- a/deploy/environments/dev/kubernetes-vllm/vllm/patch-deployments.yaml
+++ b/deploy/environments/dev/kubernetes-vllm/vllm/patch-deployments.yaml
@@ -1,7 +1,7 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: vllm
+  name: ${VLLM_DEPLOYMENT_NAME}
 spec:
   template:
     spec:
diff --git a/scripts/kubernetes-dev-env.sh b/scripts/kubernetes-dev-env.sh
index dfa73f35..b04e2cba 100755
--- a/scripts/kubernetes-dev-env.sh
+++ b/scripts/kubernetes-dev-env.sh
@@ -24,13 +24,18 @@ if [[ -z "${VLLM_MODE:-}" ]]; then
   exit 1
 fi
 
-# GIE Configuration node
+# GIE Configuration
 export POOL_NAME="${POOL_NAME:-vllm-llama3-8b-instruct}"
-export REDIS_SVC_NAME="${REDIS_SVC_NAME:-lookup-server-service}"
-export REDIS_HOST="${REDIS_HOST:-${REDIS_SVC_NAME}.${NAMESPACE}.svc.cluster.local}" #TODO- remove Redis to kustomize
-export REDIS_PORT="${REDIS_PORT:-8100}"
-export HF_TOKEN="${HF_TOKEN:-}"
+export MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-3.1-8B-Instruct}"
 
+## EPP ENV VARs — currently added to all EPPs, regardless of the VLLM mode or whether they are actually needed
+export REDIS_DEPLOYMENT_NAME="${REDIS_DEPLOYMENT_NAME:-lookup-server-service}"
+export REDIS_SVC_NAME="${REDIS_SVC_NAME:-${REDIS_DEPLOYMENT_NAME}}"
+export REDIS_HOST="${REDIS_HOST:-${REDIS_SVC_NAME}.${NAMESPACE}.svc.cluster.local}"
+export REDIS_PORT="${REDIS_PORT:-8100}"
+export HF_TOKEN=$(echo -n "${HF_TOKEN}" | base64 | tr -d '\n')
+export HF_SECRET_NAME="${HF_SECRET_NAME:-hf-token}"
+export HF_SECRET_KEY="${HF_SECRET_KEY:-token}"
 # vLLM Specific Configuration node
 case "${VLLM_MODE}" in
   vllm-sim)
@@ -38,40 +43,34 @@ case "${VLLM_MODE}" in
     export VLLM_SIM_TAG="${VLLM_SIM_TAG:-0.0.2}"
     export EPP_IMAGE="${EPP_IMAGE:-us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp}"
     export EPP_TAG="${EPP_TAG:-main}"
-    export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-sim}"
+    export HF_TOKEN=$(echo -n "dummy-token" | base64 | tr -d '\n')
     ;;
   vllm | vllm-p2p)
     # Shared across both full model modes - // TODO - make more env variables similar
     # TODO: Consider unifying more environment variables for consistency and reuse
-    export HF_SECRET_NAME="${HF_SECRET_NAME:-hf-token}"
-    export HF_TOKEN=$(echo -n "${HF_TOKEN:-}" | base64 | tr -d '\n')
+
     export VOLUME_MOUNT_PATH="${VOLUME_MOUNT_PATH:-/data}"
     export VLLM_REPLICA_COUNT="${VLLM_REPLICA_COUNT:-3}"
-
+    export MODEL_LABEL="${MODEL_LABEL:-llama3-8b}"
+    export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-${MODEL_LABEL}}"
 
     if [[ "$VLLM_MODE" == "vllm" ]]; then
       export VLLM_IMAGE="${VLLM_IMAGE:-quay.io/vllm-d/vllm-d-dev}"
       export VLLM_TAG="${VLLM_TAG:-0.0.2}"
-      export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-llama3-8b-instruct}"
       export EPP_IMAGE="${EPP_IMAGE:-quay.io/vllm-d/gateway-api-inference-extension-dev}"
       export EPP_TAG="${EPP_TAG:-0.0.4}"
-      export MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-3.1-8B-Instruct}"
-      export MODEL_LABEL="${MODEL_LABEL:-llama3-8b}"
-      export HF_SECRET_KEY="${HF_SECRET_KEY:-token}"
-      export HF_TOKEN="${HF_TOKEN:-}"
+
       export VLLM_REPLICA_COUNT="${VLLM_REPLICA_COUNT:-2}"
       export MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"
       export PVC_NAME="${PVC_NAME:-vllm-storage-claim}"
+      export LORA_ADAPTER_SYNCER_IMAGE="${LORA_ADAPTER_SYNCER_IMAGE:-us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer}"
+      export LORA_ADAPTER_SYNCER_TAG="${LORA_ADAPTER_SYNCER_TAG:-v20250425-ddc3d69}"
 
     elif [[ "$VLLM_MODE" == "vllm-p2p" ]]; then
       export VLLM_IMAGE="${VLLM_IMAGE:-lmcache/vllm-openai}"
       export VLLM_TAG="${VLLM_TAG:-2025-03-10}"
       export EPP_IMAGE="${EPP_IMAGE:- quay.io/vmaroon/gateway-api-inference-extension/epp}"
       export EPP_TAG="${EPP_TAG:-kv-aware}"
-      export MODEL_NAME="${MODEL_NAME:-mistralai/Mistral-7B-Instruct-v0.2}"
-      export MODEL_LABEL="${MODEL_LABEL:-mistral7b}"
-      export HF_SECRET_KEY="${HF_SECRET_KEY:-${HF_SECRET_NAME}_${MODEL_LABEL}}"
-      export VLLM_DEPLOYMENT_NAME="${VLLM_DEPLOYMENT_NAME:-vllm-${MODEL_LABEL}}"
       export MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
       export PVC_NAME="${PVC_NAME:-vllm-p2p-storage-claim}"
       export PVC_ACCESS_MODE="${PVC_ACCESS_MODE:-ReadWriteOnce}"
@@ -79,6 +78,7 @@ case "${VLLM_MODE}" in
       export PVC_STORAGE_CLASS="${PVC_STORAGE_CLASS:-standard}"
       export REDIS_IMAGE="${REDIS_IMAGE:-redis}"
       export REDIS_TAG="${REDIS_TAG:-7.2.3}"
+      export VLLM_CPU_RESOURCES="${VLLM_CPU_RESOURCES:-10}"
       export REDIS_REPLICA_COUNT="${REDIS_REPLICA_COUNT:-1}"
       export POD_IP="POD_IP"
       export REDIS_TARGET_PORT="${REDIS_TARGET_PORT:-6379}"
@@ -124,10 +124,10 @@ else
       kubectl -n "${NAMESPACE}" wait deployment/vllm-sim --for=condition=Available --timeout=60s
       ;;
     vllm)
-      kubectl -n "${NAMESPACE}" wait deployment/vllm-llama3-8b-instruct --for=condition=Available --timeout=180s
+      kubectl -n "${NAMESPACE}" wait deployment/${VLLM_DEPLOYMENT_NAME} --for=condition=Available --timeout=300s
       ;;
     vllm-p2p)
-      kubectl -n "${NAMESPACE}" wait deployment/vllm-mistral7b --for=condition=Available --timeout=180s
+      kubectl -n "${NAMESPACE}" wait deployment/${VLLM_DEPLOYMENT_NAME} --for=condition=Available --timeout=180s
       kubectl -n "${NAMESPACE}" wait deployment/${REDIS_SVC_NAME} --for=condition=Available --timeout=60s
       ;;
   esac

From 937bb50172338080ec4092602d5843031df879a1 Mon Sep 17 00:00:00 2001
From: Kfir Toledo <kfir.toledo@ibm.com>
Date: Mon, 28 Apr 2025 14:54:18 +0300
Subject: [PATCH 4/5] [fix]: fix typos and edit the Readme and env vars

Signed-off-by: Kfir Toledo <kfir.toledo@ibm.com>
---
 DEVELOPMENT.md                                | 67 +++++++++++--------
 Makefile                                      |  2 +-
 deploy/components/vllm-p2p/kustomization.yaml |  2 +-
 .../components/vllm-p2p/redis-deployment.yaml |  2 +-
 deploy/components/vllm/kustomization.yaml     |  2 +-
 scripts/kubernetes-dev-env.sh                 | 10 +--
 6 files changed, 47 insertions(+), 38 deletions(-)

diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index 801e67ed..24f28e19 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -37,7 +37,7 @@ serving resources.
 
 Run the following:
 
-```console
+```bash
 make environment.dev.kind
 ```
 
@@ -48,6 +48,7 @@ namespace.
 There are several ways to access the gateway:
 
 **Port forward**:
+
 ```sh
 $ kubectl --context kind-gie-dev port-forward service/inference-gateway 8080:80
 ```
@@ -55,6 +56,7 @@ $ kubectl --context kind-gie-dev port-forward service/inference-gateway 8080:80
 **NodePort `inference-gateway-istio`**
 > **Warning**: This method doesn't work on `podman` correctly, as `podman` support
 > with `kind` is not fully implemented yet.
+
 ```sh
 # Determine the k8s node address
 $ kubectl --context kind-gie-dev get node -o yaml | grep address
@@ -80,9 +82,10 @@ By default the created inference gateway, can be accessed on port 30080. This ca
 be overriden to any free port in the range of 30000 to 32767, by running the above
 command as follows:
 
-```console
+```bash
 GATEWAY_HOST_PORT=<selected-port> make environment.dev.kind
 ```
+
 **Where:** &lt;selected-port&gt; is the port on your local machine you want to use to
 access the inference gatyeway.
 
@@ -96,7 +99,7 @@ access the inference gatyeway.
 To test your changes to the GIE in this environment, make your changes locally
 and then run the following:
 
-```console
+```bash
 make environment.dev.kind.update
 ```
 
@@ -122,7 +125,7 @@ the `default` namespace if the cluster is private/personal).
 The following will deploy all the infrastructure-level requirements (e.g. CRDs,
 Operators, etc) to support the namespace-level development environments:
 
-```console
+```bash
 make environment.dev.kubernetes.infrastructure
 ```
 
@@ -140,7 +143,7 @@ To deploy a development environment to the cluster you'll need to explicitly
 provide a namespace. This can be `default` if this is your personal cluster,
 but on a shared cluster you should pick something unique. For example:
 
-```console
+```bash
 export NAMESPACE=annas-dev-environment
 ```
 
@@ -149,12 +152,13 @@ export NAMESPACE=annas-dev-environment
 
 Create the namespace:
 
-```console
+```bash
 kubectl create namespace ${NAMESPACE}
 ```
+
 Set the default namespace for kubectl commands
 
-```console
+```bash
 kubectl config set-context --current --namespace="${NAMESPACE}"
 ```
 
@@ -175,33 +179,39 @@ type: kubernetes.io/dockerconfigjson
 
 Apply that to your namespace:
 
-```console
-kubectl -n ${NAMESPACE} apply -f secret.yaml
+```bash
+kubectl apply -f secret.yaml
 ```
 
 Export the name of the `Secret` to the environment:
 
-```console
+```bash
 export REGISTRY_SECRET=anna-pull-secret
 ```
 
 Set the `VLLM_MODE` environment variable based on which version of vLLM you want to deploy:
 
-- `vllm-sim`: Lightweight simulator for simple environments
-- `vllm`: Full vLLM model server for real inference
-- `vllm-p2p`: Full vLLM with LMCache P2P support for enable KV-Cache aware routing
+* `vllm-sim`: Lightweight simulator for simple environments (defult).
+* `vllm`: Full vLLM model server, using GPU/CPU for inferencing
+* `vllm-p2p`: Full vLLM with LMCache P2P support for enable KV-Cache aware routing
 
-```console
+```bash
 export VLLM_MODE=vllm-sim  # or vllm / vllm-p2p
 ```
 
-- Set hugging face token variable:
-  export HF_TOKEN="<HF_TOKEN>"
+- Set Hugging Face token variable:
+
+```bash
+export HF_TOKEN="<HF_TOKEN>"
+```
+
 **Warning**: For vllm mode, the default image uses llama3-8b. Make sure you have permission to access these files in their respective repositories.
 
+**Note:** The model can be replaced. See [Environment Configuration](#environment-configuration) for model settings.
+
 Once all this is set up, you can deploy the environment:
 
-```console
+```bash
 make environment.dev.kubernetes
 ```
 
@@ -209,10 +219,11 @@ This will deploy the entire stack to whatever namespace you chose. You can test
 by exposing the inference `Gateway` via port-forward:
 
 ```bash
-kubectl -n ${NAMESPACE} port-forward service/inference-gateway 8080:80
+kubectl port-forward service/inference-gateway 8080:80
 ```
 
 And making requests with `curl`:
+
 - vllm-sim
 
     ```bash
@@ -226,9 +237,10 @@ And making requests with `curl`:
   curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
     -d '{"model":"meta-llama/Llama-3.1-8B-Instruct","prompt":"hi","max_tokens":10,"temperature":0}' | jq
   ```
+
 #### Environment Configurateion
 
-##### **1. Setting the EPP image and tag:**
+**1. Setting the EPP image and tag:**
 
 You can optionally set a custom EPP image (otherwise, the default will be used):
 
@@ -236,7 +248,8 @@ You can optionally set a custom EPP image (otherwise, the default will be used):
 export EPP_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
 export EPP_TAG="<YOUR_TAG>"
 ```
-##### **2. Setting the vLLM image and tag:**
+
+**2. Setting the vLLM image and tag:**
 
 Each vLLM mode has default image values, but you can override them:
 
@@ -254,7 +267,7 @@ export VLLM_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
 export VLLM_TAG="<YOUR_TAG>"
 ```
 
-##### **3. Setting the model name and label:**
+**3. Setting the model name and label:**
 
 You can replace the model name that will be used in the system.
 
@@ -263,13 +276,13 @@ export MODEL_NAME="${MODEL_NAME:-mistralai/Mistral-7B-Instruct-v0.2}"
 export MODEL_LABEL="${MODEL_LABEL:-mistral7b}"
 ```
 
-It is also recommended to update the pool name accordingly:
+It is also recommended to update the inference pool name accordingly so that it aligns with the models:
 
 ```bash
 export POOL_NAME="${POOL_NAME:-vllm-Mistral-7B-Instruct}"
 ```
 
-##### **4. Additional environment settings:**
+**4. Additional environment settings:**
 
 More Setting of environment variables can be found in the `scripts/kubernetes-dev-env.sh`.
 
@@ -283,19 +296,19 @@ More Setting of environment variables can be found in the `scripts/kubernetes-de
 Make your changes locally and commit them. Then select an image tag based on
 the `git` SHA:
 
-```console
+```bash
 export EPP_TAG=$(git rev-parse HEAD)
 ```
 
 Build the image:
 
-```console
+```bash
 DEV_VERSION=$EPP_TAG make image-build
 ```
 
 Tag the image for your private registry and push it:
 
-```console
+```bash
 $CONTAINER_RUNTIME tag quay.io/vllm-d/gateway-api-inference-extension/epp:$TAG \
     <MY_REGISTRY>/<MY_IMAGE>:$EPP_TAG
 $CONTAINER_RUNTIME push <MY_REGISTRY>/<MY_IMAGE>:$EPP_TAG
@@ -307,7 +320,7 @@ $CONTAINER_RUNTIME push <MY_REGISTRY>/<MY_IMAGE>:$EPP_TAG
 Then you can re-deploy the environment with the new changes (don't forget all
 the required env vars):
 
-```console
+```bash
 make environment.dev.kubernetes
 ```
 
diff --git a/Makefile b/Makefile
index cfaa72cb..471e95a9 100644
--- a/Makefile
+++ b/Makefile
@@ -785,7 +785,7 @@ environment.dev.kubernetes: check-kubectl check-kustomize check-envsubst
 .PHONY: clean.environment.dev.kubernetes
 clean.environment.dev.kubernetes: check-kubectl check-kustomize check-envsubst
 	@CLEAN=true ./scripts/kubernetes-dev-env.sh 2>&1
-	@echo "INFO: Finish cleanup development environment for $(VLLM_MODE) mode in namespace $(NAMESPACE)"
+	@echo "INFO: Finished cleanup of development environment for $(VLLM_MODE) mode in namespace $(NAMESPACE)"
 
 # -----------------------------------------------------------------------------
 # TODO: these are old aliases that we still need for the moment, but will be
diff --git a/deploy/components/vllm-p2p/kustomization.yaml b/deploy/components/vllm-p2p/kustomization.yaml
index 64cedf04..1b4c0b28 100644
--- a/deploy/components/vllm-p2p/kustomization.yaml
+++ b/deploy/components/vllm-p2p/kustomization.yaml
@@ -11,7 +11,7 @@
 #   - The container image and tag (VLLM_IMAGE, VLLM_TAG)
 #   - The model to load (MODEL_NAME)
 #
-# This setup is suitable for testing and production with Kubernetes (including
+# This setup is suitable for testing on Kubernetes (including
 # GPU-enabled nodes or clusters with scheduling for `nvidia.com/gpu`).
 # -----------------------------------------------------------------------------
 apiVersion: kustomize.config.k8s.io/v1beta1
diff --git a/deploy/components/vllm-p2p/redis-deployment.yaml b/deploy/components/vllm-p2p/redis-deployment.yaml
index df8f3b0a..31b329e4 100644
--- a/deploy/components/vllm-p2p/redis-deployment.yaml
+++ b/deploy/components/vllm-p2p/redis-deployment.yaml
@@ -6,7 +6,7 @@ metadata:
     app.kubernetes.io/name: redis
     app.kubernetes.io/component: redis-lookup-server
 spec:
-  replicas: ${REDIS_REPLICA_COUNT}
+  replicas: 1
   selector:
     matchLabels:
       app.kubernetes.io/name: redis
diff --git a/deploy/components/vllm/kustomization.yaml b/deploy/components/vllm/kustomization.yaml
index e5f63b73..f04fdf9a 100644
--- a/deploy/components/vllm/kustomization.yaml
+++ b/deploy/components/vllm/kustomization.yaml
@@ -10,7 +10,7 @@
 #   - The container image and tag (VLLM_IMAGE, VLLM_TAG)
 #   - The model to load (MODEL_NAME)
 #
-# This setup is suitable for testing and production with Kubernetes (including
+# This setup is suitable for testing on Kubernetes (including
 # GPU-enabled nodes or clusters with scheduling for `nvidia.com/gpu`).
 # -----------------------------------------------------------------------------
 kind: Kustomization
diff --git a/scripts/kubernetes-dev-env.sh b/scripts/kubernetes-dev-env.sh
index b04e2cba..94ca77fc 100755
--- a/scripts/kubernetes-dev-env.sh
+++ b/scripts/kubernetes-dev-env.sh
@@ -19,10 +19,7 @@ if [[ -z "${NAMESPACE:-}" ]]; then
   echo "ERROR: NAMESPACE environment variable is not set."
   exit 1
 fi
-if [[ -z "${VLLM_MODE:-}" ]]; then
-  echo "ERROR: VLLM_MODE is not set. Please export one of: vllm-sim, vllm, vllm-p2p"
-  exit 1
-fi
+
 
 # GIE Configuration
 export POOL_NAME="${POOL_NAME:-vllm-llama3-8b-instruct}"
@@ -37,6 +34,8 @@ export HF_TOKEN=$(echo -n "${HF_TOKEN}" | base64 | tr -d '\n')
 export HF_SECRET_NAME="${HF_SECRET_NAME:-hf-token}"
 export HF_SECRET_KEY="${HF_SECRET_KEY:-token}"
 # vLLM Specific Configuration node
+export VLLM_MODE="${VLLM_MODE:-vllm-sim}"
+
 case "${VLLM_MODE}" in
   vllm-sim)
     export VLLM_SIM_IMAGE="${VLLM_SIM_IMAGE:-quay.io/vllm-d/vllm-sim}"
@@ -59,8 +58,6 @@ case "${VLLM_MODE}" in
       export VLLM_TAG="${VLLM_TAG:-0.0.2}"
       export EPP_IMAGE="${EPP_IMAGE:-quay.io/vllm-d/gateway-api-inference-extension-dev}"
       export EPP_TAG="${EPP_TAG:-0.0.4}"
-
-      export VLLM_REPLICA_COUNT="${VLLM_REPLICA_COUNT:-2}"
       export MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"
       export PVC_NAME="${PVC_NAME:-vllm-storage-claim}"
       export LORA_ADAPTER_SYNCER_IMAGE="${LORA_ADAPTER_SYNCER_IMAGE:-us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer}"
@@ -79,7 +76,6 @@ case "${VLLM_MODE}" in
       export REDIS_IMAGE="${REDIS_IMAGE:-redis}"
       export REDIS_TAG="${REDIS_TAG:-7.2.3}"
       export VLLM_CPU_RESOURCES="${VLLM_CPU_RESOURCES:-10}"
-      export REDIS_REPLICA_COUNT="${REDIS_REPLICA_COUNT:-1}"
       export POD_IP="POD_IP"
       export REDIS_TARGET_PORT="${REDIS_TARGET_PORT:-6379}"
       export REDIS_SERVICE_TYPE="${REDIS_SERVICE_TYPE:-ClusterIP}"

From 17a23e5a553c68e39cec4a567bb9d65b4dc561fd Mon Sep 17 00:00:00 2001
From: Kfir Toledo <kfir.toledo@ibm.com>
Date: Tue, 29 Apr 2025 12:20:03 +0300
Subject: [PATCH 5/5] [fix] Fix the kind environemnt and set gateway service to
 be NodePort

Signed-off-by: Kfir Toledo <kfir.toledo@ibm.com>
---
 DEVELOPMENT.md                                | 22 +++++++++----------
 .../inference-gateway/deployments.yaml        |  8 -------
 .../inference-gateway/kustomization.yaml      |  2 --
 deploy/components/vllm/kustomization.yaml     |  2 +-
 .../gateway-parameters.yaml                   |  2 +-
 .../kubernetes-kgateway/kustomization.yaml    |  3 ++-
 .../patch-deployments.yaml                    |  8 +++++++
 .../dev/kubernetes-kgateway}/secret.yaml      |  0
 .../kubernetes-vllm/vllm/kustomization.yaml   |  2 +-
 scripts/kind-dev-env.sh                       |  7 +++++-
 scripts/kubernetes-dev-env.sh                 |  5 +++--
 11 files changed, 33 insertions(+), 28 deletions(-)
 rename deploy/{components/inference-gateway => environments/dev/kubernetes-kgateway}/secret.yaml (100%)

diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index 24f28e19..f6a7c466 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -191,7 +191,7 @@ export REGISTRY_SECRET=anna-pull-secret
 
 Set the `VLLM_MODE` environment variable based on which version of vLLM you want to deploy:
 
-* `vllm-sim`: Lightweight simulator for simple environments (defult).
+* `vllm-sim`: Lightweight simulator for simple environments (default).
 * `vllm`: Full vLLM model server, using GPU/CPU for inferencing
 * `vllm-p2p`: Full vLLM with LMCache P2P support for enable KV-Cache aware routing
 
@@ -224,19 +224,19 @@ kubectl port-forward service/inference-gateway 8080:80
 
 And making requests with `curl`:
 
-- vllm-sim
+**vllm-sim:**
 
-    ```bash
-    curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
-      -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq
-    ```
+```bash
+curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
+  -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq
+```
 
-- vllm or vllm-p2p
+**vllm or vllm-p2p:**
 
-  ```bash
-  curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
-    -d '{"model":"meta-llama/Llama-3.1-8B-Instruct","prompt":"hi","max_tokens":10,"temperature":0}' | jq
-  ```
+```bash
+curl -s -w '\n' http://localhost:8080/v1/completions -H 'Content-Type: application/json' \
+  -d '{"model":"meta-llama/Llama-3.1-8B-Instruct","prompt":"hi","max_tokens":10,"temperature":0}' | jq
+```
 
 #### Environment Configurateion
 
diff --git a/deploy/components/inference-gateway/deployments.yaml b/deploy/components/inference-gateway/deployments.yaml
index b603beb5..afff8fd2 100644
--- a/deploy/components/inference-gateway/deployments.yaml
+++ b/deploy/components/inference-gateway/deployments.yaml
@@ -48,11 +48,3 @@ spec:
             service: inference-extension
           initialDelaySeconds: 5
           periodSeconds: 10
-        env:
-          - name: KVCACHE_INDEXER_REDIS_ADDR
-            value: ${REDIS_HOST}:${REDIS_PORT}
-          - name: HF_TOKEN
-            valueFrom:
-              secretKeyRef:
-                name: ${HF_SECRET_NAME}
-                key: ${HF_SECRET_KEY}
diff --git a/deploy/components/inference-gateway/kustomization.yaml b/deploy/components/inference-gateway/kustomization.yaml
index 78dfabcd..49607a37 100644
--- a/deploy/components/inference-gateway/kustomization.yaml
+++ b/deploy/components/inference-gateway/kustomization.yaml
@@ -26,8 +26,6 @@ resources:
 - deployments.yaml
 - gateways.yaml
 - httproutes.yaml
-- secret.yaml
-
 
 images:
 - name: quay.io/vllm-d/gateway-api-inference-extension/epp
diff --git a/deploy/components/vllm/kustomization.yaml b/deploy/components/vllm/kustomization.yaml
index f04fdf9a..6e0da28b 100644
--- a/deploy/components/vllm/kustomization.yaml
+++ b/deploy/components/vllm/kustomization.yaml
@@ -33,4 +33,4 @@ images:
 configMapGenerator:
 - name: vllm-model-config
   literals:
-    - MODEL_NAME=${MODEL_NAME}
\ No newline at end of file
+    - MODEL_NAME=${MODEL_NAME}
diff --git a/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml b/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml
index 8c07c693..da2d91d2 100644
--- a/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml
+++ b/deploy/environments/dev/kubernetes-kgateway/gateway-parameters.yaml
@@ -11,7 +11,7 @@ spec:
         runAsNonRoot: true
         runAsUser: "${PROXY_UID}"
     service:
-      type: LoadBalancer
+      type: ${GATEWAY_SERVICE_TYPE}
       extraLabels:
         gateway: custom
     podTemplate:
diff --git a/deploy/environments/dev/kubernetes-kgateway/kustomization.yaml b/deploy/environments/dev/kubernetes-kgateway/kustomization.yaml
index 7dc295de..293119e2 100644
--- a/deploy/environments/dev/kubernetes-kgateway/kustomization.yaml
+++ b/deploy/environments/dev/kubernetes-kgateway/kustomization.yaml
@@ -4,6 +4,7 @@ kind: Kustomization
 namespace: ${NAMESPACE}
 
 resources:
+- secret.yaml
 - ../../../components/inference-gateway/
 - gateway-parameters.yaml
 
@@ -14,4 +15,4 @@ images:
 
 patches:
 - path: patch-deployments.yaml
-- path: patch-gateways.yaml
\ No newline at end of file
+- path: patch-gateways.yaml
diff --git a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml
index a3b93d36..00c87fbb 100644
--- a/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml
+++ b/deploy/environments/dev/kubernetes-kgateway/patch-deployments.yaml
@@ -22,3 +22,11 @@ spec:
         - "9002"
         - -grpcHealthPort
         - "9003"
+        env:
+          - name: KVCACHE_INDEXER_REDIS_ADDR
+            value: ${REDIS_HOST}:${REDIS_PORT}
+          - name: HF_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: hf-token
+                key: ${HF_SECRET_KEY}
\ No newline at end of file
diff --git a/deploy/components/inference-gateway/secret.yaml b/deploy/environments/dev/kubernetes-kgateway/secret.yaml
similarity index 100%
rename from deploy/components/inference-gateway/secret.yaml
rename to deploy/environments/dev/kubernetes-kgateway/secret.yaml
diff --git a/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml b/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml
index af346345..e512ee89 100644
--- a/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml
+++ b/deploy/environments/dev/kubernetes-vllm/vllm/kustomization.yaml
@@ -5,7 +5,7 @@ resources:
 - ../../../../components/vllm/
 
 images:
-- name: quay.io/vllm-d/vllm-d-dev:0.0.2
+- name: quay.io/vllm-d/vllm-d-dev
   newName: ${VLLM_IMAGE}
   newTag: ${VLLM_TAG}
 
diff --git a/scripts/kind-dev-env.sh b/scripts/kind-dev-env.sh
index e40847e0..85cd988e 100755
--- a/scripts/kind-dev-env.sh
+++ b/scripts/kind-dev-env.sh
@@ -25,6 +25,11 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 # Set the host port to map to the Gateway's inbound port (30080)
 : "${GATEWAY_HOST_PORT:=30080}"
 
+# Set the inference pool name for the deployment
+export POOL_NAME="${POOL_NAME:-vllm-llama3-8b-instruct}"
+
+# Set the model name to deploy
+export MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-3.1-8B-Instruct}"
 # ------------------------------------------------------------------------------
 # Setup & Requirement Checks
 # ------------------------------------------------------------------------------
@@ -113,7 +118,7 @@ kustomize build --enable-helm deploy/components/crds-kgateway |
 
 # Deploy the environment to the "default" namespace
 kustomize build --enable-helm deploy/environments/dev/kind-kgateway \
-	| sed "s/REPLACE_NAMESPACE/${PROJECT_NAMESPACE}/gI" \
+	| envsubst | sed "s/REPLACE_NAMESPACE/${PROJECT_NAMESPACE}/gI" \
 	| kubectl --context ${KUBE_CONTEXT} apply -f -
 
 # Wait for all control-plane pods to be ready
diff --git a/scripts/kubernetes-dev-env.sh b/scripts/kubernetes-dev-env.sh
index 94ca77fc..62027c69 100755
--- a/scripts/kubernetes-dev-env.sh
+++ b/scripts/kubernetes-dev-env.sh
@@ -24,6 +24,7 @@ fi
 # GIE Configuration
 export POOL_NAME="${POOL_NAME:-vllm-llama3-8b-instruct}"
 export MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-3.1-8B-Instruct}"
+export GATEWAY_SERVICE_TYPE="${GATEWAY_SERVICE_TYPE:-NodePort}"
 
 ## EPP ENV VARs — currently added to all EPPs, regardless of the VLLM mode or whether they are actually needed
 export REDIS_DEPLOYMENT_NAME="${REDIS_DEPLOYMENT_NAME:-lookup-server-service}"
@@ -104,7 +105,7 @@ if [[ "$CLEAN" == "true" ]]; then
   kustomize build deploy/environments/dev/kubernetes-vllm/${VLLM_MODE} | envsubst | kubectl -n "${NAMESPACE}" delete --ignore-not-found=true -f -
 else
   echo "INFO: Deploying vLLM Environment in namespace ${NAMESPACE}"
-  oc adm policy add-scc-to-user anyuid -z default -n ${NAMESPACE}  # TODO - Change to security context
+  oc adm policy add-scc-to-user anyuid -z default -n ${NAMESPACE}
   kustomize build deploy/environments/dev/kubernetes-vllm/${VLLM_MODE} | envsubst | kubectl -n "${NAMESPACE}" apply -f -
 
   echo "INFO: Deploying Gateway Environment in namespace ${NAMESPACE}"
@@ -120,7 +121,7 @@ else
       kubectl -n "${NAMESPACE}" wait deployment/vllm-sim --for=condition=Available --timeout=60s
       ;;
     vllm)
-      kubectl -n "${NAMESPACE}" wait deployment/${VLLM_DEPLOYMENT_NAME} --for=condition=Available --timeout=300s
+      kubectl -n "${NAMESPACE}" wait deployment/${VLLM_DEPLOYMENT_NAME} --for=condition=Available --timeout=500s
       ;;
     vllm-p2p)
       kubectl -n "${NAMESPACE}" wait deployment/${VLLM_DEPLOYMENT_NAME} --for=condition=Available --timeout=180s