neuralmagic
diff --git a/‎DEVELOPMENT.md
Lines changed: 20 additions & 3 deletions b/‎DEVELOPMENT.md
Lines changed: 20 additions & 3 deletions
diff --git a/‎Makefile
Lines changed: 2 additions & 5 deletions b/‎Makefile
Lines changed: 2 additions & 5 deletions
diff --git a/‎deploy/components/vllm-p2p/deployments/redis-deployment.yaml
Lines changed: 27 additions & 0 deletions b/‎deploy/components/vllm-p2p/deployments/redis-deployment.yaml
Lines changed: 27 additions & 0 deletions
diff --git a/‎deploy/components/vllm-p2p/deployments/secret.yaml
Lines changed: 11 additions & 0 deletions b/‎deploy/components/vllm-p2p/deployments/secret.yaml
Lines changed: 11 additions & 0 deletions
diff --git a/‎deploy/components/vllm-p2p/deployments/vllm-deployment.yaml
Lines changed: 61 additions & 0 deletions b/‎deploy/components/vllm-p2p/deployments/vllm-deployment.yaml
Lines changed: 61 additions & 0 deletions
diff --git a/‎deploy/components/vllm-p2p/kustomization.yaml
Lines changed: 30 additions & 0 deletions b/‎deploy/components/vllm-p2p/kustomization.yaml
Lines changed: 30 additions & 0 deletions
diff --git a/‎deploy/components/vllm-p2p/pvc/volume.yaml
Lines changed: 18 additions & 0 deletions b/‎deploy/components/vllm-p2p/pvc/volume.yaml
Lines changed: 18 additions & 0 deletions
diff --git a/‎deploy/components/vllm-p2p/service/redis-service.yaml
Lines changed: 18 additions & 0 deletions b/‎deploy/components/vllm-p2p/service/redis-service.yaml
Lines changed: 18 additions & 0 deletions
diff --git a/‎deploy/components/vllm/deployments.yaml
Lines changed: 143 additions & 0 deletions b/‎deploy/components/vllm/deployments.yaml
Lines changed: 143 additions & 0 deletions
@@ -178,14 +178,31 @@ Export the name of the `Secret` to the environment:
 export REGISTRY_SECRET=anna-pull-secret
 ```
 
-Now you need to provide several other environment variables. You'll need to
-indicate the location and tag of the `vllm-sim` image:
+Set the `VLLM_MODE` environment variable based on which version of vLLM you want to deploy:
+
+- `vllm-sim`: Lightweight simulator for simple environments
+- `vllm`: Full vLLM model server for real inference
+- `vllm-p2p`: Full vLLM with LMCache P2P support for distributed KV caching
+
+```console
+export VLLM_MODE=vllm-sim  # or vllm / vllm-p2p
+```
+Each mode has default image values, but you can override them:
+
+For vllm-sim:
 
 ```console
 export VLLM_SIM_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
 export VLLM_SIM_TAG="<YOUR_TAG>"
 ```
 
+For vllm and vllm-p2p:
+
+```console
+export VLLM_IMAGE="<YOUR_REGISTRY>/<YOUR_IMAGE>"
+export VLLM_TAG="<YOUR_TAG>"
+```
+
 The same thing will need to be done for the EPP:
 
 ```console
@@ -203,7 +220,7 @@ This will deploy the entire stack to whatever namespace you chose. You can test
 by exposing the inference `Gateway` via port-forward:
 
 ```console
-kubectl -n ${NAMESPACE} port-forward service/inference-gateway-istio 8080:80
+kubectl -n ${NAMESPACE} port-forward service/inference-gateway 8080:80
 ```
 
 And making requests with `curl`:
 
@@ -780,11 +780,8 @@ environment.dev.kubernetes: check-kubectl check-kustomize check-envsubst
 # ------------------------------------------------------------------------------
 .PHONY: clean.environment.dev.kubernetes
 clean.environment.dev.kubernetes: check-kubectl check-kustomize check-envsubst
-ifndef NAMESPACE
-	$(error "Error: NAMESPACE is required but not set")
-endif
-	@echo "INFO: cleaning up dev environment in $(NAMESPACE)"
-	kustomize build deploy/environments/dev/kubernetes-kgateway | envsubst | kubectl -n "${NAMESPACE}" delete -f -
+	@CLEAN=true ./scripts/kubernetes-dev-env.sh 2>&1
+	@echo "INFO: Finish cleanup development environment for $(VLLM_MODE) mode in namespace $(NAMESPACE)"
 
 # -----------------------------------------------------------------------------
 # TODO: these are old aliases that we still need for the moment, but will be
 
@@ -0,0 +1,27 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ${REDIS_NAME}
+  labels:
+    app.kubernetes.io/name: redis
+    app.kubernetes.io/component: redis-lookup-server
+spec:
+  replicas: ${REDIS_REPLICA_COUNT}
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: redis
+      app.kubernetes.io/component: redis-lookup-server
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: redis
+        app.kubernetes.io/component: redis-lookup-server
+    spec:
+      containers:
+        - name: lookup-server
+          image: ${REDIS_IMAGE}:${REDIS_TAG}
+          imagePullPolicy: Always
+          command:
+            - redis-server
+          ports:
+            - containerPort: ${REDIS_TARGET_PORT}
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ${HF_SECRET_NAME}
+  namespace: ${NAMESPACE}
+  labels:
+    app.kubernetes.io/name: vllm
+    app.kubernetes.io/component: secret
+type: Opaque
+data:
+  ${HF_SECRET_KEY}: ${HF_TOKEN}
@@ -0,0 +1,61 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ${VLLM_DEPLOYMENT_NAME}
+  labels:
+    app.kubernetes.io/name: vllm
+    app.kubernetes.io/model: ${MODEL_LABEL}
+    app.kubernetes.io/component: vllm
+spec:
+  replicas: ${VLLM_REPLICA_COUNT}
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: vllm
+      app.kubernetes.io/component: vllm
+      app.kubernetes.io/model: ${MODEL_LABEL}
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: vllm
+        app.kubernetes.io/component: vllm
+        app.kubernetes.io/model: ${MODEL_LABEL}
+    spec:
+      containers:
+        - name: vllm
+          image: ${VLLM_IMAGE}:${VLLM_TAG}
+          imagePullPolicy: Always
+          command:
+            - /bin/sh
+            - "-c"
+          args:
+            - |
+              export LMCACHE_DISTRIBUTED_URL=${POD_IP}:80 &&
+              vllm serve ${MODEL_NAME}
+              --host 0.0.0.0
+              --port 8000
+              --enable-chunked-prefill false
+              --max-model-len ${MAX_MODEL_LEN}
+              --kv-transfer-config
+              '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}'
+          ports:
+            - name: http
+              containerPort: 8000
+            - name: lmcache-dist
+              containerPort: 80
+          env:
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: ${HF_SECRET_NAME}
+                  key: ${HF_SECRET_KEY}
+            - name: POD_IP
+              valueFrom:
+                fieldRef:
+                  fieldPath: status.podIP
+          volumeMounts:
+            - name: model-storage
+              mountPath: ${VOLUME_MOUNT_PATH}
+      volumes:
+        - name: model-storage
+          persistentVolumeClaim:
+            claimName: ${PVC_NAME}
@@ -0,0 +1,30 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: ${NAMESPACE}
+
+resources:
+  - deployments/vllm-deployment.yaml
+  - deployments/redis-deployment.yaml
+  - service/redis-service.yaml
+  - pvc/volume.yaml
+  - deployments/secret.yaml
+
+images:
+  - name: vllm/vllm-openai
+    newName: ${VLLM_IMAGE}
+    newTag: ${VLLM_TAG}
+  - name: redis
+    newName: ${REDIS_IMAGE}
+    newTag: ${REDIS_TAG}
+
+configMapGenerator:
+  - name: model-config
+    literals:
+      - MODEL_NAME=${MODEL_NAME}
+      - MODEL_LABEL=${MODEL_LABEL}
+      - POOL_LABEL=${POOL_LABEL}
+      - REDIS_ENABLED=${REDIS_ENABLED}
+
+generatorOptions:
+  disableNameSuffixHash: true
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: ${PVC_NAME}
+  namespace: ${NAMESPACE}
+  labels:
+    app.kubernetes.io/name: vllm
+    app.kubernetes.io/component: storage
+    app.kubernetes.io/model: ${MODEL_LABEL}
+  finalizers:
+    - kubernetes.io/pvc-protection
+spec:
+  accessModes:
+    - ${PVC_ACCESS_MODE}
+  resources:
+    requests:
+      storage: ${PVC_SIZE}
+  storageClassName: ${PVC_STORAGE_CLASS}
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: ${REDIS_NAME}
+  namespace: ${NAMESPACE}
+  labels:
+    app.kubernetes.io/name: redis
+    app.kubernetes.io/component: redis-lookup-server
+spec:
+  ports:
+    - name: lookupserver-port
+      protocol: TCP
+      port: ${REDIS_PORT}
+      targetPort: ${REDIS_TARGET_PORT}
+  type: ${REDIS_SERVICE_TYPE}
+  selector:
+    app.kubernetes.io/name: redis
+    app.kubernetes.io/component: redis-lookup-server
@@ -0,0 +1,143 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ${VLLM_DEPLOYMENT_NAME}
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: vllm-llama3-8b-instruct
+  template:
+    metadata:
+      labels:
+        app: vllm-llama3-8b-instruct
+    spec:
+      securityContext:
+        runAsUser: ${PROXY_UID}
+        runAsNonRoot: true
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: vllm
+          image: "vllm/vllm-openai:latest"
+          imagePullPolicy: IfNotPresent
+          command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+          args:
+            - "--model"
+            - "meta-llama/Llama-3.1-8B-Instruct"
+            - "--tensor-parallel-size"
+            - "1"
+            - "--port"
+            - "8000"
+            - "--max-num-seq"
+            - "1024"
+            - "--compilation-config"
+            - "3"
+            - "--enable-lora"
+            - "--max-loras"
+            - "2"
+            - "--max-lora-rank"
+            - "8"
+            - "--max-cpu-loras"
+            - "12"
+          env:
+            - name: VLLM_USE_V1
+              value: "1"
+            - name: PORT
+              value: "8000"
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: token
+            - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
+              value: "true"
+            - name: XDG_CACHE_HOME
+              value: /cache
+            - name: HF_HOME
+              value: /cache/huggingface
+            - name: FLASHINFER_CACHE_DIR
+              value: /cache/flashinfer
+          ports:
+            - containerPort: 8000
+              name: http
+              protocol: TCP
+          lifecycle:
+            preStop:
+              sleep:
+                seconds: 30
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: http
+              scheme: HTTP
+            periodSeconds: 1
+            successThreshold: 1
+            failureThreshold: 5
+            timeoutSeconds: 1
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: http
+              scheme: HTTP
+            periodSeconds: 1
+            successThreshold: 1
+            failureThreshold: 1
+            timeoutSeconds: 1
+          startupProbe:
+            httpGet:
+              path: /health
+              port: http
+              scheme: HTTP
+            failureThreshold: 600
+            initialDelaySeconds: 2
+            periodSeconds: 1
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+            requests:
+              nvidia.com/gpu: 1
+          volumeMounts:
+            - mountPath: /cache
+              name: hf-cache
+            - mountPath: /dev/shm
+              name: shm
+            - mountPath: /adapters
+              name: adapters
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop:
+                - ALL
+      initContainers:
+        - name: lora-adapter-syncer
+          tty: true
+          stdin: true
+          image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer:main
+          restartPolicy: Always
+          imagePullPolicy: Always
+          env:
+            - name: DYNAMIC_LORA_ROLLOUT_CONFIG
+              value: "/config/configmap.yaml"
+          volumeMounts:
+            - name: config-volume
+              mountPath: /config
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop:
+                - ALL
+      restartPolicy: Always
+      enableServiceLinks: false
+      terminationGracePeriodSeconds: 130
+      volumes:
+        - name: hf-cache
+          emptyDir: {}
+        - name: shm
+          emptyDir:
+            medium: Memory
+        - name: adapters
+          emptyDir: {}
+        - name: config-volume
+          configMap:
+            name: vllm-llama3-8b-instruct-adapters