fix: better instructions for GAIE recipe (#4525) (#4554)

dagil-nvidia · atchernych · web-flow · commit 6af705ea22f2 · 2025-11-24T17:16:32.000-08:00
Signed-off-by: Anna Tchernych &lt;atchernych@nvidia.com&gt;
Signed-off-by: Dan Gil &lt;dagil@nvidia.com&gt;
Co-authored-by: atchernych &lt;atchernych@nvidia.com&gt;
diff --git a/recipes/README.md b/recipes/README.md
@@ -90,9 +90,13 @@ kubectl get storageclass
 **Step 1: Download Model**
 
 ```bash
+cd recipes
 # Update storageClassName in model-cache.yaml first!
 kubectl apply -f <model>/model-cache/ -n ${NAMESPACE}
 
+# Create model cache PVC
+kubectl apply -f <model>/model-cache/model-download.yaml -n ${NAMESPACE}
+
 # Wait for download to complete (may take 10-60 minutes depending on model size)
 kubectl wait --for=condition=Complete job/model-download -n ${NAMESPACE} --timeout=6000s
 
@@ -102,6 +106,8 @@ kubectl logs -f job/model-download -n ${NAMESPACE}
 
 **Step 2: Deploy Service**
 
+Update the image in `<model>/<framework>/<mode>/deploy.yaml`.
+
 ```bash
 kubectl apply -f <model>/<framework>/<mode>/deploy.yaml -n ${NAMESPACE}
 
@@ -162,7 +168,9 @@ kubectl create secret generic hf-token-secret \
   -n ${NAMESPACE}
 
 # Deploy
+cd recipes
 kubectl apply -f llama-3-70b/model-cache/ -n ${NAMESPACE}
+kubectl apply -f llama-3-70b/model-cache/model-download.yaml -n ${NAMESPACE}
 kubectl wait --for=condition=Complete job/model-download -n ${NAMESPACE} --timeout=6000s
 kubectl apply -f llama-3-70b/vllm/agg/deploy.yaml -n ${NAMESPACE}
 
@@ -174,13 +182,15 @@ kubectl port-forward svc/llama3-70b-agg-frontend 8000:8000 -n ${NAMESPACE}
 
 For Llama-3-70B with vLLM (Aggregated), an example of integration with the Inference Gateway is provided.
 
-Follow to Follow [Deploy Inference Gateway Section 2](../deploy/inference-gateway/README.md#2-deploy-inference-gateway) to install GAIE. Then apply manifests.
-Update the containers.epp.image in the deployment file, i.e. llama-3-70b/vllm/agg/gaie/k8s-manifests/epp/deployment.yaml
-This should be the same image you have used for your deployment.
+First, deploy the Dynamo Graph per instructions above.
+
+Then follow [Deploy Inference Gateway Section 2](../deploy/inference-gateway/README.md#2-deploy-inference-gateway) to install GAIE.
+
+Update the containers.epp.image in the deployment file, i.e. llama-3-70b/vllm/agg/gaie/k8s-manifests/epp/deployment.yaml. It should match the release tag and be in the format `nvcr.io/nvidia/ai-dynamo/frontend:<my-tag>` i.e. `nvcr.io/nvstaging/ai-dynamo/dynamo-frontend:0.7.0rc2-amd64`
 
 ```bash
 export DEPLOY_PATH=llama-3-70b/vllm/agg/
-#DEPLOY_PATH=<model>/<framework>/<mode>/
+# DEPLOY_PATH=<model>/<framework>/<mode>/
 kubectl apply -R -f "$DEPLOY_PATH/gaie/k8s-manifests" -n "$NAMESPACE"
 ```
 
diff --git a/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/epp/deployment.yaml b/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/epp/deployment.yaml
@@ -38,7 +38,7 @@ spec:
 
       containers:
         - name: epp
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+          image: nvcr.io/nvidia/ai-dynamo/frontend:<my-tag>
           imagePullPolicy: IfNotPresent
           resources:
             requests:
@@ -76,7 +76,7 @@ spec:
             - name: DYNAMO_NAMESPACE
               value: "$(POD_NAMESPACE)-llama3-70b-agg"
             - name: DYNAMO_MODEL
-              value: "llama3-70b-agg"
+              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
             - name: DYNAMO_KV_BLOCK_SIZE
               value: "128" # UPDATE to match the --block-size in your deploy.yaml engine command
             - name: USE_STREAMING