Upload prefill and decode heavy benchmarking configs

rlakhtakia · rlakhtakia · commit 6cac7b1176a3 · 2025-11-11T19:07:21.000Z
diff --git a/benchmarking/prefix-cache-aware/high-cache-values.yaml b/benchmarking/prefix-cache-aware/high-cache-values.yaml
@@ -23,6 +23,12 @@ logLevel: INFO
 # NOTE: For this dataset to be used, config.data.path must also be explicitly set to /dataset/dataset.json.
 gcsPath: ""
 
+# An S3 bucket path that points to the dataset file.
+# The file will be copied from this path to the local file system
+# at /dataset/s3-dataset.json for use during the run.
+# NOTE: For this dataset to be used, config.data.path must also be explicitly set to /dataset/s3-dataset.json.
+s3Path: ""
+
 # hfToken optionally creates a secret with the specified token.
 # Can be set using helm install --set hftoken=<token>
 hfToken: ""
diff --git a/benchmarking/prefix-cache-aware/low-cache-values.yaml b/benchmarking/prefix-cache-aware/low-cache-values.yaml
@@ -23,6 +23,12 @@ logLevel: INFO
 # NOTE: For this dataset to be used, config.data.path must also be explicitly set to /dataset/dataset.json.
 gcsPath: ""
 
+# An S3 bucket path that points to the dataset file.
+# The file will be copied from this path to the local file system
+# at /dataset/s3-dataset.json for use during the run.
+# NOTE: For this dataset to be used, config.data.path must also be explicitly set to /dataset/s3-dataset.json.
+s3Path: ""
+
 # hfToken optionally creates a secret with the specified token.
 # Can be set using helm install --set hftoken=<token>
 hfToken: ""
diff --git a/benchmarking/single-workload/decode-heavy-values.yaml b/benchmarking/single-workload/decode-heavy-values.yaml
@@ -0,0 +1,76 @@
+# Decode Heavy Configuration
+job:
+  image:
+    repository: quay.io/inference-perf/inference-perf
+    tag: "0.2.0" # Defaults to .Chart.AppVersion
+  serviceAccountName: ""
+  nodeSelector: {}
+  # Example resources:
+  # resources:
+  #   requests:
+  #     cpu: "1"
+  #     memory: "4Gi"
+  #   limits:
+  #     cpu: "2"
+  #     memory: "8Gi"
+  resources: {}
+
+logLevel: INFO
+
+# A GCS bucket path that points to the dataset file.
+# The file will be copied from this path to the local file system
+# at /dataset/dataset.json for use during the run.
+# NOTE: For this dataset to be used, config.data.path must also be explicitly set to /dataset/dataset.json.
+gcsPath: ""
+
+# An S3 bucket path that points to the dataset file.
+# The file will be copied from this path to the local file system
+# at /dataset/s3-dataset.json for use during the run.
+# NOTE: For this dataset to be used, config.data.path must also be explicitly set to /dataset/s3-dataset.json.
+s3Path: ""
+
+# hfToken optionally creates a secret with the specified token.
+# Can be set using helm install --set hftoken=<token>
+hfToken: ""
+
+config:
+  load:
+    type: constant
+    interval: 15
+    stages:
+    - rate: 200
+      duration: 60
+    - rate: 210
+      duration: 60
+    - rate: 220
+      duration: 60
+    worker_max_concurrency: 1000
+  api:
+    type: completion
+    streaming: true
+  server:
+    type: vllm
+    model_name: meta-llama/Llama-3.1-8B-Instruct
+    base_url: http://0.0.0.0:8000
+    ignore_eos: true
+  tokenizer:
+    pretrained_model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
+  data:
+    type: infinity_instruct
+    path: ""
+    input_distribution:
+      max: 1024
+    output_distribution:
+      max: 1024
+  metrics:
+    type: prometheus
+    prometheus:
+      google_managed: true
+  report:
+    request_lifecycle:
+      summary: true
+      per_stage: true
+      per_request: true
+    prometheus:
+      summary: true
+      per_stage: true
diff --git a/benchmarking/single-workload/prefill-heavy-values.yaml b/benchmarking/single-workload/prefill-heavy-values.yaml
@@ -0,0 +1,77 @@
+# Prefill Heavy Configuration
+job:
+  image:
+    repository: quay.io/inference-perf/inference-perf
+    tag: "0.2.0" # Defaults to .Chart.AppVersion
+  serviceAccountName: ""
+  nodeSelector: {}
+  # Example resources:
+  # resources:
+  #   requests:
+  #     cpu: "1"
+  #     memory: "4Gi"
+  #   limits:
+  #     cpu: "2"
+  #     memory: "8Gi"
+  resources: {}
+
+logLevel: INFO
+
+# A GCS bucket path that points to the dataset file.
+# The file will be copied from this path to the local file system
+# at /dataset/dataset.json for use during the run.
+# NOTE: For this dataset to be used, config.data.path must also be explicitly set to /dataset/dataset.json.
+gcsPath: ""
+
+# An S3 bucket path that points to the dataset file.
+# The file will be copied from this path to the local file system
+# at /dataset/s3-dataset.json for use during the run.
+# NOTE: For this dataset to be used, config.data.path must also be explicitly set to /dataset/s3-dataset.json.
+s3Path: ""
+
+# hfToken optionally creates a secret with the specified token.
+# Can be set using helm install --set hftoken=<token>
+hfToken: ""
+
+config:
+  load:
+    type: constant
+    interval: 15
+    stages:
+    - rate: 300
+      duration: 30
+    - rate: 310
+      duration: 30
+    - rate: 320
+      duration: 30
+    - rate: 330
+      duration: 30
+  api:
+    type: completion
+    streaming: true
+  server:
+    type: vllm
+    model_name: meta-llama/Llama-3.1-8B-Instruct
+    base_url: http://0.0.0.0:8000
+    ignore_eos: true
+  tokenizer:
+    pretrained_model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
+  data:
+    type: billsum_conversations
+    path: ""
+    input_distribution:
+      max: 1024
+    output_distribution:
+      max: 1024
+  metrics:
+    type: prometheus
+    prometheus:
+      google_managed: true
+  report:
+    request_lifecycle:
+      summary: true
+      per_stage: true
+      per_request: true
+    prometheus:
+      summary: true
+      per_stage: true
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -84,6 +84,8 @@ nav:
     - Benchmark: performance/benchmark/index.md
     - Advanced Benchmarking Configs: 
       - Prefix Cache Aware: performance/benchmark/advanced-configs/prefix-cache-aware.md
+      - Decode Heavy Workload: performance/benchmark/advanced-configs/decode-heavy-workload.md
+      - Prefill Heavy Workload: performance/benchmark/advanced-configs/prefill-heavy-workload.md
     - Regression Testing: performance/regression-testing/index.md
   - Reference:
     - v1 API Reference: reference/spec.md
diff --git a/site-src/performance/benchmark/advanced-configs/decode-heavy-workload.md b/site-src/performance/benchmark/advanced-configs/decode-heavy-workload.md
@@ -0,0 +1,97 @@
+# Decode Heavy Workload Benchmarking
+This guide shows how to deploy a decode-heavy benchmarking config using inference-perf.
+
+## Prerequisites
+
+Before you begin, ensure you have the following:
+
+*   **Helm 3+**: [Installation Guide](https://helm.sh/docs/intro/install/)
+*   **Kubernetes Cluster**: Access to a Kubernetes cluster
+*   **Hugging Face Token Secret**: A Hugging Face token to pull models.
+*   **Gateway Deployed**: Your inference server/gateway must be deployed and accessible within the cluster.
+
+Follow [benchmarking guide](https://gateway-api-inference-extension.sigs.k8s.io/performance/benchmark/#benchmark) for more information on how to set up gateway and how to validate benchmark results.
+
+## Infinity Instruct Dataset Configuration
+
+The chart uses the `infinity_instruct` [dataset type](https://huggingface.co/datasets/BAAI/Infinity-Instruct). 
+
+>NOTE: Currently, we need to download and supply the dataset for inference-perf to ingest. Currently using helm, we can supply the dataset by uploading to a gcs or s3 bucket. Otherwise, you can follow inference perf guides to run locally with a local dataset file path.
+
+## Deployment
+
+### 1. Check out the repo.
+
+```bash
+git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension
+cd gateway-api-inference-extension/benchmarking/single-workload
+```
+
+### 2. Get the target IP. 
+
+  The examples below shows how to get the IP of a gateway or a k8s service.
+
+  ```bash
+  # Get gateway IP
+  GW_IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}')
+  # Get LoadBalancer k8s service IP
+  SVC_IP=$(kubectl get service/vllm-llama3-8b-instruct -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
+
+  echo $GW_IP
+  echo $SVC_IP
+  ```
+
+### 3. Deploying the Decode Heavy Configuration
+
+This configuration is optimized for scenarios where a high cache hit rate is expected. It uses the `decode-heavy-values.yaml` file.
+
+=== "Google Cloud Storage (GCS)"
+    Use the `gcsPath` field to provide your dataset stored on GCS. The dataset will be downloaded from the bucket and stored locally on the cluster at `/dataset/gcs-dataset.json`. 
+    ```bash
+    export IP='<YOUR_IP>'
+    export PORT='<YOUR_PORT>'
+    export HF_TOKEN='<YOUR_HUGGINGFACE_TOKEN>'
+    helm install decode-heavy ../inference-perf -f decode-heavy-values.yaml \
+      --set hfToken=${HF_TOKEN} \
+      --set "config.server.base_url=http://${IP}:${PORT}" \
+      --set "config.data.path=/dataset/gcs-dataset.json" \
+      --set "gcsPath=<PATH TO DATASET FILE ON GCS BUCKET>"
+    ```
+    **Parameters to customize:**
+    
+    *   `decode-heavy`: A unique name for this deployment.
+    *   `hfTokenSecret.name`: The name of your Kubernetes Secret containing the Hugging Face token (default: `hf-token`).
+    *   `hfTokenSecret.key`: The key in your Kubernetes Secret pointing to the Hugging Face token (default: `token`).
+    *   `config.server.base_url`: The base URL (IP and port) of your inference server for the high-cache scenario.
+    *   `gcsPath`: The path to the downloaded dataset file hosted on your gcs bucket. 
+
+=== "Simple Storage Service (S3)"
+    Use the `s3Path` field to provide your dataset stored on S3. The dataset will be downloaded from the bucket and stored locally on the cluster at `/dataset/s3-dataset.json`. 
+    ```bash
+    export IP='<YOUR_IP>'
+    export PORT='<YOUR_PORT>'
+    export HF_TOKEN='<YOUR_HUGGINGFACE_TOKEN>'
+    helm install decode-heavy ../inference-perf -f decode-heavy-values.yaml \
+      --set hfToken=${HF_TOKEN} \
+      --set "config.server.base_url=http://${IP}:${PORT}" \
+      --set "config.data.path=/dataset/s3-dataset.json" \
+      --set "s3Path=<PATH TO DATASET FILE ON S3 BUCKET>"
+    ```
+    **Parameters to customize:**
+    
+    *   `decode-heavy`: A unique name for this deployment.
+    *   `hfTokenSecret.name`: The name of your Kubernetes Secret containing the Hugging Face token (default: `hf-token`).
+    *   `hfTokenSecret.key`: The key in your Kubernetes Secret pointing to the Hugging Face token (default: `token`).
+    *   `config.server.base_url`: The base URL (IP and port) of your inference server for the high-cache scenario.
+    *   `s3Path`: The path to the downloaded dataset file hosted on your s3 bucket. 
+
+## Clean Up
+
+To uninstall the deployed charts:
+
+```bash
+helm uninstall decode-heavy
+```
+
+## Post Benchmark Analysis
+Follow the benchmarking guide instructions to [compare benchmark results](https://gateway-api-inference-extension.sigs.k8s.io/performance/benchmark/#analyze-the-results).
diff --git a/site-src/performance/benchmark/advanced-configs/prefill-heavy-workload.md b/site-src/performance/benchmark/advanced-configs/prefill-heavy-workload.md
@@ -0,0 +1,98 @@
+# Prefill Heavy Workload Benchmarking
+This guide shows how to deploy a prefill-heavy benchmarking config using inference-perf.
+
+## Prerequisites
+
+Before you begin, ensure you have the following:
+
+*   **Helm 3+**: [Installation Guide](https://helm.sh/docs/intro/install/)
+*   **Kubernetes Cluster**: Access to a Kubernetes cluster
+*   **Hugging Face Token Secret**: A Hugging Face token to pull models.
+*   **Gateway Deployed**: Your inference server/gateway must be deployed and accessible within the cluster.
+
+Follow [benchmarking guide](https://gateway-api-inference-extension.sigs.k8s.io/performance/benchmark/#benchmark) for more information on how to set up gateway and how to validate benchmark results.
+
+## Infinity Instruct Dataset Configuration
+
+The chart uses the `infinity_instruct` [dataset type](https://huggingface.co/datasets/BAAI/Infinity-Instruct). 
+
+>NOTE: Currently, we need to download and supply the dataset for inference-perf to ingest. Currently using helm, we can supply the dataset by uploading to a gcs or s3 bucket. Otherwise, you can follow inference perf guides to run locally with a local dataset file path.
+
+## Deployment
+
+### 1. Check out the repo.
+
+```bash
+git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension
+cd gateway-api-inference-extension/benchmarking/single-workload
+```
+
+### 2. Get the target IP. 
+
+  The examples below shows how to get the IP of a gateway or a k8s service.
+
+  ```bash
+  # Get gateway IP
+  GW_IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}')
+  # Get LoadBalancer k8s service IP
+  SVC_IP=$(kubectl get service/vllm-llama3-8b-instruct -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
+
+  echo $GW_IP
+  echo $SVC_IP
+  ```
+
+### 3. Deploying the Prefill Heavy Configuration
+
+This configuration is optimized for scenarios where a high cache hit rate is expected. It uses the `prefill-heavy-values.yaml` file.
+
+=== "Google Cloud Storage (GCS)"
+    Use the `gcsPath` field to provide your dataset stored on GCS. The dataset will be downloaded from the bucket and stored locally on the cluster at `/dataset/gcs-dataset.json`. 
+    ```bash
+    export IP='<YOUR_IP>'
+    export PORT='<YOUR_PORT>'
+    export HF_TOKEN='<YOUR_HUGGINGFACE_TOKEN>'
+    helm install prefill-heavy ../inference-perf -f prefill-heavy-values.yaml \
+      --set hfToken=${HF_TOKEN} \
+      --set "config.server.base_url=http://${IP}:${PORT}" \
+      --set "config.data.path=/dataset/gcs-dataset.json" \
+      --set "gcsPath=<PATH TO DATASET FILE ON GCS BUCKET>"
+    ```
+    **Parameters to customize:**
+    
+    *   `prefill-heavy`: A unique name for this deployment.
+    *   `hfTokenSecret.name`: The name of your Kubernetes Secret containing the Hugging Face token (default: `hf-token`).
+    *   `hfTokenSecret.key`: The key in your Kubernetes Secret pointing to the Hugging Face token (default: `token`).
+    *   `config.server.base_url`: The base URL (IP and port) of your inference server for the high-cache scenario.
+    *   `gcsPath`: The path to the downloaded dataset file hosted on your gcs bucket. 
+
+=== "Simple Storage Service (S3)"
+    Use the `s3Path` field to provide your dataset stored on S3. The dataset will be downloaded from the bucket and stored locally on the cluster at `/dataset/s3-dataset.json`. 
+    ```bash
+    export IP='<YOUR_IP>'
+    export PORT='<YOUR_PORT>'
+    export HF_TOKEN='<YOUR_HUGGINGFACE_TOKEN>'
+    helm install prefill-heavy ../inference-perf -f prefill-heavy-values.yaml \
+      --set hfToken=${HF_TOKEN} \
+      --set "config.server.base_url=http://${IP}:${PORT}" \
+      --set "config.data.path=/dataset/s3-dataset.json" \
+      --set "s3Path=<PATH TO DATASET FILE ON S3 BUCKET>"
+    ```
+
+    **Parameters to customize:**
+    
+    *   `prefill-heavy`: A unique name for this deployment.
+    *   `hfTokenSecret.name`: The name of your Kubernetes Secret containing the Hugging Face token (default: `hf-token`).
+    *   `hfTokenSecret.key`: The key in your Kubernetes Secret pointing to the Hugging Face token (default: `token`).
+    *   `config.server.base_url`: The base URL (IP and port) of your inference server for the high-cache scenario.
+    *   `s3Path`: The path to the downloaded dataset file hosted on your s3 bucket. 
+
+## Clean Up
+
+To uninstall the deployed charts:
+
+```bash
+helm uninstall prefill-heavy
+```
+
+## Post Benchmark Analysis
+Follow the benchmarking guide instructions to [compare benchmark results](https://gateway-api-inference-extension.sigs.k8s.io/performance/benchmark/#analyze-the-results).
diff --git a/site-src/performance/benchmark/index.md b/site-src/performance/benchmark/index.md
@@ -173,9 +173,11 @@ detailed list of configuration knobs.
 
 The following is a list of advanced configurations available.
 
-| Guides | Config | Directory | Config(s)
+| Guide | Directory | Config(s)
 | :--- | :--- | :--- | :--- |
 | [Prefix Cache Aware Guide](https://gateway-api-inference-extension.sigs.k8s.io/performance/benchmark/advanced-configs/prefix-cache-aware/#prefix-cache-aware-benchmarking) | `prefix-cache-aware` | `benchamrking/prefix-cache-aware`  | `high-cache-values.yaml` `low-cache-values.yaml` |
+| [Decode Heavy Guide](https://gateway-api-inference-extension.sigs.k8s.io/performance/benchmark/advanced-configs/decode-heavy) | `benchamrking/single-workload`  | `decode-heavy-values.yaml` |
+| [Prefill Heavy Guide](https://gateway-api-inference-extension.sigs.k8s.io/performance/benchmark/advanced-configs/prefill-heavy) | `benchamrking/single-workload`  | `prefill-heavy-values.yaml` |
 
 ## Analyze the results