diff --git a/.gitignore b/.gitignore index d4186ed6..5e4d213a 100644 --- a/.gitignore +++ b/.gitignore @@ -178,3 +178,6 @@ cython_debug/ # Project specific files *.json *.yaml + +# Allow specific YAML files +!docs/guides/k8s/*.yaml diff --git a/docs/guides/example-analysis/README.md b/docs/guides/example-analysis/README.md new file mode 100644 index 00000000..5c1970a2 --- /dev/null +++ b/docs/guides/example-analysis/README.md @@ -0,0 +1,44 @@ +# GuideLLM Example Analysis + +This directory contains example analysis script for GuideLLM performance testing. + +## Running Benchmarks in Kubernetes + +To run comprehensive GuideLLM benchmarks in Kubernetes, follow the instructions in the [k8s/README.md](../k8s/README.md). This will help you: + +- Set up the necessary Kubernetes environment +- Configure benchmark parameters +- Execute the benchmarks +- Collect performance data + +## Analyzing Results + +### Using the Analysis Script + +The [analyze_benchmarks.py](./analyze_benchmarks.py) script processes benchmark YAML output and generates visualizations and statistics. To use it: + +1. Install required dependencies: + + ```bash + pip install -r requirements.txt + ``` + +2. Ensure the GuideLLM benchmark YAML file from the Kubernetes guidellm-job pod is copied to your local environment. + + ```bash + # From the k8s/README.md instructions + kubectl cp :/path/to/benchmark.yaml ./llama32-3b.yaml + ``` + +3. Run the analysis script (make sure the YAML file is in the same directory): + + ```bash + python analyze_benchmarks.py + ``` + +The script will: + +- Process the benchmark YAML file +- Generate visualizations in the `benchmark_plots` directory +- Create a CSV file with processed metrics +- Print summary statistics diff --git a/docs/guides/example-analysis/analyze_benchmarks.py b/docs/guides/example-analysis/analyze_benchmarks.py new file mode 100644 index 00000000..763cb964 --- /dev/null +++ b/docs/guides/example-analysis/analyze_benchmarks.py @@ -0,0 +1,140 @@ +import yaml +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +from pathlib import Path + +def process_benchmark_yaml(yaml_file): + """Process the benchmark YAML file and return a DataFrame with the data.""" + with open(yaml_file, 'r') as f: + data = yaml.safe_load(f) + + # Extract concurrency levels from the benchmark configuration + concurrency_levels = data['benchmarks'][0]['args']['profile']['measured_concurrencies'] + + # Process metrics for each concurrency level + processed_data = [] + for i, benchmark in enumerate(data['benchmarks']): + if 'metrics' in benchmark: + metrics = benchmark['metrics'] + concurrency = concurrency_levels[i] if i < len(concurrency_levels) else 1.0 + + # Extract successful metrics + for metric_name, metric_data in metrics.items(): + if 'successful' in metric_data: + successful = metric_data['successful'] + processed_data.append({ + 'concurrency': concurrency, + 'metric': metric_name, + 'count': successful.get('count', 0), + 'mean': successful.get('mean', 0), + 'median': successful.get('median', 0), + 'min': successful.get('min', 0), + 'max': successful.get('max', 0), + 'std_dev': successful.get('std_dev', 0), + 'p95': successful.get('percentiles', {}).get('p95', 0), + 'p99': successful.get('percentiles', {}).get('p99', 0) + }) + + # Convert to DataFrame + df = pd.DataFrame(processed_data) + return df + +def create_visualizations(df): + """Create visualizations for the benchmark data.""" + # Create plots directory if it doesn't exist + plot_dir = Path('benchmark_plots') + plot_dir.mkdir(exist_ok=True) + + # Set style + plt.style.use('default') + + # Sort by concurrency for better visualization + df = df.sort_values('concurrency') + + # Create visualizations for each metric + metrics_to_plot = [ + 'request_latency', + 'time_to_first_token_ms', + 'tokens_per_second', + 'inter_token_latency_ms' + ] + + for metric in metrics_to_plot: + metric_df = df[df['metric'] == metric] + if not metric_df.empty: + # Mean vs Median + plt.figure(figsize=(12, 6)) + plt.plot(metric_df['concurrency'], metric_df['mean'], 'b-', label='Mean') + plt.plot(metric_df['concurrency'], metric_df['median'], 'r--', label='Median') + plt.title(f'{metric.replace("_", " ").title()} vs Concurrency') + plt.xlabel('Concurrency Level') + plt.ylabel('Value') + plt.legend() + plt.grid(True) + plt.tight_layout() + plt.savefig(plot_dir / f'{metric}_mean_median.png') + plt.close() + + # Min-Max Range + plt.figure(figsize=(12, 6)) + plt.fill_between(metric_df['concurrency'], + metric_df['min'], + metric_df['max'], + alpha=0.3, + label='Min-Max Range') + plt.plot(metric_df['concurrency'], metric_df['mean'], 'b-', label='Mean') + plt.title(f'{metric.replace("_", " ").title()} Range vs Concurrency') + plt.xlabel('Concurrency Level') + plt.ylabel('Value') + plt.legend() + plt.grid(True) + plt.tight_layout() + plt.savefig(plot_dir / f'{metric}_range.png') + plt.close() + + # Percentiles + plt.figure(figsize=(12, 6)) + plt.plot(metric_df['concurrency'], metric_df['p95'], 'g--', label='95th Percentile') + plt.plot(metric_df['concurrency'], metric_df['p99'], 'r--', label='99th Percentile') + plt.plot(metric_df['concurrency'], metric_df['mean'], 'b-', label='Mean') + plt.title(f'{metric.replace("_", " ").title()} Percentiles vs Concurrency') + plt.xlabel('Concurrency Level') + plt.ylabel('Value') + plt.legend() + plt.grid(True) + plt.tight_layout() + plt.savefig(plot_dir / f'{metric}_percentiles.png') + plt.close() + +def main(): + # Process the YAML file + df = process_benchmark_yaml('llama32-3b.yaml') + + # Create visualizations + create_visualizations(df) + + # Print summary statistics by concurrency level + print("\nSummary Statistics by Concurrency Level:") + for concurrency in sorted(df['concurrency'].unique()): + print(f"\nConcurrency Level: {concurrency:.2f}") + subset = df[df['concurrency'] == concurrency] + + for metric in subset['metric'].unique(): + metric_data = subset[subset['metric'] == metric] + print(f"\n{metric.replace('_', ' ').title()}:") + print(f"Count: {metric_data['count'].iloc[0]}") + print(f"Mean: {metric_data['mean'].iloc[0]:.2f}") + print(f"Median: {metric_data['median'].iloc[0]:.2f}") + print(f"Min: {metric_data['min'].iloc[0]:.2f}") + print(f"Max: {metric_data['max'].iloc[0]:.2f}") + print(f"Std Dev: {metric_data['std_dev'].iloc[0]:.2f}") + print(f"95th Percentile: {metric_data['p95'].iloc[0]:.2f}") + print(f"99th Percentile: {metric_data['p99'].iloc[0]:.2f}") + + # Save processed data + df.to_csv('benchmark_processed_data.csv', index=False) + print("\nProcessed data saved to benchmark_processed_data.csv") + +if __name__ == "__main__": + main() diff --git a/docs/guides/example-analysis/requirements.txt b/docs/guides/example-analysis/requirements.txt new file mode 100644 index 00000000..118ce135 --- /dev/null +++ b/docs/guides/example-analysis/requirements.txt @@ -0,0 +1,4 @@ +pyyaml>=6.0 +pandas>=2.0.0 +matplotlib>=3.7.0 +seaborn>=0.12.0 diff --git a/docs/guides/k8s/Dockerfile b/docs/guides/k8s/Dockerfile new file mode 100644 index 00000000..8696c8e7 --- /dev/null +++ b/docs/guides/k8s/Dockerfile @@ -0,0 +1,12 @@ +FROM registry.access.redhat.com/ubi9/python-312:9.5-1744198409 + +RUN pip install --upgrade pip && \ + pip install git+https://github.com/neuralmagic/guidellm.git@main + +# Replace these env vars in the guidellm-job.yaml +ENV TARGET=http://localhost:8000/v1 \ + MODEL=neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 \ + DATA_TYPE=emulated \ + DATA=prompt_tokens=512,generated_tokens=128 + +ENTRYPOINT ["guidellm"] diff --git a/docs/guides/k8s/README.md b/docs/guides/k8s/README.md new file mode 100644 index 00000000..a5a89b31 --- /dev/null +++ b/docs/guides/k8s/README.md @@ -0,0 +1,53 @@ +## Run Guidellm with Kubernetes Job + +Here's an example to run `guidellm` with `meta-llama/Llama-3.2-3B-Instruct` that has been deployed with +[llm-d-deployer](https://github.com/neuralmagic/llm-d-deployer/blob/main/quickstart/README-minikube.md). +Replace the `--target` and references to `Llama-3.2-3B` in [guidellm-job.yaml](./guidellm-job.yaml) to evaluate any served LLM. + +### Run evaluation + +```bash +# Update the claim-name in accessor-pod.yaml, and guidellm-job.yaml if using a different pvc-name +kubectl apply -f pvc.yaml +kubectl apply -f guidellm-job.yaml +``` + +> **📝 NOTE:** [Dockerfile](./Dockerfile) was used to build the image for the guidellm-job pod. + +> **📝 NOTE:** The HF_TOKEN is passed to the job, but this will not be necessary if you use the same PVC as the one storing your model. +> Guidellm uses the model's tokenizer/processor files in its evaluation. You can pass a path instead with `--tokenizer=/path/to/model`. +> This eliminates the need for Guidellm to download the files from Huggingface. + +The logs from the job will show pretty tables that summarize the results. There is also a large yaml file created. The evaluation for this model +will take ~20-30 minutes. + +### Extract Guidellm Report + +```bash +kubectl apply -f accessor-pod.yaml + +# Wait for the pod to be ready +kubectl wait --for=condition=Ready pod/guidellm-accessor + +# Copy the report file from the pod (accessor pod mounts the volume as read-only) +kubectl cp guidellm-accessor:/app/data/guidellm-reports.tgz ./guidellm-reports.tgz +``` + +Extract the report: + +```bash +tar -xvf guidellm-reports.tgz +``` + +You will now have a local file `./guidellm-reports/llama32-3b.yaml` + +You can remove the accessor pod with: + +```bash +kubectl delete pod guidellm-accessor +``` + +### Gather Insights from Guidellm Report + +You can follow the ["Analyzing Results" section](../example-analysis/README.md#analyzing-results) to gain insights from your LLM +deployments using the GuideLLM report. diff --git a/docs/guides/k8s/accessor-pod.yaml b/docs/guides/k8s/accessor-pod.yaml new file mode 100644 index 00000000..51fd7ee7 --- /dev/null +++ b/docs/guides/k8s/accessor-pod.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Pod +metadata: + name: guidellm-accessor +spec: + containers: + - command: + - sleep + - "3600" + image: registry.access.redhat.com/ubi9/ubi + name: accessor + volumeMounts: + - mountPath: /app/data + name: output + readOnly: true + volumes: + - name: output + persistentVolumeClaim: + claimName: guidellm-output-pvc + readOnly: true diff --git a/docs/guides/k8s/guidellm-job.yaml b/docs/guides/k8s/guidellm-job.yaml new file mode 100644 index 00000000..8345f129 --- /dev/null +++ b/docs/guides/k8s/guidellm-job.yaml @@ -0,0 +1,84 @@ +# This job takes ~25min to complete. +# This will create a very large yaml file. To extract the file, run: +# oc apply -f accessor-pod.yaml +# mkdir ./guidellm-reports +# kubectl cp guidellm-accessor:/app/data/guidellm-reports.tgz ./guidellm-reports/guidellm-reports.tgz +# You will now have a local ./guidellm-reports/guidellm-reports.tgz, to extract it run: +# tar -xvf guidellm-reports.tgz +# You will now have a local file ./guidellm-reports/llama32-3b.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: run-guidellm +spec: + template: + spec: + containers: + - name: guidellm + # TODO: replace this image + image: quay.io/sallyom/guidellm:latest + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + args: + - benchmark + - --target=$(TARGET) + - --data=$(DATA) + - --rate-type=sweep + - --model=$(MODEL) + - --output-path=/app/data/llama32-3b.yaml + env: + # HF_TOKEN is not necessary if you share/use the model PVC. Guidellm needs to access the tokenizer file. + # You can provide a path to the tokenizer file by passing `--tokenizer=/path/to/model`. If you do not + # pass the tokenizer path, Guidellm will get the tokenizer file(s) from Huggingface. + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: HF_TOKEN + name: huggingface-secret + - name: TARGET + value: "http://llm-d-inference-gateway.llm-d.svc.cluster.local:80/v1" + - name: DATA_TYPE + value: "emulated" + - name: DATA + value: "prompt_tokens=512,output_tokens=128" + - name: MODEL + value: "meta-llama/Llama-3.2-3B-Instruct" + volumeMounts: + - name: output + mountPath: /app/data + - name: extract + image: registry.access.redhat.com/ubi9/ubi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + command: ["sh", "-c"] + args: + - | + echo "Waiting for guidellm container to complete..."; + while [ ! -f /app/data/llama32-3b.yaml ]; do + sleep 60; + done; + echo "Guidellm completed, packing reports..."; + cd /app/data && \ + tar czf guidellm-reports.tgz *.yaml && \ + rm /app/data/llama32-3b.yaml + volumeMounts: + - name: output + mountPath: /app/data + restartPolicy: Never + volumes: + - name: output + persistentVolumeClaim: + claimName: guidellm-output-pvc diff --git a/docs/guides/k8s/pvc.yaml b/docs/guides/k8s/pvc.yaml new file mode 100644 index 00000000..93aa8801 --- /dev/null +++ b/docs/guides/k8s/pvc.yaml @@ -0,0 +1,11 @@ +# Example PVC - update to match your cluster +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: guidellm-output-pvc +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi