diff --git a/gkm-codespell.precommit-toml b/gkm-codespell.precommit-toml index 76f85615..d998123d 100644 --- a/gkm-codespell.precommit-toml +++ b/gkm-codespell.precommit-toml @@ -1,3 +1,3 @@ [tool.codespell] -ignore-words-list = "AfterAll,renderD" +ignore-words-list = "AfterAll,renderD,ERRO" skip = './.*,vendor/*,go.sum' diff --git a/mcv/docs/vllm-binary-cache.md b/mcv/docs/vllm-binary-cache.md index 9106459b..39ddabd6 100644 --- a/mcv/docs/vllm-binary-cache.md +++ b/mcv/docs/vllm-binary-cache.md @@ -2,26 +2,149 @@ ## Overview -MCV supports two vLLM cache formats: +MCV supports three vLLM cache formats: 1. **vLLM Triton Cache Format** (legacy) - Stores `triton_cache/` and `inductor_cache/` inside rank directories -2. **vLLM Binary Cache Format** (new) - Stores prefix directories - (e.g., `backbone/`) inside rank directories +2. **vLLM Binary Cache Format** (current default) - Stores compiled artifacts in + prefix directories with embedded Triton kernels +3. **vLLM Mega AOT Artifact Format** (PyTorch 2.12+) - Uses + `VLLM_USE_MEGA_AOT_ARTIFACT=true` for enhanced AOT serialization -Both formats share the same top-level structure: -`torch_compile_cache/{hash}/rank_{rank}_{dp_rank}/` +**AOT Compile Support**: MCV **supports** the `VLLM_USE_AOT_COMPILE=1` workflow, +which creates a separate cache structure at +`torch_compile_cache/torch_aot_compile/{hash}/rank_{rank}_{dp_rank}/model`. +AOT compile caches store ahead-of-time compiled models as single binary files +rather than multiple compilation artifacts. During preflight checks, AOT cache +compatibility is validated primarily via the summary label, as the cache metadata +contains limited hardware information. -The key differences are **inside the rank directory**: +Standard cache formats share the structure: +`torch_compile_cache/{hash}/rank_{rank}_{dp_rank}/{prefix}/` -- **Triton format**: Contains `triton_cache/` and `inductor_cache/` - subdirectories with unpacked artifacts -- **Binary format**: Contains prefix directories - (e.g., `backbone/`, `eagle_head/`) with `cache_key_factors.json` - and artifacts that can be either binary files or unpacked directories +The key differences are **inside the prefix directory**: -This document describes the **vLLM Binary Cache Format** introduced in recent -versions of vLLM. +- **Triton format** (legacy): Contains `triton_cache/` and `inductor_cache/` + subdirectories with unpacked Triton kernels +- **Binary format** (default): Contains `cache_key_factors.json` and multiple + `artifact_compile_range_*` files with embedded Triton kernels +- **Mega AOT format** (PyTorch 2.12+): Same structure as binary format, but uses + enhanced AOT serialization (indicated by `VLLM_USE_MEGA_AOT_ARTIFACT: true` in + `cache_key_factors.json`) + +**Note**: The `VLLM_USE_AOT_COMPILE=1` workflow uses a different structure at +`torch_compile_cache/torch_aot_compile/{hash}/rank_{rank}_{dp_rank}/model` and is +not currently supported by MCV. + +This document describes the **vLLM Binary and Mega AOT Artifact Formats** and how +torch.compile caching works with MCV. + +**Important**: This document covers compilation mode 3 (`VLLM_COMPILE`) which uses +`~/.cache/vllm/torch_compile_cache/`. There is a separate vLLM feature controlled +by `VLLM_USE_AOT_COMPILE=1` (enabled by default in PyTorch 2.10+) that creates an +additional cache at `torch_compile_cache/torch_aot_compile/` with a different +structure. MCV does not currently support this AOT compile workflow. + +## Torch Compile Architecture + +### How vLLM Uses torch.compile + +When vLLM is configured with compilation mode 3 via +`--compilation-config '{"mode": 3}'` (not enabled by default), it uses PyTorch's +`torch.compile` with TorchInductor backend to optimize model execution: + +```text +Model Code → torch.compile → TorchInductor → Triton/CUDA Kernels → GPU Execution +``` + +**First Run (Compilation)**: + +1. vLLM traces the model with Dynamo +2. TorchInductor compiles the graph +3. Triton generates optimized GPU kernels → `/tmp/torchinductor_$USER` +4. vLLM saves artifacts using `standalone_compile().save(format="binary")` +5. **PyTorch bundles the Triton kernels into the artifacts** +6. Complete cache saved to `~/.cache/vllm/torch_compile_cache/` + +**Subsequent Runs (Cache Hit)**: + +1. vLLM loads artifacts from `~/.cache/vllm/torch_compile_cache/` +2. **PyTorch extracts embedded Triton kernels → `/tmp/torchinductor_$USER`** +3. Execution resumes using extracted kernels (~10-20s vs 3-5min compilation) + +### Binary vs Mega AOT Serialization + +Both binary and mega AOT formats bundle Triton kernels in the artifacts and use +the same directory structure. They only differ in how the artifact files are +serialized: + +**Binary Serialization** (default): + +- Uses PyTorch `standalone_compile().save(format="binary")` +- Environment: `VLLM_USE_MEGA_AOT_ARTIFACT=false` (default in PyTorch <2.12) +- Multiple `artifact_compile_range_*` files per prefix +- Typical size: ~11MB for Qwen3-0.6B model + +**Mega AOT Serialization** (PyTorch 2.12+): + +- Uses PyTorch `AOTCompiledArtifact.serialize()` with bundled autograd cache +- Environment: `VLLM_USE_MEGA_AOT_ARTIFACT=true` (default in PyTorch 2.12+) +- More portable across PyTorch versions +- Same multi-artifact structure as binary format +- Typical size: Similar to binary format + +**Important**: From MCV's perspective, both formats are **structurally identical** +and use the same detection and packaging logic: + +```text +~/.cache/vllm/torch_compile_cache/{hash}/rank_{rank}_{dp_rank}/{prefix}/ +``` + +### VLLM_USE_AOT_COMPILE vs VLLM_USE_MEGA_AOT_ARTIFACT + +These are **two different features** that are often confused: + +**`VLLM_USE_AOT_COMPILE`** (directory structure change): +- Enabled by default in PyTorch 2.10+ +- Creates cache at: `torch_compile_cache/torch_aot_compile/{hash}/rank_X_Y/model` +- Single `model` file (~6.5MB) instead of multiple artifacts +- **Not currently supported by MCV** (different directory structure) + +**`VLLM_USE_MEGA_AOT_ARTIFACT`** (serialization format): +- Enabled by default in PyTorch 2.12+ +- Uses regular cache path: `torch_compile_cache/{hash}/rank_X_Y/{prefix}/` +- Enhanced AOT serialization for better portability +- **Supported by MCV** (same structure as binary format) + +When both are enabled, vLLM creates **both** cache locations. + +### The /tmp Cache Directory + +During compilation and execution, PyTorch creates temporary files: + +```text +/tmp/torchinductor_$USER/ +├── triton/0/{hash}/ +│ ├── triton_.cubin # Compiled GPU binary (ELF) +│ ├── triton_.source # Triton source code +│ ├── triton_.ttir # Triton IR +│ └── triton_.ptx # PTX assembly +├── o7/, dp/, .../ # Python kernel cache +└── aotautograd/ # AOT autograd cache +``` + +**Lifecycle**: + +- **First run**: Created during compilation +- **Cache hit**: Extracted from embedded artifacts +- **Cleanup**: Cleared on reboot (tmpfs) or manual deletion +- **Recreation**: Automatic on every vLLM start + +**Key Insight**: This directory is **NOT needed for cache portability**. +The Triton kernels are already embedded in the binary artifacts (verified by +finding 42 ELF headers in a 5.3MB artifact file). + +**MCV does NOT capture `/tmp`** - kernels auto-extract at runtime (~2 seconds). ## Binary Cache Format @@ -202,21 +325,27 @@ The `manifest.json` file contains comprehensive metadata: ## Hardware Detection -MCV automatically extracts hardware information from the cache metadata: +MCV automatically detects hardware information from the system and combines it with cache metadata: ### CUDA ```json { "backend": "cuda", - "arch": "sm_12.9", - "warp_size": 32 + "arch": "75", + "warp_size": 32, + "ptx_version": 590, + "cuda_version": "12.9" } ``` -- **Backend**: Extracted from `VLLM_TARGET_DEVICE` -- **Arch**: Derived from `VLLM_MAIN_CUDA_VERSION` +- **Backend**: Extracted from `VLLM_TARGET_DEVICE` environment variable +- **Arch**: **Detected from actual GPU** on the system as numerical compute capability (e.g., `75` for Tesla T4, `80` for A100, `89` for RTX 4090) - **Warp Size**: 32 (CUDA default) +- **PTX Version**: PTX version from NVIDIA driver (e.g., 590 for driver 590.48.01) +- **CUDA Version**: CUDA toolkit version from `VLLM_MAIN_CUDA_VERSION` (e.g., "12.9") + +**Important**: MCV detects the **actual GPU compute capability** from the system, not from environment variables. Compute capability is stored as a numerical value (e.g., `75` = sm_7.5 = Turing architecture). This ensures accurate compatibility checking between cached kernels and the target GPU. ### ROCm/HIP @@ -228,10 +357,12 @@ MCV automatically extracts hardware information from the cache metadata: } ``` -- **Backend**: Extracted from `VLLM_TARGET_DEVICE` -- **Arch**: Detected from ROCm environment variables +- **Backend**: Extracted from `VLLM_TARGET_DEVICE` environment variable +- **Arch**: **Detected from actual GPU** on the system (e.g., `gfx90a` for MI250, `gfx942` for MI300) - **Warp Size**: 64 (AMD wavefront size) +**Note**: If GPU detection fails, MCV will warn that the cache may not be compatible with the current GPU. Always verify GPU compatibility before deployment. + ## Format Detection MCV automatically detects the vLLM cache format by inspecting the @@ -348,6 +479,7 @@ inspect` or `skopeo inspect` without reading the full manifest. | vLLM Format | Artifacts | `cacheFormat` | `cache_save_format` | Label | | ----------- | --------- | ------------- | ------------------- | ----- | | Binary | Binary files | `"binary"` | `"binary"` | `"binary"` | +| AOT | Binary files | `"binary"` | `"binary"` | `"binary"` | | Triton | Unpacked dirs | `"triton"` | N/A | `"unpacked"` | **Why Three Indicators?** @@ -361,16 +493,303 @@ inspect` or `skopeo inspect` without reading the full manifest. ## Comparison: vLLM Binary Cache vs vLLM Triton Cache -| Aspect | Triton (Legacy) | Binary (New) | -| ------ | --------------- | ------------ | -| **Structure** | `{hash}/rank_X_Y/` | `{hash}/rank_X_Y/` | -| **Inside Rank** | `triton_cache/` + `inductor_cache/` | `{prefix}/` | -| **Metadata** | Triton JSON | `cache_key_factors.json` | -| **Storage** | Unpacked | Binary/unpacked | -| **Multiprocess** | No | Yes (binary) | -| **Distributed** | Full rank/DP | Full rank/DP | -| **Manifest** | `"triton"` | `"binary"` | -| **Label** | `"unpacked"` | `"binary"`/`"unpacked"` | +| Aspect | Triton (Legacy) | Binary | Mega AOT | +| ------ | --------------- | ------------ | -------------- | +| **Structure** | `{hash}/rank_X_Y/` | `{hash}/rank_X_Y/` | `{hash}/rank_X_Y/` | +| **Inside Rank** | `triton_cache/` + `inductor_cache/` | `{prefix}/` | `{prefix}/` | +| **Metadata** | Triton JSON | `cache_key_factors.json` | `cache_key_factors.json` | +| **Serialization** | Unpacked | `standalone_compile().save()` | `AOTCompiledArtifact.serialize()` | +| **Storage** | Unpacked | Binary/unpacked | Binary | +| **Multiprocess** | No | Yes (binary) | Yes | +| **Distributed** | Full rank/DP | Full rank/DP | Full rank/DP | +| **Manifest** | `"triton"` | `"binary"` | `"binary"` | +| **Label** | `"unpacked"` | `"binary"`/`"unpacked"` | `"binary"` | +| **PyTorch Req** | Any | Any | 2.12+ | +| **Env Var** | - | `VLLM_USE_MEGA_AOT_ARTIFACT=0` | `VLLM_USE_MEGA_AOT_ARTIFACT=1` | +| **MCV Support** | ✅ Yes | ✅ Yes | ✅ Yes | + +**Note**: The `VLLM_USE_AOT_COMPILE=1` workflow creates a different structure at +`torch_compile_cache/torch_aot_compile/` and is **not** shown in this table as it +is not currently supported by MCV. + +## Complete Workflow Example + +This section demonstrates the complete end-to-end workflow of capturing a vLLM cache, creating an OCI image, and extracting it on another system. + +### Prerequisites + +- Docker or Podman installed +- MCV binary built (`make mcv`) +- Access to a container registry (e.g., quay.io) +- GPU available on the system (NVIDIA or AMD) + +### Step 1: Start vLLM Container + +Start a vLLM container with a model. This example uses NVIDIA GPU with CUDA: + +```bash +# For NVIDIA GPUs with CUDA 13.0 +sudo podman run -d \ + --name vllm-server \ + --privileged \ + --device /dev/nvidia0:/dev/nvidia0 \ + --device /dev/nvidiactl:/dev/nvidiactl \ + --device /dev/nvidia-uvm:/dev/nvidia-uvm \ + --device /dev/nvidia-uvm-tools:/dev/nvidia-uvm-tools \ + -v /usr/lib64:/usr/lib64:ro \ + -v /usr/lib64:/usr/local/cuda-13.0/compat:ro \ + -v /usr/local/cuda:/usr/local/cuda:ro \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env 'LD_LIBRARY_PATH=/usr/lib64:/usr/local/cuda/lib64:/usr/local/cuda-13.0/compat' \ + -e NVIDIA_VISIBLE_DEVICES=all \ + -e NVIDIA_DRIVER_CAPABILITIES=compute,utility \ + -p 8000:8000 \ + --ipc=host \ + docker.io/vllm/vllm-openai:latest-cu130 \ + --model Qwen/Qwen3-0.6B +``` + +For AMD GPUs with ROCm, adjust the device mounts and environment variables accordingly. + +### Step 2: Wait for Cache Generation + +The vLLM server compiles kernels during model loading and warmup. Wait for the compilation to complete: + +```bash +# Monitor vLLM logs to see compilation progress +sudo podman logs -f vllm-server + +# Look for messages like: +# INFO 04-02 13:08:05 [monitor.py:48] torch.compile took 53.19 s in total +# INFO 04-02 13:08:28 [core.py:281] init engine (profile, create kv cache, warmup model) took 76.50 seconds +# INFO 04-02 13:08:31 [api_server.py:580] Starting vLLM server on http://0.0.0.0:8000 + +# Once you see "Starting vLLM server", the cache has been generated +``` + +The compiled kernels are stored in `/root/.cache/vllm/torch_compile_cache/` inside the container. + +**Optional**: You can also send a test request to verify the server is working: + +```bash +# Send a test request (cache already compiled during startup) +curl -s http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-0.6B", + "messages": [{"role": "user", "content": "Hello!"}], + "max_tokens": 50 + }' | jq -r '.choices[0].message.content' +``` + +### Step 3: Capture Cache from Container + +Copy the generated cache from the running container to your host: + +```bash +# Create directory for cache +mkdir -p ~/vllm-qwen-cache + +# Copy cache from container +sudo podman cp vllm-server:/root/.cache/vllm ~/vllm-qwen-cache/ + +# Fix ownership +sudo chown -R $(whoami):$(whoami) ~/vllm-qwen-cache/ + +# Verify cache was captured +du -sh ~/vllm-qwen-cache/vllm +# Output: ~18M /home/user/vllm-qwen-cache/vllm + +# Inspect cache structure +ls -la ~/vllm-qwen-cache/vllm/torch_compile_cache/ +# Should show hash directories (e.g., fe20897a43/) +``` + +### Step 4: Build Cache Image with MCV + +Create an OCI container image containing the cache: + +```bash +# Install buildah if not already installed +sudo dnf install -y buildah + +# Build cache image +mcv -c \ + -i quay.io/myorg/vllm-qwen-cache:v1 \ + -d ~/vllm-qwen-cache/vllm \ + --builder buildah + +# Output: +# INFO Using buildah to build the image +# INFO Detected cache components: [vllm] +# INFO Detected GPU: backend=cuda, arch=75, warpSize=32, PTX=590 +# INFO Image built! 3cbede0b2cb5... +# INFO OCI image created successfully. +``` + +### Step 5: Inspect Cache Image + +Verify the cache image metadata and labels: + +```bash +# View image in buildah +buildah images | grep vllm-qwen-cache + +# Inspect image labels +buildah inspect quay.io/myorg/vllm-qwen-cache:v1 | \ + jq -r '.OCIv1.config.Labels' + +# Expected output: +# { +# "cache.vllm.image/cache-size-bytes": "18152945", +# "cache.vllm.image/entry-count": "2", +# "cache.vllm.image/format": "binary", +# "cache.vllm.image/summary": "{\"targets\":[{\"backend\":\"cuda\",\"arch\":\"75\",\"warp_size\":32,\"ptx_version\":590,\"cuda_version\":\"12.9\"}]}" +# } +``` + +**Important**: Notice that the `arch` field shows the **actual GPU compute capability** (e.g., `75` for Tesla T4 which is sm_7.5), not the CUDA toolkit version. + +### Step 6: Push to Registry + +Push the cache image to a container registry: + +```bash +# Login to registry +buildah login quay.io + +# Push image +buildah push quay.io/myorg/vllm-qwen-cache:v1 + +# Verify push +buildah images | grep vllm-qwen-cache +``` + +### Step 7: Extract Cache on Target System + +On another system with compatible GPU, extract the cache: + +```bash +# Pull and extract cache +mcv -e -i quay.io/myorg/vllm-qwen-cache:v1 + +# MCV performs preflight checks: +# 1. Fetches image and reads metadata +# 2. Detects local GPU (e.g., Tesla T4 with compute capability 75) +# 3. Compares with cache requirements +# 4. Extracts cache to ~/.cache/vllm/ if compatible + +# Expected output on compatible GPU: +# INFO Preflight GPU compatibility check passed. +# INFO Preflight completed matched="[0]" unmatched="[]" +# INFO Extracting cache to directory: /home/user/.cache/vllm +``` + +**Preflight Check Failure**: If the GPU is incompatible, MCV will reject the extraction: + +```bash +# Example: Trying to use A100 (compute capability 80) cache on T4 (75) +mcv -e -i quay.io/myorg/vllm-a100-cache:v1 + +# Output: +# ERRO Preflight check failed: no compatible GPU found +# WARN No compatible GPUs found for the image. +``` + +### Step 8: Verify Cache with GPU Compatibility Check + +Check compatibility without extracting: + +```bash +# Check if current GPU is compatible with cached kernels +mcv --check-compat -i quay.io/myorg/vllm-qwen-cache:v1 + +# On compatible GPU (Tesla T4): +# No output means compatible + +# On incompatible GPU: +# ERRO Preflight check failed: no compatible GPU found +# WARN No compatible GPUs found for the image. +``` + +### Step 9: View Detailed GPU Information + +Get detailed information about system GPUs: + +```bash +# Display GPU fleet information +mcv --gpu-info + +# Output: +# INFO Detected 1 accelerator(s) +# GPU Fleet: +# - GPU Type: TU104GL [Tesla T4] +# Driver Version: 590.48.01 +# IDs: [0] +``` + +### Step 10: Use Cache with vLLM + +Start vLLM with the extracted cache: + +```bash +# The cache is now in ~/.cache/vllm/ +# Start vLLM normally - it will automatically use the cache +podman run -d \ + --name vllm-with-cache \ + ... # same mounts and settings as before + -v ~/.cache/vllm:/root/.cache/vllm \ + docker.io/vllm/vllm-openai:latest-cu130 \ + --model Qwen/Qwen3-0.6B + +# vLLM will skip compilation and use cached kernels +# First request will be much faster! +``` + +### Workflow Summary + +```text +┌─────────────────────────┐ +│ 1. Start vLLM │ +│ Container │ +└───────────┬─────────────┘ + │ + ▼ +┌─────────────────────────┐ +│ 2. Wait for Kernel │ +│ Compilation │ +└───────────┬─────────────┘ + │ + ▼ +┌─────────────────────────┐ +│ 3. Copy Cache from │ +│ Container to Host │ +└───────────┬─────────────┘ + │ + ▼ +┌─────────────────────────┐ +│ 4. Build OCI Image │ +│ with MCV │ +└───────────┬─────────────┘ + │ + ▼ +┌─────────────────────────┐ +│ 5. Push to Registry │ +└───────────┬─────────────┘ + │ + ▼ +┌─────────────────────────┐ +│ 6. Pull & Extract on │ +│ Target System │ +│ (Preflight Checks) │ +└───────────┬─────────────┘ + │ + ▼ +┌─────────────────────────┐ +│ 7. Use Cache with │ +│ vLLM on Target │ +└─────────────────────────┘ +``` ## Usage Examples @@ -428,6 +847,7 @@ Key files in vLLM that implement binary cache: 3. **Include full env in manifest** for cache compatibility checking 4. **Verify hardware match** using image labels before deployment 5. **Check cache_save_format** in manifest when extracting caches +6. **Use AOT artifacts for cross-PyTorch-version portability** (requires PyTorch 2.10+) ## Migration from vLLM Triton Cache to vLLM Binary Cache @@ -439,6 +859,259 @@ To migrate from vLLM triton cache format to vLLM binary cache format: 4. Package new cache with MCV (automatically detected) 5. Both vLLM cache formats are supported, no breaking changes +## Practical Guide + +### Generating a Cache + +**Environment Setup**: + +```bash +# For binary format (default): +export VLLM_COMPILE_CACHE_SAVE_FORMAT=binary +export VLLM_USE_MEGA_AOT_ARTIFACT=false # or omit (default) + +# For AOT format (more portable): +export VLLM_COMPILE_CACHE_SAVE_FORMAT=binary +export VLLM_USE_MEGA_AOT_ARTIFACT=true # requires PyTorch 2.10+ +``` + +**Run vLLM Warmup**: + +```bash +# Enable compilation with mode 3 (VLLM_COMPILE) for cache generation +vllm serve my-model \ + --compilation-config '{"mode": 3}' \ + --tensor-parallel-size 1 + +# Alternatively, use the named mode: +# vllm serve my-model --compilation-config '{"mode": "VLLM_COMPILE"}' --tensor-parallel-size 1 + +# Make sample requests to trigger compilation: +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "my-model", "prompt": "Hello", "max_tokens": 100}' +``` + +**Note**: Mode 3 (VLLM_COMPILE) is required for cache generation. Other modes: + +- Mode 0: No compilation (default) +- Mode 1: Standard torch.compile +- Mode 2: Single Dynamo trace + +**Verify Cache**: + +```bash +ls -lh ~/.cache/vllm/torch_compile_cache/ +# Should show a 10-char hash directory (e.g., 8d0a361fbc) + +# Check cache contents: +find ~/.cache/vllm/torch_compile_cache/ -type f | head +``` + +### Packaging with MCV + +**Create Container Image**: + +```bash +mcv -c \ + -d ~/.cache/vllm/torch_compile_cache/{hash} \ + -i quay.io/myorg/my-model-cache:v1 +``` + +**Verify Image Labels**: + +```bash +skopeo inspect containers-storage:quay.io/myorg/my-model-cache:v1 \ + | jq '.Labels' + +# Expected labels: +# { +# "cache.vllm.image/cache-size-bytes": "95000000", +# "cache.vllm.image/entry-count": "1", +# "cache.vllm.image/format": "binary", +# "cache.vllm.image/summary": "{\"targets\":[{\"backend\":\"cuda\",...}]}" +# } +``` + +### Using a Cached Image + +**Extract Cache**: + +```bash +mcv -e -i quay.io/myorg/my-model-cache:v1 + +# MCV extracts to: ~/.cache/vllm/torch_compile_cache/{hash}/ +``` + +**Start vLLM**: + +```bash +# vLLM automatically detects and uses the cache +vllm serve my-model --tensor-parallel-size 1 + +# Look for log message: +# INFO: Directly load the compiled graph(s) from the cache, took X.X s +``` + +### Cache Compatibility + +A cache is compatible if: + +1. **GPU architecture** matches (check: `nvidia-smi --query-gpu=compute_cap`) +2. **CUDA/ROCm version** compatible (check: `nvcc --version` or `rocm-smi`) +3. **PyTorch version** compatible +4. **Model code** unchanged (code hash must match) +5. **vLLM configuration** matches (TP size, compile level, etc.) +6. **Environment variables** match (see `cache_key_factors.json`) + +**Check Compatibility**: + +```bash +# View cache metadata: +cat ~/.cache/vllm/torch_compile_cache/*/rank_0_0/*/cache_key_factors.json \ + | jq '{target: .env.VLLM_TARGET_DEVICE, cuda: .env.VLLM_MAIN_CUDA_VERSION}' + +# Compare with system: +nvidia-smi +# or +rocm-smi +``` + +## Troubleshooting + +### Cache Not Being Used + +**Symptom**: vLLM recompiles on every start despite having a cache + +**Common Causes**: + +See the [Cache Compatibility](#cache-compatibility) section above for requirements. + +**Debug Steps**: + +```bash +# 1. Check if cache exists +ls ~/.cache/vllm/torch_compile_cache/ + +# 2. Enable debug logging +export VLLM_LOGGING_LEVEL=DEBUG + +# 3. Check for hash mismatch in logs +grep "cache" vllm.log | grep -i "hash\|miss" + +# 4. Verify GPU compatibility +python -c "import torch; print(torch.cuda.get_device_capability())" +``` + +### Slow Startup with Cache + +**Symptom**: vLLM takes 20+ seconds to start with cache + +**Normal Behavior**: 10-20 seconds for kernel extraction from artifacts is expected + +**If Slower**: + +- Check disk I/O performance: `iostat -x 1` +- Verify `/tmp` is not on slow storage (NFS, etc.) +- Consider using `tmpfs` for `/tmp`: `df -h /tmp` + +### Missing Kernels Error + +**Symptom**: Runtime errors about missing Triton kernels + +**Causes**: + +1. Corrupted artifacts +2. Incomplete cache (warmup didn't cover all batch sizes) +3. Disk space issues during generation + +**Solutions**: + +```bash +# 1. Delete and regenerate cache +rm -rf ~/.cache/vllm/torch_compile_cache/* + +# 2. Verify disk space +df -h ~/.cache/vllm/ + +# 3. Check artifact integrity +file ~/.cache/vllm/torch_compile_cache/*/rank_0_0/*/artifact_* +# Should show: "data" (binary format) +``` + +### AOT Artifact Serialization Issues + +**Symptom**: AOT artifacts fail to load (when using `VLLM_USE_MEGA_AOT_ARTIFACT=true`) + +**Requirements**: + +- PyTorch 2.10.0 or later +- `VLLM_USE_MEGA_AOT_ARTIFACT=true` +- Compilation mode 3 (`--compilation-config '{"mode": 3}'`) + +**Verify**: + +```bash +# Check PyTorch version +python -c "import torch; print(torch.__version__)" + +# Verify AOT flag in cache +grep "VLLM_USE_MEGA_AOT_ARTIFACT" \ + ~/.cache/vllm/torch_compile_cache/*/rank_0_0/*/cache_key_factors.json +``` + +## Advanced Topics + +### Multi-GPU Caching + +For tensor parallelism or pipeline parallelism: + +```text +torch_compile_cache/{hash}/ +├── rank_0_0/ # First tensor parallel rank +├── rank_0_1/ # Second tensor parallel rank +├── rank_1_0/ # First pipeline parallel rank +└── rank_1_1/ # Second pipeline + tensor parallel rank +``` + +MCV captures all rank directories. Extract the entire hash directory for +multi-GPU deployments. + +### Multiple Model Components + +Models with speculative decoding have multiple components: + +```text +rank_0_0/ +├── backbone/ # Main model +│ └── artifact_* +└── eagle_head/ # Draft model for speculation + └── artifact_* +``` + +MCV captures all prefix directories automatically. + +### Cache Size Optimization + +**Typical Sizes**: + +- Small models (< 1B params): 50-100 MB +- Medium models (1-10B params): 100-500 MB +- Large models (10B+ params): 500 MB - 2 GB + +**Factors Affecting Size**: + +- Number of compiled ranges (batch sizes) +- Number of layers +- Triton kernel count +- Autotune configurations + +**Reduce Size**: + +- Use fewer compile ranges: `VLLM_COMPILE_RANGES=[128,512]` vs default +- Binary format is smaller than unpacked +- AOT format is similar to binary + ## See Also - [spec-compat.md](./spec-compat.md) - OCI image specification diff --git a/mcv/pkg/accelerator/devices/amd.go b/mcv/pkg/accelerator/devices/amd.go index 7e435ab0..0188510d 100644 --- a/mcv/pkg/accelerator/devices/amd.go +++ b/mcv/pkg/accelerator/devices/amd.go @@ -47,13 +47,41 @@ func (d *gpuAMD) SetHwType(hwType string) { } // SetTritonInfo sets the Triton GPU information for the AMD device. +// When restoring from cache, this also populates the devices map. func (d *gpuAMD) SetTritonInfo(info []TritonGPUInfo) { d.tritonInfo = info + + // Rebuild devices map from cached triton info + if d.devices == nil { + d.devices = make(map[int]GPUDevice) + } + for _, tritonInfo := range info { + d.devices[tritonInfo.ID] = GPUDevice{ + ID: tritonInfo.ID, + TritonInfo: tritonInfo, + // Summary will be set by SetSummaries + } + } } // SetSummaries sets the summaries for the AMD device. +// When restoring from cache, this also updates the Summary field in devices map. func (d *gpuAMD) SetSummaries(summaries []DeviceSummary) { d.summaries = summaries + + // Update Summary in devices map if it exists + if d.devices != nil { + for _, summary := range summaries { + // Parse GPU ID from summary.ID (which is a string like "0", "1", etc.) + var gpuID int + if _, err := fmt.Sscanf(summary.ID, "%d", &gpuID); err == nil { + if dev, exists := d.devices[gpuID]; exists { + dev.Summary = summary + d.devices[gpuID] = dev + } + } + } + } } type AMDGPUInfo struct { diff --git a/mcv/pkg/accelerator/devices/nvml.go b/mcv/pkg/accelerator/devices/nvml.go index 542baa6c..ba9b07dc 100644 --- a/mcv/pkg/accelerator/devices/nvml.go +++ b/mcv/pkg/accelerator/devices/nvml.go @@ -62,13 +62,41 @@ func (d *gpuNvml) SetHwType(hwType string) { } // SetTritonInfo sets the Triton GPU information for the NVML device. +// When restoring from cache, this also populates the devices map. func (d *gpuNvml) SetTritonInfo(info []TritonGPUInfo) { d.tritonInfo = info + + // Rebuild devices map from cached triton info + if d.devices == nil { + d.devices = make(map[int]GPUDevice) + } + for _, tritonInfo := range info { + d.devices[tritonInfo.ID] = GPUDevice{ + ID: tritonInfo.ID, + TritonInfo: tritonInfo, + // Summary will be set by SetSummaries + } + } } // SetSummaries sets the summaries for the NVML device. +// When restoring from cache, this also updates the Summary field in devices map. func (d *gpuNvml) SetSummaries(summaries []DeviceSummary) { d.summaries = summaries + + // Update Summary in devices map if it exists + if d.devices != nil { + for _, summary := range summaries { + // Parse GPU ID from summary.ID (which is a string like "0", "1", etc.) + var gpuID int + if _, err := fmt.Sscanf(summary.ID, "%d", &gpuID); err == nil { + if dev, exists := d.devices[gpuID]; exists { + dev.Summary = summary + d.devices[gpuID] = dev + } + } + } + } } func nvmlCheck(r *Registry) { @@ -222,7 +250,7 @@ func getNVMLTritonGPUInfo(device nvml.Device) (TritonGPUInfo, error) { Name: name, UUID: uuid, ComputeCapability: fmt.Sprintf("%d.%d", major, minor), // Formatting the compute capability - Arch: strconv.Itoa(major*10 + minor), // TODO double check this + Arch: strconv.Itoa(major*10 + minor), // Numeric string for Triton compatibility (e.g., "75") WarpSize: warpSize, MemoryTotalMB: mem.Total / (1024 * 1024), PTXVersion: ptxVersion, diff --git a/mcv/pkg/accelerator/devices/rocm.go b/mcv/pkg/accelerator/devices/rocm.go index 8423f499..f5905fbf 100644 --- a/mcv/pkg/accelerator/devices/rocm.go +++ b/mcv/pkg/accelerator/devices/rocm.go @@ -46,13 +46,41 @@ func (d *gpuROCm) SetHwType(hwType string) { } // SetTritonInfo sets the Triton GPU information for the ROCM device. +// When restoring from cache, this also populates the devices map. func (d *gpuROCm) SetTritonInfo(info []TritonGPUInfo) { d.tritonInfo = info + + // Rebuild devices map from cached triton info + if d.devices == nil { + d.devices = make(map[int]GPUDevice) + } + for _, tritonInfo := range info { + d.devices[tritonInfo.ID] = GPUDevice{ + ID: tritonInfo.ID, + TritonInfo: tritonInfo, + // Summary will be set by SetSummaries + } + } } // SetSummaries sets the summaries for the ROCM device. +// When restoring from cache, this also updates the Summary field in devices map. func (d *gpuROCm) SetSummaries(summaries []DeviceSummary) { d.summaries = summaries + + // Update Summary in devices map if it exists + if d.devices != nil { + for _, summary := range summaries { + // Parse GPU ID from summary.ID (which is a string like "0", "1", etc.) + var gpuID int + if _, err := fmt.Sscanf(summary.ID, "%d", &gpuID); err == nil { + if dev, exists := d.devices[gpuID]; exists { + dev.Summary = summary + d.devices[gpuID] = dev + } + } + } + } } type ROCMGPUInfo struct { diff --git a/mcv/pkg/cache/types.go b/mcv/pkg/cache/types.go index 8810ccd4..7f7d96db 100644 --- a/mcv/pkg/cache/types.go +++ b/mcv/pkg/cache/types.go @@ -1,9 +1,11 @@ package cache type SummaryTargetInfo struct { - Backend string `json:"backend"` - Arch string `json:"arch"` - WarpSize int `json:"warp_size"` + Backend string `json:"backend"` + Arch string `json:"arch"` + WarpSize int `json:"warp_size"` + PTXVersion int `json:"ptx_version,omitempty"` // CUDA PTX version (for CUDA backend) + CUDAVersion string `json:"cuda_version,omitempty"` // CUDA toolkit version (e.g., "12.9") } type Summary struct { @@ -88,3 +90,13 @@ type CacheKeyFactors struct { ConfigHash string `json:"config_hash"` Env map[string]interface{} `json:"env"` } + +// AOTCompileCacheMetadata represents metadata for AOT compile cache artifacts +// These are created when VLLM_USE_AOT_COMPILE=1 and stored at: +// torch_compile_cache/torch_aot_compile/{hash}/rank_{rank}_{dp_rank}/model +type AOTCompileCacheMetadata struct { + Hash string `json:"hash"` // Full hash from directory name + Rank string `json:"rank"` // rank_X_Y format + ModelFile string `json:"model_file"` // Always "model" + FileSize int64 `json:"file_size"` // Size of model file in bytes +} diff --git a/mcv/pkg/cache/vllm.go b/mcv/pkg/cache/vllm.go index e782642d..c5ddccd5 100644 --- a/mcv/pkg/cache/vllm.go +++ b/mcv/pkg/cache/vllm.go @@ -11,6 +11,8 @@ import ( "strconv" "strings" + "github.com/redhat-et/GKM/mcv/pkg/accelerator/devices" + "github.com/redhat-et/GKM/mcv/pkg/config" "github.com/redhat-et/GKM/mcv/pkg/constants" logging "github.com/sirupsen/logrus" ) @@ -27,6 +29,9 @@ const ( // Cache format constants BinaryCacheFormat = "binary" CUDABackend = "cuda" + ROCmBackend = "rocm" + HIPBackend = "hip" + UnknownBackend = "UnknownBackend" // torchAOTCompileDirName is the extra directory vLLM introduces above // the per-model hash dir when VLLM_USE_AOT_COMPILE is enabled. @@ -46,10 +51,11 @@ type VLLMCache struct { } type VLLMCacheMetadata struct { - VllmHash string `json:"vllmHash"` - CacheFormat string `json:"cacheFormat"` // "triton" or "binary" - TritonCacheEntries []CacheEntry `json:"triton,omitempty"` - BinaryCacheEntries []BinaryCacheMetadata `json:"binary,omitempty"` + VllmHash string `json:"vllmHash"` + CacheFormat string `json:"cacheFormat"` // "triton", "binary", or "aot_compile" + TritonCacheEntries []CacheEntry `json:"triton,omitempty"` + BinaryCacheEntries []BinaryCacheMetadata `json:"binary,omitempty"` + AOTCompileEntries []AOTCompileCacheMetadata `json:"aot_compile,omitempty"` } // DetectVLLMCache walks the given root directory to detect whether VLLM-style cache artifacts exist @@ -165,6 +171,35 @@ func DetectVLLMCache(cacheDir string) *VLLMCache { logging.Debugf("Adding VLLM triton cache metadata: %+v", vllmMetadata) metadata = append(metadata, vllmMetadata) } + + // Check for AOT compile cache at torch_compile_cache/torch_aot_compile/ + aotCompilePath := filepath.Join(torchCompileCachePath, "torch_aot_compile") + if _, err := os.Stat(aotCompilePath); err == nil { + logging.Debugf("Detecting AOT compile cache at: %s", aotCompilePath) + aotCacheData, aotErr := detectAOTCompileCache(aotCompilePath) + if aotErr == nil && len(aotCacheData) > 0 { + logging.Debugf("Detected AOT compile cache format with %d entries", len(aotCacheData)) + // Group AOT cache entries by hash + aotByHash := make(map[string][]AOTCompileCacheMetadata) + for _, aotCache := range aotCacheData { + aotByHash[aotCache.Hash] = append(aotByHash[aotCache.Hash], aotCache) + } + + // Create metadata entries for each hash + for hash, entries := range aotByHash { + vllmMetadata := VLLMCacheMetadata{ + VllmHash: hash, + CacheFormat: "aot_compile", + AOTCompileEntries: entries, + } + logging.Debugf("Adding VLLM AOT compile cache metadata: %+v", vllmMetadata) + metadata = append(metadata, vllmMetadata) + count++ + } + } else if aotErr != nil { + logging.Debugf("No AOT compile cache detected: %v", aotErr) + } + } } } @@ -372,6 +407,73 @@ func detectMegaAOTCache(hashDir string) ([]BinaryCacheMetadata, error) { return out, nil } +// detectAOTCompileCache detects AOT compile cache format +// These are created when VLLM_USE_AOT_COMPILE=1 and stored at: +// torch_compile_cache/torch_aot_compile/{hash}/rank_{rank}_{dp_rank}/model +func detectAOTCompileCache(aotPath string) ([]AOTCompileCacheMetadata, error) { + var aotCaches []AOTCompileCacheMetadata + + if _, err := os.Stat(aotPath); os.IsNotExist(err) { + return nil, fmt.Errorf("AOT compile cache path does not exist: %s", aotPath) + } + + // Walk the torch_aot_compile directory looking for {hash}/rank_X_Y/model files + entries, err := os.ReadDir(aotPath) + if err != nil { + return nil, fmt.Errorf("failed to read AOT compile directory: %w", err) + } + + for _, hashEntry := range entries { + if !hashEntry.IsDir() { + continue + } + + hashDir := filepath.Join(aotPath, hashEntry.Name()) + logging.Debugf("Inspecting AOT hash directory: %s", hashDir) + + // Look for rank_X_Y directories + rankEntries, err := os.ReadDir(hashDir) + if err != nil { + logging.Warnf("Failed to read AOT hash directory %s: %v", hashDir, err) + continue + } + + rankDirRegex := regexp.MustCompile(`^rank_\d+_\d+$`) + for _, rankEntry := range rankEntries { + if !rankEntry.IsDir() { + continue + } + if !rankDirRegex.MatchString(rankEntry.Name()) { + continue + } + + // Check for model file + modelPath := filepath.Join(hashDir, rankEntry.Name(), "model") + stat, err := os.Stat(modelPath) + if err != nil { + logging.Debugf("No model file found at %s: %v", modelPath, err) + continue + } + + aotCache := AOTCompileCacheMetadata{ + Hash: hashEntry.Name(), + Rank: rankEntry.Name(), + ModelFile: "model", + FileSize: stat.Size(), + } + + logging.Debugf("Found AOT compile cache: %+v", aotCache) + aotCaches = append(aotCaches, aotCache) + } + } + + if len(aotCaches) == 0 { + return nil, fmt.Errorf("no AOT compile cache detected") + } + + return aotCaches, nil +} + func (v *VLLMCache) Name() string { return constants.VLLM } func (v *VLLMCache) EntryCount() int { @@ -426,6 +528,78 @@ func (v *VLLMCache) Summary() string { return string(jsonData) } +// detectActualGPUInfo detects the actual GPU architecture from the current system +// This is called during cache image creation to detect the real hardware, +// regardless of what VLLM_TARGET_DEVICE says in the cache metadata. +// Returns backend, arch, warpSize, and ptxVersion +func detectActualGPUInfo() (backend, arch string, warpSize, ptxVersion int) { + // Initialize config if not already done + if !config.IsInitialized() { + if _, err := config.Initialize(config.ConfDir); err != nil { + logging.WithError(err).Debug("Failed to initialize config for GPU detection") + return UnknownBackend, UnknownBackend, 0, 0 + } + } + + // Get device registry + registry := devices.GetRegistry() + if registry == nil { + logging.Debug("Failed to get device registry") + return UnknownBackend, UnknownBackend, 0, 0 + } + + // Try to start GPU device - this will auto-detect CUDA/ROCm + device := devices.Startup(config.GPU, registry) + if device == nil { + logging.Debug("No GPU detected on system") + return UnknownBackend, UnknownBackend, 0, 0 + } + + // Initialize the device to ensure GPU info is populated + // This is important when the device was restored from cache + if err := device.Init(); err != nil { + logging.WithError(err).Debug("Failed to initialize device") + return UnknownBackend, UnknownBackend, 0, 0 + } + + // Get GPU info for the first GPU (index 0) + gpuInfo, err := device.GetGPUInfo(0) + if err != nil { + logging.WithError(err).Debug("Failed to get GPU info from device") + return UnknownBackend, UnknownBackend, 0, 0 + } + + // Determine backend and warp size from the detected GPU + detectedBackend := gpuInfo.Backend + if detectedBackend == "" { + // Fallback: try to infer from device type + switch device.DevType() { + case devices.NVML: + detectedBackend = CUDABackend + case devices.ROCM, devices.AMD: + detectedBackend = ROCmBackend + default: + detectedBackend = UnknownBackend + } + } + + detectedWarpSize := gpuInfo.WarpSize + if detectedWarpSize == 0 { + // Fallback to defaults + switch detectedBackend { + case CUDABackend: + detectedWarpSize = 32 + case ROCmBackend, HIPBackend: + detectedWarpSize = 64 + } + } + + logging.Infof("Detected GPU: backend=%s, arch=%s, warpSize=%d, PTX=%d", + detectedBackend, gpuInfo.Arch, detectedWarpSize, gpuInfo.PTXVersion) + + return detectedBackend, gpuInfo.Arch, detectedWarpSize, gpuInfo.PTXVersion +} + // buildBinaryCacheSummary builds a summary from binary cache metadata func buildBinaryCacheSummary(metadata []VLLMCacheMetadata) (*Summary, error) { targetMap := make(map[string]SummaryTargetInfo) @@ -435,46 +609,85 @@ func buildBinaryCacheSummary(metadata []VLLMCacheMetadata) (*Summary, error) { continue } + // Detect actual GPU from the system once per metadata entry + // NOTE: We detect the actual system GPU rather than trusting VLLM_TARGET_DEVICE + // because caches may be copied from other systems + detectedBackend, detectedArch, detectedWarpSize, detectedPTX := detectActualGPUInfo() + for i := range meta.BinaryCacheEntries { binaryCache := &meta.BinaryCacheEntries[i] - // Extract target info from the stored environment variables - backend := binaryCache.TargetDevice - if backend == "" { - backend = CUDABackend // Default if not specified + + // Use detected GPU info from actual system + backend := detectedBackend + arch := detectedArch + warpSize := detectedWarpSize + ptxVersion := detectedPTX + + // For vLLM binary cache, CUDA uses sm_ prefix (e.g., sm_75) + // AMD/ROCm already has gfx prefix (e.g., gfx1151) + if backend == CUDABackend { + arch = fmt.Sprintf("sm_%s", arch) } - // Determine arch and warpSize based on backend and env vars - arch := "unknown" - warpSize := 32 // Default for CUDA + // Extract toolkit versions from cache environment for reference + cudaVersion := "" + rocmVersion := "" + + // Handle special cases where no GPU is detected + if backend == UnknownBackend { + logging.Warn("Could not detect GPU on system, using cache metadata as fallback") + // Fallback to cache metadata if GPU detection failed + backend = binaryCache.TargetDevice + if backend == "" { + backend = CUDABackend // Default if not specified + } + // Set default warp sizes + switch backend { + case ROCmBackend, HIPBackend: + warpSize = 64 + case CUDABackend: + warpSize = 32 + case "tpu": + warpSize = 128 + case "cpu": + warpSize = 1 + } + } + // Extract toolkit version info from environment switch backend { - case "rocm", "hip": - warpSize = 64 // AMD GPUs use 64-wide wavefronts - // Try to extract GPU architecture from env - if env, ok := binaryCache.Env["VLLM_ROCM_CUSTOM_PAGED_ATTN"]; ok && env != nil { - // ROCm is being used - arch = "gfx90a" // Common MI250/MI300 arch, could be extracted more precisely + case CUDABackend: + if cudaVer, ok := binaryCache.Env["VLLM_MAIN_CUDA_VERSION"]; ok { + if ver, ok := cudaVer.(string); ok { + cudaVersion = ver + logging.Debugf("CUDA toolkit version from cache: %s", cudaVersion) + } } - case "cuda": - // Try to extract CUDA architecture - if mainVersion, ok := binaryCache.Env["VLLM_MAIN_CUDA_VERSION"]; ok { - if version, ok := mainVersion.(string); ok { - arch = "sm_" + version + case ROCmBackend, HIPBackend: + if rocmVer, ok := binaryCache.Env["ROCM_VERSION"]; ok { + if ver, ok := rocmVer.(string); ok { + rocmVersion = ver + logging.Debugf("ROCm version from cache: %s", rocmVersion) } } - case "tpu": - warpSize = 128 // TPU uses different parallelism model - case "cpu": - warpSize = 1 // CPU doesn't have warp concept } - key := fmt.Sprintf("%s-%s-%d", backend, arch, warpSize) + // Create unique key including version info for better cache matching + key := fmt.Sprintf("%s-%s-%d-%s-%s", backend, arch, warpSize, cudaVersion, rocmVersion) if _, exists := targetMap[key]; !exists { - targetMap[key] = SummaryTargetInfo{ + targetInfo := SummaryTargetInfo{ Backend: backend, Arch: arch, WarpSize: warpSize, } + // Add version info if available + if ptxVersion > 0 { + targetInfo.PTXVersion = ptxVersion + } + if cudaVersion != "" { + targetInfo.CUDAVersion = cudaVersion + } + targetMap[key] = targetInfo } } } diff --git a/mcv/pkg/preflightcheck/triton.go b/mcv/pkg/preflightcheck/triton.go index b9eaa489..7293efa0 100644 --- a/mcv/pkg/preflightcheck/triton.go +++ b/mcv/pkg/preflightcheck/triton.go @@ -63,7 +63,11 @@ func CompareTritonEntriesToGPU(entries []cache.TritonCacheMetadata, devInfo []de for _, gpuInfo := range devInfo { backendMatches := entry.Backend == gpuInfo.Backend - archMatches := entry.Arch == gpuInfo.Arch + // Normalize architectures for comparison (handles "75" vs "sm_75" for CUDA) + entryArchStr := cache.ConvertArchToString(entry.Arch) + normalizedEntryArch := normalizeArchForComparison(entry.Backend, entryArchStr) + normalizedGPUArch := normalizeArchForComparison(gpuInfo.Backend, gpuInfo.Arch) + archMatches := normalizedEntryArch == normalizedGPUArch warpMatches := entry.WarpSize == gpuInfo.WarpSize ptxMatches := true diff --git a/mcv/pkg/preflightcheck/utils.go b/mcv/pkg/preflightcheck/utils.go index 84d20da7..eb5699fe 100644 --- a/mcv/pkg/preflightcheck/utils.go +++ b/mcv/pkg/preflightcheck/utils.go @@ -4,6 +4,7 @@ import ( "encoding/json" "errors" "fmt" + "strings" v1 "github.com/google/go-containerregistry/pkg/v1" "github.com/redhat-et/GKM/mcv/pkg/accelerator" @@ -14,6 +15,15 @@ import ( logging "github.com/sirupsen/logrus" ) +// normalizeArchForComparison normalizes architecture strings for comparison +// Strips sm_ prefix from CUDA architectures to handle both "75" and "sm_75" formats +func normalizeArchForComparison(backend, arch string) string { + if backend == "cuda" { + return strings.TrimPrefix(arch, "sm_") + } + return arch +} + func CompareCacheSummaryLabelToGPU(img v1.Image, labels map[string]string, devInfo []devices.TritonGPUInfo) (matched, unmatched []devices.TritonGPUInfo, err error) { logging.Debug("Starting cache summary label preflight check...") if labels == nil { @@ -40,13 +50,30 @@ func CompareCacheSummaryLabelToGPU(img v1.Image, labels map[string]string, devIn return nil, nil, fmt.Errorf("failed to parse summary label: %w", err) } + logging.Debugf("Preflight check: devInfo has %d GPUs, summary has %d targets", len(devInfo), len(summary.Targets)) + for i, gpu := range devInfo { + logging.Debugf("GPU[%d]: backend=%s, arch=%s, warp=%d", i, gpu.Backend, gpu.Arch, gpu.WarpSize) + } + for i, target := range summary.Targets { + logging.Debugf("Target[%d]: backend=%s, arch=%s, warp=%d", i, target.Backend, target.Arch, target.WarpSize) + } + for _, gpu := range devInfo { isMatch := false for _, target := range summary.Targets { backendMatches := target.Backend == gpu.Backend - archMatches := target.Arch == gpu.Arch + // Normalize architectures for comparison (handles "75" vs "sm_75" for CUDA) + normalizedTargetArch := normalizeArchForComparison(target.Backend, target.Arch) + normalizedGPUArch := normalizeArchForComparison(gpu.Backend, gpu.Arch) + archMatches := normalizedTargetArch == normalizedGPUArch warpMatches := target.WarpSize == gpu.WarpSize + logging.Debugf("Comparing cache target vs GPU: backend=%s vs %s, arch=%s(%s) vs %s(%s), warp=%d vs %d", + target.Backend, gpu.Backend, + target.Arch, normalizedTargetArch, + gpu.Arch, normalizedGPUArch, + target.WarpSize, gpu.WarpSize) + if backendMatches && archMatches && warpMatches { isMatch = true break diff --git a/mcv/pkg/preflightcheck/vllm.go b/mcv/pkg/preflightcheck/vllm.go index b9cfaef3..d23fcfcb 100644 --- a/mcv/pkg/preflightcheck/vllm.go +++ b/mcv/pkg/preflightcheck/vllm.go @@ -7,6 +7,7 @@ import ( "github.com/redhat-et/GKM/mcv/pkg/accelerator/devices" "github.com/redhat-et/GKM/mcv/pkg/cache" + logging "github.com/sirupsen/logrus" ) // CompareVLLMCacheManifestToGPU compares VLLM manifest entries to GPU info @@ -23,79 +24,116 @@ func CompareVLLMCacheManifestToGPU(manifestPath string, devInfo []devices.Triton } for _, entry := range manifest.VLLM { - // Check if this is a binary cache format - if entry.CacheFormat == "binary" && len(entry.BinaryCacheEntries) > 0 { - if err := compareBinaryCacheEntriesToGPU(entry.BinaryCacheEntries, devInfo); err != nil { - return err + // Check cache format and validate accordingly + switch entry.CacheFormat { + case "binary": + if len(entry.BinaryCacheEntries) > 0 { + if err := compareBinaryCacheEntriesToGPU(entry.BinaryCacheEntries, devInfo); err != nil { + return err + } } - } else if len(entry.TritonCacheEntries) > 0 { - // Handle triton cache format (legacy) - convertedEntries := make([]cache.TritonCacheMetadata, len(entry.TritonCacheEntries)) - for i, e := range entry.TritonCacheEntries { - if metadata, ok := e.(cache.TritonCacheMetadata); ok { - convertedEntries[i] = metadata - } else { - return fmt.Errorf("failed to assert type cache.TritonCacheMetadata for entry: %v", e) + case "aot_compile": + if len(entry.AOTCompileEntries) > 0 { + if err := compareAOTCompileCacheEntriesToGPU(entry.AOTCompileEntries, devInfo); err != nil { + return err } } - if err := CompareTritonEntriesToGPU(convertedEntries, devInfo); err != nil { - return err + case "triton": + if len(entry.TritonCacheEntries) > 0 { + // Handle triton cache format (legacy) + // TritonCacheEntries contains JSON-unmarshalled map[string]interface{} values, + // so we need to re-marshal and unmarshal to get proper cache.TritonCacheMetadata structs + convertedEntries := make([]cache.TritonCacheMetadata, len(entry.TritonCacheEntries)) + for i, e := range entry.TritonCacheEntries { + // Re-marshal the entry to JSON + jsonData, err := json.Marshal(e) + if err != nil { + return fmt.Errorf("failed to marshal triton cache entry: %w", err) + } + // Unmarshal into proper struct + if err := json.Unmarshal(jsonData, &convertedEntries[i]); err != nil { + return fmt.Errorf("failed to unmarshal triton cache entry: %w", err) + } + } + if err := CompareTritonEntriesToGPU(convertedEntries, devInfo); err != nil { + return err + } } + default: + return fmt.Errorf("unknown cache format: %s", entry.CacheFormat) } } return nil } +// compareAOTCompileCacheEntriesToGPU validates AOT compile cache entries against GPU hardware +// AOT compile caches have limited metadata, so this primarily relies on the summary-based check +func compareAOTCompileCacheEntriesToGPU(entries []cache.AOTCompileCacheMetadata, _ []devices.TritonGPUInfo) error { + // AOT compile cache entries don't contain cache_key_factors.json with env vars, + // so we can't extract detailed hardware requirements from the manifest. + // The summary label (created during image build) contains the actual GPU info + // and is checked by CompareCacheSummaryLabelToGPU. + // + // Here we just verify the entries exist and log for debugging. + if len(entries) == 0 { + return fmt.Errorf("no AOT compile cache entries found") + } + + // Log the AOT cache entries for debugging + for _, entry := range entries { + logging.Debugf("AOT compile cache: hash=%s, rank=%s, size=%d bytes", + entry.Hash, entry.Rank, entry.FileSize) + } + + // Actual hardware compatibility is validated via the summary label + return nil +} + // compareBinaryCacheEntriesToGPU validates binary cache entries against GPU hardware +// Note: Binary cache metadata doesn't directly contain compute capability. +// The Summary label (built during image creation using actual GPU detection) is the +// primary source of truth for hardware compatibility. This function provides a basic +// backend-level check. func compareBinaryCacheEntriesToGPU(entries []cache.BinaryCacheMetadata, devInfo []devices.TritonGPUInfo) error { for i := range entries { entry := &entries[i] - // Extract hardware info from the binary cache metadata + // Extract backend from the binary cache metadata backend := entry.TargetDevice if backend == "" { backend = cache.CUDABackend // Default if not specified } - // Determine arch and warpSize based on backend and env vars - arch := "unknown" - warpSize := 32 // Default for CUDA - + // Basic warp size validation based on backend + expectedWarpSize := 32 // Default for CUDA switch backend { - case "rocm", "hip": - warpSize = 64 // AMD GPUs use 64-wide wavefronts - // Try to extract GPU architecture from env - if env, ok := entry.Env["VLLM_ROCM_CUSTOM_PAGED_ATTN"]; ok && env != nil { - arch = "gfx90a" // Common MI250/MI300 arch, could be extracted more precisely - } - case "cuda": - // Try to extract CUDA architecture - if mainVersion, ok := entry.Env["VLLM_MAIN_CUDA_VERSION"]; ok { - if version, ok := mainVersion.(string); ok { - arch = "sm_" + version - } - } + case cache.ROCmBackend, cache.HIPBackend: + expectedWarpSize = 64 // AMD GPUs use 64-wide wavefronts + case cache.CUDABackend: + expectedWarpSize = 32 // NVIDIA GPUs use 32-wide warps case "tpu": - warpSize = 128 // TPU uses different parallelism model + expectedWarpSize = 128 // TPU uses different parallelism model case "cpu": - warpSize = 1 // CPU doesn't have warp concept + expectedWarpSize = 1 // CPU doesn't have warp concept } - // Check if any GPU matches this binary cache entry + // Check if any GPU matches the backend and warp size matched := false for _, gpu := range devInfo { backendMatches := backend == gpu.Backend - archMatches := arch == gpu.Arch - warpMatches := warpSize == gpu.WarpSize + warpMatches := expectedWarpSize == gpu.WarpSize - if backendMatches && archMatches && warpMatches { + if backendMatches && warpMatches { matched = true + // For detailed arch compatibility, rely on Summary label check + logging.Debugf("Binary cache entry matches GPU: backend=%s, warpSize=%d", + backend, expectedWarpSize) break } } if !matched { - return fmt.Errorf("binary cache entry (backend=%s, arch=%s, warpSize=%d) does not match any available GPU", backend, arch, warpSize) + return fmt.Errorf("binary cache entry (backend=%s, warpSize=%d) does not match any available GPU. Use Summary label for precise arch validation", backend, expectedWarpSize) } }