diff --git a/gkm-codespell.precommit-toml b/gkm-codespell.precommit-toml
index 76f85615..d998123d 100644
--- a/gkm-codespell.precommit-toml
+++ b/gkm-codespell.precommit-toml
@@ -1,3 +1,3 @@
 [tool.codespell]
-ignore-words-list = "AfterAll,renderD"
+ignore-words-list = "AfterAll,renderD,ERRO"
 skip = './.*,vendor/*,go.sum'
diff --git a/mcv/docs/vllm-binary-cache.md b/mcv/docs/vllm-binary-cache.md
index 9106459b..39ddabd6 100644
--- a/mcv/docs/vllm-binary-cache.md
+++ b/mcv/docs/vllm-binary-cache.md
@@ -2,26 +2,149 @@
 
 ## Overview
 
-MCV supports two vLLM cache formats:
+MCV supports three vLLM cache formats:
 
 1. **vLLM Triton Cache Format** (legacy) - Stores `triton_cache/` and
    `inductor_cache/` inside rank directories
-2. **vLLM Binary Cache Format** (new) - Stores prefix directories
-   (e.g., `backbone/`) inside rank directories
+2. **vLLM Binary Cache Format** (current default) - Stores compiled artifacts in
+   prefix directories with embedded Triton kernels
+3. **vLLM Mega AOT Artifact Format** (PyTorch 2.12+) - Uses
+   `VLLM_USE_MEGA_AOT_ARTIFACT=true` for enhanced AOT serialization
 
-Both formats share the same top-level structure:
-`torch_compile_cache/{hash}/rank_{rank}_{dp_rank}/`
+**AOT Compile Support**: MCV **supports** the `VLLM_USE_AOT_COMPILE=1` workflow,
+which creates a separate cache structure at
+`torch_compile_cache/torch_aot_compile/{hash}/rank_{rank}_{dp_rank}/model`.
+AOT compile caches store ahead-of-time compiled models as single binary files
+rather than multiple compilation artifacts. During preflight checks, AOT cache
+compatibility is validated primarily via the summary label, as the cache metadata
+contains limited hardware information.
 
-The key differences are **inside the rank directory**:
+Standard cache formats share the structure:
+`torch_compile_cache/{hash}/rank_{rank}_{dp_rank}/{prefix}/`
 
-- **Triton format**: Contains `triton_cache/` and `inductor_cache/`
-  subdirectories with unpacked artifacts
-- **Binary format**: Contains prefix directories
-  (e.g., `backbone/`, `eagle_head/`) with `cache_key_factors.json`
-  and artifacts that can be either binary files or unpacked directories
+The key differences are **inside the prefix directory**:
 
-This document describes the **vLLM Binary Cache Format** introduced in recent
-versions of vLLM.
+- **Triton format** (legacy): Contains `triton_cache/` and `inductor_cache/`
+  subdirectories with unpacked Triton kernels
+- **Binary format** (default): Contains `cache_key_factors.json` and multiple
+  `artifact_compile_range_*` files with embedded Triton kernels
+- **Mega AOT format** (PyTorch 2.12+): Same structure as binary format, but uses
+  enhanced AOT serialization (indicated by `VLLM_USE_MEGA_AOT_ARTIFACT: true` in
+  `cache_key_factors.json`)
+
+**Note**: The `VLLM_USE_AOT_COMPILE=1` workflow uses a different structure at
+`torch_compile_cache/torch_aot_compile/{hash}/rank_{rank}_{dp_rank}/model` and is
+not currently supported by MCV.
+
+This document describes the **vLLM Binary and Mega AOT Artifact Formats** and how
+torch.compile caching works with MCV.
+
+**Important**: This document covers compilation mode 3 (`VLLM_COMPILE`) which uses
+`~/.cache/vllm/torch_compile_cache/`. There is a separate vLLM feature controlled
+by `VLLM_USE_AOT_COMPILE=1` (enabled by default in PyTorch 2.10+) that creates an
+additional cache at `torch_compile_cache/torch_aot_compile/` with a different
+structure. MCV does not currently support this AOT compile workflow.
+
+## Torch Compile Architecture
+
+### How vLLM Uses torch.compile
+
+When vLLM is configured with compilation mode 3 via
+`--compilation-config '{"mode": 3}'` (not enabled by default), it uses PyTorch's
+`torch.compile` with TorchInductor backend to optimize model execution:
+
+```text
+Model Code → torch.compile → TorchInductor → Triton/CUDA Kernels → GPU Execution
+```
+
+**First Run (Compilation)**:
+
+1. vLLM traces the model with Dynamo
+2. TorchInductor compiles the graph
+3. Triton generates optimized GPU kernels → `/tmp/torchinductor_$USER`
+4. vLLM saves artifacts using `standalone_compile().save(format="binary")`
+5. **PyTorch bundles the Triton kernels into the artifacts**
+6. Complete cache saved to `~/.cache/vllm/torch_compile_cache/`
+
+**Subsequent Runs (Cache Hit)**:
+
+1. vLLM loads artifacts from `~/.cache/vllm/torch_compile_cache/`
+2. **PyTorch extracts embedded Triton kernels → `/tmp/torchinductor_$USER`**
+3. Execution resumes using extracted kernels (~10-20s vs 3-5min compilation)
+
+### Binary vs Mega AOT Serialization
+
+Both binary and mega AOT formats bundle Triton kernels in the artifacts and use
+the same directory structure. They only differ in how the artifact files are
+serialized:
+
+**Binary Serialization** (default):
+
+- Uses PyTorch `standalone_compile().save(format="binary")`
+- Environment: `VLLM_USE_MEGA_AOT_ARTIFACT=false` (default in PyTorch <2.12)
+- Multiple `artifact_compile_range_*` files per prefix
+- Typical size: ~11MB for Qwen3-0.6B model
+
+**Mega AOT Serialization** (PyTorch 2.12+):
+
+- Uses PyTorch `AOTCompiledArtifact.serialize()` with bundled autograd cache
+- Environment: `VLLM_USE_MEGA_AOT_ARTIFACT=true` (default in PyTorch 2.12+)
+- More portable across PyTorch versions
+- Same multi-artifact structure as binary format
+- Typical size: Similar to binary format
+
+**Important**: From MCV's perspective, both formats are **structurally identical**
+and use the same detection and packaging logic:
+
+```text
+~/.cache/vllm/torch_compile_cache/{hash}/rank_{rank}_{dp_rank}/{prefix}/
+```
+
+### VLLM_USE_AOT_COMPILE vs VLLM_USE_MEGA_AOT_ARTIFACT
+
+These are **two different features** that are often confused:
+
+**`VLLM_USE_AOT_COMPILE`** (directory structure change):
+- Enabled by default in PyTorch 2.10+
+- Creates cache at: `torch_compile_cache/torch_aot_compile/{hash}/rank_X_Y/model`
+- Single `model` file (~6.5MB) instead of multiple artifacts
+- **Not currently supported by MCV** (different directory structure)
+
+**`VLLM_USE_MEGA_AOT_ARTIFACT`** (serialization format):
+- Enabled by default in PyTorch 2.12+
+- Uses regular cache path: `torch_compile_cache/{hash}/rank_X_Y/{prefix}/`
+- Enhanced AOT serialization for better portability
+- **Supported by MCV** (same structure as binary format)
+
+When both are enabled, vLLM creates **both** cache locations.
+
+### The /tmp Cache Directory
+
+During compilation and execution, PyTorch creates temporary files:
+
+```text
+/tmp/torchinductor_$USER/
+├── triton/0/{hash}/
+│   ├── triton_.cubin    # Compiled GPU binary (ELF)
+│   ├── triton_.source   # Triton source code
+│   ├── triton_.ttir     # Triton IR
+│   └── triton_.ptx      # PTX assembly
+├── o7/, dp/, .../       # Python kernel cache
+└── aotautograd/         # AOT autograd cache
+```
+
+**Lifecycle**:
+
+- **First run**: Created during compilation
+- **Cache hit**: Extracted from embedded artifacts
+- **Cleanup**: Cleared on reboot (tmpfs) or manual deletion
+- **Recreation**: Automatic on every vLLM start
+
+**Key Insight**: This directory is **NOT needed for cache portability**.
+The Triton kernels are already embedded in the binary artifacts (verified by
+finding 42 ELF headers in a 5.3MB artifact file).
+
+**MCV does NOT capture `/tmp`** - kernels auto-extract at runtime (~2 seconds).
 
 ## Binary Cache Format
 
@@ -202,21 +325,27 @@ The `manifest.json` file contains comprehensive metadata:
 
 ## Hardware Detection
 
-MCV automatically extracts hardware information from the cache metadata:
+MCV automatically detects hardware information from the system and combines it with cache metadata:
 
 ### CUDA
 
 ```json
 {
   "backend": "cuda",
-  "arch": "sm_12.9",
-  "warp_size": 32
+  "arch": "75",
+  "warp_size": 32,
+  "ptx_version": 590,
+  "cuda_version": "12.9"
 }
 ```
 
-- **Backend**: Extracted from `VLLM_TARGET_DEVICE`
-- **Arch**: Derived from `VLLM_MAIN_CUDA_VERSION`
+- **Backend**: Extracted from `VLLM_TARGET_DEVICE` environment variable
+- **Arch**: **Detected from actual GPU** on the system as numerical compute capability (e.g., `75` for Tesla T4, `80` for A100, `89` for RTX 4090)
 - **Warp Size**: 32 (CUDA default)
+- **PTX Version**: PTX version from NVIDIA driver (e.g., 590 for driver 590.48.01)
+- **CUDA Version**: CUDA toolkit version from `VLLM_MAIN_CUDA_VERSION` (e.g., "12.9")
+
+**Important**: MCV detects the **actual GPU compute capability** from the system, not from environment variables. Compute capability is stored as a numerical value (e.g., `75` = sm_7.5 = Turing architecture). This ensures accurate compatibility checking between cached kernels and the target GPU.
 
 ### ROCm/HIP
 
@@ -228,10 +357,12 @@ MCV automatically extracts hardware information from the cache metadata:
 }
 ```
 
-- **Backend**: Extracted from `VLLM_TARGET_DEVICE`
-- **Arch**: Detected from ROCm environment variables
+- **Backend**: Extracted from `VLLM_TARGET_DEVICE` environment variable
+- **Arch**: **Detected from actual GPU** on the system (e.g., `gfx90a` for MI250, `gfx942` for MI300)
 - **Warp Size**: 64 (AMD wavefront size)
 
+**Note**: If GPU detection fails, MCV will warn that the cache may not be compatible with the current GPU. Always verify GPU compatibility before deployment.
+
 ## Format Detection
 
 MCV automatically detects the vLLM cache format by inspecting the
@@ -348,6 +479,7 @@ inspect` or `skopeo inspect` without reading the full manifest.
 | vLLM Format | Artifacts | `cacheFormat` | `cache_save_format` | Label |
 | ----------- | --------- | ------------- | ------------------- | ----- |
 | Binary | Binary files | `"binary"` | `"binary"` | `"binary"` |
+| AOT | Binary files | `"binary"` | `"binary"` | `"binary"` |
 | Triton | Unpacked dirs | `"triton"` | N/A | `"unpacked"` |
 
 **Why Three Indicators?**
@@ -361,16 +493,303 @@ inspect` or `skopeo inspect` without reading the full manifest.
 
 ## Comparison: vLLM Binary Cache vs vLLM Triton Cache
 
-| Aspect | Triton (Legacy) | Binary (New) |
-| ------ | --------------- | ------------ |
-| **Structure** | `{hash}/rank_X_Y/` | `{hash}/rank_X_Y/` |
-| **Inside Rank** | `triton_cache/` + `inductor_cache/` | `{prefix}/` |
-| **Metadata** | Triton JSON | `cache_key_factors.json` |
-| **Storage** | Unpacked | Binary/unpacked |
-| **Multiprocess** | No | Yes (binary) |
-| **Distributed** | Full rank/DP | Full rank/DP |
-| **Manifest** | `"triton"` | `"binary"` |
-| **Label** | `"unpacked"` | `"binary"`/`"unpacked"` |
+| Aspect | Triton (Legacy) | Binary | Mega AOT |
+| ------ | --------------- | ------------ | -------------- |
+| **Structure** | `{hash}/rank_X_Y/` | `{hash}/rank_X_Y/` | `{hash}/rank_X_Y/` |
+| **Inside Rank** | `triton_cache/` + `inductor_cache/` | `{prefix}/` | `{prefix}/` |
+| **Metadata** | Triton JSON | `cache_key_factors.json` | `cache_key_factors.json` |
+| **Serialization** | Unpacked | `standalone_compile().save()` | `AOTCompiledArtifact.serialize()` |
+| **Storage** | Unpacked | Binary/unpacked | Binary |
+| **Multiprocess** | No | Yes (binary) | Yes |
+| **Distributed** | Full rank/DP | Full rank/DP | Full rank/DP |
+| **Manifest** | `"triton"` | `"binary"` | `"binary"` |
+| **Label** | `"unpacked"` | `"binary"`/`"unpacked"` | `"binary"` |
+| **PyTorch Req** | Any | Any | 2.12+ |
+| **Env Var** | - | `VLLM_USE_MEGA_AOT_ARTIFACT=0` | `VLLM_USE_MEGA_AOT_ARTIFACT=1` |
+| **MCV Support** | ✅ Yes | ✅ Yes | ✅ Yes |
+
+**Note**: The `VLLM_USE_AOT_COMPILE=1` workflow creates a different structure at
+`torch_compile_cache/torch_aot_compile/` and is **not** shown in this table as it
+is not currently supported by MCV.
+
+## Complete Workflow Example
+
+This section demonstrates the complete end-to-end workflow of capturing a vLLM cache, creating an OCI image, and extracting it on another system.
+
+### Prerequisites
+
+- Docker or Podman installed
+- MCV binary built (`make mcv`)
+- Access to a container registry (e.g., quay.io)
+- GPU available on the system (NVIDIA or AMD)
+
+### Step 1: Start vLLM Container
+
+Start a vLLM container with a model. This example uses NVIDIA GPU with CUDA:
+
+```bash
+# For NVIDIA GPUs with CUDA 13.0
+sudo podman run -d \
+    --name vllm-server \
+    --privileged \
+    --device /dev/nvidia0:/dev/nvidia0 \
+    --device /dev/nvidiactl:/dev/nvidiactl \
+    --device /dev/nvidia-uvm:/dev/nvidia-uvm \
+    --device /dev/nvidia-uvm-tools:/dev/nvidia-uvm-tools \
+    -v /usr/lib64:/usr/lib64:ro \
+    -v /usr/lib64:/usr/local/cuda-13.0/compat:ro \
+    -v /usr/local/cuda:/usr/local/cuda:ro \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env 'LD_LIBRARY_PATH=/usr/lib64:/usr/local/cuda/lib64:/usr/local/cuda-13.0/compat' \
+    -e NVIDIA_VISIBLE_DEVICES=all \
+    -e NVIDIA_DRIVER_CAPABILITIES=compute,utility \
+    -p 8000:8000 \
+    --ipc=host \
+    docker.io/vllm/vllm-openai:latest-cu130 \
+    --model Qwen/Qwen3-0.6B
+```
+
+For AMD GPUs with ROCm, adjust the device mounts and environment variables accordingly.
+
+### Step 2: Wait for Cache Generation
+
+The vLLM server compiles kernels during model loading and warmup. Wait for the compilation to complete:
+
+```bash
+# Monitor vLLM logs to see compilation progress
+sudo podman logs -f vllm-server
+
+# Look for messages like:
+# INFO 04-02 13:08:05 [monitor.py:48] torch.compile took 53.19 s in total
+# INFO 04-02 13:08:28 [core.py:281] init engine (profile, create kv cache, warmup model) took 76.50 seconds
+# INFO 04-02 13:08:31 [api_server.py:580] Starting vLLM server on http://0.0.0.0:8000
+
+# Once you see "Starting vLLM server", the cache has been generated
+```
+
+The compiled kernels are stored in `/root/.cache/vllm/torch_compile_cache/` inside the container.
+
+**Optional**: You can also send a test request to verify the server is working:
+
+```bash
+# Send a test request (cache already compiled during startup)
+curl -s http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Qwen/Qwen3-0.6B",
+    "messages": [{"role": "user", "content": "Hello!"}],
+    "max_tokens": 50
+  }' | jq -r '.choices[0].message.content'
+```
+
+### Step 3: Capture Cache from Container
+
+Copy the generated cache from the running container to your host:
+
+```bash
+# Create directory for cache
+mkdir -p ~/vllm-qwen-cache
+
+# Copy cache from container
+sudo podman cp vllm-server:/root/.cache/vllm ~/vllm-qwen-cache/
+
+# Fix ownership
+sudo chown -R $(whoami):$(whoami) ~/vllm-qwen-cache/
+
+# Verify cache was captured
+du -sh ~/vllm-qwen-cache/vllm
+# Output: ~18M    /home/user/vllm-qwen-cache/vllm
+
+# Inspect cache structure
+ls -la ~/vllm-qwen-cache/vllm/torch_compile_cache/
+# Should show hash directories (e.g., fe20897a43/)
+```
+
+### Step 4: Build Cache Image with MCV
+
+Create an OCI container image containing the cache:
+
+```bash
+# Install buildah if not already installed
+sudo dnf install -y buildah
+
+# Build cache image
+mcv -c \
+    -i quay.io/myorg/vllm-qwen-cache:v1 \
+    -d ~/vllm-qwen-cache/vllm \
+    --builder buildah
+
+# Output:
+# INFO Using buildah to build the image
+# INFO Detected cache components: [vllm]
+# INFO Detected GPU: backend=cuda, arch=75, warpSize=32, PTX=590
+# INFO Image built! 3cbede0b2cb5...
+# INFO OCI image created successfully.
+```
+
+### Step 5: Inspect Cache Image
+
+Verify the cache image metadata and labels:
+
+```bash
+# View image in buildah
+buildah images | grep vllm-qwen-cache
+
+# Inspect image labels
+buildah inspect quay.io/myorg/vllm-qwen-cache:v1 | \
+    jq -r '.OCIv1.config.Labels'
+
+# Expected output:
+# {
+#   "cache.vllm.image/cache-size-bytes": "18152945",
+#   "cache.vllm.image/entry-count": "2",
+#   "cache.vllm.image/format": "binary",
+#   "cache.vllm.image/summary": "{\"targets\":[{\"backend\":\"cuda\",\"arch\":\"75\",\"warp_size\":32,\"ptx_version\":590,\"cuda_version\":\"12.9\"}]}"
+# }
+```
+
+**Important**: Notice that the `arch` field shows the **actual GPU compute capability** (e.g., `75` for Tesla T4 which is sm_7.5), not the CUDA toolkit version.
+
+### Step 6: Push to Registry
+
+Push the cache image to a container registry:
+
+```bash
+# Login to registry
+buildah login quay.io
+
+# Push image
+buildah push quay.io/myorg/vllm-qwen-cache:v1
+
+# Verify push
+buildah images | grep vllm-qwen-cache
+```
+
+### Step 7: Extract Cache on Target System
+
+On another system with compatible GPU, extract the cache:
+
+```bash
+# Pull and extract cache
+mcv -e -i quay.io/myorg/vllm-qwen-cache:v1
+
+# MCV performs preflight checks:
+# 1. Fetches image and reads metadata
+# 2. Detects local GPU (e.g., Tesla T4 with compute capability 75)
+# 3. Compares with cache requirements
+# 4. Extracts cache to ~/.cache/vllm/ if compatible
+
+# Expected output on compatible GPU:
+# INFO Preflight GPU compatibility check passed.
+# INFO Preflight completed    matched="[0]" unmatched="[]"
+# INFO Extracting cache to directory: /home/user/.cache/vllm
+```
+
+**Preflight Check Failure**: If the GPU is incompatible, MCV will reject the extraction:
+
+```bash
+# Example: Trying to use A100 (compute capability 80) cache on T4 (75)
+mcv -e -i quay.io/myorg/vllm-a100-cache:v1
+
+# Output:
+# ERRO Preflight check failed: no compatible GPU found
+# WARN No compatible GPUs found for the image.
+```
+
+### Step 8: Verify Cache with GPU Compatibility Check
+
+Check compatibility without extracting:
+
+```bash
+# Check if current GPU is compatible with cached kernels
+mcv --check-compat -i quay.io/myorg/vllm-qwen-cache:v1
+
+# On compatible GPU (Tesla T4):
+# No output means compatible
+
+# On incompatible GPU:
+# ERRO Preflight check failed: no compatible GPU found
+# WARN No compatible GPUs found for the image.
+```
+
+### Step 9: View Detailed GPU Information
+
+Get detailed information about system GPUs:
+
+```bash
+# Display GPU fleet information
+mcv --gpu-info
+
+# Output:
+# INFO Detected 1 accelerator(s)
+# GPU Fleet:
+#   - GPU Type: TU104GL [Tesla T4]
+#     Driver Version: 590.48.01
+#     IDs: [0]
+```
+
+### Step 10: Use Cache with vLLM
+
+Start vLLM with the extracted cache:
+
+```bash
+# The cache is now in ~/.cache/vllm/
+# Start vLLM normally - it will automatically use the cache
+podman run -d \
+    --name vllm-with-cache \
+    ... # same mounts and settings as before
+    -v ~/.cache/vllm:/root/.cache/vllm \
+    docker.io/vllm/vllm-openai:latest-cu130 \
+    --model Qwen/Qwen3-0.6B
+
+# vLLM will skip compilation and use cached kernels
+# First request will be much faster!
+```
+
+### Workflow Summary
+
+```text
+┌─────────────────────────┐
+│  1. Start vLLM          │
+│     Container           │
+└───────────┬─────────────┘
+            │
+            ▼
+┌─────────────────────────┐
+│  2. Wait for Kernel     │
+│     Compilation         │
+└───────────┬─────────────┘
+            │
+            ▼
+┌─────────────────────────┐
+│  3. Copy Cache from     │
+│     Container to Host   │
+└───────────┬─────────────┘
+            │
+            ▼
+┌─────────────────────────┐
+│  4. Build OCI Image     │
+│     with MCV            │
+└───────────┬─────────────┘
+            │
+            ▼
+┌─────────────────────────┐
+│  5. Push to Registry    │
+└───────────┬─────────────┘
+            │
+            ▼
+┌─────────────────────────┐
+│  6. Pull & Extract on   │
+│     Target System       │
+│     (Preflight Checks)  │
+└───────────┬─────────────┘
+            │
+            ▼
+┌─────────────────────────┐
+│  7. Use Cache with      │
+│     vLLM on Target      │
+└─────────────────────────┘
+```
 
 ## Usage Examples
 
@@ -428,6 +847,7 @@ Key files in vLLM that implement binary cache:
 3. **Include full env in manifest** for cache compatibility checking
 4. **Verify hardware match** using image labels before deployment
 5. **Check cache_save_format** in manifest when extracting caches
+6. **Use AOT artifacts for cross-PyTorch-version portability** (requires PyTorch 2.10+)
 
 ## Migration from vLLM Triton Cache to vLLM Binary Cache
 
@@ -439,6 +859,259 @@ To migrate from vLLM triton cache format to vLLM binary cache format:
 4. Package new cache with MCV (automatically detected)
 5. Both vLLM cache formats are supported, no breaking changes
 
+## Practical Guide
+
+### Generating a Cache
+
+**Environment Setup**:
+
+```bash
+# For binary format (default):
+export VLLM_COMPILE_CACHE_SAVE_FORMAT=binary
+export VLLM_USE_MEGA_AOT_ARTIFACT=false  # or omit (default)
+
+# For AOT format (more portable):
+export VLLM_COMPILE_CACHE_SAVE_FORMAT=binary
+export VLLM_USE_MEGA_AOT_ARTIFACT=true  # requires PyTorch 2.10+
+```
+
+**Run vLLM Warmup**:
+
+```bash
+# Enable compilation with mode 3 (VLLM_COMPILE) for cache generation
+vllm serve my-model \
+  --compilation-config '{"mode": 3}' \
+  --tensor-parallel-size 1
+
+# Alternatively, use the named mode:
+# vllm serve my-model --compilation-config '{"mode": "VLLM_COMPILE"}' --tensor-parallel-size 1
+
+# Make sample requests to trigger compilation:
+curl http://localhost:8000/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model": "my-model", "prompt": "Hello", "max_tokens": 100}'
+```
+
+**Note**: Mode 3 (VLLM_COMPILE) is required for cache generation. Other modes:
+
+- Mode 0: No compilation (default)
+- Mode 1: Standard torch.compile
+- Mode 2: Single Dynamo trace
+
+**Verify Cache**:
+
+```bash
+ls -lh ~/.cache/vllm/torch_compile_cache/
+# Should show a 10-char hash directory (e.g., 8d0a361fbc)
+
+# Check cache contents:
+find ~/.cache/vllm/torch_compile_cache/ -type f | head
+```
+
+### Packaging with MCV
+
+**Create Container Image**:
+
+```bash
+mcv -c \
+  -d ~/.cache/vllm/torch_compile_cache/{hash} \
+  -i quay.io/myorg/my-model-cache:v1
+```
+
+**Verify Image Labels**:
+
+```bash
+skopeo inspect containers-storage:quay.io/myorg/my-model-cache:v1 \
+  | jq '.Labels'
+
+# Expected labels:
+# {
+#   "cache.vllm.image/cache-size-bytes": "95000000",
+#   "cache.vllm.image/entry-count": "1",
+#   "cache.vllm.image/format": "binary",
+#   "cache.vllm.image/summary": "{\"targets\":[{\"backend\":\"cuda\",...}]}"
+# }
+```
+
+### Using a Cached Image
+
+**Extract Cache**:
+
+```bash
+mcv -e -i quay.io/myorg/my-model-cache:v1
+
+# MCV extracts to: ~/.cache/vllm/torch_compile_cache/{hash}/
+```
+
+**Start vLLM**:
+
+```bash
+# vLLM automatically detects and uses the cache
+vllm serve my-model --tensor-parallel-size 1
+
+# Look for log message:
+# INFO: Directly load the compiled graph(s) from the cache, took X.X s
+```
+
+### Cache Compatibility
+
+A cache is compatible if:
+
+1. **GPU architecture** matches (check: `nvidia-smi --query-gpu=compute_cap`)
+2. **CUDA/ROCm version** compatible (check: `nvcc --version` or `rocm-smi`)
+3. **PyTorch version** compatible
+4. **Model code** unchanged (code hash must match)
+5. **vLLM configuration** matches (TP size, compile level, etc.)
+6. **Environment variables** match (see `cache_key_factors.json`)
+
+**Check Compatibility**:
+
+```bash
+# View cache metadata:
+cat ~/.cache/vllm/torch_compile_cache/*/rank_0_0/*/cache_key_factors.json \
+  | jq '{target: .env.VLLM_TARGET_DEVICE, cuda: .env.VLLM_MAIN_CUDA_VERSION}'
+
+# Compare with system:
+nvidia-smi
+# or
+rocm-smi
+```
+
+## Troubleshooting
+
+### Cache Not Being Used
+
+**Symptom**: vLLM recompiles on every start despite having a cache
+
+**Common Causes**:
+
+See the [Cache Compatibility](#cache-compatibility) section above for requirements.
+
+**Debug Steps**:
+
+```bash
+# 1. Check if cache exists
+ls ~/.cache/vllm/torch_compile_cache/
+
+# 2. Enable debug logging
+export VLLM_LOGGING_LEVEL=DEBUG
+
+# 3. Check for hash mismatch in logs
+grep "cache" vllm.log | grep -i "hash\|miss"
+
+# 4. Verify GPU compatibility
+python -c "import torch; print(torch.cuda.get_device_capability())"
+```
+
+### Slow Startup with Cache
+
+**Symptom**: vLLM takes 20+ seconds to start with cache
+
+**Normal Behavior**: 10-20 seconds for kernel extraction from artifacts is expected
+
+**If Slower**:
+
+- Check disk I/O performance: `iostat -x 1`
+- Verify `/tmp` is not on slow storage (NFS, etc.)
+- Consider using `tmpfs` for `/tmp`: `df -h /tmp`
+
+### Missing Kernels Error
+
+**Symptom**: Runtime errors about missing Triton kernels
+
+**Causes**:
+
+1. Corrupted artifacts
+2. Incomplete cache (warmup didn't cover all batch sizes)
+3. Disk space issues during generation
+
+**Solutions**:
+
+```bash
+# 1. Delete and regenerate cache
+rm -rf ~/.cache/vllm/torch_compile_cache/*
+
+# 2. Verify disk space
+df -h ~/.cache/vllm/
+
+# 3. Check artifact integrity
+file ~/.cache/vllm/torch_compile_cache/*/rank_0_0/*/artifact_*
+# Should show: "data" (binary format)
+```
+
+### AOT Artifact Serialization Issues
+
+**Symptom**: AOT artifacts fail to load (when using `VLLM_USE_MEGA_AOT_ARTIFACT=true`)
+
+**Requirements**:
+
+- PyTorch 2.10.0 or later
+- `VLLM_USE_MEGA_AOT_ARTIFACT=true`
+- Compilation mode 3 (`--compilation-config '{"mode": 3}'`)
+
+**Verify**:
+
+```bash
+# Check PyTorch version
+python -c "import torch; print(torch.__version__)"
+
+# Verify AOT flag in cache
+grep "VLLM_USE_MEGA_AOT_ARTIFACT" \
+  ~/.cache/vllm/torch_compile_cache/*/rank_0_0/*/cache_key_factors.json
+```
+
+## Advanced Topics
+
+### Multi-GPU Caching
+
+For tensor parallelism or pipeline parallelism:
+
+```text
+torch_compile_cache/{hash}/
+├── rank_0_0/    # First tensor parallel rank
+├── rank_0_1/    # Second tensor parallel rank
+├── rank_1_0/    # First pipeline parallel rank
+└── rank_1_1/    # Second pipeline + tensor parallel rank
+```
+
+MCV captures all rank directories. Extract the entire hash directory for
+multi-GPU deployments.
+
+### Multiple Model Components
+
+Models with speculative decoding have multiple components:
+
+```text
+rank_0_0/
+├── backbone/        # Main model
+│   └── artifact_*
+└── eagle_head/      # Draft model for speculation
+    └── artifact_*
+```
+
+MCV captures all prefix directories automatically.
+
+### Cache Size Optimization
+
+**Typical Sizes**:
+
+- Small models (< 1B params): 50-100 MB
+- Medium models (1-10B params): 100-500 MB
+- Large models (10B+ params): 500 MB - 2 GB
+
+**Factors Affecting Size**:
+
+- Number of compiled ranges (batch sizes)
+- Number of layers
+- Triton kernel count
+- Autotune configurations
+
+**Reduce Size**:
+
+- Use fewer compile ranges: `VLLM_COMPILE_RANGES=[128,512]` vs default
+- Binary format is smaller than unpacked
+- AOT format is similar to binary
+
 ## See Also
 
 - [spec-compat.md](./spec-compat.md) - OCI image specification
diff --git a/mcv/pkg/accelerator/devices/amd.go b/mcv/pkg/accelerator/devices/amd.go
index 7e435ab0..0188510d 100644
--- a/mcv/pkg/accelerator/devices/amd.go
+++ b/mcv/pkg/accelerator/devices/amd.go
@@ -47,13 +47,41 @@ func (d *gpuAMD) SetHwType(hwType string) {
 }
 
 // SetTritonInfo sets the Triton GPU information for the AMD device.
+// When restoring from cache, this also populates the devices map.
 func (d *gpuAMD) SetTritonInfo(info []TritonGPUInfo) {
 	d.tritonInfo = info
+
+	// Rebuild devices map from cached triton info
+	if d.devices == nil {
+		d.devices = make(map[int]GPUDevice)
+	}
+	for _, tritonInfo := range info {
+		d.devices[tritonInfo.ID] = GPUDevice{
+			ID:         tritonInfo.ID,
+			TritonInfo: tritonInfo,
+			// Summary will be set by SetSummaries
+		}
+	}
 }
 
 // SetSummaries sets the summaries for the AMD device.
+// When restoring from cache, this also updates the Summary field in devices map.
 func (d *gpuAMD) SetSummaries(summaries []DeviceSummary) {
 	d.summaries = summaries
+
+	// Update Summary in devices map if it exists
+	if d.devices != nil {
+		for _, summary := range summaries {
+			// Parse GPU ID from summary.ID (which is a string like "0", "1", etc.)
+			var gpuID int
+			if _, err := fmt.Sscanf(summary.ID, "%d", &gpuID); err == nil {
+				if dev, exists := d.devices[gpuID]; exists {
+					dev.Summary = summary
+					d.devices[gpuID] = dev
+				}
+			}
+		}
+	}
 }
 
 type AMDGPUInfo struct {
diff --git a/mcv/pkg/accelerator/devices/nvml.go b/mcv/pkg/accelerator/devices/nvml.go
index 542baa6c..ba9b07dc 100644
--- a/mcv/pkg/accelerator/devices/nvml.go
+++ b/mcv/pkg/accelerator/devices/nvml.go
@@ -62,13 +62,41 @@ func (d *gpuNvml) SetHwType(hwType string) {
 }
 
 // SetTritonInfo sets the Triton GPU information for the NVML device.
+// When restoring from cache, this also populates the devices map.
 func (d *gpuNvml) SetTritonInfo(info []TritonGPUInfo) {
 	d.tritonInfo = info
+
+	// Rebuild devices map from cached triton info
+	if d.devices == nil {
+		d.devices = make(map[int]GPUDevice)
+	}
+	for _, tritonInfo := range info {
+		d.devices[tritonInfo.ID] = GPUDevice{
+			ID:         tritonInfo.ID,
+			TritonInfo: tritonInfo,
+			// Summary will be set by SetSummaries
+		}
+	}
 }
 
 // SetSummaries sets the summaries for the NVML device.
+// When restoring from cache, this also updates the Summary field in devices map.
 func (d *gpuNvml) SetSummaries(summaries []DeviceSummary) {
 	d.summaries = summaries
+
+	// Update Summary in devices map if it exists
+	if d.devices != nil {
+		for _, summary := range summaries {
+			// Parse GPU ID from summary.ID (which is a string like "0", "1", etc.)
+			var gpuID int
+			if _, err := fmt.Sscanf(summary.ID, "%d", &gpuID); err == nil {
+				if dev, exists := d.devices[gpuID]; exists {
+					dev.Summary = summary
+					d.devices[gpuID] = dev
+				}
+			}
+		}
+	}
 }
 
 func nvmlCheck(r *Registry) {
@@ -222,7 +250,7 @@ func getNVMLTritonGPUInfo(device nvml.Device) (TritonGPUInfo, error) {
 		Name:              name,
 		UUID:              uuid,
 		ComputeCapability: fmt.Sprintf("%d.%d", major, minor), // Formatting the compute capability
-		Arch:              strconv.Itoa(major*10 + minor),     // TODO double check this
+		Arch:              strconv.Itoa(major*10 + minor),     // Numeric string for Triton compatibility (e.g., "75")
 		WarpSize:          warpSize,
 		MemoryTotalMB:     mem.Total / (1024 * 1024),
 		PTXVersion:        ptxVersion,
diff --git a/mcv/pkg/accelerator/devices/rocm.go b/mcv/pkg/accelerator/devices/rocm.go
index 8423f499..f5905fbf 100644
--- a/mcv/pkg/accelerator/devices/rocm.go
+++ b/mcv/pkg/accelerator/devices/rocm.go
@@ -46,13 +46,41 @@ func (d *gpuROCm) SetHwType(hwType string) {
 }
 
 // SetTritonInfo sets the Triton GPU information for the ROCM device.
+// When restoring from cache, this also populates the devices map.
 func (d *gpuROCm) SetTritonInfo(info []TritonGPUInfo) {
 	d.tritonInfo = info
+
+	// Rebuild devices map from cached triton info
+	if d.devices == nil {
+		d.devices = make(map[int]GPUDevice)
+	}
+	for _, tritonInfo := range info {
+		d.devices[tritonInfo.ID] = GPUDevice{
+			ID:         tritonInfo.ID,
+			TritonInfo: tritonInfo,
+			// Summary will be set by SetSummaries
+		}
+	}
 }
 
 // SetSummaries sets the summaries for the ROCM device.
+// When restoring from cache, this also updates the Summary field in devices map.
 func (d *gpuROCm) SetSummaries(summaries []DeviceSummary) {
 	d.summaries = summaries
+
+	// Update Summary in devices map if it exists
+	if d.devices != nil {
+		for _, summary := range summaries {
+			// Parse GPU ID from summary.ID (which is a string like "0", "1", etc.)
+			var gpuID int
+			if _, err := fmt.Sscanf(summary.ID, "%d", &gpuID); err == nil {
+				if dev, exists := d.devices[gpuID]; exists {
+					dev.Summary = summary
+					d.devices[gpuID] = dev
+				}
+			}
+		}
+	}
 }
 
 type ROCMGPUInfo struct {
diff --git a/mcv/pkg/cache/types.go b/mcv/pkg/cache/types.go
index 8810ccd4..7f7d96db 100644
--- a/mcv/pkg/cache/types.go
+++ b/mcv/pkg/cache/types.go
@@ -1,9 +1,11 @@
 package cache
 
 type SummaryTargetInfo struct {
-	Backend  string `json:"backend"`
-	Arch     string `json:"arch"`
-	WarpSize int    `json:"warp_size"`
+	Backend     string `json:"backend"`
+	Arch        string `json:"arch"`
+	WarpSize    int    `json:"warp_size"`
+	PTXVersion  int    `json:"ptx_version,omitempty"`  // CUDA PTX version (for CUDA backend)
+	CUDAVersion string `json:"cuda_version,omitempty"` // CUDA toolkit version (e.g., "12.9")
 }
 
 type Summary struct {
@@ -88,3 +90,13 @@ type CacheKeyFactors struct {
 	ConfigHash   string                 `json:"config_hash"`
 	Env          map[string]interface{} `json:"env"`
 }
+
+// AOTCompileCacheMetadata represents metadata for AOT compile cache artifacts
+// These are created when VLLM_USE_AOT_COMPILE=1 and stored at:
+// torch_compile_cache/torch_aot_compile/{hash}/rank_{rank}_{dp_rank}/model
+type AOTCompileCacheMetadata struct {
+	Hash      string `json:"hash"`       // Full hash from directory name
+	Rank      string `json:"rank"`       // rank_X_Y format
+	ModelFile string `json:"model_file"` // Always "model"
+	FileSize  int64  `json:"file_size"`  // Size of model file in bytes
+}
diff --git a/mcv/pkg/cache/vllm.go b/mcv/pkg/cache/vllm.go
index e782642d..c5ddccd5 100644
--- a/mcv/pkg/cache/vllm.go
+++ b/mcv/pkg/cache/vllm.go
@@ -11,6 +11,8 @@ import (
 	"strconv"
 	"strings"
 
+	"github.com/redhat-et/GKM/mcv/pkg/accelerator/devices"
+	"github.com/redhat-et/GKM/mcv/pkg/config"
 	"github.com/redhat-et/GKM/mcv/pkg/constants"
 	logging "github.com/sirupsen/logrus"
 )
@@ -27,6 +29,9 @@ const (
 	// Cache format constants
 	BinaryCacheFormat = "binary"
 	CUDABackend       = "cuda"
+	ROCmBackend       = "rocm"
+	HIPBackend        = "hip"
+	UnknownBackend    = "UnknownBackend"
 
 	// torchAOTCompileDirName is the extra directory vLLM introduces above
 	// the per-model hash dir when VLLM_USE_AOT_COMPILE is enabled.
@@ -46,10 +51,11 @@ type VLLMCache struct {
 }
 
 type VLLMCacheMetadata struct {
-	VllmHash           string                `json:"vllmHash"`
-	CacheFormat        string                `json:"cacheFormat"` // "triton" or "binary"
-	TritonCacheEntries []CacheEntry          `json:"triton,omitempty"`
-	BinaryCacheEntries []BinaryCacheMetadata `json:"binary,omitempty"`
+	VllmHash           string                    `json:"vllmHash"`
+	CacheFormat        string                    `json:"cacheFormat"` // "triton", "binary", or "aot_compile"
+	TritonCacheEntries []CacheEntry              `json:"triton,omitempty"`
+	BinaryCacheEntries []BinaryCacheMetadata     `json:"binary,omitempty"`
+	AOTCompileEntries  []AOTCompileCacheMetadata `json:"aot_compile,omitempty"`
 }
 
 // DetectVLLMCache walks the given root directory to detect whether VLLM-style cache artifacts exist
@@ -165,6 +171,35 @@ func DetectVLLMCache(cacheDir string) *VLLMCache {
 				logging.Debugf("Adding VLLM triton cache metadata: %+v", vllmMetadata)
 				metadata = append(metadata, vllmMetadata)
 			}
+
+			// Check for AOT compile cache at torch_compile_cache/torch_aot_compile/
+			aotCompilePath := filepath.Join(torchCompileCachePath, "torch_aot_compile")
+			if _, err := os.Stat(aotCompilePath); err == nil {
+				logging.Debugf("Detecting AOT compile cache at: %s", aotCompilePath)
+				aotCacheData, aotErr := detectAOTCompileCache(aotCompilePath)
+				if aotErr == nil && len(aotCacheData) > 0 {
+					logging.Debugf("Detected AOT compile cache format with %d entries", len(aotCacheData))
+					// Group AOT cache entries by hash
+					aotByHash := make(map[string][]AOTCompileCacheMetadata)
+					for _, aotCache := range aotCacheData {
+						aotByHash[aotCache.Hash] = append(aotByHash[aotCache.Hash], aotCache)
+					}
+
+					// Create metadata entries for each hash
+					for hash, entries := range aotByHash {
+						vllmMetadata := VLLMCacheMetadata{
+							VllmHash:          hash,
+							CacheFormat:       "aot_compile",
+							AOTCompileEntries: entries,
+						}
+						logging.Debugf("Adding VLLM AOT compile cache metadata: %+v", vllmMetadata)
+						metadata = append(metadata, vllmMetadata)
+						count++
+					}
+				} else if aotErr != nil {
+					logging.Debugf("No AOT compile cache detected: %v", aotErr)
+				}
+			}
 		}
 	}
 
@@ -372,6 +407,73 @@ func detectMegaAOTCache(hashDir string) ([]BinaryCacheMetadata, error) {
 	return out, nil
 }
 
+// detectAOTCompileCache detects AOT compile cache format
+// These are created when VLLM_USE_AOT_COMPILE=1 and stored at:
+// torch_compile_cache/torch_aot_compile/{hash}/rank_{rank}_{dp_rank}/model
+func detectAOTCompileCache(aotPath string) ([]AOTCompileCacheMetadata, error) {
+	var aotCaches []AOTCompileCacheMetadata
+
+	if _, err := os.Stat(aotPath); os.IsNotExist(err) {
+		return nil, fmt.Errorf("AOT compile cache path does not exist: %s", aotPath)
+	}
+
+	// Walk the torch_aot_compile directory looking for {hash}/rank_X_Y/model files
+	entries, err := os.ReadDir(aotPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read AOT compile directory: %w", err)
+	}
+
+	for _, hashEntry := range entries {
+		if !hashEntry.IsDir() {
+			continue
+		}
+
+		hashDir := filepath.Join(aotPath, hashEntry.Name())
+		logging.Debugf("Inspecting AOT hash directory: %s", hashDir)
+
+		// Look for rank_X_Y directories
+		rankEntries, err := os.ReadDir(hashDir)
+		if err != nil {
+			logging.Warnf("Failed to read AOT hash directory %s: %v", hashDir, err)
+			continue
+		}
+
+		rankDirRegex := regexp.MustCompile(`^rank_\d+_\d+$`)
+		for _, rankEntry := range rankEntries {
+			if !rankEntry.IsDir() {
+				continue
+			}
+			if !rankDirRegex.MatchString(rankEntry.Name()) {
+				continue
+			}
+
+			// Check for model file
+			modelPath := filepath.Join(hashDir, rankEntry.Name(), "model")
+			stat, err := os.Stat(modelPath)
+			if err != nil {
+				logging.Debugf("No model file found at %s: %v", modelPath, err)
+				continue
+			}
+
+			aotCache := AOTCompileCacheMetadata{
+				Hash:      hashEntry.Name(),
+				Rank:      rankEntry.Name(),
+				ModelFile: "model",
+				FileSize:  stat.Size(),
+			}
+
+			logging.Debugf("Found AOT compile cache: %+v", aotCache)
+			aotCaches = append(aotCaches, aotCache)
+		}
+	}
+
+	if len(aotCaches) == 0 {
+		return nil, fmt.Errorf("no AOT compile cache detected")
+	}
+
+	return aotCaches, nil
+}
+
 func (v *VLLMCache) Name() string { return constants.VLLM }
 
 func (v *VLLMCache) EntryCount() int {
@@ -426,6 +528,78 @@ func (v *VLLMCache) Summary() string {
 	return string(jsonData)
 }
 
+// detectActualGPUInfo detects the actual GPU architecture from the current system
+// This is called during cache image creation to detect the real hardware,
+// regardless of what VLLM_TARGET_DEVICE says in the cache metadata.
+// Returns backend, arch, warpSize, and ptxVersion
+func detectActualGPUInfo() (backend, arch string, warpSize, ptxVersion int) {
+	// Initialize config if not already done
+	if !config.IsInitialized() {
+		if _, err := config.Initialize(config.ConfDir); err != nil {
+			logging.WithError(err).Debug("Failed to initialize config for GPU detection")
+			return UnknownBackend, UnknownBackend, 0, 0
+		}
+	}
+
+	// Get device registry
+	registry := devices.GetRegistry()
+	if registry == nil {
+		logging.Debug("Failed to get device registry")
+		return UnknownBackend, UnknownBackend, 0, 0
+	}
+
+	// Try to start GPU device - this will auto-detect CUDA/ROCm
+	device := devices.Startup(config.GPU, registry)
+	if device == nil {
+		logging.Debug("No GPU detected on system")
+		return UnknownBackend, UnknownBackend, 0, 0
+	}
+
+	// Initialize the device to ensure GPU info is populated
+	// This is important when the device was restored from cache
+	if err := device.Init(); err != nil {
+		logging.WithError(err).Debug("Failed to initialize device")
+		return UnknownBackend, UnknownBackend, 0, 0
+	}
+
+	// Get GPU info for the first GPU (index 0)
+	gpuInfo, err := device.GetGPUInfo(0)
+	if err != nil {
+		logging.WithError(err).Debug("Failed to get GPU info from device")
+		return UnknownBackend, UnknownBackend, 0, 0
+	}
+
+	// Determine backend and warp size from the detected GPU
+	detectedBackend := gpuInfo.Backend
+	if detectedBackend == "" {
+		// Fallback: try to infer from device type
+		switch device.DevType() {
+		case devices.NVML:
+			detectedBackend = CUDABackend
+		case devices.ROCM, devices.AMD:
+			detectedBackend = ROCmBackend
+		default:
+			detectedBackend = UnknownBackend
+		}
+	}
+
+	detectedWarpSize := gpuInfo.WarpSize
+	if detectedWarpSize == 0 {
+		// Fallback to defaults
+		switch detectedBackend {
+		case CUDABackend:
+			detectedWarpSize = 32
+		case ROCmBackend, HIPBackend:
+			detectedWarpSize = 64
+		}
+	}
+
+	logging.Infof("Detected GPU: backend=%s, arch=%s, warpSize=%d, PTX=%d",
+		detectedBackend, gpuInfo.Arch, detectedWarpSize, gpuInfo.PTXVersion)
+
+	return detectedBackend, gpuInfo.Arch, detectedWarpSize, gpuInfo.PTXVersion
+}
+
 // buildBinaryCacheSummary builds a summary from binary cache metadata
 func buildBinaryCacheSummary(metadata []VLLMCacheMetadata) (*Summary, error) {
 	targetMap := make(map[string]SummaryTargetInfo)
@@ -435,46 +609,85 @@ func buildBinaryCacheSummary(metadata []VLLMCacheMetadata) (*Summary, error) {
 			continue
 		}
 
+		// Detect actual GPU from the system once per metadata entry
+		// NOTE: We detect the actual system GPU rather than trusting VLLM_TARGET_DEVICE
+		// because caches may be copied from other systems
+		detectedBackend, detectedArch, detectedWarpSize, detectedPTX := detectActualGPUInfo()
+
 		for i := range meta.BinaryCacheEntries {
 			binaryCache := &meta.BinaryCacheEntries[i]
-			// Extract target info from the stored environment variables
-			backend := binaryCache.TargetDevice
-			if backend == "" {
-				backend = CUDABackend // Default if not specified
+
+			// Use detected GPU info from actual system
+			backend := detectedBackend
+			arch := detectedArch
+			warpSize := detectedWarpSize
+			ptxVersion := detectedPTX
+
+			// For vLLM binary cache, CUDA uses sm_ prefix (e.g., sm_75)
+			// AMD/ROCm already has gfx prefix (e.g., gfx1151)
+			if backend == CUDABackend {
+				arch = fmt.Sprintf("sm_%s", arch)
 			}
 
-			// Determine arch and warpSize based on backend and env vars
-			arch := "unknown"
-			warpSize := 32 // Default for CUDA
+			// Extract toolkit versions from cache environment for reference
+			cudaVersion := ""
+			rocmVersion := ""
+
+			// Handle special cases where no GPU is detected
+			if backend == UnknownBackend {
+				logging.Warn("Could not detect GPU on system, using cache metadata as fallback")
+				// Fallback to cache metadata if GPU detection failed
+				backend = binaryCache.TargetDevice
+				if backend == "" {
+					backend = CUDABackend // Default if not specified
+				}
+				// Set default warp sizes
+				switch backend {
+				case ROCmBackend, HIPBackend:
+					warpSize = 64
+				case CUDABackend:
+					warpSize = 32
+				case "tpu":
+					warpSize = 128
+				case "cpu":
+					warpSize = 1
+				}
+			}
 
+			// Extract toolkit version info from environment
 			switch backend {
-			case "rocm", "hip":
-				warpSize = 64 // AMD GPUs use 64-wide wavefronts
-				// Try to extract GPU architecture from env
-				if env, ok := binaryCache.Env["VLLM_ROCM_CUSTOM_PAGED_ATTN"]; ok && env != nil {
-					// ROCm is being used
-					arch = "gfx90a" // Common MI250/MI300 arch, could be extracted more precisely
+			case CUDABackend:
+				if cudaVer, ok := binaryCache.Env["VLLM_MAIN_CUDA_VERSION"]; ok {
+					if ver, ok := cudaVer.(string); ok {
+						cudaVersion = ver
+						logging.Debugf("CUDA toolkit version from cache: %s", cudaVersion)
+					}
 				}
-			case "cuda":
-				// Try to extract CUDA architecture
-				if mainVersion, ok := binaryCache.Env["VLLM_MAIN_CUDA_VERSION"]; ok {
-					if version, ok := mainVersion.(string); ok {
-						arch = "sm_" + version
+			case ROCmBackend, HIPBackend:
+				if rocmVer, ok := binaryCache.Env["ROCM_VERSION"]; ok {
+					if ver, ok := rocmVer.(string); ok {
+						rocmVersion = ver
+						logging.Debugf("ROCm version from cache: %s", rocmVersion)
 					}
 				}
-			case "tpu":
-				warpSize = 128 // TPU uses different parallelism model
-			case "cpu":
-				warpSize = 1 // CPU doesn't have warp concept
 			}
 
-			key := fmt.Sprintf("%s-%s-%d", backend, arch, warpSize)
+			// Create unique key including version info for better cache matching
+			key := fmt.Sprintf("%s-%s-%d-%s-%s", backend, arch, warpSize, cudaVersion, rocmVersion)
 			if _, exists := targetMap[key]; !exists {
-				targetMap[key] = SummaryTargetInfo{
+				targetInfo := SummaryTargetInfo{
 					Backend:  backend,
 					Arch:     arch,
 					WarpSize: warpSize,
 				}
+				// Add version info if available
+				if ptxVersion > 0 {
+					targetInfo.PTXVersion = ptxVersion
+				}
+				if cudaVersion != "" {
+					targetInfo.CUDAVersion = cudaVersion
+				}
+				targetMap[key] = targetInfo
 			}
 		}
 	}
diff --git a/mcv/pkg/preflightcheck/triton.go b/mcv/pkg/preflightcheck/triton.go
index b9eaa489..7293efa0 100644
--- a/mcv/pkg/preflightcheck/triton.go
+++ b/mcv/pkg/preflightcheck/triton.go
@@ -63,7 +63,11 @@ func CompareTritonEntriesToGPU(entries []cache.TritonCacheMetadata, devInfo []de
 
 		for _, gpuInfo := range devInfo {
 			backendMatches := entry.Backend == gpuInfo.Backend
-			archMatches := entry.Arch == gpuInfo.Arch
+			// Normalize architectures for comparison (handles "75" vs "sm_75" for CUDA)
+			entryArchStr := cache.ConvertArchToString(entry.Arch)
+			normalizedEntryArch := normalizeArchForComparison(entry.Backend, entryArchStr)
+			normalizedGPUArch := normalizeArchForComparison(gpuInfo.Backend, gpuInfo.Arch)
+			archMatches := normalizedEntryArch == normalizedGPUArch
 			warpMatches := entry.WarpSize == gpuInfo.WarpSize
 
 			ptxMatches := true
diff --git a/mcv/pkg/preflightcheck/utils.go b/mcv/pkg/preflightcheck/utils.go
index 84d20da7..eb5699fe 100644
--- a/mcv/pkg/preflightcheck/utils.go
+++ b/mcv/pkg/preflightcheck/utils.go
@@ -4,6 +4,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"strings"
 
 	v1 "github.com/google/go-containerregistry/pkg/v1"
 	"github.com/redhat-et/GKM/mcv/pkg/accelerator"
@@ -14,6 +15,15 @@ import (
 	logging "github.com/sirupsen/logrus"
 )
 
+// normalizeArchForComparison normalizes architecture strings for comparison
+// Strips sm_ prefix from CUDA architectures to handle both "75" and "sm_75" formats
+func normalizeArchForComparison(backend, arch string) string {
+	if backend == "cuda" {
+		return strings.TrimPrefix(arch, "sm_")
+	}
+	return arch
+}
+
 func CompareCacheSummaryLabelToGPU(img v1.Image, labels map[string]string, devInfo []devices.TritonGPUInfo) (matched, unmatched []devices.TritonGPUInfo, err error) {
 	logging.Debug("Starting cache summary label preflight check...")
 	if labels == nil {
@@ -40,13 +50,30 @@ func CompareCacheSummaryLabelToGPU(img v1.Image, labels map[string]string, devIn
 		return nil, nil, fmt.Errorf("failed to parse summary label: %w", err)
 	}
 
+	logging.Debugf("Preflight check: devInfo has %d GPUs, summary has %d targets", len(devInfo), len(summary.Targets))
+	for i, gpu := range devInfo {
+		logging.Debugf("GPU[%d]: backend=%s, arch=%s, warp=%d", i, gpu.Backend, gpu.Arch, gpu.WarpSize)
+	}
+	for i, target := range summary.Targets {
+		logging.Debugf("Target[%d]: backend=%s, arch=%s, warp=%d", i, target.Backend, target.Arch, target.WarpSize)
+	}
+
 	for _, gpu := range devInfo {
 		isMatch := false
 		for _, target := range summary.Targets {
 			backendMatches := target.Backend == gpu.Backend
-			archMatches := target.Arch == gpu.Arch
+			// Normalize architectures for comparison (handles "75" vs "sm_75" for CUDA)
+			normalizedTargetArch := normalizeArchForComparison(target.Backend, target.Arch)
+			normalizedGPUArch := normalizeArchForComparison(gpu.Backend, gpu.Arch)
+			archMatches := normalizedTargetArch == normalizedGPUArch
 			warpMatches := target.WarpSize == gpu.WarpSize
 
+			logging.Debugf("Comparing cache target vs GPU: backend=%s vs %s, arch=%s(%s) vs %s(%s), warp=%d vs %d",
+				target.Backend, gpu.Backend,
+				target.Arch, normalizedTargetArch,
+				gpu.Arch, normalizedGPUArch,
+				target.WarpSize, gpu.WarpSize)
+
 			if backendMatches && archMatches && warpMatches {
 				isMatch = true
 				break
diff --git a/mcv/pkg/preflightcheck/vllm.go b/mcv/pkg/preflightcheck/vllm.go
index b9cfaef3..d23fcfcb 100644
--- a/mcv/pkg/preflightcheck/vllm.go
+++ b/mcv/pkg/preflightcheck/vllm.go
@@ -7,6 +7,7 @@ import (
 
 	"github.com/redhat-et/GKM/mcv/pkg/accelerator/devices"
 	"github.com/redhat-et/GKM/mcv/pkg/cache"
+	logging "github.com/sirupsen/logrus"
 )
 
 // CompareVLLMCacheManifestToGPU compares VLLM manifest entries to GPU info
@@ -23,79 +24,116 @@ func CompareVLLMCacheManifestToGPU(manifestPath string, devInfo []devices.Triton
 	}
 
 	for _, entry := range manifest.VLLM {
-		// Check if this is a binary cache format
-		if entry.CacheFormat == "binary" && len(entry.BinaryCacheEntries) > 0 {
-			if err := compareBinaryCacheEntriesToGPU(entry.BinaryCacheEntries, devInfo); err != nil {
-				return err
+		// Check cache format and validate accordingly
+		switch entry.CacheFormat {
+		case "binary":
+			if len(entry.BinaryCacheEntries) > 0 {
+				if err := compareBinaryCacheEntriesToGPU(entry.BinaryCacheEntries, devInfo); err != nil {
+					return err
+				}
 			}
-		} else if len(entry.TritonCacheEntries) > 0 {
-			// Handle triton cache format (legacy)
-			convertedEntries := make([]cache.TritonCacheMetadata, len(entry.TritonCacheEntries))
-			for i, e := range entry.TritonCacheEntries {
-				if metadata, ok := e.(cache.TritonCacheMetadata); ok {
-					convertedEntries[i] = metadata
-				} else {
-					return fmt.Errorf("failed to assert type cache.TritonCacheMetadata for entry: %v", e)
+		case "aot_compile":
+			if len(entry.AOTCompileEntries) > 0 {
+				if err := compareAOTCompileCacheEntriesToGPU(entry.AOTCompileEntries, devInfo); err != nil {
+					return err
 				}
 			}
-			if err := CompareTritonEntriesToGPU(convertedEntries, devInfo); err != nil {
-				return err
+		case "triton":
+			if len(entry.TritonCacheEntries) > 0 {
+				// Handle triton cache format (legacy)
+				// TritonCacheEntries contains JSON-unmarshalled map[string]interface{} values,
+				// so we need to re-marshal and unmarshal to get proper cache.TritonCacheMetadata structs
+				convertedEntries := make([]cache.TritonCacheMetadata, len(entry.TritonCacheEntries))
+				for i, e := range entry.TritonCacheEntries {
+					// Re-marshal the entry to JSON
+					jsonData, err := json.Marshal(e)
+					if err != nil {
+						return fmt.Errorf("failed to marshal triton cache entry: %w", err)
+					}
+					// Unmarshal into proper struct
+					if err := json.Unmarshal(jsonData, &convertedEntries[i]); err != nil {
+						return fmt.Errorf("failed to unmarshal triton cache entry: %w", err)
+					}
+				}
+				if err := CompareTritonEntriesToGPU(convertedEntries, devInfo); err != nil {
+					return err
+				}
 			}
+		default:
+			return fmt.Errorf("unknown cache format: %s", entry.CacheFormat)
 		}
 	}
 
 	return nil
 }
 
+// compareAOTCompileCacheEntriesToGPU validates AOT compile cache entries against GPU hardware
+// AOT compile caches have limited metadata, so this primarily relies on the summary-based check
+func compareAOTCompileCacheEntriesToGPU(entries []cache.AOTCompileCacheMetadata, _ []devices.TritonGPUInfo) error {
+	// AOT compile cache entries don't contain cache_key_factors.json with env vars,
+	// so we can't extract detailed hardware requirements from the manifest.
+	// The summary label (created during image build) contains the actual GPU info
+	// and is checked by CompareCacheSummaryLabelToGPU.
+	//
+	// Here we just verify the entries exist and log for debugging.
+	if len(entries) == 0 {
+		return fmt.Errorf("no AOT compile cache entries found")
+	}
+
+	// Log the AOT cache entries for debugging
+	for _, entry := range entries {
+		logging.Debugf("AOT compile cache: hash=%s, rank=%s, size=%d bytes",
+			entry.Hash, entry.Rank, entry.FileSize)
+	}
+
+	// Actual hardware compatibility is validated via the summary label
+	return nil
+}
+
 // compareBinaryCacheEntriesToGPU validates binary cache entries against GPU hardware
+// Note: Binary cache metadata doesn't directly contain compute capability.
+// The Summary label (built during image creation using actual GPU detection) is the
+// primary source of truth for hardware compatibility. This function provides a basic
+// backend-level check.
 func compareBinaryCacheEntriesToGPU(entries []cache.BinaryCacheMetadata, devInfo []devices.TritonGPUInfo) error {
 	for i := range entries {
 		entry := &entries[i]
-		// Extract hardware info from the binary cache metadata
+		// Extract backend from the binary cache metadata
 		backend := entry.TargetDevice
 		if backend == "" {
 			backend = cache.CUDABackend // Default if not specified
 		}
 
-		// Determine arch and warpSize based on backend and env vars
-		arch := "unknown"
-		warpSize := 32 // Default for CUDA
-
+		// Basic warp size validation based on backend
+		expectedWarpSize := 32 // Default for CUDA
 		switch backend {
-		case "rocm", "hip":
-			warpSize = 64 // AMD GPUs use 64-wide wavefronts
-			// Try to extract GPU architecture from env
-			if env, ok := entry.Env["VLLM_ROCM_CUSTOM_PAGED_ATTN"]; ok && env != nil {
-				arch = "gfx90a" // Common MI250/MI300 arch, could be extracted more precisely
-			}
-		case "cuda":
-			// Try to extract CUDA architecture
-			if mainVersion, ok := entry.Env["VLLM_MAIN_CUDA_VERSION"]; ok {
-				if version, ok := mainVersion.(string); ok {
-					arch = "sm_" + version
-				}
-			}
+		case cache.ROCmBackend, cache.HIPBackend:
+			expectedWarpSize = 64 // AMD GPUs use 64-wide wavefronts
+		case cache.CUDABackend:
+			expectedWarpSize = 32 // NVIDIA GPUs use 32-wide warps
 		case "tpu":
-			warpSize = 128 // TPU uses different parallelism model
+			expectedWarpSize = 128 // TPU uses different parallelism model
 		case "cpu":
-			warpSize = 1 // CPU doesn't have warp concept
+			expectedWarpSize = 1 // CPU doesn't have warp concept
 		}
 
-		// Check if any GPU matches this binary cache entry
+		// Check if any GPU matches the backend and warp size
 		matched := false
 		for _, gpu := range devInfo {
 			backendMatches := backend == gpu.Backend
-			archMatches := arch == gpu.Arch
-			warpMatches := warpSize == gpu.WarpSize
+			warpMatches := expectedWarpSize == gpu.WarpSize
 
-			if backendMatches && archMatches && warpMatches {
+			if backendMatches && warpMatches {
 				matched = true
+				// For detailed arch compatibility, rely on Summary label check
+				logging.Debugf("Binary cache entry matches GPU: backend=%s, warpSize=%d",
+					backend, expectedWarpSize)
 				break
 			}
 		}
 
 		if !matched {
-			return fmt.Errorf("binary cache entry (backend=%s, arch=%s, warpSize=%d) does not match any available GPU", backend, arch, warpSize)
+			return fmt.Errorf("binary cache entry (backend=%s, warpSize=%d) does not match any available GPU. Use Summary label for precise arch validation", backend, expectedWarpSize)
 		}
 	}