vllm-project
diff --git a/‎.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh‎
Lines changed: 12 additions & 14 deletions b/‎.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh‎
Lines changed: 12 additions & 14 deletions
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/issue_autolabel.yml‎
Lines changed: 25 additions & 0 deletions b/‎.github/workflows/issue_autolabel.yml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎csrc/moe/marlin_moe_wna16/marlin_template.h‎
Lines changed: 10 additions & 8 deletions b/‎csrc/moe/marlin_moe_wna16/marlin_template.h‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎docker/Dockerfile‎
Lines changed: 2 additions & 2 deletions b/‎docker/Dockerfile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/community/meetups.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/community/meetups.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/design/optimization_levels.md‎
Lines changed: 69 additions & 0 deletions b/‎docs/design/optimization_levels.md‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎docs/models/hardware_supported_models/cpu.md‎
Lines changed: 17 additions & 9 deletions b/‎docs/models/hardware_supported_models/cpu.md‎
Lines changed: 17 additions & 9 deletions
diff --git a/‎docs/models/hardware_supported_models/xpu.md‎
Lines changed: 65 additions & 0 deletions b/‎docs/models/hardware_supported_models/xpu.md‎
Lines changed: 65 additions & 0 deletions
@@ -7,58 +7,56 @@ set -ex
 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-0-16}
 OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
-NUMA_NODE=${NUMA_NODE:-0}
 
-export CMAKE_BUILD_PARALLEL_LEVEL=32
+export CMAKE_BUILD_PARALLEL_LEVEL=16
 
 # Setup cleanup
 remove_docker_container() {
     set -e;
-    docker rm -f cpu-test-"$NUMA_NODE" || true;
+    docker rm -f cpu-test || true;
 }
 trap remove_docker_container EXIT
 remove_docker_container
 
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
 
-# Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+# Run the image
+docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
 
 function cpu_tests() {
   set -e
-  export NUMA_NODE=$2
 
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test bash -c "
     set -e
     pip list"
 
   # offline inference
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test bash -c "
     set -e
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 
   # Run kernel tests
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test bash -c "
     set -e
     pytest -x -v -s tests/kernels/test_onednn.py
     pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
 
   # basic online serving
-  docker exec cpu-test-"$NUMA_NODE" bash -c '
+  docker exec cpu-test bash -c '
     set -e
-    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 &
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
     server_pid=$!
     timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
     vllm bench serve \
       --backend vllm \
       --dataset-name random \
-      --model meta-llama/Llama-3.2-3B-Instruct \
+      --model Qwen/Qwen3-0.6B \
       --num-prompts 20 \
       --endpoint /v1/completions
     kill -s SIGTERM $server_pid &'
 }
 
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2h bash -c cpu_tests
@@ -632,6 +632,7 @@ steps:
   # we can only upgrade after this is resolved
   # TODO(jerryzh168): resolve the above comment
   - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
 - label: LM Eval Small Models # 53min
 
@@ -105,6 +105,31 @@ jobs:
                   }
                 ],
               },
+              cpu: {
+                // Keyword search - matches whole words only (with word boundaries)
+                keywords: [
+                  {
+                    term: "CPU Backend",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "x86",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "ARM",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "Apple Silicon",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "IBM Z",
+                    searchIn: "title"
+                  },
+                ],
+              },
               // Add more label configurations here as needed
               // example: {
               //   keywords: [...],
 
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
 
 *Latest News* 🔥
 
+- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
 - [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
 - [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
 - [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
 
@@ -489,14 +489,16 @@ __global__ void Marlin(
   #pragma unroll
         for (int i = 0; i < 4; i++) {
           int idx = tid4 * 4 + i;
-          idx = idx < block_num_valid_tokens ? idx : 0;
-          if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
-            sh_block_topk_weights[idx] = __hmul2(
-                global_scale, Dtype::num2num2(Dtype::float2num(
-                                  topk_weights_ptr[sh_block_sorted_ids[idx]])));
-          } else {
-            sh_block_topk_weights[idx] = Dtype::num2num2(
-                Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
+          if (idx < block_num_valid_tokens) {
+            if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+              sh_block_topk_weights[idx] =
+                  __hmul2(global_scale,
+                          Dtype::num2num2(Dtype::float2num(
+                              topk_weights_ptr[sh_block_sorted_ids[idx]])));
+            } else {
+              sh_block_topk_weights[idx] = Dtype::num2num2(
+                  Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
+            }
           }
         }
       }
 
@@ -398,8 +398,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer pre-compiled kernel cache and binaries
 # https://docs.flashinfer.ai/installation.html
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system flashinfer-cubin==0.5.2 \
-    && uv pip install --system flashinfer-jit-cache==0.5.2 \
+    uv pip install --system flashinfer-cubin==0.5.3 \
+    && uv pip install --system flashinfer-jit-cache==0.5.3 \
         --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
     && flashinfer show-config
 
 
@@ -10,6 +10,7 @@ Stay tuned for upcoming meetups! Follow us on [Twitter/X](https://x.com/vllm_pro
 
 Below you'll find slides and recordings from our previous meetups:
 
+- [vLLM Bangkok Meetup](https://luma.com/v0f647nv), November 21st 2025. [[Slides]](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing)
 - [vLLM Zurich Meetup](https://luma.com/0gls27kb), November 6th 2025. [[Slides]](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) [[Recording]](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
 - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w), November 1st 2025. [[Slides]](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link)
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6)
 
@@ -0,0 +1,69 @@
+<!-- markdownlint-disable -->
+
+# Optimization Levels
+
+## Overview
+
+vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechnaism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out of the box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.
+
+## Level Summaries and Usage Examples
+```bash
+# CLI usage
+python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O0
+
+# Python API usage
+from vllm.entrypoints.llm import LLM
+
+llm = LLM(
+    model="RedHatAI/Llama-3.2-1B-FP8",
+    optimization_level=0
+)
+```
+
+#### `-O1`: Quick Optimizations
+- **Startup**: Moderate startup time
+- **Performance**: Inductor compilation, CUDAGraphMode.PIECEWISE
+- **Use case**:  Balance for most development scenarios
+
+```bash
+# CLI usage
+python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O1
+
+# Python API usage
+from vllm.entrypoints.llm import LLM
+
+llm = LLM(
+    model="RedHatAI/Llama-3.2-1B-FP8",
+    optimization_level=1
+)
+```
+
+#### `-O2`: Full Optimizations (Default)
+- **Startup**: Longer startup time
+- **Performance**: `-O1` + CUDAGraphMode.FULL_AND_PIECEWISE
+- **Use case**: Production workloads where performance is important. This is the default use case. It is also very similar to the previous default. The primary difference is that  noop & fusion flags are enabled. 
+
+```bash
+# CLI usage (default, so optional)
+python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O2
+
+# Python API usage
+from vllm.entrypoints.llm import LLM
+
+llm = LLM(
+    model="RedHatAI/Llama-3.2-1B-FP8",
+    optimization_level=2  # This is the default
+)
+```
+
+#### `-O3`: Full Optimization
+Still in development. Added infrastructure to prevent changing API in future 
+release. Currently behaves the same O2.
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Startup Time Too Long**: Use `-O0` or `-O1` for faster startup
+2. **Compilation Errors**: Use `debug_dump_path` for additional debugging information
+3. **Performance Issues**: Ensure using `-O2` for production
@@ -1,25 +1,33 @@
 # CPU - Intel® Xeon®
 
+## Validated Hardware
+
+| Hardware                                 |
+| ----------------------------------------- |
+| [Intel® Xeon® 6 Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon.html)                   |
+| [Intel® Xeon® 5 Processors](https://www.intel.com/content/www/us/en/products/docs/processors/xeon/5th-gen-xeon-scalable-processors.html)              |
+
 ## Supported Models
 
 ### Text-only Language Models
 
 | Model                                | Architecture                             | Supported |
 |--------------------------------------|-------------------------------------------|-----------|
-| meta-llama/Llama-3.1 / 3.3           | LlamaForCausalLM                          | ✅        |
-| meta-llama/Llama-4-Scout             | Llama4ForConditionalGeneration            | ✅        |
-| meta-llama/Llama-4-Maverick          | Llama4ForConditionalGeneration            | ✅        |
-| ibm-granite/granite (Granite-MOE)    | GraniteMoeForCausalLM                     | ✅        |
-| Qwen/Qwen3                           | Qwen3ForCausalLM                          | ✅        |
-| zai-org/GLM-4.5                      | GLMForCausalLM                            | ✅        |
-| google/gemma                         | GemmaForCausalLM                          | ✅        |
+| meta-llama/Llama-3.1-8B-Instruct     | LlamaForCausalLM                          | ✅        |
+| meta-llama/Llama-3.2-3B-Instruct     | LlamaForCausalLM                          | ✅        |
+| ibm-granite/granite-3.2-2b-instruct  | GraniteForCausalLM                        | ✅        |
+| Qwen/Qwen3-1.7B                      | Qwen3ForCausalLM                          | ✅        |
+| Qwen/Qwen3-4B                        | Qwen3ForCausalLM                          | ✅        |
+| Qwen/Qwen3-8B                        | Qwen3ForCausalLM                          | ✅        |
+| zai-org/glm-4-9b-hf                  | GLMForCausalLM                            | ✅        |
+| google/gemma-7b                      | GemmaForCausalLM                          | ✅        |
 
 ### Multimodal Language Models
 
 | Model                                | Architecture                             | Supported |
 |--------------------------------------|-------------------------------------------|-----------|
-| Qwen/Qwen2.5-VL                      | Qwen2VLForConditionalGeneration           | ✅        |
-| openai/whisper                       | WhisperForConditionalGeneration           | ✅        |
+| Qwen/Qwen2.5-VL-7B-Instruct          | Qwen2VLForConditionalGeneration           | ✅        |
+| openai/whisper-large-v3              | WhisperForConditionalGeneration           | ✅        |
 
 ✅ Runs and optimized.  
 🟨 Runs and correct but not optimized to green yet.  
 
@@ -0,0 +1,65 @@
+# XPU - Intel® GPUs
+
+## Validated Hardware
+
+| Hardware                                 |
+| ----------------------------------------- |
+| [Intel® Arc™ Pro B-Series Graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/workstations/b-series/overview.html)                   |
+
+## Supported Models
+
+### Text-only Language Models
+
+| Model                                     | Architecture                                         | FP16 | Dynamic FP8 | MXFP4 |
+| ----------------------------------------- | ---------------------------------------------------- | ---- | ----------- | ----- |
+| openai/gpt-oss-20b                        | GPTForCausalLM                                       |      |             | ✅     |
+| openai/gpt-oss-120b                       | GPTForCausalLM                                       |      |             | ✅     |
+| deepseek-ai/DeepSeek-R1-Distill-Llama-8B  | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-R1-Distill-Qwen-14B  | QwenForCausalLM                                      | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-R1-Distill-Qwen-32B  | QwenForCausalLM                                      | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-R1-Distill-Llama-70B | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen2.5-72B-Instruct                 | Qwen2ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-14B                            | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-32B                            | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-30B-A3B                        | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-30B-A3B-GPTQ-Int4              | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-coder-30B-A3B-Instruct         | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/QwQ-32B                              | QwenForCausalLM                                      | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-V2-Lite              | DeepSeekForCausalLM                                  | ✅    | ✅           |       |
+| meta-llama/Llama-3.1-8B-Instruct          | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| baichuan-inc/Baichuan2-13B-Chat           | BaichuanForCausalLM                                  | ✅    | ✅           |       |
+| THUDM/GLM-4-9B-chat                       | GLMForCausalLM                                       | ✅    | ✅           |       |
+| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
+| chuhac/TeleChat2-35B                      | LlamaForCausalLM (TeleChat2 based on Llama arch)     | ✅    | ✅           |       |
+| 01-ai/Yi1.5-34B-Chat                      | YiForCausalLM                                        | ✅    | ✅           |       |
+| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-Coder-33B-base       | DeepSeekCoderForCausalLM                             | ✅    | ✅           |       |
+| baichuan-inc/Baichuan2-13B-Chat           | BaichuanForCausalLM                                  | ✅    | ✅           |       |
+| meta-llama/Llama-2-13b-chat-hf            | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
+| Qwen/Qwen1.5-14B-Chat                     | QwenForCausalLM                                      | ✅    | ✅           |       |
+| Qwen/Qwen1.5-32B-Chat                     | QwenForCausalLM                                      | ✅    | ✅           |       |
+
+### Multimodal Language Models
+
+| Model                        | Architecture                     | FP16 | Dynamic FP8 | MXFP4 |
+| ---------------------------- | -------------------------------- | ---- | ----------- | ----- |
+| OpenGVLab/InternVL3_5-8B     | InternVLForConditionalGeneration | ✅    | ✅           |       |
+| OpenGVLab/InternVL3_5-14B    | InternVLForConditionalGeneration | ✅    | ✅           |       |
+| OpenGVLab/InternVL3_5-38B    | InternVLForConditionalGeneration | ✅    | ✅           |       |
+| Qwen/Qwen2-VL-7B-Instruct    | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
+| Qwen/Qwen2.5-VL-72B-Instruct | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
+| Qwen/Qwen2.5-VL-32B-Instruct | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
+| THUDM/GLM-4v-9B              | GLM4vForConditionalGeneration    | ✅    | ✅           |       |
+| openbmb/MiniCPM-V-4          | MiniCPMVForConditionalGeneration | ✅    | ✅           |       |
+
+### Embedding and Reranker Language Models
+
+| Model                   | Architecture                   | FP16 | Dynamic FP8 | MXFP4 |
+| ----------------------- | ------------------------------ | ---- | ----------- | ----- |
+| Qwen/Qwen3-Embedding-8B | Qwen3ForTextEmbedding          | ✅    | ✅           |       |
+| Qwen/Qwen3-Reranker-8B  | Qwen3ForSequenceClassification | ✅    | ✅           |       |
+
+✅ Runs and optimized.  
+🟨 Runs and correct but not optimized to green yet.  
+❌ Does not pass accuracy test or does not run.