Skip to content

Commit dd892c7

Browse files
author
Andrew Briand
committed
Merge branch 'main' into abriand_eplb_nvfp4_2
2 parents 381bbfd + be493e0 commit dd892c7

File tree

118 files changed

+2477
-1007
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

118 files changed

+2477
-1007
lines changed

.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,58 +7,56 @@ set -ex
77
# allow to bind to different cores
88
CORE_RANGE=${CORE_RANGE:-0-16}
99
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
10-
NUMA_NODE=${NUMA_NODE:-0}
1110

12-
export CMAKE_BUILD_PARALLEL_LEVEL=32
11+
export CMAKE_BUILD_PARALLEL_LEVEL=16
1312

1413
# Setup cleanup
1514
remove_docker_container() {
1615
set -e;
17-
docker rm -f cpu-test-"$NUMA_NODE" || true;
16+
docker rm -f cpu-test || true;
1817
}
1918
trap remove_docker_container EXIT
2019
remove_docker_container
2120

2221
# Try building the docker image
23-
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
22+
docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
2423

25-
# Run the image, setting --shm-size=4g for tensor parallel.
26-
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
24+
# Run the image
25+
docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
2726

2827
function cpu_tests() {
2928
set -e
30-
export NUMA_NODE=$2
3129

32-
docker exec cpu-test-"$NUMA_NODE" bash -c "
30+
docker exec cpu-test bash -c "
3331
set -e
3432
pip list"
3533

3634
# offline inference
37-
docker exec cpu-test-"$NUMA_NODE" bash -c "
35+
docker exec cpu-test bash -c "
3836
set -e
3937
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
4038

4139
# Run kernel tests
42-
docker exec cpu-test-"$NUMA_NODE" bash -c "
40+
docker exec cpu-test bash -c "
4341
set -e
4442
pytest -x -v -s tests/kernels/test_onednn.py
4543
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
4644

4745
# basic online serving
48-
docker exec cpu-test-"$NUMA_NODE" bash -c '
46+
docker exec cpu-test bash -c '
4947
set -e
50-
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 &
48+
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
5149
server_pid=$!
5250
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
5351
vllm bench serve \
5452
--backend vllm \
5553
--dataset-name random \
56-
--model meta-llama/Llama-3.2-3B-Instruct \
54+
--model Qwen/Qwen3-0.6B \
5755
--num-prompts 20 \
5856
--endpoint /v1/completions
5957
kill -s SIGTERM $server_pid &'
6058
}
6159

6260
# All of CPU tests are expected to be finished less than 40 mins.
6361
export -f cpu_tests
64-
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
62+
timeout 2h bash -c cpu_tests

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -632,6 +632,7 @@ steps:
632632
# we can only upgrade after this is resolved
633633
# TODO(jerryzh168): resolve the above comment
634634
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
635+
- uv pip install --system conch-triton-kernels
635636
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
636637

637638
- label: LM Eval Small Models # 53min

.github/workflows/issue_autolabel.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,31 @@ jobs:
105105
}
106106
],
107107
},
108+
cpu: {
109+
// Keyword search - matches whole words only (with word boundaries)
110+
keywords: [
111+
{
112+
term: "CPU Backend",
113+
searchIn: "title"
114+
},
115+
{
116+
term: "x86",
117+
searchIn: "title"
118+
},
119+
{
120+
term: "ARM",
121+
searchIn: "title"
122+
},
123+
{
124+
term: "Apple Silicon",
125+
searchIn: "title"
126+
},
127+
{
128+
term: "IBM Z",
129+
searchIn: "title"
130+
},
131+
],
132+
},
108133
// Add more label configurations here as needed
109134
// example: {
110135
// keywords: [...],

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
2121

2222
*Latest News* 🔥
2323

24+
- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
2425
- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
2526
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
2627
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).

csrc/moe/marlin_moe_wna16/marlin_template.h

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -489,14 +489,16 @@ __global__ void Marlin(
489489
#pragma unroll
490490
for (int i = 0; i < 4; i++) {
491491
int idx = tid4 * 4 + i;
492-
idx = idx < block_num_valid_tokens ? idx : 0;
493-
if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
494-
sh_block_topk_weights[idx] = __hmul2(
495-
global_scale, Dtype::num2num2(Dtype::float2num(
496-
topk_weights_ptr[sh_block_sorted_ids[idx]])));
497-
} else {
498-
sh_block_topk_weights[idx] = Dtype::num2num2(
499-
Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
492+
if (idx < block_num_valid_tokens) {
493+
if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
494+
sh_block_topk_weights[idx] =
495+
__hmul2(global_scale,
496+
Dtype::num2num2(Dtype::float2num(
497+
topk_weights_ptr[sh_block_sorted_ids[idx]])));
498+
} else {
499+
sh_block_topk_weights[idx] = Dtype::num2num2(
500+
Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
501+
}
500502
}
501503
}
502504
}

docker/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -398,8 +398,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
398398
# Install FlashInfer pre-compiled kernel cache and binaries
399399
# https://docs.flashinfer.ai/installation.html
400400
RUN --mount=type=cache,target=/root/.cache/uv \
401-
uv pip install --system flashinfer-cubin==0.5.2 \
402-
&& uv pip install --system flashinfer-jit-cache==0.5.2 \
401+
uv pip install --system flashinfer-cubin==0.5.3 \
402+
&& uv pip install --system flashinfer-jit-cache==0.5.3 \
403403
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
404404
&& flashinfer show-config
405405

docs/community/meetups.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Stay tuned for upcoming meetups! Follow us on [Twitter/X](https://x.com/vllm_pro
1010

1111
Below you'll find slides and recordings from our previous meetups:
1212

13+
- [vLLM Bangkok Meetup](https://luma.com/v0f647nv), November 21st 2025. [[Slides]](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing)
1314
- [vLLM Zurich Meetup](https://luma.com/0gls27kb), November 6th 2025. [[Slides]](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) [[Recording]](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
1415
- [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w), November 1st 2025. [[Slides]](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link)
1516
- [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6)

docs/design/optimization_levels.md

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
<!-- markdownlint-disable -->
2+
3+
# Optimization Levels
4+
5+
## Overview
6+
7+
vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechnaism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out of the box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.
8+
9+
## Level Summaries and Usage Examples
10+
```bash
11+
# CLI usage
12+
python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O0
13+
14+
# Python API usage
15+
from vllm.entrypoints.llm import LLM
16+
17+
llm = LLM(
18+
model="RedHatAI/Llama-3.2-1B-FP8",
19+
optimization_level=0
20+
)
21+
```
22+
23+
#### `-O1`: Quick Optimizations
24+
- **Startup**: Moderate startup time
25+
- **Performance**: Inductor compilation, CUDAGraphMode.PIECEWISE
26+
- **Use case**: Balance for most development scenarios
27+
28+
```bash
29+
# CLI usage
30+
python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O1
31+
32+
# Python API usage
33+
from vllm.entrypoints.llm import LLM
34+
35+
llm = LLM(
36+
model="RedHatAI/Llama-3.2-1B-FP8",
37+
optimization_level=1
38+
)
39+
```
40+
41+
#### `-O2`: Full Optimizations (Default)
42+
- **Startup**: Longer startup time
43+
- **Performance**: `-O1` + CUDAGraphMode.FULL_AND_PIECEWISE
44+
- **Use case**: Production workloads where performance is important. This is the default use case. It is also very similar to the previous default. The primary difference is that noop & fusion flags are enabled.
45+
46+
```bash
47+
# CLI usage (default, so optional)
48+
python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O2
49+
50+
# Python API usage
51+
from vllm.entrypoints.llm import LLM
52+
53+
llm = LLM(
54+
model="RedHatAI/Llama-3.2-1B-FP8",
55+
optimization_level=2 # This is the default
56+
)
57+
```
58+
59+
#### `-O3`: Full Optimization
60+
Still in development. Added infrastructure to prevent changing API in future
61+
release. Currently behaves the same O2.
62+
63+
## Troubleshooting
64+
65+
### Common Issues
66+
67+
1. **Startup Time Too Long**: Use `-O0` or `-O1` for faster startup
68+
2. **Compilation Errors**: Use `debug_dump_path` for additional debugging information
69+
3. **Performance Issues**: Ensure using `-O2` for production

docs/models/hardware_supported_models/cpu.md

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,33 @@
11
# CPU - Intel® Xeon®
22

3+
## Validated Hardware
4+
5+
| Hardware |
6+
| ----------------------------------------- |
7+
| [Intel® Xeon® 6 Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon.html) |
8+
| [Intel® Xeon® 5 Processors](https://www.intel.com/content/www/us/en/products/docs/processors/xeon/5th-gen-xeon-scalable-processors.html) |
9+
310
## Supported Models
411

512
### Text-only Language Models
613

714
| Model | Architecture | Supported |
815
|--------------------------------------|-------------------------------------------|-----------|
9-
| meta-llama/Llama-3.1 / 3.3 | LlamaForCausalLM ||
10-
| meta-llama/Llama-4-Scout | Llama4ForConditionalGeneration ||
11-
| meta-llama/Llama-4-Maverick | Llama4ForConditionalGeneration ||
12-
| ibm-granite/granite (Granite-MOE) | GraniteMoeForCausalLM ||
13-
| Qwen/Qwen3 | Qwen3ForCausalLM ||
14-
| zai-org/GLM-4.5 | GLMForCausalLM ||
15-
| google/gemma | GemmaForCausalLM ||
16+
| meta-llama/Llama-3.1-8B-Instruct | LlamaForCausalLM ||
17+
| meta-llama/Llama-3.2-3B-Instruct | LlamaForCausalLM ||
18+
| ibm-granite/granite-3.2-2b-instruct | GraniteForCausalLM ||
19+
| Qwen/Qwen3-1.7B | Qwen3ForCausalLM ||
20+
| Qwen/Qwen3-4B | Qwen3ForCausalLM ||
21+
| Qwen/Qwen3-8B | Qwen3ForCausalLM ||
22+
| zai-org/glm-4-9b-hf | GLMForCausalLM ||
23+
| google/gemma-7b | GemmaForCausalLM ||
1624

1725
### Multimodal Language Models
1826

1927
| Model | Architecture | Supported |
2028
|--------------------------------------|-------------------------------------------|-----------|
21-
| Qwen/Qwen2.5-VL | Qwen2VLForConditionalGeneration ||
22-
| openai/whisper | WhisperForConditionalGeneration ||
29+
| Qwen/Qwen2.5-VL-7B-Instruct | Qwen2VLForConditionalGeneration ||
30+
| openai/whisper-large-v3 | WhisperForConditionalGeneration ||
2331

2432
✅ Runs and optimized.
2533
🟨 Runs and correct but not optimized to green yet.
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# XPU - Intel® GPUs
2+
3+
## Validated Hardware
4+
5+
| Hardware |
6+
| ----------------------------------------- |
7+
| [Intel® Arc™ Pro B-Series Graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/workstations/b-series/overview.html) |
8+
9+
## Supported Models
10+
11+
### Text-only Language Models
12+
13+
| Model | Architecture | FP16 | Dynamic FP8 | MXFP4 |
14+
| ----------------------------------------- | ---------------------------------------------------- | ---- | ----------- | ----- |
15+
| openai/gpt-oss-20b | GPTForCausalLM | | ||
16+
| openai/gpt-oss-120b | GPTForCausalLM | | ||
17+
| deepseek-ai/DeepSeek-R1-Distill-Llama-8B | LlamaForCausalLM ||| |
18+
| deepseek-ai/DeepSeek-R1-Distill-Qwen-14B | QwenForCausalLM ||| |
19+
| deepseek-ai/DeepSeek-R1-Distill-Qwen-32B | QwenForCausalLM ||| |
20+
| deepseek-ai/DeepSeek-R1-Distill-Llama-70B | LlamaForCausalLM ||| |
21+
| Qwen/Qwen2.5-72B-Instruct | Qwen2ForCausalLM ||| |
22+
| Qwen/Qwen3-14B | Qwen3ForCausalLM ||| |
23+
| Qwen/Qwen3-32B | Qwen3ForCausalLM ||| |
24+
| Qwen/Qwen3-30B-A3B | Qwen3ForCausalLM ||| |
25+
| Qwen/Qwen3-30B-A3B-GPTQ-Int4 | Qwen3ForCausalLM ||| |
26+
| Qwen/Qwen3-coder-30B-A3B-Instruct | Qwen3ForCausalLM ||| |
27+
| Qwen/QwQ-32B | QwenForCausalLM ||| |
28+
| deepseek-ai/DeepSeek-V2-Lite | DeepSeekForCausalLM ||| |
29+
| meta-llama/Llama-3.1-8B-Instruct | LlamaForCausalLM ||| |
30+
| baichuan-inc/Baichuan2-13B-Chat | BaichuanForCausalLM ||| |
31+
| THUDM/GLM-4-9B-chat | GLMForCausalLM ||| |
32+
| THUDM/CodeGeex4-All-9B | CodeGeexForCausalLM ||| |
33+
| chuhac/TeleChat2-35B | LlamaForCausalLM (TeleChat2 based on Llama arch) ||| |
34+
| 01-ai/Yi1.5-34B-Chat | YiForCausalLM ||| |
35+
| THUDM/CodeGeex4-All-9B | CodeGeexForCausalLM ||| |
36+
| deepseek-ai/DeepSeek-Coder-33B-base | DeepSeekCoderForCausalLM ||| |
37+
| baichuan-inc/Baichuan2-13B-Chat | BaichuanForCausalLM ||| |
38+
| meta-llama/Llama-2-13b-chat-hf | LlamaForCausalLM ||| |
39+
| THUDM/CodeGeex4-All-9B | CodeGeexForCausalLM ||| |
40+
| Qwen/Qwen1.5-14B-Chat | QwenForCausalLM ||| |
41+
| Qwen/Qwen1.5-32B-Chat | QwenForCausalLM ||| |
42+
43+
### Multimodal Language Models
44+
45+
| Model | Architecture | FP16 | Dynamic FP8 | MXFP4 |
46+
| ---------------------------- | -------------------------------- | ---- | ----------- | ----- |
47+
| OpenGVLab/InternVL3_5-8B | InternVLForConditionalGeneration ||| |
48+
| OpenGVLab/InternVL3_5-14B | InternVLForConditionalGeneration ||| |
49+
| OpenGVLab/InternVL3_5-38B | InternVLForConditionalGeneration ||| |
50+
| Qwen/Qwen2-VL-7B-Instruct | Qwen2VLForConditionalGeneration ||| |
51+
| Qwen/Qwen2.5-VL-72B-Instruct | Qwen2VLForConditionalGeneration ||| |
52+
| Qwen/Qwen2.5-VL-32B-Instruct | Qwen2VLForConditionalGeneration ||| |
53+
| THUDM/GLM-4v-9B | GLM4vForConditionalGeneration ||| |
54+
| openbmb/MiniCPM-V-4 | MiniCPMVForConditionalGeneration ||| |
55+
56+
### Embedding and Reranker Language Models
57+
58+
| Model | Architecture | FP16 | Dynamic FP8 | MXFP4 |
59+
| ----------------------- | ------------------------------ | ---- | ----------- | ----- |
60+
| Qwen/Qwen3-Embedding-8B | Qwen3ForTextEmbedding ||| |
61+
| Qwen/Qwen3-Reranker-8B | Qwen3ForSequenceClassification ||| |
62+
63+
✅ Runs and optimized.
64+
🟨 Runs and correct but not optimized to green yet.
65+
❌ Does not pass accuracy test or does not run.

0 commit comments

Comments
 (0)