Skip to content

Commit 089bbd0

Browse files
authored
[Model] Support Qwen3 models with enable_thinking field (#686)
- This PR adds the following Qwen3 models to WebLLM's prebuilt models: - Qwen3-0.6B: `q0f16, q0f32, q4f16_1, q4f32_1` - Other Qwen3: `{1.7B, 4B, 8B} x {q4f16_1, q4f32_1}` - In addition, we add `extra_body` field and `extra_body.enable_thinking` field to support switching between thinking and non-thinking mode. - We also bumped web-tokenizer to 0.1.6, which resolves newly converted MLC models throwing rust-related error
1 parent 632d347 commit 089bbd0

14 files changed

+751
-222
lines changed

examples/qwen3/README.md

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
### OpenAI API Demos w/ Qwen3
2+
3+
Run `npm install` first, followed by `npm start`.
4+
5+
Note if you would like to hack WebLLM core package,
6+
you can change web-llm dependencies as `"file:../.."`, and follow the build from source
7+
instruction in the project to build webllm locally. This option is only recommended
8+
if you would like to hack WebLLM core package.

examples/qwen3/package.json

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"name": "qwen3_example",
3+
"version": "0.1.0",
4+
"private": true,
5+
"scripts": {
6+
"start": "parcel src/qwen3_example.html --port 8883",
7+
"build": "parcel build src/qwen3_example.html --dist-dir lib"
8+
},
9+
"devDependencies": {
10+
"buffer": "^5.7.1",
11+
"parcel": "^2.8.3",
12+
"process": "^0.11.10",
13+
"tslib": "^2.3.1",
14+
"typescript": "^4.9.5",
15+
"url": "^0.11.3"
16+
},
17+
"dependencies": {
18+
"@mlc-ai/web-llm": "^0.2.78"
19+
}
20+
}

examples/qwen3/src/qwen3_example.html

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<!doctype html>
2+
<html>
3+
<script>
4+
webLLMGlobal = {};
5+
</script>
6+
7+
<body>
8+
<h2>WebLLM Test Page</h2>
9+
Open console to see output
10+
<br />
11+
<br />
12+
<label id="init-label"> </label>
13+
<h3>Response</h3>
14+
<label id="generate-label"> </label>
15+
<script type="module" src="./qwen3_example.ts"></script>
16+
</body>
17+
</html>

examples/qwen3/src/qwen3_example.ts

+147
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
import * as webllm from "@mlc-ai/web-llm";
2+
3+
function setLabel(id: string, text: string) {
4+
const label = document.getElementById(id);
5+
if (label == null) {
6+
throw Error("Cannot find label " + id);
7+
}
8+
label.innerText = text;
9+
}
10+
11+
// Helper method to stream responses from the engine
12+
async function streamResponse(
13+
engine: webllm.MLCEngineInterface,
14+
request: webllm.ChatCompletionRequestStreaming,
15+
): Promise<void> {
16+
console.log("Requesting chat completion with request:", request);
17+
const asyncChunkGenerator = await engine.chat.completions.create(request);
18+
let message = "";
19+
for await (const chunk of asyncChunkGenerator) {
20+
message += chunk.choices[0]?.delta?.content || "";
21+
setLabel("generate-label", message);
22+
if (chunk.usage) {
23+
console.log(chunk.usage); // only last chunk has usage
24+
}
25+
// engine.interruptGenerate(); // works with interrupt as well
26+
}
27+
console.log("Final message:\n", await engine.getMessage()); // the concatenated message
28+
}
29+
30+
/**
31+
* We demonstrate how Qwen3's best practices can be followed in WebLLM. For more, see
32+
* https://huggingface.co/Qwen/Qwen3-8B#best-practices.
33+
*/
34+
async function main() {
35+
const initProgressCallback = (report: webllm.InitProgressReport) => {
36+
setLabel("init-label", report.text);
37+
};
38+
const selectedModel = "Qwen3-4B-q4f16_1-MLC";
39+
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
40+
selectedModel,
41+
{ initProgressCallback: initProgressCallback },
42+
);
43+
44+
/**
45+
* 1. Default behavior: enable thinking
46+
*/
47+
let request: webllm.ChatCompletionRequest = {
48+
stream: true,
49+
stream_options: { include_usage: true },
50+
messages: [
51+
{
52+
role: "user",
53+
content: "How many r's are there in the word strawberry?",
54+
},
55+
],
56+
// Specifying `enable_thinking` is optional, as it defaults to think.
57+
// extra_body: {
58+
// enable_thinking: true,
59+
// }
60+
};
61+
await streamResponse(engine, request);
62+
63+
/**
64+
* 2. Disable thinking with `enable_thinking: false`.
65+
*/
66+
request = {
67+
stream: true,
68+
stream_options: { include_usage: true },
69+
messages: [
70+
{
71+
role: "user",
72+
content: "How many r's are there in the word strawberry?",
73+
},
74+
],
75+
extra_body: {
76+
enable_thinking: false,
77+
},
78+
};
79+
await streamResponse(engine, request);
80+
81+
/**
82+
* 3. Disable thinking with soft switch /no_think
83+
* or enable thinking with soft switch /think.
84+
* Using soft switch: "When enable_thinking=True, regardless of whether the user
85+
* uses /think or /no_think, the model will always output a block wrapped in
86+
* <think>...</think>. However, the content inside this block may be empty if
87+
* thinking is disabled. When enable_thinking=False, the soft switches are not
88+
* valid. Regardless of any /think or /no_think tags input by the user, the
89+
* model will not generate think content and will not include a <think>...</think> block.
90+
*/
91+
request = {
92+
stream: true,
93+
stream_options: { include_usage: true },
94+
messages: [
95+
{
96+
role: "user",
97+
content: "How many r's are there in the word strawberry? /no_think",
98+
// content: "How many r's are there in the word strawberry? /think",
99+
},
100+
],
101+
};
102+
await streamResponse(engine, request);
103+
104+
/**
105+
* 4. For multi-turn messages, it is recommended to
106+
* parse out the thinking content in the history
107+
* messages as described in the Best Practices section.
108+
*/
109+
const history: webllm.ChatCompletionMessageParam[] = [
110+
{
111+
role: "user",
112+
content: "How many r's are there in the word strawberry? /think",
113+
},
114+
{
115+
role: "assistant",
116+
content:
117+
"<think>Dummy thinking content here...</think>\n\nThe answer is 3.",
118+
},
119+
];
120+
// Preprocess history to remove thinking content
121+
const preprocessedHistory = history.map((msg) => {
122+
if (msg.role === "assistant") {
123+
// Remove <think>...</think> block from assistant messages that is at the start
124+
// and may contain two \n\n line breaks.
125+
const thinkRegex = /<think>.*?<\/think>\n?\n?/s; // Match <think>...</think> with optional \n\n
126+
const contentWithoutThink = msg.content!.replace(thinkRegex, "").trim();
127+
return { ...msg, content: contentWithoutThink };
128+
}
129+
return msg; // User messages remain unchanged
130+
});
131+
console.log("Preprocessed history:", preprocessedHistory);
132+
133+
// Now use the preprocessed history in the request
134+
const newMessage: webllm.ChatCompletionMessageParam = {
135+
role: "user",
136+
content: "What about blueberries?",
137+
};
138+
139+
request = {
140+
stream: true,
141+
stream_options: { include_usage: true },
142+
messages: [...preprocessedHistory, newMessage],
143+
};
144+
await streamResponse(engine, request);
145+
}
146+
147+
main();

examples/simple-chat-ts/src/simple_chat.ts

+6
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,12 @@ class ChatUI {
303303
stream: true,
304304
messages: this.chatHistory,
305305
stream_options: { include_usage: true },
306+
// if model starts with "Qwen3", disable thinking.
307+
extra_body: this.selectedModel.startsWith("Qwen3")
308+
? {
309+
enable_thinking: false,
310+
}
311+
: undefined,
306312
});
307313
// TODO(Charlie): Processing of � requires changes
308314
for await (const chunk of completion) {

0 commit comments

Comments
 (0)