Skip to content

Commit 911ea9b

Browse files
authored
Use prefill for term consistency (#104)
This PR update the term name: encoding->prefill so that it is consistent with mlc-llm
1 parent 3dffc15 commit 911ea9b

File tree

2 files changed

+14
-14
lines changed

2 files changed

+14
-14
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ All these are made possible by the open-source ecosystem that we leverage. Speci
114114
115115
TVM unity also provides an easy way to compose new solutions in the ecosystem. We will continue to bring further optimizations such as fused quantization kernels, and bring them to more platforms.
116116
117-
One key characteristic of LLM models is the dynamic nature of the model. As the decoding and encoding process depends on computations that grow with the size of tokens, we leverage the first-class dynamic shape support in TVM unity that represents sequence dimensions through symbolic integers. This allows us to plan ahead to statically allocate all the memory needed for the sequence window of interest without padding.
117+
One key characteristic of LLM models is the dynamic nature of the model. As the decoding and prefill process depends on computations that grow with the size of tokens, we leverage the first-class dynamic shape support in TVM unity that represents sequence dimensions through symbolic integers. This allows us to plan ahead to statically allocate all the memory needed for the sequence window of interest without padding.
118118
119119
We also leveraged the integration of tensor expressions to quickly express partial-tensor computations such as rotary embedding directly without materializing them into full-tensor matrix computations.
120120

web/llm_chat.js

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -176,15 +176,15 @@ class LLMChatPipeline {
176176

177177
this.decodingTotalTime = 0;
178178
this.decodingTotalTokens = 0;
179-
this.encodingTotalTime = 0;
180-
this.encodingTotalTokens = 0;
179+
this.prefillTotalTime = 0;
180+
this.prefillTotalTokens = 0;
181181
this.conversation = getConversation(config.conv_template);
182182
this.device = this.tvm.webgpu();
183183
this.vm = this.tvm.detachFromCurrentScope(
184184
this.tvm.createVirtualMachine(this.device)
185185
);
186-
this.encoding = this.tvm.detachFromCurrentScope(
187-
this.vm.getFunction("encoding")
186+
this.prefill = this.tvm.detachFromCurrentScope(
187+
this.vm.getFunction("prefill")
188188
);
189189
this.decoding = this.tvm.detachFromCurrentScope(
190190
this.vm.getFunction("decoding")
@@ -218,7 +218,7 @@ class LLMChatPipeline {
218218
// note: tvm instance is not owned by this class
219219
this.params.dispose();
220220
this.decoding.dispose();
221-
this.encoding.dispose();
221+
this.prefill.dispose();
222222
this.vm.dispose();
223223
this.kvCache.dispose();
224224
this.fclearKVCaches.dispose();
@@ -237,7 +237,7 @@ class LLMChatPipeline {
237237
var retValue;
238238
const seqLenShape = this.tvm.makeShapeTuple([curPos]);
239239
if (inputs.shape[1] > 1) {
240-
retValue = this.encoding(
240+
retValue = this.prefill(
241241
inputs, seqLenShape, this.kvCache, this.params
242242
);
243243
} else {
@@ -349,9 +349,9 @@ class LLMChatPipeline {
349349
this.conversation.reset();
350350
this.#clearKVCache();
351351
this.decodingTotalTime = 0;
352-
this.encodingTotalTime = 0;
352+
this.prefillTotalTime = 0;
353353
this.decodingTotalTokens = 0;
354-
this.encodingTotalTokens = 0;
354+
this.prefillTotalTokens = 0;
355355
}
356356

357357
async generate(inputPrompt, callbackUpdateResponse) {
@@ -413,8 +413,8 @@ class LLMChatPipeline {
413413
this.decodingTotalTokens += 1;
414414
this.decodingTotalTime += (tend - tstart) / 1000;
415415
} else {
416-
this.encodingTotalTime += (tend - tstart) / 1000;
417-
this.encodingTotalTokens += inputTokenLength;
416+
this.prefillTotalTime += (tend - tstart) / 1000;
417+
this.prefillTotalTokens += inputTokenLength;
418418
}
419419

420420
if (step % this.streamInterval == 0) {
@@ -441,7 +441,7 @@ class LLMChatPipeline {
441441
this.tvm.beginScope();
442442
const inputData = this.tvm.empty([1, tokens.length], "int32", this.device);
443443
inputData.copyFrom(tokens);
444-
const encodingStart = performance.now();
444+
const prefillStart = performance.now();
445445
this.#forward(inputData, tokens.length);
446446
this.tvm.endScope();
447447
await this.device.sync();
@@ -456,7 +456,7 @@ class LLMChatPipeline {
456456

457457
const decodingEnd = performance.now();
458458
const msg = (
459-
`encoding-time=${((decodingStart - encodingStart) / 1000).toFixed(4)} sec` +
459+
`prefill-time=${((decodingStart - prefillStart) / 1000).toFixed(4)} sec` +
460460
`decoding-time=${((decodingEnd - decodingStart) / 1000).toFixed(4)} sec`
461461
);
462462

@@ -475,7 +475,7 @@ class LLMChatPipeline {
475475

476476
runtimeStatsText() {
477477
return (
478-
`encoding: ${(this.encodingTotalTokens / this.encodingTotalTime).toFixed(4)} tokens/sec, ` +
478+
`prefill: ${(this.prefillTotalTokens / this.prefillTotalTime).toFixed(4)} tokens/sec, ` +
479479
`decoding: ${(this.decodingTotalTokens / this.decodingTotalTime).toFixed(4)} tokens/sec`
480480
)
481481
}

0 commit comments

Comments
 (0)