@@ -57,16 +57,18 @@ pub struct PrefillRouter {
5757 cancel_token : CancellationToken ,
5858 router_mode : RouterMode ,
5959 enforce_disagg : bool ,
60+ block_size : u32 ,
6061}
6162
6263impl PrefillRouter {
6364 /// Create a disabled prefill router that will never activate (passthrough only)
64- pub fn disabled ( router_mode : RouterMode , enforce_disagg : bool ) -> Arc < Self > {
65+ pub fn disabled ( router_mode : RouterMode , enforce_disagg : bool , block_size : u32 ) -> Arc < Self > {
6566 Arc :: new ( Self {
6667 prefill_router : OnceLock :: new ( ) ,
6768 cancel_token : CancellationToken :: new ( ) ,
6869 router_mode,
6970 enforce_disagg,
71+ block_size,
7072 } )
7173 }
7274
@@ -86,6 +88,7 @@ impl PrefillRouter {
8688 cancel_token : cancel_token. clone ( ) ,
8789 router_mode,
8890 enforce_disagg,
91+ block_size : kv_cache_block_size,
8992 } ) ;
9093
9194 // Spawn background task to wait for activation
@@ -180,7 +183,8 @@ impl PrefillRouter {
180183 async fn call_prefill (
181184 & self ,
182185 request : SingleIn < PreprocessedRequest > ,
183- ) -> Result < ( PrefillResult , Option < u64 > ) , PrefillError > {
186+ _block_size : u32 ,
187+ ) -> Result < ( PrefillResult , Option < u64 > , u32 ) , PrefillError > {
184188 // Get the prefill router, error if not activated
185189 let Some ( prefill_router) = self . prefill_router . get ( ) else {
186190 return Err ( PrefillError :: NotActivated ) ;
@@ -247,12 +251,22 @@ impl PrefillRouter {
247251 . get ( "prefill_worker_id" )
248252 . and_then ( |v| v. as_u64 ( ) )
249253 } ) ;
254+
255+ // Extract overlap_blocks from the response (set by prefill worker)
256+ let overlap_blocks = output
257+ . disaggregated_params
258+ . as_ref ( )
259+ . and_then ( |params| params. get ( "overlap_blocks" ) )
260+ . and_then ( |v| v. as_u64 ( ) )
261+ . unwrap_or ( 0 ) as u32 ;
262+
250263 Ok ( (
251264 PrefillResult {
252265 disaggregated_params,
253266 prompt_tokens_details,
254267 } ,
255268 prefill_worker_id,
269+ overlap_blocks,
256270 ) )
257271 }
258272}
297311 let prefill_request = prefill_context;
298312
299313 // Attempt prefill
300- let prefill_result = self . call_prefill ( prefill_request) . await ;
314+ let prefill_result = self . call_prefill ( prefill_request, self . block_size ) . await ;
301315
302316 // Abort if cancelled during prefill
303317 if engine_ctx. is_stopped ( ) || engine_ctx. is_killed ( ) {
@@ -310,8 +324,28 @@ impl
310324
311325 // Handle prefill result
312326 match prefill_result {
313- Ok ( ( prefill_result, prefill_worker_id) ) => {
314- tracing:: debug!( "Prefill succeeded, using disaggregated params for decode" ) ;
327+ Ok ( ( mut prefill_result, prefill_worker_id, overlap_blocks) ) => {
328+ // Prefer vLLM's actual cached_tokens over router's estimate
329+ // vLLM queries the actual KV cache on the prefill worker (ground truth)
330+ // Router's overlap is just a prediction based on its global state
331+ let vllm_cached_tokens = prefill_result
332+ . prompt_tokens_details
333+ . as_ref ( )
334+ . and_then ( |d| d. cached_tokens ) ;
335+ let final_cached_tokens = if let Some ( vllm_value) = vllm_cached_tokens {
336+ vllm_value
337+ } else {
338+ overlap_blocks * self . block_size
339+ } ;
340+
341+ prefill_result. prompt_tokens_details =
342+ Some ( dynamo_async_openai:: types:: PromptTokensDetails {
343+ cached_tokens : Some ( final_cached_tokens) ,
344+ audio_tokens : prefill_result
345+ . prompt_tokens_details
346+ . as_ref ( )
347+ . and_then ( |d| d. audio_tokens ) ,
348+ } ) ;
315349
316350 let mut decode_req = req;
317351 // Update request with prefill result
0 commit comments