@@ -362,3 +362,95 @@ def test_request_cancellation_trtllm_prefill_cancel(
362362 logger .info (
363363 "Completion request cancellation during prefill phase detected successfully"
364364 )
365+
366+
367+ @pytest .mark .trtllm_marker
368+ @pytest .mark .gpu_1
369+ @pytest .mark .e2e
370+ @pytest .mark .model (FAULT_TOLERANCE_MODEL_NAME )
371+ def test_request_cancellation_trtllm_kv_transfer_cancel (
372+ request , runtime_services , predownload_models
373+ ):
374+ """
375+ End-to-end test for request cancellation during prefill to decode KV transfer phase.
376+
377+ This test verifies that when a request is cancelled by the client during the KV transfer phase,
378+ the system properly handles the cancellation and cleans up resources on the prefill worker.
379+ """
380+
381+ # Step 1: Start the frontend
382+ with DynamoFrontendProcess (request ) as frontend :
383+ logger .info ("Frontend started successfully" )
384+
385+ # Step 2: Start the prefill worker
386+ with DynamoWorkerProcess (request , mode = "prefill" ) as prefill_worker :
387+ logger .info (f"Prefill Worker PID: { prefill_worker .get_pid ()} " )
388+
389+ # Step 3: Start the decode worker
390+ with DynamoWorkerProcess (request , mode = "decode" ) as decode_worker :
391+ logger .info (f"Decode Worker PID: { decode_worker .get_pid ()} " )
392+
393+ # TODO: Why wait after worker ready fixes frontend 404 / 500 flakiness?
394+ time .sleep (2 )
395+
396+ # Step 4: Test request cancellation during KV transfer phase
397+ logger .info (
398+ "Testing completion request cancellation during KV transfer phase..."
399+ )
400+
401+ # Send request with long prompt
402+ cancellable_req = send_cancellable_request (
403+ "completion" , use_long_prompt = True
404+ )
405+
406+ # Poll for "Prefill Request ID" pattern in prefill worker
407+ request_id , prefill_log_offset = poll_for_pattern (
408+ process = prefill_worker ,
409+ pattern = "Prefill Request ID: " ,
410+ match_type = "contains" ,
411+ )
412+
413+ # Poll for start sending KV cache pattern
414+ _ , prefill_log_offset = poll_for_pattern (
415+ process = prefill_worker ,
416+ pattern = "Start sending KV cache for request ID: " ,
417+ log_offset = prefill_log_offset ,
418+ poll_interval_ms = 2 ,
419+ match_type = "contains" ,
420+ )
421+
422+ # Cancel during KV transfer phase
423+ cancellable_req .cancel ()
424+ logger .info (
425+ f"Cancelled request ID: { request_id } during KV transfer phase"
426+ )
427+
428+ # Poll for "Aborted Request ID" in decode worker
429+ _ , decode_log_offset = poll_for_pattern (
430+ process = decode_worker ,
431+ pattern = f"Aborted Request ID: { request_id } " ,
432+ )
433+
434+ # Verify frontend log has kill message
435+ _ , frontend_log_offset = poll_for_pattern (
436+ process = frontend ,
437+ pattern = "issued control message Kill to sender" ,
438+ )
439+
440+ logger .info (
441+ "Completion request cancellation during KV transfer phase detected successfully"
442+ )
443+
444+ # Verify the workers are still functional
445+ cancellable_req = send_cancellable_request ("chat_completion_stream" )
446+ _ , decode_log_offset = poll_for_pattern (
447+ process = decode_worker ,
448+ pattern = "Decode Request ID: " ,
449+ log_offset = decode_log_offset ,
450+ match_type = "contains" ,
451+ )
452+ read_streaming_responses (cancellable_req , expected_count = 5 )
453+
454+ logger .info (
455+ "Workers are functional after cancellation during KV transfer"
456+ )
0 commit comments