fix tests

Superjomn · Superjomn · commit a39ea7184d7f · 2025-11-13T09:55:23.000Z
Signed-off-by: Superjomn &lt;328693+Superjomn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/executor/rpc/rpc_server.py b/tensorrt_llm/executor/rpc/rpc_server.py
@@ -529,8 +529,10 @@ async def _process_streaming_request(self, req: RPCRequest) -> None:
                 async def stream_with_timeout():
                     nonlocal chunk_index
                     async for result in func(*req.args, **req.kwargs):
-                        if not result:
-                            continue  # WAR
+                        if result is None or result == []:
+                            # Skip None values or empty list to save bandwidth
+                            # TODO[Superjomn]: add a flag to control this behavior
+                            continue
                         # Check if shutdown was triggered
                         if self._shutdown_event.is_set():
                             raise RPCCancelled(
@@ -559,8 +561,8 @@ async def stream_with_timeout():
             else:
                 # No timeout specified, stream normally
                 async for result in func(*req.args, **req.kwargs):
-                    if not result:
-                        continue  # WAR
+                    if result is None or result == []:
+                        continue  # Skip None values or empty list
                     # Check if shutdown was triggered
                     if self._shutdown_event.is_set():
                         raise RPCCancelled(
diff --git a/tensorrt_llm/executor/rpc_worker.py b/tensorrt_llm/executor/rpc_worker.py
@@ -1,5 +1,4 @@
 import asyncio
-from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from queue import Queue
 from threading import Event
@@ -88,9 +87,9 @@ def __init__(
         self._response_queue = Queue()
         self.set_result_queue(self._response_queue)
 
-        # Create a thread pool for the fetch_responses_loop_async task to avoid
-        # being interfered by other tasks such as submit().
-        self._fetch_responses_loop_executor = ThreadPoolExecutor(max_workers=1)
+        # Note: We don't create a persistent ThreadPoolExecutor anymore
+        # to avoid thread leaks. Instead, we use asyncio.to_thread() which
+        # manages threads internally.
 
     def submit(self, request: GenerationRequest):
         """ Submits a request to the worker. """
@@ -128,11 +127,10 @@ def fetch_responses(self, timeout: Optional[float] = None) -> list:
 
     async def fetch_responses_async(self,
                                     timeout: Optional[float] = None) -> list:
-        # First, await any pending responses without blocking the event loop
-        loop = asyncio.get_event_loop()
-        responses = await loop.run_in_executor(
-            self._fetch_responses_loop_executor,
-            lambda: self.fetch_responses(timeout=timeout))
+        # Use asyncio.to_thread to avoid blocking the event loop
+        # This is similar to fetch_stats_async and fetch_kv_cache_events_async
+        responses = await asyncio.to_thread(self.fetch_responses,
+                                            timeout=timeout)
         return responses
 
     async def fetch_stats_async(self, timeout: Optional[float] = None) -> list: