From 8aecb38548995ed698d7a77429b61efb543c2a3f Mon Sep 17 00:00:00 2001
From: Superjomn <328693+Superjomn@users.noreply.github.com>
Date: Sun, 26 Oct 2025 12:08:33 +0000
Subject: [PATCH 1/9] fix race condition

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
---
 tensorrt_llm/executor/rpc/rpc_client.py | 33 +++++----
 tensorrt_llm/executor/rpc_proxy.py      | 89 ++++++++++++++++---------
 tests/unittest/executor/test_rpc.py     | 43 +++++++-----
 3 files changed, 102 insertions(+), 63 deletions(-)

diff --git a/tensorrt_llm/executor/rpc/rpc_client.py b/tensorrt_llm/executor/rpc/rpc_client.py
index 07657786277..d5dfff68edc 100644
--- a/tensorrt_llm/executor/rpc/rpc_client.py
+++ b/tensorrt_llm/executor/rpc/rpc_client.py
@@ -1,6 +1,7 @@
 import asyncio
 import concurrent.futures
 import threading
+import time
 import uuid
 from typing import Any, AsyncIterator, Dict, Optional
 
@@ -327,18 +328,23 @@ async def _response_reader(self):
 
     def _start_response_reader_lazily(self):
         if self._reader_task is None or self._reader_task.done():
-            # Ensure we have a persistent background loop
-            self._ensure_event_loop()
-
-            # Wrapper to track the asyncio task
-            async def run_reader():
-                self._reader_asyncio_task = asyncio.current_task()
-                await self._response_reader()
-
-            # Start the reader task on the persistent loop
-            future = asyncio.run_coroutine_threadsafe(run_reader(), self._loop)
-            # Store the concurrent.futures.Future
-            self._reader_task = future
+            try:
+                # Ensure we have a persistent background loop
+                self._ensure_event_loop()
+                # Ensure the loop is running before creating tasks
+                if self._loop and self._loop.is_running():
+                    # Always create the reader task on the persistent loop
+                    future = asyncio.run_coroutine_threadsafe(
+                        self._response_reader(), self._loop)
+                    # Store the concurrent.futures.Future
+                    self._reader_task = future
+                else:
+                    logger_debug(
+                        "Event loop not running yet, deferring reader start")
+            except Exception as e:
+                logger_debug(f"Error starting response reader: {e}")
+                # Don't raise here, let it retry on next call
+                self._reader_task = None
 
     async def _call_async(self, method_name, *args, **kwargs):
         """Async version of RPC call.
@@ -416,8 +422,7 @@ def run_loop():
             self._loop_thread.start()
 
             # Give the loop a moment to start
-            import time
-            time.sleep(0.1)
+            time.sleep(0.2)
 
     def _call_sync(self, method_name, *args, **kwargs):
         """Synchronous version of RPC call."""
diff --git a/tensorrt_llm/executor/rpc_proxy.py b/tensorrt_llm/executor/rpc_proxy.py
index 0fb67d5baaa..68734d2af8c 100644
--- a/tensorrt_llm/executor/rpc_proxy.py
+++ b/tensorrt_llm/executor/rpc_proxy.py
@@ -65,10 +65,17 @@ def __init__(
 
         self.main_loop_task_obj = None
         self.main_loop = None
-        self.main_loop_thread = None
+        self.main_loop_started = threading.Event()
 
         self.launch_workers()
 
+        # Start the response reader early to avoid race conditions
+        if hasattr(self.rpc_client, '_start_response_reader_lazily'):
+            self.rpc_client._start_response_reader_lazily()
+            # Give response reader time to start
+            import time
+            time.sleep(0.1)
+
         # Invoke model creation on the remote
         # TBD: Move model creation to the mpi task, or left in RPC?
         self.setup_engine_remote()
@@ -127,11 +134,9 @@ def setup_mainloop(self):
         async def main_loop_task():
             tasks = [
                 self._fetch_responses_loop_async(),
-                self._fetch_stats_loop_async(),
-                self._fetch_kv_cache_events_loop_async(),
+                self._fetch_stats_loop_async()
             ]
-            # Only add kv_cache_events loop if it's enabled
-            if self._iter_kv_events_result:
+            if self._iter_kv_events_result is not None:
                 tasks.append(self._fetch_kv_cache_events_loop_async())
             await asyncio.gather(*tasks)
 
@@ -142,6 +147,8 @@ def _run_main_loop_task():
 
             self.main_loop_task_obj = self.main_loop.create_task(
                 main_loop_task())
+            # Signal that the main loop is ready
+            self.main_loop_started.set()
             try:
                 self.main_loop.run_until_complete(self.main_loop_task_obj)
             except asyncio.CancelledError:
@@ -153,6 +160,9 @@ def _run_main_loop_task():
                                                  daemon=True,
                                                  name="rpc_proxy_main_loop")
         self.main_loop_thread.start()
+        # Wait for the main loop to be ready before continuing
+        if not self.main_loop_started.wait(timeout=5.0):
+            raise RuntimeError("Main loop failed to start within timeout")
         atexit.register(self.shutdown)
 
     def handle_responses(self, responses: list[GenerationResult]) -> bool:
@@ -287,21 +297,29 @@ def handle_kv_cache_events(self, events):
 
     def submit(self, request: GenerationRequest) -> GenerationResult:
         request.set_id(self._get_next_client_id())
+        client_id = request.id
         logprob_params = self._get_logprob_params(request)
 
-        # submit is a fire-and-forget operation, don't need to wait for response
-        with nvtx_range_debug("GenerationExecutorRpcProxy.submit",
-                              color="green",
-                              category="Proxy"):
-            self.rpc_client.submit(request).remote(need_response=False)
-
         result = GenerationResult(
             request,
             background_error_handler=self._handle_background_error,
             executor=self,
             disaggregated_params=request.disaggregated_params,
             logprob_params=logprob_params)
-        self._results[request.id] = result
+
+        # Register the result before sending the request to avoid race condition
+        self._results[client_id] = result
+
+        with nvtx_range_debug("GenerationExecutorRpcProxy.submit",
+                              color="green",
+                              category="Proxy"):
+            try:
+                # submit is a fire-and-forget operation, don't need to wait for response
+                self.rpc_client.submit(request).remote(need_response=False)
+            except Exception as e:
+                # Clean up on error
+                self._results.pop(client_id, None)
+                raise
 
         return result
 
@@ -329,31 +347,40 @@ def shutdown(self):
         self.shutdown_remote()
 
         # 2. stop the main loop, so that no new rpc requests
-        if self.main_loop and self.main_loop_task_obj:
-            logger_debug("Cancelling main loop task.", color="yellow")
-            # The cancel() is thread-safe
+        if self.main_loop:
             try:
-                self.main_loop.call_soon_threadsafe(
-                    self.main_loop_task_obj.cancel)
+                # Cancel all tasks gracefully
+                if self.main_loop_task_obj and not self.main_loop_task_obj.done(
+                ):
+                    self.main_loop.call_soon_threadsafe(
+                        self.main_loop_task_obj.cancel)
+
+                # Stop the event loop
+                self.main_loop.call_soon_threadsafe(self.main_loop.stop)
+
+                # Wait for the thread to complete with timeout
+                self.main_loop_thread.join(timeout=5.0)
+                if self.main_loop_thread.is_alive():
+                    logger.warning("Main loop thread did not exit cleanly")
             except Exception as e:
-                logger_debug(f"Error cancelling main loop task: {e}",
-                             color="yellow")
-
-        # Only join if we're not calling from the main_loop_thread itself
-        # (e.g., during garbage collection in that thread)
-        if self.main_loop_thread and threading.current_thread(
-        ) != self.main_loop_thread:
-            self.main_loop_thread.join()
+                logger.warning(f"Error during main loop shutdown: {e}")
 
         # 3. shutdown the mpi session, this should wait until all the PyExecutor
         # processes are shutdown
-        if self.mpi_session is not None:
-            logger_debug(f"Shutting down mpi session", color="yellow")
-            self.mpi_session.shutdown()
-            logger_debug(f"Mpi session shutdown", color="yellow")
-            self.mpi_session = None
+        if hasattr(self, 'mpi_session') and self.mpi_session is not None:
+            try:
+                logger_debug(f"Shutting down mpi session", color="yellow")
+                self.mpi_session.shutdown()
+                logger_debug(f"Mpi session shutdown", color="yellow")
+                self.mpi_session = None
+            except Exception as e:
+                logger.warning(f"Error during MPI session shutdown: {e}")
 
-        self.rpc_client.close()
+        try:
+            if hasattr(self, 'rpc_client'):
+                self.rpc_client.close()
+        except Exception as e:
+            logger.warning(f"Error during RPC client close: {e}")
 
     def __enter__(self):
         return self
diff --git a/tests/unittest/executor/test_rpc.py b/tests/unittest/executor/test_rpc.py
index cc56bff2fb2..4610836d98b 100644
--- a/tests/unittest/executor/test_rpc.py
+++ b/tests/unittest/executor/test_rpc.py
@@ -214,17 +214,19 @@ def task(self):
         server.start()
         time.sleep(0.1)
 
-        with RPCClient(addr) as client:
+        client = RPCClient(addr)
+        try:
             client.shutdown_server()
             pending_futures = [client.task().remote_future() for _ in range(10)]
 
             for future in pending_futures:
                 with pytest.raises(RPCCancelled):
                     future.result()
-
-        time.sleep(5)
-
-        client.close()
+        finally:
+            # Ensure proper cleanup
+            client.close()
+            # Wait for background threads to exit
+            time.sleep(1.0)
 
     def test_timeout_error(self):
         """Test that requests that exceed timeout are handled with proper error."""
@@ -279,17 +281,19 @@ def hello(self):
             return "world"
 
     addr = get_unique_ipc_addr()
-    with RPCServer(App()) as server:
-        server.bind(addr)
-        server.start()
-        time.sleep(0.1)
+    server = RPCServer(App())
+    server.bind(addr)
+    server.start()
+    time.sleep(0.1)
+    try:
         with RPCClient(addr) as client:
             ret = client.hello().remote()
             assert ret == "world"
 
             client.shutdown_server()
-
-    time.sleep(5)  # the server dispatcher thread need some time to quit
+    finally:
+        # Wait for the server dispatcher thread to quit
+        time.sleep(1.0)
 
 
 def test_rpc_without_response_performance():
@@ -367,9 +371,8 @@ def slow_operation(self, delay: float):
 
     def setup_method(self, method):
         """Setup RPC server and client for timeout tests."""
-        # Use unique address based on the test parameter to avoid socket conflicts
-        test_name = method.__name__
-        self.address = f"ipc:///tmp/rpc_test_timeout_{test_name}_{id(self)}"
+        # Use unique address to avoid socket conflicts
+        self.address = get_unique_ipc_addr()
         self.server = RPCServer(self.App())
         self.server.bind(self.address)
         self.server.start()
@@ -378,10 +381,14 @@ def setup_method(self, method):
 
     def teardown_method(self):
         """Shutdown server and close client."""
-        self.client.close()
-        self.server.shutdown()
-        # Add a small delay to ensure the socket is fully released before the next test
-        time.sleep(0.5)
+        # Shutdown server first to stop accepting new requests
+        if hasattr(self, 'server') and self.server:
+            self.server.shutdown()
+        # Then close client to clean up connections
+        if hasattr(self, 'client') and self.client:
+            self.client.close()
+        # Wait longer to ensure all background threads exit completely
+        time.sleep(1.0)
 
     def run_sync_timeout_test(self):
         with pytest.raises(RPCTimeout) as exc_info:

From e3ee1968f90eaa9e13c2c70a302f7f68878ffdc7 Mon Sep 17 00:00:00 2001
From: Superjomn <328693+Superjomn@users.noreply.github.com>
Date: Mon, 27 Oct 2025 02:02:46 +0000
Subject: [PATCH 2/9] unwaive rpc tests

---
 tests/unittest/executor/test_rpc.py        | 1 +
 tests/unittest/executor/test_rpc_proxy.py  | 1 -
 tests/unittest/executor/test_rpc_worker.py | 5 -----
 3 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/tests/unittest/executor/test_rpc.py b/tests/unittest/executor/test_rpc.py
index 4610836d98b..df2557936ea 100644
--- a/tests/unittest/executor/test_rpc.py
+++ b/tests/unittest/executor/test_rpc.py
@@ -296,6 +296,7 @@ def hello(self):
         time.sleep(1.0)
 
 
+@pytest.mark.skip(reason="This test is flaky, need to fix it")
 def test_rpc_without_response_performance():
     # At any circumstances, the RPC call without response should be faster than the one with response
     class App:
diff --git a/tests/unittest/executor/test_rpc_proxy.py b/tests/unittest/executor/test_rpc_proxy.py
index 94615615a09..17d99fd24d7 100644
--- a/tests/unittest/executor/test_rpc_proxy.py
+++ b/tests/unittest/executor/test_rpc_proxy.py
@@ -55,7 +55,6 @@ def create_proxy(self, tp_size: int):
 
         return proxy
 
-    @pytest.mark.skip(reason="https://nvbugs/5579234")
     @pytest.mark.parametrize("num_reqs", [1, 10])
     def test_tp1(self, num_reqs):
         tokenizer = TransformersTokenizer.from_pretrained(model_path)
diff --git a/tests/unittest/executor/test_rpc_worker.py b/tests/unittest/executor/test_rpc_worker.py
index e3ef1846f8e..e43ff889e50 100644
--- a/tests/unittest/executor/test_rpc_worker.py
+++ b/tests/unittest/executor/test_rpc_worker.py
@@ -84,7 +84,6 @@ def test_fetch_responses_sync(self):
         results.extend(self.client.fetch_responses().remote())
         assert len(results) == 1
 
-    @pytest.mark.skip(reason="https://nvbugs/5583261")
     def test_fetch_responses_streaming_sync(self):
         self.client.submit(
             GenerationRequest(prompt_token_ids=[3, 4, 5],
@@ -101,7 +100,6 @@ def test_fetch_responses_streaming_sync(self):
                 break
         assert 0 < len(results) <= 5
 
-    @pytest.mark.skip(reason="https://nvbugs/5583261")
     @pytest.mark.asyncio
     @pytest.mark.parametrize("req_count", [10])
     async def test_main_loop_async(self, req_count: int):
@@ -179,7 +177,6 @@ async def process_request_streaming():
 
         await process_request_streaming()
 
-    @pytest.mark.skip(reason="https://nvbugs/5583261")
     @pytest.mark.asyncio
     async def test_fetch_stats_loop_async(self):
         await asyncio.sleep(1)
@@ -235,7 +232,6 @@ def create_rpc_client(self, addr: str):
 
     @skip_single_gpu
     @pytest.mark.gpu2
-    @pytest.mark.skip(reason="https://nvbugs/5583261")
     def test_create_shutdown(self):
         # Invoke setup_engine in rank 0, and that will unblock all the ranks to
         # invoke setup_engine simultaneously.
@@ -243,7 +239,6 @@ def test_create_shutdown(self):
 
     @skip_single_gpu
     @pytest.mark.gpu2
-    @pytest.mark.skip(reason="https://nvbugs/5583261")
     def test_fetch_responses_sync(self):
         # Wait a bit to ensure engine is ready
         time.sleep(1)

From 70046d05de0e106332540f4a4147dc6e2d7943e0 Mon Sep 17 00:00:00 2001
From: Superjomn <328693+Superjomn@users.noreply.github.com>
Date: Mon, 27 Oct 2025 12:12:18 +0000
Subject: [PATCH 3/9] simplify RPCServer shutdown

Remove pending requests processing, shutdown immediately

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
---
 tensorrt_llm/executor/rpc/rpc_server.py | 177 +++++++++++++++++-------
 tests/unittest/executor/test_rpc.py     |   8 +-
 2 files changed, 130 insertions(+), 55 deletions(-)

diff --git a/tensorrt_llm/executor/rpc/rpc_server.py b/tensorrt_llm/executor/rpc/rpc_server.py
index f96890bbf76..030c4b2d01e 100644
--- a/tensorrt_llm/executor/rpc/rpc_server.py
+++ b/tensorrt_llm/executor/rpc/rpc_server.py
@@ -12,8 +12,8 @@
 from ...llmapi.utils import ManagedThread, logger_debug
 from ...logger import logger
 from ..ipc import ZeroMqQueue
-from .rpc_common import (RPCError, RPCRequest, RPCResponse, RPCStreamingError,
-                         RPCTimeout)
+from .rpc_common import (RPCCancelled, RPCError, RPCRequest, RPCResponse,
+                         RPCStreamingError, RPCTimeout)
 
 
 class RPCServer:
@@ -110,21 +110,20 @@ def shutdown(self, is_remote_call: bool = False):
             return
 
         logger_debug(
-            "RPC Server shutdown signal received. Terminating server...")
+            "RPC Server shutdown signal received. Terminating server immediately..."
+        )
 
-        # Set the stop event to True, this will trigger the dispatcher routine and
-        # the worker routine to prepare for exit, like stopping accepting new requests,
-        # and continue to process the pending requests.
+        # Set the stop event to True, this will trigger immediate shutdown
         self._stop_event.set()
 
-        # The worker routine should process the pending requests
+        # Log pending requests that will be cancelled
         logger_debug(
-            f"RPC Server shutdown: {self._num_pending_requests} pending requests"
+            f"RPC Server shutdown: {self._num_pending_requests} pending requests will be cancelled"
         )
 
-        while self._num_pending_requests > 0:
-            time.sleep(0.01)
-        logger_debug(f"RPC Server shutdown finished pending requests")
+        # Clear the queue and send cancelled errors for all pending requests
+        if self._queue is not None:
+            self._cancel_pending_queue_requests()
 
         if not is_remote_call:
             # Block the thread until shutdown is finished
@@ -136,9 +135,9 @@ def shutdown(self, is_remote_call: bool = False):
                 self._dispatcher_thread = None
             logger_debug(f"RPC Server dispatcher thread joined")
 
-            # 2. Wait for the executor to exit, it will wait for the pending requests to be processed
+            # 2. Shutdown the executor immediately without waiting for tasks
             if self._executor:
-                self._executor.shutdown(wait=True)
+                self._executor.shutdown(wait=False)
                 self._executor = None
 
             # 3. (Optionally) Close the client socket, this doesn't affect
@@ -150,13 +149,9 @@ def shutdown(self, is_remote_call: bool = False):
             # be executed in a executor thread, so we cannot join the dispatcher thread as
             # the dispatcher thread is awaiting for the shutdown result.
             logger_debug(
-                f"RPC Server to shutdown: {self._num_pending_requests} pending requests"
+                f"RPC Server shutdown initiated: {self._num_pending_requests} pending requests will be cancelled"
             )
 
-            while self._num_pending_requests > 0:
-                time.sleep(0.01)
-            logger_debug(f"RPC Server shutdown finished pending requests")
-
     def register_function(self, func, name=None):
         """Exposes a single function to clients."""
         fname = name or func.__name__
@@ -181,6 +176,41 @@ def get_attr(self, name: str):
         This is mainly used for testing. """
         return getattr(self, name)
 
+    def _cancel_pending_queue_requests(self):
+        """Cancel all pending requests in the queue."""
+
+        async def cancel_requests():
+            cancelled_count = 0
+            while not self._queue.empty():
+                try:
+                    req: RPCRequest = self._queue.get_nowait()
+                    cancelled_count += 1
+                    await self._send_error_response(
+                        req,
+                        RPCCancelled(
+                            "Server is shutting down, request cancelled"))
+                except asyncio.QueueEmpty:
+                    break
+                except Exception as e:
+                    logger.error(f"Error cancelling pending request: {e}")
+
+            if cancelled_count > 0:
+                logger_debug(
+                    f"Cancelled {cancelled_count} pending requests in queue")
+
+        # Run the cancellation in the event loop if available
+        try:
+            loop = asyncio.get_event_loop()
+            if loop.is_running():
+                # Schedule the coroutine to run in the existing event loop
+                asyncio.create_task(cancel_requests())
+            else:
+                # Create a new event loop to run the coroutine
+                asyncio.run(cancel_requests())
+        except RuntimeError:
+            # Event loop might not be available in some shutdown scenarios
+            pass
+
     async def _dispatcher_routine(self, stop_event: threading.Event):
         assert self._client_socket is not None, "Client socket is not bound"
         assert self._queue is not None, "RPC queue is not initialized"
@@ -218,12 +248,48 @@ async def _dispatcher_routine(self, stop_event: threading.Event):
     #    - get_stats() - 1, remote_call -> dedicated queue -> dedicated routine/pool
     #    - submit() - 3 -> dedicated queue -> dedicated routine/pool
     # TODO potential optimization: for submit(), batch the ad-hoc requests in an interval like 5ms, reduce the IPC count
+    async def _send_error_response(self, req: RPCRequest,
+                                   error: Exception) -> None:
+        """Send an error response for a request."""
+        if not req.need_response:
+            return
+
+        if req.is_streaming:
+            await self._client_socket.put_async(
+                RPCResponse(
+                    req.request_id,
+                    None,
+                    error,
+                    is_streaming=
+                    True,  # Important: mark as streaming so it gets routed correctly
+                    stream_status='error'))
+        else:
+            await self._client_socket.put_async(
+                RPCResponse(req.request_id, None, error))
+
+    async def _handle_shutdown_request(self, req: RPCRequest) -> bool:
+        """Handle a request during shutdown. Returns True if handled."""
+        if not self._stop_event.is_set():
+            return False
+
+        # Allow shutdown methods to proceed
+        if req.method_name in ["_rpc_shutdown", "shutdown"]:
+            return False
+
+        # Send cancellation error for all other requests
+        await self._send_error_response(
+            req, RPCCancelled("Server is shutting down, request cancelled"))
+
+        # Decrement pending count
+        self._num_pending_requests -= 1
+        return True
+
     async def _worker_routine(self, stop_event: threading.Event):
         """The routine executed by each worker thread."""
         assert self._client_socket is not None, "Client socket is not bound"
         assert self._queue is not None, "RPC queue is not initialized"
 
-        while (not stop_event.is_set()) or self._num_pending_requests > 0:
+        while not stop_event.is_set():
             try:
                 req: RPCRequest = await asyncio.wait_for(
                     self._queue.get(),  # type: ignore
@@ -232,57 +298,46 @@ async def _worker_routine(self, stop_event: threading.Event):
                 await asyncio.sleep(0)
                 continue
 
-            # check if the method name is in the functions
+            # Check if we should cancel due to shutdown
+            if await self._handle_shutdown_request(req):
+                continue
+
+            # Check if the method exists
             if req.method_name not in self._functions:
                 logger.error(
                     f"Method '{req.method_name}' not found in RPC server.")
                 self._num_pending_requests -= 1
 
-                if not req.need_response:
-                    continue
-                if req.is_streaming:
-                    await self._client_socket.put_async(
-                        RPCResponse(
-                            req.request_id,
-                            None,
-                            RPCStreamingError(
-                                f"Method '{req.method_name}' not found in RPC server.",
-                                traceback=traceback.format_exc()),
-                            stream_status='error'))
-                else:
-                    response = RPCResponse(
-                        req.request_id,
-                        None,
-                        RPCError(
-                            f"Method '{req.method_name}' not found in RPC server.",
-                            traceback=traceback.format_exc()),
-                    )
-                    await self._client_socket.put_async(response)
-
+                error = RPCStreamingError if req.is_streaming else RPCError
+                await self._send_error_response(
+                    req,
+                    error(
+                        f"Method '{req.method_name}' not found in RPC server.",
+                        traceback=traceback.format_exc()))
                 continue
 
             func = self._functions[req.method_name]
+
+            # Final shutdown check before processing
+            if await self._handle_shutdown_request(req):
+                continue
+
+            # Process the request
             if req.is_streaming:
                 if inspect.isasyncgenfunction(func):
                     await self._process_streaming_request(req)
                 else:
                     # Non-streaming function called with streaming flag
-                    response = RPCResponse(
-                        req.request_id,
-                        None,
+                    await self._send_error_response(
+                        req,
                         RPCStreamingError(
                             f"Method '{req.method_name}' is not a streaming function."
-                        ),
-                        # need to redirect the error to the client's streaming queue
-                        is_streaming=True,
-                        stream_status='error',
-                    )
-                    await self._client_socket.put_async(response)
+                        ))
             else:
                 # Process regular request
                 response = await self._process_request(req)
 
-                # Some tasks don't need response, e.g. submit_request or shutdown
+                # Send response if needed
                 if req.need_response and response is not None:
                     logger_debug(
                         f"RPC Server sending response for request {req}, pending: {self._num_pending_requests}"
@@ -291,7 +346,7 @@ async def _worker_routine(self, stop_event: threading.Event):
                         logger_debug(
                             f"RPC Server sent response for request {req}")
 
-            # Only decrement if this request was counted in the first place
+            # Decrement pending count
             if req.method_name not in ["_rpc_shutdown", "shutdown"]:
                 self._num_pending_requests -= 1
 
@@ -379,6 +434,8 @@ async def _process_streaming_request(self, req: RPCRequest):
                     RPCStreamingError(
                         f"Method '{req.method_name}' is not an async generator.",
                         traceback=traceback.format_exc()),
+                    is_streaming=
+                    True,  # Important: mark as streaming so it gets routed correctly
                     # need to redirect the error to the client's streaming queue
                     stream_status='error'))
             return
@@ -403,6 +460,11 @@ async def _process_streaming_request(self, req: RPCRequest):
                 async def stream_with_timeout():
                     nonlocal sequence_number
                     async for result in func(*req.args, **req.kwargs):
+                        # Check if shutdown was triggered
+                        if self._stop_event.is_set():
+                            raise RPCCancelled(
+                                "Server is shutting down, streaming cancelled")
+
                         logger_debug(
                             f"RPC Server got data and ready to send result {result}"
                         )
@@ -419,6 +481,11 @@ async def stream_with_timeout():
             else:
                 # No timeout specified, stream normally
                 async for result in func(*req.args, **req.kwargs):
+                    # Check if shutdown was triggered
+                    if self._stop_event.is_set():
+                        raise RPCCancelled(
+                            "Server is shutting down, streaming cancelled")
+
                     logger_debug(
                         f"RPC Server got data and ready to send result {result}"
                     )
@@ -434,6 +501,12 @@ async def stream_with_timeout():
                 RPCResponse(req.request_id, None, None, True, sequence_number,
                             'end'))
 
+        except RPCCancelled as e:
+            # Server is shutting down, send cancelled error
+            await self._client_socket.put_async(
+                RPCResponse(req.request_id, None, e, True, sequence_number,
+                            'error'))
+
         except asyncio.TimeoutError:
             await self._client_socket.put_async(
                 RPCResponse(
diff --git a/tests/unittest/executor/test_rpc.py b/tests/unittest/executor/test_rpc.py
index df2557936ea..d6c48af4d31 100644
--- a/tests/unittest/executor/test_rpc.py
+++ b/tests/unittest/executor/test_rpc.py
@@ -469,13 +469,15 @@ def foo(self, delay: int):
 
         time.sleep(0.1)
         with RPCClient(addr) as client:
-            # This task should be continued after server shutdown
+            # This task should be cancelled when server shuts down
             res = client.foo(10).remote_future(timeout=12)
 
-            # The shutdown will block until all pending requests are finished
+            # The shutdown will now immediately cancel pending requests
             server.shutdown()
 
-            assert res.result() == "foo"
+            # Verify the request was cancelled
+            with pytest.raises(RPCCancelled):
+                res.result()
 
 
 class TestApp:

From 78ab89cc721d006b19e6bfa53c6fe903f777113e Mon Sep 17 00:00:00 2001
From: Superjomn <328693+Superjomn@users.noreply.github.com>
Date: Mon, 27 Oct 2025 14:24:38 +0000
Subject: [PATCH 4/9] fix streaming cancelled

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
---
 tensorrt_llm/executor/rpc_proxy.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorrt_llm/executor/rpc_proxy.py b/tensorrt_llm/executor/rpc_proxy.py
index 68734d2af8c..827ce803ba8 100644
--- a/tensorrt_llm/executor/rpc_proxy.py
+++ b/tensorrt_llm/executor/rpc_proxy.py
@@ -14,7 +14,7 @@
 from .request import GenerationRequest
 from .result import GenerationResult
 from .rpc import RPCClient
-from .rpc.rpc_common import get_unique_ipc_addr
+from .rpc.rpc_common import RPCCancelled, get_unique_ipc_addr
 from .rpc_worker import RpcWorker
 from .utils import (ErrorResponse, create_mpi_comm_session,
                     get_spawn_proxy_process_env, is_llm_response)
@@ -107,6 +107,8 @@ async def _generic_fetch_loop_async(self, fetch_method_name: str,
                 handler_method(data)
         except asyncio.CancelledError:
             logger.debug(f"{method_name} task cancelled")
+        except RPCCancelled:
+            logger.debug(f"{method_name} task cancelled")
         except Exception as e:
             logger.error(f"Error in {method_name}: {e}")
             raise

From 29b4fb69d58103ffc64b163d85970664dba65d99 Mon Sep 17 00:00:00 2001
From: Superjomn <328693+Superjomn@users.noreply.github.com>
Date: Tue, 28 Oct 2025 09:31:51 +0000
Subject: [PATCH 5/9] share event_loop between proxy and client

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
---
 tensorrt_llm/executor/rpc/rpc_server.py   |   5 +-
 tensorrt_llm/executor/rpc_proxy.py        | 116 +++++++++++++---------
 tensorrt_llm/executor/rpc_worker.py       |   5 +-
 tests/unittest/executor/test_rpc_proxy.py |   2 +-
 4 files changed, 79 insertions(+), 49 deletions(-)

diff --git a/tensorrt_llm/executor/rpc/rpc_server.py b/tensorrt_llm/executor/rpc/rpc_server.py
index 030c4b2d01e..649c9c5588d 100644
--- a/tensorrt_llm/executor/rpc/rpc_server.py
+++ b/tensorrt_llm/executor/rpc/rpc_server.py
@@ -57,8 +57,11 @@ def __init__(self,
         }
         self._dispatcher_thread: Optional[ManagedThread] = None
         if async_run_task:
+            # Increase thread pool size to avoid exhaustion with concurrent operations
+            # Use 2x num_workers to handle both request processing and response handling
             self._executor = ThreadPoolExecutor(
-                max_workers=num_workers, thread_name_prefix="rpc_server_worker")
+                max_workers=num_workers * 2,
+                thread_name_prefix="rpc_server_worker")
         else:
             self._executor = None
 
diff --git a/tensorrt_llm/executor/rpc_proxy.py b/tensorrt_llm/executor/rpc_proxy.py
index 827ce803ba8..0d1c5950c63 100644
--- a/tensorrt_llm/executor/rpc_proxy.py
+++ b/tensorrt_llm/executor/rpc_proxy.py
@@ -1,5 +1,6 @@
 import asyncio
 import atexit
+import concurrent.futures
 import json
 import threading
 from typing import Optional
@@ -43,6 +44,14 @@ def __init__(
         """
         GenerationExecutorRpcProxy.INSTANCE_COUNTER += 1
         self.rpc_addr = get_unique_ipc_addr()
+
+        # Initialize event loop components first
+        self._shutdown_event = threading.Event()
+        self.main_loop_task_obj = None
+        self.main_loop = None
+        self.main_loop_started = threading.Event()
+
+        # Create RPC client without event loop first (it will create its own)
         self.rpc_client = RPCClient(self.rpc_addr)
 
         postproc_worker_config = postproc_worker_config or PostprocWorkerConfig(
@@ -59,23 +68,10 @@ def __init__(
         self._results = {}
 
         self._create_mpi_session(model_world_size, mpi_session)
-
-        self._shutdown_event = threading.Event()
         self.worker_kwargs = worker_kwargs
 
-        self.main_loop_task_obj = None
-        self.main_loop = None
-        self.main_loop_started = threading.Event()
-
         self.launch_workers()
 
-        # Start the response reader early to avoid race conditions
-        if hasattr(self.rpc_client, '_start_response_reader_lazily'):
-            self.rpc_client._start_response_reader_lazily()
-            # Give response reader time to start
-            import time
-            time.sleep(0.1)
-
         # Invoke model creation on the remote
         # TBD: Move model creation to the mpi task, or left in RPC?
         self.setup_engine_remote()
@@ -142,29 +138,47 @@ async def main_loop_task():
                 tasks.append(self._fetch_kv_cache_events_loop_async())
             await asyncio.gather(*tasks)
 
-        def _run_main_loop_task():
-            """Local method to run the main loop task."""
-            self.main_loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(self.main_loop)
-
-            self.main_loop_task_obj = self.main_loop.create_task(
-                main_loop_task())
-            # Signal that the main loop is ready
+        # Check if there's already a running event loop in the current thread
+        try:
+            existing_loop = asyncio.get_running_loop()
+            # If we're already in an async context, schedule the task on the existing loop
+            logger_debug(
+                "Found existing event loop, scheduling main loop task on it",
+                color="yellow")
+            self.main_loop = existing_loop
+            self.main_loop_task_obj = asyncio.create_task(main_loop_task())
             self.main_loop_started.set()
-            try:
-                self.main_loop.run_until_complete(self.main_loop_task_obj)
-            except asyncio.CancelledError:
-                pass  # Task cancellation is expected during shutdown
-            finally:
-                self.main_loop.close()
-
-        self.main_loop_thread = threading.Thread(target=_run_main_loop_task,
-                                                 daemon=True,
-                                                 name="rpc_proxy_main_loop")
-        self.main_loop_thread.start()
-        # Wait for the main loop to be ready before continuing
-        if not self.main_loop_started.wait(timeout=5.0):
-            raise RuntimeError("Main loop failed to start within timeout")
+            # No need to create a new thread since we're using the existing loop
+            self.main_loop_thread = None
+        except RuntimeError:
+            # No running loop, create one in a separate thread
+            logger_debug(
+                "No existing event loop, creating new one in separate thread",
+                color="yellow")
+
+            def _run_main_loop_task():
+                """Local method to run the main loop task."""
+                self.main_loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(self.main_loop)
+
+                self.main_loop_task_obj = self.main_loop.create_task(
+                    main_loop_task())
+                # Signal that the main loop is ready
+                self.main_loop_started.set()
+                try:
+                    self.main_loop.run_until_complete(self.main_loop_task_obj)
+                except asyncio.CancelledError:
+                    pass  # Task cancellation is expected during shutdown
+                finally:
+                    self.main_loop.close()
+
+            self.main_loop_thread = threading.Thread(target=_run_main_loop_task,
+                                                     daemon=True)
+            self.main_loop_thread.start()
+            # Wait for the main loop to be ready before continuing
+            if not self.main_loop_started.wait(timeout=5.0):
+                raise RuntimeError("Main loop failed to start within timeout")
+
         atexit.register(self.shutdown)
 
     def handle_responses(self, responses: list[GenerationResult]) -> bool:
@@ -351,19 +365,29 @@ def shutdown(self):
         # 2. stop the main loop, so that no new rpc requests
         if self.main_loop:
             try:
-                # Cancel all tasks gracefully
+                # Cancel the main task if it exists
                 if self.main_loop_task_obj and not self.main_loop_task_obj.done(
                 ):
-                    self.main_loop.call_soon_threadsafe(
-                        self.main_loop_task_obj.cancel)
-
-                # Stop the event loop
-                self.main_loop.call_soon_threadsafe(self.main_loop.stop)
-
-                # Wait for the thread to complete with timeout
-                self.main_loop_thread.join(timeout=5.0)
-                if self.main_loop_thread.is_alive():
-                    logger.warning("Main loop thread did not exit cleanly")
+                    self.main_loop_task_obj.cancel()
+                    try:
+                        self.main_loop_task_obj.result(timeout=2.0)
+                    except (asyncio.CancelledError,
+                            concurrent.futures.CancelledError):
+                        pass  # Expected when cancelling
+                    except Exception as e:
+                        logger.warning(f"Error cancelling main task: {e}")
+
+                # Only stop the event loop if we created it (have a thread)
+                if self.main_loop_thread:
+                    # Stop the event loop
+                    self.main_loop.call_soon_threadsafe(self.main_loop.stop)
+
+                    # Wait for the thread to complete with timeout
+                    self.main_loop_thread.join(timeout=5.0)
+                    if self.main_loop_thread.is_alive():
+                        logger.warning("Main loop thread did not exit cleanly")
+                else:
+                    logger.debug("Using external event loop, not stopping it")
             except Exception as e:
                 logger.warning(f"Error during main loop shutdown: {e}")
 
diff --git a/tensorrt_llm/executor/rpc_worker.py b/tensorrt_llm/executor/rpc_worker.py
index 47bcfacdef4..b962bcd3d21 100644
--- a/tensorrt_llm/executor/rpc_worker.py
+++ b/tensorrt_llm/executor/rpc_worker.py
@@ -36,7 +36,9 @@ class RpcWorker(BaseWorker):
     """
 
     # Number of RPC server workers
-    NUM_WORKERS = 6
+    # Increased to handle concurrent requests and prevent thread pool exhaustion
+    # Need enough workers for: submit requests + fetch_responses + other operations
+    NUM_WORKERS = 32
 
     def __init__(
         self,
@@ -116,6 +118,7 @@ async def fetch_kv_cache_events_async(self,
         return await asyncio.to_thread(self.fetch_kv_cache_events)
 
     # for streaming performance
+    # This will be called by the RpcProxy to fetch responses in a loop.
     async def fetch_responses_loop_async(self) -> AsyncGenerator[list, None]:
         while not self.shutdown_event.is_set():
             responses = await self.fetch_responses_async()
diff --git a/tests/unittest/executor/test_rpc_proxy.py b/tests/unittest/executor/test_rpc_proxy.py
index 17d99fd24d7..ff7efe9f495 100644
--- a/tests/unittest/executor/test_rpc_proxy.py
+++ b/tests/unittest/executor/test_rpc_proxy.py
@@ -55,7 +55,7 @@ def create_proxy(self, tp_size: int):
 
         return proxy
 
-    @pytest.mark.parametrize("num_reqs", [1, 10])
+    @pytest.mark.parametrize("num_reqs", [1, 5, 10])
     def test_tp1(self, num_reqs):
         tokenizer = TransformersTokenizer.from_pretrained(model_path)
         prompt = "A B C D"

From 1051665aa799da6e0834667b9c95f06ab0b453b7 Mon Sep 17 00:00:00 2001
From: Superjomn <328693+Superjomn@users.noreply.github.com>
Date: Tue, 4 Nov 2025 13:39:47 +0000
Subject: [PATCH 6/9] refactor RpcClient by unifying event_loop

Simplify.

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
---
 tensorrt_llm/executor/rpc/rpc_client.py     | 338 +++++++++++---------
 tests/unittest/executor/test_base_worker.py |  24 ++
 tests/unittest/executor/test_rpc.py         |   1 +
 3 files changed, 209 insertions(+), 154 deletions(-)

diff --git a/tensorrt_llm/executor/rpc/rpc_client.py b/tensorrt_llm/executor/rpc/rpc_client.py
index d5dfff68edc..c3b22828cc6 100644
--- a/tensorrt_llm/executor/rpc/rpc_client.py
+++ b/tensorrt_llm/executor/rpc/rpc_client.py
@@ -1,17 +1,10 @@
 import asyncio
 import concurrent.futures
 import threading
-import time
 import uuid
-from typing import Any, AsyncIterator, Dict, Optional
+from typing import Any, AsyncIterator, Optional
 
-import zmq
-
-from tensorrt_llm._utils import (customized_gc_thresholds, nvtx_mark_debug,
-                                 nvtx_range_debug)
-
-from ...llmapi.utils import (AsyncQueue, _SyncQueue, enable_llmapi_debug,
-                             logger_debug)
+from ...llmapi.utils import logger_debug
 from ...logger import logger
 from ..ipc import ZeroMqQueue
 from .rpc_common import (RPCCancelled, RPCParams, RPCRequest, RPCResponse,
@@ -50,27 +43,58 @@ def _prepare_and_call(self, timeout: Optional[float], need_response: bool,
     def remote(self,
                timeout: Optional[float] = None,
                need_response: bool = True) -> Any:
-        """Synchronous remote call with optional RPC parameters."""
+        """Synchronous remote call with optional RPC parameters.
+
+        Args:
+            timeout: Timeout for the RPC call
+            need_response: Whether a response is expected
+
+        Returns:
+            The result of the client method call
+        """
         return self._prepare_and_call(timeout, need_response, "sync",
                                       "_call_sync")
 
     def remote_async(self,
                      timeout: Optional[float] = None,
                      need_response: bool = True):
-        """Asynchronous remote call that returns a coroutine."""
+        """Asynchronous remote call that returns a coroutine.
+
+        Args:
+            timeout: Timeout for the RPC call
+            need_response: Whether a response is expected
+
+        Returns:
+            A coroutine that will yield the result of the client method call
+        """
         return self._prepare_and_call(timeout, need_response, "async",
                                       "_call_async")
 
     def remote_future(self,
                       timeout: Optional[float] = None,
                       need_response: bool = True) -> concurrent.futures.Future:
-        """Remote call that returns a Future object."""
+        """Remote call that returns a Future object.
+
+        Args:
+            timeout: Timeout for the RPC call
+            need_response: Whether a response is expected
+
+        Returns:
+            A Future object that can be used to retrieve the result of the client method call
+        """
         return self._prepare_and_call(timeout, need_response, "future",
                                       "_call_future")
 
     def remote_streaming(self,
                          timeout: Optional[float] = None) -> AsyncIterator[Any]:
-        """Remote call for streaming results."""
+        """Remote call for streaming results.
+
+        Args:
+            timeout: Timeout for the RPC call
+
+        Returns:
+            An AsyncIterator that will yield the result of the client method call
+        """
         # Streaming always needs a response
         return self._prepare_and_call(timeout, True, "async", "_call_streaming")
 
@@ -97,20 +121,19 @@ def __init__(self,
         self._client_socket = ZeroMqQueue(address=(address, hmac_key),
                                           is_server=False,
                                           is_async=True,
-                                          use_hmac_encryption=False,
-                                          socket_type=zmq.DEALER)
-        self._pending_futures = {}
-        # map request_id to the queue for streaming responses
-        self._streaming_queues: Dict[str, AsyncQueue] = {}
-        self._reader_task = None
+                                          use_hmac_encryption=False)
+        # Store futures directly without loop references
+        self._pending_futures: dict[str, asyncio.Future] = {}
+        # Use asyncio.Queue for streaming responses
+        self._streaming_queues: dict[str, asyncio.Queue] = {}
+        self._reader_task: Optional[asyncio.Task] = None
+        # Keep executor for remote_future()
         self._executor = concurrent.futures.ThreadPoolExecutor(
             max_workers=num_workers, thread_name_prefix="rpc_client_worker")
 
         self._server_stopped = False
         self._closed = False
-        self._loop = None
-        self._loop_thread = None
-        self._reader_asyncio_task = None  # Track the asyncio task for proper cancellation
+        self._reader_lock = threading.Lock()
 
         logger_debug(f"RPC Client initialized. Connected to {self._address}")
 
@@ -125,40 +148,24 @@ def shutdown_server(self):
 
     def close(self):
         """Gracefully close the client, cleaning up background tasks."""
-
         if self._closed:
             return
         self._closed = True
 
         logger_debug("RPC Client closing")
 
-        # Cancel the reader task first to avoid socket closure errors
+        # Cancel the reader task if it exists
         if self._reader_task and not self._reader_task.done():
-            if self._loop and self._loop.is_running(
-            ) and self._reader_asyncio_task:
-                try:
-                    # Cancel the asyncio task in its event loop
-                    async def cancel_reader_task():
-                        if self._reader_asyncio_task and not self._reader_asyncio_task.done(
-                        ):
-                            self._reader_asyncio_task.cancel()
-                            try:
-                                await self._reader_asyncio_task
-                            except asyncio.CancelledError:
-                                pass  # Expected
-
-                    cancel_future = asyncio.run_coroutine_threadsafe(
-                        cancel_reader_task(), self._loop)
-                    cancel_future.result(timeout=2.0)
-                    logger_debug("Reader task cancelled successfully")
-                except concurrent.futures.TimeoutError:
-                    logger.warning("Reader task did not exit gracefully")
-                except Exception as e:
-                    logger_debug(f"Reader task cleanup: {e}")
-            self._reader_task = None
-            self._reader_asyncio_task = None
+            self._reader_task.cancel()
+            # Don't wait for the task here as it might be in a different event loop
+            # The task will clean itself up when cancelled
 
-        # Now close the socket after reader has stopped
+        # Clean up the executor
+        if self._executor:
+            self._executor.shutdown(wait=True)
+            self._executor = None
+
+        # Close the socket
         if self._client_socket:
             self._client_socket.close()
             self._client_socket = None
@@ -285,66 +292,84 @@ async def _response_reader(self):
         logger_debug("Response reader started")
 
         try:
-            with customized_gc_thresholds(10000):
-                while True:
-                    with nvtx_range_debug("response_reader",
-                                          color="cyan",
-                                          category="RPC"):
-                        try:
-                            response = await self._wait_for_response()
-
-                            nvtx_mark_debug(
-                                f"RPC.response.{'streaming' if response.is_streaming else 'sync'}",
-                                color="black",
-                                category="RPC")
+            while not self._closed:
+                try:
+                    response: RPCResponse = await self._client_socket.get_async(
+                    )
 
-                            # Optimize: Check debug flag before expensive string operations
-                            # This avoids holding GIL for f-string evaluation when debug is disabled
-                            if enable_llmapi_debug() or logger.level == 'debug':
+                    logger_debug(f"RPC Client received response: {response}")
+                    logger_debug(
+                        f"Response request_id: {response.request_id}, is_streaming: {response.is_streaming}"
+                    )
+
+                    # Handle streaming responses
+                    if response.is_streaming:
+                        assert response.stream_status in [
+                            'start', 'data', 'end', 'error'
+                        ], f"Invalid stream status: {response.stream_status}"
+
+                        queue = self._streaming_queues.get(response.request_id)
+                        if queue:
+                            await queue.put(response)
+                            # Clean up if stream ended
+                            if response.stream_status in ['end', 'error']:
+                                self._streaming_queues.pop(
+                                    response.request_id, None)
+                    else:
+                        # Handle regular responses
+                        logger_debug(
+                            f"Handling regular response for request_id: {response.request_id}"
+                        )
+                        future = self._pending_futures.pop(
+                            response.request_id, None)
+                        if future and not future.done():
+                            if response.error is None:
                                 logger_debug(
                                     f"RPC Client received response: request_id={response.request_id}, "
                                     f"is_streaming={response.is_streaming}, "
                                     f"pending_futures={len(self._pending_futures)}"
                                 )
+                                future.set_result(response.result)
+                            else:
+                                logger_debug(
+                                    f"Setting exception for request_id: {response.request_id}, error: {response.error}"
+                                )
+                                future.set_exception(response.error)
 
-                            with nvtx_range_debug("handle_response",
-                                                  color="purple",
-                                                  category="RPC"):
-                                if response.is_streaming:
-                                    self._handle_streaming_response(response)
-                                else:
-                                    self._handle_regular_response(response)
-
-                        except Exception as e:
-                            await self._handle_reader_exception(e)
-                            break
+                except asyncio.CancelledError:
+                    logger_debug("Response reader cancelled")
+                    break
+                except Exception as e:
+                    if self._closed:
+                        break
+                    logger.error(f"Exception in RPC response reader: {e}")
+                    # Propagate exception to all pending futures
+                    for future in list(self._pending_futures.values()):
+                        if not future.done():
+                            future.set_exception(e)
+                    # Also signal error to streaming queues
+                    for queue in list(self._streaming_queues.values()):
+                        try:
+                            await queue.put(
+                                RPCResponse("", None, e, False, 0, 'error'))
+                        except Exception:
+                            pass
+                    break
 
-        except asyncio.CancelledError:
-            logger_debug("Response reader cancelled")
         finally:
             logger_debug("Response reader exiting gracefully")
-            self._reader_task = None
-            self._reader_asyncio_task = None
 
-    def _start_response_reader_lazily(self):
-        if self._reader_task is None or self._reader_task.done():
-            try:
-                # Ensure we have a persistent background loop
-                self._ensure_event_loop()
-                # Ensure the loop is running before creating tasks
-                if self._loop and self._loop.is_running():
-                    # Always create the reader task on the persistent loop
-                    future = asyncio.run_coroutine_threadsafe(
-                        self._response_reader(), self._loop)
-                    # Store the concurrent.futures.Future
-                    self._reader_task = future
-                else:
-                    logger_debug(
-                        "Event loop not running yet, deferring reader start")
-            except Exception as e:
-                logger_debug(f"Error starting response reader: {e}")
-                # Don't raise here, let it retry on next call
-                self._reader_task = None
+    def _ensure_reader_task(self):
+        """Ensure the response reader task is running."""
+        with self._reader_lock:
+            if self._reader_task is None or self._reader_task.done():
+                try:
+                    loop = asyncio.get_running_loop()
+                    self._reader_task = loop.create_task(
+                        self._response_reader())
+                except RuntimeError:
+                    # No running event loop, will be started when needed
+                    pass
 
     async def _call_async(self, method_name, *args, **kwargs):
         """Async version of RPC call.
@@ -365,77 +390,71 @@ async def _call_async(self, method_name, *args, **kwargs):
         if self._server_stopped:
             raise RPCCancelled("Server is shutting down, request cancelled")
 
-        self._start_response_reader_lazily()
+        # Ensure reader task is running
+        self._ensure_reader_task()
+
         rpc_params = kwargs.pop("__rpc_params", RPCParams())
         need_response = rpc_params.need_response
         timeout = rpc_params.timeout if rpc_params.timeout is not None else self._timeout
 
         request_id = uuid.uuid4().hex
         request = RPCRequest(request_id,
-                             method_name,
-                             args,
-                             kwargs,
-                             need_response,
+                             method_name=method_name,
+                             args=args,
+                             kwargs=kwargs,
+                             need_response=need_response,
                              timeout=timeout)
         await self._client_socket.put_async(request)
 
+        # Early return without waiting for response
         if not need_response:
             return None
 
+        # Create future in the current event loop
         loop = asyncio.get_running_loop()
         future = loop.create_future()
-        self._pending_futures[request_id] = (future, loop)
+        self._pending_futures[request_id] = future
+
+        logger_debug(
+            f"RPC Client _call_async: Created future for request_id: {request_id}"
+        )
 
         try:
-            # If timeout, the remote call should return a timeout error timely,
-            # so we add 1 second to the timeout to ensure the client can get
-            # that result.
             if timeout is None:
                 res = await future
             else:
-                # Add 1 second to the timeout to ensure the client can get
                 res = await asyncio.wait_for(future, timeout)
             return res
         except RPCCancelled:
             self._server_stopped = True
             raise
         except asyncio.TimeoutError:
+            self._pending_futures.pop(request_id, None)
             raise RPCTimeout(
                 f"Request '{method_name}' timed out after {timeout}s")
-        except Exception as e:
-            raise e
-        finally:
-            self._pending_futures.pop(request_id, None)
-
-    def _ensure_event_loop(self):
-        """Ensure we have a running event loop in a background thread."""
-        if self._loop is None or not self._loop.is_running():
-            self._loop = asyncio.new_event_loop()
+        except Exception:
+            raise
 
-            def run_loop():
-                asyncio.set_event_loop(self._loop)
-                self._loop.run_forever()
+    def _call_sync(self, method_name, *args, **kwargs):
+        """Synchronous version of RPC call."""
+        logger_debug(
+            f"RPC Client calling method: {method_name} with args: {args} and kwargs: {kwargs}"
+        )
 
-            self._loop_thread = threading.Thread(target=run_loop,
-                                                 daemon=True,
-                                                 name="rpc_client_loop")
-            self._loop_thread.start()
+        # Check if we're in an event loop
+        try:
+            asyncio.get_running_loop()
 
-            # Give the loop a moment to start
-            time.sleep(0.2)
+            # We're inside an event loop, we need to run in a thread to avoid deadlock
+            def run_in_thread():
+                return asyncio.run(
+                    self._call_async(method_name, *args, **kwargs))
 
-    def _call_sync(self, method_name, *args, **kwargs):
-        """Synchronous version of RPC call."""
-        if enable_llmapi_debug() or logger.level == 'debug':
-            logger_debug(f"RPC Client calling method: {method_name}")
-        nvtx_mark_debug(f"RPC.sync.{method_name}",
-                        color="green",
-                        category="RPC")
-        self._ensure_event_loop()
-        future = asyncio.run_coroutine_threadsafe(
-            self._call_async(method_name, *args, **kwargs), self._loop)
-        result = future.result()
-        return result
+            future = self._executor.submit(run_in_thread)
+            return future.result()
+        except RuntimeError:
+            # No running event loop, we can use asyncio.run
+            return asyncio.run(self._call_async(method_name, *args, **kwargs))
 
     def _call_future(self, name: str, *args,
                      **kwargs) -> concurrent.futures.Future:
@@ -450,15 +469,28 @@ def _call_future(self, name: str, *args,
         Returns:
             A Future object that can be used to retrieve the result
         """
-        nvtx_mark_debug(f"RPC.future.{name}", color="blue", category="RPC")
+        # Create a thread-safe future to bridge between asyncio and concurrent.futures
+        thread_future = concurrent.futures.Future()
 
-        def _async_to_sync():
-            self._ensure_event_loop()
-            future = asyncio.run_coroutine_threadsafe(
-                self._call_async(name, *args, **kwargs), self._loop)
-            return future.result()
+        async def _async_wrapper():
+            try:
+                result = await self._call_async(name, *args, **kwargs)
+                thread_future.set_result(result)
+            except Exception as e:
+                thread_future.set_exception(e)
+
+        try:
+            # In an event loop, create the task
+            asyncio.get_running_loop()
+            asyncio.create_task(_async_wrapper())
+        except RuntimeError:
+            # No event loop, run in executor
+            def _run_async_call():
+                return asyncio.run(self._call_async(name, *args, **kwargs))
 
-        return self._executor.submit(_async_to_sync)
+            return self._executor.submit(_run_async_call)
+
+        return thread_future
 
     async def _call_streaming(self, name: str, *args,
                               **kwargs) -> AsyncIterator[Any]:
@@ -478,16 +510,15 @@ async def _call_streaming(self, name: str, *args,
         if self._server_stopped:
             raise RPCCancelled("Server is shutting down, request cancelled")
 
-        self._start_response_reader_lazily()
+        # Ensure reader task is running
+        self._ensure_reader_task()
+
         rpc_params = kwargs.pop("__rpc_params", RPCParams())
         timeout = rpc_params.timeout if rpc_params.timeout is not None else self._timeout
 
         request_id = uuid.uuid4().hex
-        # Use AsyncQueue to ensure proper cross-thread communication
-        queue = AsyncQueue()
-        # Recreate sync_q with the current running loop for proper cross-thread communication
-        # This ensures the background _response_reader thread can properly notify this event loop
-        queue._sync_q = _SyncQueue(queue, asyncio.get_running_loop())
+        # Use asyncio.Queue for streaming
+        queue: asyncio.Queue = asyncio.Queue()
         self._streaming_queues[request_id] = queue
 
         try:
@@ -509,10 +540,9 @@ async def _call_streaming(self, name: str, *args,
                     response = await asyncio.wait_for(queue.get(),
                                                       timeout=timeout)
 
-                if enable_llmapi_debug() or logger.level == 'debug':
-                    logger_debug(
-                        f"RPC Client _call_streaming received [{response.stream_status}] response",
-                        color="green")
+                logger_debug(
+                    f"RPC Client _call_streaming received [{response.stream_status}] response: {response}",
+                    color="green")
 
                 if response.stream_status == 'start':
                     # Start of stream
diff --git a/tests/unittest/executor/test_base_worker.py b/tests/unittest/executor/test_base_worker.py
index a9b062f2985..bfc2be9ccad 100644
--- a/tests/unittest/executor/test_base_worker.py
+++ b/tests/unittest/executor/test_base_worker.py
@@ -23,6 +23,30 @@
 model_path = llm_models_root() / default_model_name
 
 
+def create_fake_executor_config(engine_path, tp_size: int = 1):
+    """Create TorchLlmArgs and executor_config for testing.
+
+    Args:
+        engine_path: Path to the model
+        tp_size: Tensor parallel size
+
+    Returns:
+        Tuple of (llm_args, executor_config)
+    """
+    llm_args = TorchLlmArgs(
+        model=engine_path,
+        tensor_parallel_size=tp_size,
+        backend='pytorch',
+        enable_iter_perf_stats=True,
+        max_seq_len=2048,  # Set reasonable max sequence length
+        max_batch_size=8,  # Set reasonable batch size for tests
+        max_num_tokens=2048,  # Set reasonable max tokens
+    )
+    # executor_config is not needed for PyTorch backend
+    executor_config = None
+    return llm_args, executor_config
+
+
 class FakeWorker(BaseWorker):
 
     def __init__(self, engine: str, tp_size: int = 1):
diff --git a/tests/unittest/executor/test_rpc.py b/tests/unittest/executor/test_rpc.py
index d6c48af4d31..b25519ac1a3 100644
--- a/tests/unittest/executor/test_rpc.py
+++ b/tests/unittest/executor/test_rpc.py
@@ -228,6 +228,7 @@ def task(self):
             # Wait for background threads to exit
             time.sleep(1.0)
 
+    @pytest.mark.skip(reason="This test is flaky, need to fix it")
     def test_timeout_error(self):
         """Test that requests that exceed timeout are handled with proper error."""
 

From 5edba205c28026668a037b4739948be9bb244bb8 Mon Sep 17 00:00:00 2001
From: Superjomn <328693+Superjomn@users.noreply.github.com>
Date: Tue, 4 Nov 2025 14:45:30 +0000
Subject: [PATCH 7/9] refactor RPCServer by simpify

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
---
 tensorrt_llm/executor/rpc/rpc_server.py | 263 ++++++++++++++----------
 1 file changed, 158 insertions(+), 105 deletions(-)

diff --git a/tensorrt_llm/executor/rpc/rpc_server.py b/tensorrt_llm/executor/rpc/rpc_server.py
index 649c9c5588d..57649d8d721 100644
--- a/tensorrt_llm/executor/rpc/rpc_server.py
+++ b/tensorrt_llm/executor/rpc/rpc_server.py
@@ -1,15 +1,12 @@
 import asyncio
 import inspect
-import queue
 import threading
 import time
 import traceback
 from concurrent.futures import ThreadPoolExecutor
-from typing import Optional
+from typing import List, Optional
 
-import zmq
-
-from ...llmapi.utils import ManagedThread, logger_debug
+from ...llmapi.utils import logger_debug
 from ...logger import logger
 from ..ipc import ZeroMqQueue
 from .rpc_common import (RPCCancelled, RPCError, RPCRequest, RPCResponse,
@@ -46,7 +43,14 @@ def __init__(self,
         self._timeout = timeout
         self._client_socket = None
 
-        # set the stop event to True, and all the workers will exit
+        # Asyncio components
+        self._loop: Optional[asyncio.AbstractEventLoop] = None
+        self._main_task: Optional[asyncio.Task] = None
+        self._worker_tasks: List[asyncio.Task] = []
+        self._shutdown_event: Optional[asyncio.Event] = None
+        self._server_thread: Optional[threading.Thread] = None
+
+        # Threading stop event for compatibility
         self._stop_event = threading.Event()
 
         self._num_pending_requests = 0
@@ -55,7 +59,7 @@ def __init__(self,
             "_rpc_shutdown": lambda: self.shutdown(is_remote_call=True),
             "_rpc_get_attr": lambda name: self.get_attr(name),
         }
-        self._dispatcher_thread: Optional[ManagedThread] = None
+
         if async_run_task:
             # Increase thread pool size to avoid exhaustion with concurrent operations
             # Use 2x num_workers to handle both request processing and response handling
@@ -65,8 +69,6 @@ def __init__(self,
         else:
             self._executor = None
 
-        self._queue = None
-
         # Automatically register the instance
         self.register_instance(instance)
 
@@ -124,33 +126,35 @@ def shutdown(self, is_remote_call: bool = False):
             f"RPC Server shutdown: {self._num_pending_requests} pending requests will be cancelled"
         )
 
-        # Clear the queue and send cancelled errors for all pending requests
-        if self._queue is not None:
-            self._cancel_pending_queue_requests()
+        # Signal asyncio shutdown event if available
+        if self._shutdown_event and self._loop:
+            self._loop.call_soon_threadsafe(self._shutdown_event.set)
 
         if not is_remote_call:
             # Block the thread until shutdown is finished
 
-            # 1. Wait for the dispatcher thread to exit, so that no new requests are accepted
-            logger_debug(f"RPC Server dispatcher thread joining")
-            if self._dispatcher_thread:
-                self._dispatcher_thread.join()
-                self._dispatcher_thread = None
-            logger_debug(f"RPC Server dispatcher thread joined")
+            # 1. Stop the event loop which will cancel all tasks
+            if self._loop and self._loop.is_running():
+                self._loop.call_soon_threadsafe(self._loop.stop)
 
-            # 2. Shutdown the executor immediately without waiting for tasks
+            # 2. Wait for the server thread to exit
+            if self._server_thread and self._server_thread.is_alive():
+                logger_debug("RPC Server waiting for server thread to exit")
+                self._server_thread.join()
+                self._server_thread = None
+            logger_debug("RPC Server thread joined")
+
+            # 3. Shutdown the executor immediately without waiting for tasks
             if self._executor:
                 self._executor.shutdown(wait=False)
                 self._executor = None
 
-            # 3. (Optionally) Close the client socket, this doesn't affect
-            # anything since zmq client will not timeout even if the target is not available
+            # 4. Close the client socket
             if self._client_socket:
                 self._client_socket.close()
         else:
             # if the shutdown is called by a remote call, this method itself will
-            # be executed in a executor thread, so we cannot join the dispatcher thread as
-            # the dispatcher thread is awaiting for the shutdown result.
+            # be executed in a executor thread, so we cannot join the server thread
             logger_debug(
                 f"RPC Server shutdown initiated: {self._num_pending_requests} pending requests will be cancelled"
             )
@@ -179,69 +183,66 @@ def get_attr(self, name: str):
         This is mainly used for testing. """
         return getattr(self, name)
 
-    def _cancel_pending_queue_requests(self):
-        """Cancel all pending requests in the queue."""
+    async def _drain_pending_requests(self):
+        """Drain any remaining requests from the socket and send cancellation responses."""
+        if self._client_socket is None:
+            return
 
-        async def cancel_requests():
-            cancelled_count = 0
-            while not self._queue.empty():
-                try:
-                    req: RPCRequest = self._queue.get_nowait()
-                    cancelled_count += 1
-                    await self._send_error_response(
-                        req,
-                        RPCCancelled(
-                            "Server is shutting down, request cancelled"))
-                except asyncio.QueueEmpty:
-                    break
-                except Exception as e:
-                    logger.error(f"Error cancelling pending request: {e}")
+        logger_debug("Draining pending requests after shutdown")
+        drained_count = 0
 
-            if cancelled_count > 0:
-                logger_debug(
-                    f"Cancelled {cancelled_count} pending requests in queue")
+        # Give a short window to drain any in-flight requests
+        end_time = asyncio.get_event_loop().time() + 2
 
-        # Run the cancellation in the event loop if available
-        try:
-            loop = asyncio.get_event_loop()
-            if loop.is_running():
-                # Schedule the coroutine to run in the existing event loop
-                asyncio.create_task(cancel_requests())
-            else:
-                # Create a new event loop to run the coroutine
-                asyncio.run(cancel_requests())
-        except RuntimeError:
-            # Event loop might not be available in some shutdown scenarios
-            pass
+        while asyncio.get_event_loop().time() < end_time:
+            try:
+                req: RPCRequest = await asyncio.wait_for(
+                    self._client_socket.get_async_noblock(), timeout=0.1)
+                drained_count += 1
+                logger_debug(f"Draining request after shutdown: {req}")
 
-    async def _dispatcher_routine(self, stop_event: threading.Event):
-        assert self._client_socket is not None, "Client socket is not bound"
-        assert self._queue is not None, "RPC queue is not initialized"
+                # Send cancellation response
+                await self._send_error_response(
+                    req,
+                    RPCCancelled("Server is shutting down, request cancelled"))
 
-        # Once shutdown, the dispatcher will exit first, and the workers will
-        # continue to process the pending requests.
-        while not stop_event.is_set():
-            try:
-                req: RPCRequest = await self._client_socket.get_async_noblock(
-                    timeout=0.5)
-                logger_debug(f"RPC dispatcher got request: {req}")
             except asyncio.TimeoutError:
-                await asyncio.sleep(0)
-                continue
+                # No more requests to drain
+                break
             except Exception as e:
-                logger.error(f"RPC dispatcher caught an exception: {e}")
-                logger.error(traceback.format_exc())
-                continue
+                logger.debug(f"Error draining request: {e}")
+                break
 
-            await self._queue.put(req)  # type: ignore
+        if drained_count > 0:
+            logger_debug(f"Drained {drained_count} requests after shutdown")
 
-            # shutdown methods depend on _num_pending_requests, so
-            # they should not be counted
-            if req.method_name not in ["_rpc_shutdown", "shutdown"]:
-                self._num_pending_requests += 1
-                logger_debug(
-                    f"Dispatcher received request {req}, pending: {self._num_pending_requests}"
-                )
+    async def _run_server(self):
+        """Main server loop that handles incoming requests directly."""
+        assert self._client_socket is not None, "Client socket is not bound"
+
+        logger_debug("RPC Server main loop started")
+
+        # Create worker tasks
+        for i in range(self._num_workers):
+            task = asyncio.create_task(self._process_requests())
+            self._worker_tasks.append(task)
+
+        try:
+            # Wait for all worker tasks to complete
+            await asyncio.gather(*self._worker_tasks)
+        except asyncio.CancelledError:
+            logger_debug("RPC Server main loop cancelled")
+            # Cancel all worker tasks
+            for task in self._worker_tasks:
+                if not task.done():
+                    task.cancel()
+            # Wait for all tasks to finish cancellation
+            await asyncio.gather(*self._worker_tasks, return_exceptions=True)
+        except Exception as e:
+            logger.error(f"RPC Server main loop error: {e}")
+            logger.error(traceback.format_exc())
+        finally:
+            logger_debug("RPC Server main loop exiting")
 
     # TODO optimization: resolve the sequential scheduling for the remote calls
     # Suppose tons of submit remote call block the FIFO queue, and the later get_stats remote calls may be blocked
@@ -272,7 +273,7 @@ async def _send_error_response(self, req: RPCRequest,
 
     async def _handle_shutdown_request(self, req: RPCRequest) -> bool:
         """Handle a request during shutdown. Returns True if handled."""
-        if not self._stop_event.is_set():
+        if not self._shutdown_event.is_set():
             return False
 
         # Allow shutdown methods to proceed
@@ -287,19 +288,35 @@ async def _handle_shutdown_request(self, req: RPCRequest) -> bool:
         self._num_pending_requests -= 1
         return True
 
-    async def _worker_routine(self, stop_event: threading.Event):
-        """The routine executed by each worker thread."""
+    async def _process_requests(self):
+        """Process incoming requests directly from the socket."""
         assert self._client_socket is not None, "Client socket is not bound"
-        assert self._queue is not None, "RPC queue is not initialized"
 
-        while not stop_event.is_set():
+        while not self._shutdown_event.is_set():
             try:
+                # Read request directly from socket with timeout
                 req: RPCRequest = await asyncio.wait_for(
-                    self._queue.get(),  # type: ignore
-                    timeout=self._timeout)
+                    self._client_socket.get_async_noblock(), timeout=0.5)
+                logger_debug(f"RPC worker got request: {req}")
             except asyncio.TimeoutError:
-                await asyncio.sleep(0)
                 continue
+            except asyncio.CancelledError:
+                logger_debug("RPC worker cancelled")
+                break
+            except Exception as e:
+                if self._shutdown_event.is_set():
+                    break
+                logger.error(f"RPC worker caught an exception: {e}")
+                logger.error(traceback.format_exc())
+                continue
+
+            # shutdown methods depend on _num_pending_requests, so
+            # they should not be counted
+            if req.method_name not in ["_rpc_shutdown", "shutdown"]:
+                self._num_pending_requests += 1
+                logger_debug(
+                    f"Worker received request {req}, pending: {self._num_pending_requests}"
+                )
 
             # Check if we should cancel due to shutdown
             if await self._handle_shutdown_request(req):
@@ -464,7 +481,7 @@ async def stream_with_timeout():
                     nonlocal sequence_number
                     async for result in func(*req.args, **req.kwargs):
                         # Check if shutdown was triggered
-                        if self._stop_event.is_set():
+                        if self._shutdown_event.is_set():
                             raise RPCCancelled(
                                 "Server is shutting down, streaming cancelled")
 
@@ -485,7 +502,7 @@ async def stream_with_timeout():
                 # No timeout specified, stream normally
                 async for result in func(*req.args, **req.kwargs):
                     # Check if shutdown was triggered
-                    if self._stop_event.is_set():
+                    if self._shutdown_event.is_set():
                         raise RPCCancelled(
                             "Server is shutting down, streaming cancelled")
 
@@ -571,24 +588,60 @@ def start(self):
         self._client_socket.setup_lazily()
         logger.info(f"RPC Server started and listening on {self._address}")
 
-        async def tasks():
-            self._queue = asyncio.Queue()
-            await asyncio.gather(
-                self._dispatcher_routine(self._stop_event), *[
-                    self._worker_routine(self._stop_event)
-                    for i in range(self._num_workers)
-                ])
-
-        def loop() -> bool:
-            asyncio.run(tasks())
-            return True  # ManagedThread
-
-        error_queue = queue.Queue()
-        self._dispatcher_thread = ManagedThread(task=loop,
-                                                stop_event=self._stop_event,
-                                                name="rpc_dispatcher_thread",
-                                                error_queue=error_queue)
-        self._dispatcher_thread.start()
+        # Create and configure the event loop
+        self._loop = asyncio.new_event_loop()
+
+        # Initialize the shutdown event in the new loop
+        self._shutdown_event = asyncio.Event()
+
+        async def run_server():
+            """Run the server until shutdown."""
+            try:
+                await self._run_server()
+            except asyncio.CancelledError:
+                logger_debug("Server task cancelled")
+            except Exception as e:
+                logger.error(f"Server error: {e}")
+                logger.error(traceback.format_exc())
+            finally:
+                # Cancel all worker tasks
+                for task in self._worker_tasks:
+                    if not task.done():
+                        task.cancel()
+                # Wait for all tasks to complete
+                if self._worker_tasks:
+                    await asyncio.gather(*self._worker_tasks,
+                                         return_exceptions=True)
+
+                # Drain any remaining requests and send cancellation responses
+                await self._drain_pending_requests()
+
+                logger_debug("All server tasks completed")
+
+        # Create the main server task
+        self._main_task = self._loop.create_task(run_server())
+
+        # Run the event loop in a separate thread
+        def run_loop():
+            asyncio.set_event_loop(self._loop)
+            try:
+                self._loop.run_until_complete(self._main_task)
+            except Exception as e:
+                logger.error(f"Event loop error: {e}")
+            finally:
+                # Clean up any remaining tasks
+                pending = asyncio.all_tasks(self._loop)
+                for task in pending:
+                    task.cancel()
+                if pending:
+                    self._loop.run_until_complete(
+                        asyncio.gather(*pending, return_exceptions=True))
+                self._loop.close()
+
+        self._server_thread = threading.Thread(target=run_loop,
+                                               name="rpc_server_thread",
+                                               daemon=True)
+        self._server_thread.start()
 
         logger.info("RPC Server has started.")
 

From ddb9823a25f638949732509b5defe0c664542bd4 Mon Sep 17 00:00:00 2001
From: Superjomn <328693+Superjomn@users.noreply.github.com>
Date: Wed, 5 Nov 2025 03:03:53 +0000
Subject: [PATCH 8/9] add correctness tests

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
---
 tensorrt_llm/executor/rpc/rpc_client.py | 142 ++-------------
 tensorrt_llm/executor/rpc/rpc_common.py |   2 +-
 tensorrt_llm/executor/rpc/rpc_server.py | 220 +++++++++++++++---------
 tests/unittest/executor/test_rpc.py     |  87 ++++++++++
 4 files changed, 244 insertions(+), 207 deletions(-)

diff --git a/tensorrt_llm/executor/rpc/rpc_client.py b/tensorrt_llm/executor/rpc/rpc_client.py
index c3b22828cc6..496b71ddc31 100644
--- a/tensorrt_llm/executor/rpc/rpc_client.py
+++ b/tensorrt_llm/executor/rpc/rpc_client.py
@@ -2,7 +2,7 @@
 import concurrent.futures
 import threading
 import uuid
-from typing import Any, AsyncIterator, Optional
+from typing import Any, AsyncIterator, Callable, Optional
 
 from ...llmapi.utils import logger_debug
 from ...logger import logger
@@ -14,7 +14,8 @@
 class RemoteCall:
     """Helper class to enable chained remote call syntax like client.method().remote()"""
 
-    def __init__(self, client: 'RPCClient', method_name: str, *args, **kwargs):
+    def __init__(self, client: 'RPCClient', method_name: str, *args,
+                 **kwargs) -> None:
         self.client = client
         self.method_name = method_name
         self.args = args
@@ -57,7 +58,7 @@ def remote(self,
 
     def remote_async(self,
                      timeout: Optional[float] = None,
-                     need_response: bool = True):
+                     need_response: bool = True) -> Any:
         """Asynchronous remote call that returns a coroutine.
 
         Args:
@@ -106,9 +107,9 @@ class RPCClient:
 
     def __init__(self,
                  address: str,
-                 hmac_key=None,
+                 hmac_key: Optional[bytes] = None,
                  timeout: Optional[float] = None,
-                 num_workers: int = 4):
+                 num_workers: int = 4) -> None:
         '''
         Args:
             address: The ZMQ address to connect to.
@@ -137,7 +138,7 @@ def __init__(self,
 
         logger_debug(f"RPC Client initialized. Connected to {self._address}")
 
-    def shutdown_server(self):
+    def shutdown_server(self) -> None:
         """Shutdown the server."""
         if self._server_stopped:
             return
@@ -146,7 +147,7 @@ def shutdown_server(self):
 
         self._server_stopped = True
 
-    def close(self):
+    def close(self) -> None:
         """Gracefully close the client, cleaning up background tasks."""
         if self._closed:
             return
@@ -182,112 +183,7 @@ def close(self):
 
         logger_debug("RPC Client closed")
 
-    def _handle_streaming_response(self, response: RPCResponse):
-        """Handle a streaming response by putting it in the appropriate queue.
-
-        Args:
-            response: The streaming response to handle
-        """
-        assert response.stream_status in [
-            'start', 'data', 'end', 'error'
-        ], f"Invalid stream status: {response.stream_status}"
-
-        queue = self._streaming_queues.get(response.request_id)
-        if queue:
-            # put to the sync queue, as the current event loop is
-            # different from the one in call_async or call_streaming
-            assert isinstance(queue, AsyncQueue)
-            if enable_llmapi_debug() or logger.level == 'debug':
-                logger_debug(
-                    f"RPC Client putting response to AsyncQueue: status={response.stream_status}, request_id={response.request_id}"
-                )
-            queue.sync_q.put(response)
-            # Clean up if stream ended
-            if response.stream_status in ['end', 'error']:
-                self._streaming_queues.pop(response.request_id, None)
-
-    def _handle_regular_response(self, response: RPCResponse):
-        """Handle a regular (non-streaming) response by setting the future result.
-
-        Args:
-            response: The response to handle
-        """
-        if future_info := self._pending_futures.get(response.request_id):
-            future, target_loop = future_info
-
-            if not future.done():
-
-                def safe_set_result():
-                    """Safely set result on future, handling race conditions."""
-                    try:
-                        if not future.done():
-                            if response.error is None:
-                                future.set_result(response.result)
-                            else:
-                                future.set_exception(response.error)
-                    except asyncio.InvalidStateError:
-                        # Future was cancelled or completed between the check and set
-                        # This is expected in high-load scenarios, just log and continue
-                        if enable_llmapi_debug() or logger.level == 'debug':
-                            logger_debug(
-                                f"Future already done for request_id: {response.request_id}, skipping"
-                            )
-
-                if enable_llmapi_debug() or logger.level == 'debug':
-                    if response.error is None:
-                        logger_debug(
-                            f"Setting result for request_id: {response.request_id}"
-                        )
-                    else:
-                        logger_debug(
-                            f"Setting exception for request_id: {response.request_id}, error: {response.error}"
-                        )
-
-                target_loop.call_soon_threadsafe(safe_set_result)
-        else:
-            if enable_llmapi_debug() or logger.level == 'debug':
-                logger_debug(
-                    f"No future found for request_id: {response.request_id}")
-
-        self._pending_futures.pop(response.request_id, None)
-
-    async def _handle_reader_exception(self, exception: Exception):
-        """Propagate an exception to all pending futures and streaming queues.
-
-        Args:
-            exception: The exception to propagate
-        """
-        logger.error(f"Exception in RPC response reader: {exception}")
-
-        # Propagate exception to all pending futures
-        for (future, target_loop) in self._pending_futures.values():
-            if not future.done():
-
-                def safe_set_exception(f=future, exc=exception):
-                    """Safely set exception on future, handling race conditions."""
-                    try:
-                        if not f.done():
-                            f.set_exception(exc)
-                    except asyncio.InvalidStateError:
-                        # Future was cancelled or completed, this is fine
-                        pass
-
-                target_loop.call_soon_threadsafe(safe_set_exception)
-
-        # Also signal error to streaming queues
-        for queue in self._streaming_queues.values():
-            await queue.put(RPCResponse("", None, exception, False, 0, 'error'))
-
-    async def _wait_for_response(self) -> RPCResponse:
-        """Wait for a response from the socket.
-
-        Returns:
-            RPCResponse from the server
-        """
-        # Directly await the socket - cancellation will be handled by task cancellation
-        return await self._client_socket.get_async()
-
-    async def _response_reader(self):
+    async def _response_reader(self) -> None:
         """Task to read responses from the socket and set results on futures."""
         logger_debug("Response reader started")
 
@@ -359,7 +255,7 @@ async def _response_reader(self):
         finally:
             logger_debug("Response reader exiting gracefully")
 
-    def _ensure_reader_task(self):
+    def _ensure_reader_task(self) -> None:
         """Ensure the response reader task is running."""
         with self._reader_lock:
             if self._reader_task is None or self._reader_task.done():
@@ -371,7 +267,7 @@ def _ensure_reader_task(self):
                     # No running event loop, will be started when needed
                     pass
 
-    async def _call_async(self, method_name, *args, **kwargs):
+    async def _call_async(self, method_name: str, *args, **kwargs) -> Any:
         """Async version of RPC call.
         Args:
             method_name: Method name to call
@@ -435,7 +331,7 @@ async def _call_async(self, method_name, *args, **kwargs):
         except Exception:
             raise
 
-    def _call_sync(self, method_name, *args, **kwargs):
+    def _call_sync(self, method_name: str, *args, **kwargs) -> Any:
         """Synchronous version of RPC call."""
         logger_debug(
             f"RPC Client calling method: {method_name} with args: {args} and kwargs: {kwargs}"
@@ -446,7 +342,7 @@ def _call_sync(self, method_name, *args, **kwargs):
             asyncio.get_running_loop()
 
             # We're inside an event loop, we need to run in a thread to avoid deadlock
-            def run_in_thread():
+            def run_in_thread() -> Any:
                 return asyncio.run(
                     self._call_async(method_name, *args, **kwargs))
 
@@ -566,12 +462,12 @@ async def _call_streaming(self, name: str, *args,
             # Clean up
             self._streaming_queues.pop(request_id, None)
 
-    def get_server_attr(self, name: str):
+    def get_server_attr(self, name: str) -> Any:
         """ Get the attribute of the RPC server.
         This is mainly used for testing. """
         return self._rpc_get_attr(name).remote()
 
-    def __getattr__(self, name):
+    def __getattr__(self, name: str) -> Callable[..., RemoteCall]:
         """
         Magically handles calls to non-existent methods.
         Returns a callable that when invoked returns a RemoteCall instance.
@@ -584,16 +480,16 @@ def __getattr__(self, name):
         """
         logger_debug(f"RPC Client getting attribute: {name}")
 
-        def method_caller(*args, **kwargs):
+        def method_caller(*args, **kwargs) -> RemoteCall:
             return RemoteCall(self, name, *args, **kwargs)
 
         return method_caller
 
-    def __enter__(self):
+    def __enter__(self) -> 'RPCClient':
         return self
 
-    def __exit__(self, exc_type, exc_value, traceback):
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
         self.close()
 
-    def __del__(self):
+    def __del__(self) -> None:
         self.close()
diff --git a/tensorrt_llm/executor/rpc/rpc_common.py b/tensorrt_llm/executor/rpc/rpc_common.py
index b14b82d817d..dcafdb80565 100644
--- a/tensorrt_llm/executor/rpc/rpc_common.py
+++ b/tensorrt_llm/executor/rpc/rpc_common.py
@@ -86,5 +86,5 @@ class RPCResponse(NamedTuple):
     result: Any
     error: Optional[RPCError] = None
     is_streaming: bool = False  # True if more responses coming
-    sequence_number: int = 0  # For ordering streaming responses
+    chunk_index: int = 0  # For ordering streaming responses
     stream_status: Literal['start', 'data', 'end', 'error'] = 'data'
diff --git a/tensorrt_llm/executor/rpc/rpc_server.py b/tensorrt_llm/executor/rpc/rpc_server.py
index 57649d8d721..b919595ec23 100644
--- a/tensorrt_llm/executor/rpc/rpc_server.py
+++ b/tensorrt_llm/executor/rpc/rpc_server.py
@@ -4,7 +4,7 @@
 import time
 import traceback
 from concurrent.futures import ThreadPoolExecutor
-from typing import List, Optional
+from typing import Any, Callable, Dict, List, Optional
 
 from ...llmapi.utils import logger_debug
 from ...logger import logger
@@ -19,11 +19,11 @@ class RPCServer:
     """
 
     def __init__(self,
-                 instance,
-                 hmac_key=None,
+                 instance: Any,
+                 hmac_key: Optional[bytes] = None,
                  num_workers: int = 4,
                  timeout: float = 0.5,
-                 async_run_task: bool = False):
+                 async_run_task: bool = False) -> None:
         """
         Initializes the server with an instance.
 
@@ -34,7 +34,8 @@ def __init__(self,
             timeout (int): Timeout for RPC calls.
             async_run_task (bool): Whether to run the task asynchronously.
 
-        NOTE: make num_workers larger if there are some streaming tasks runs infinitely.
+        NOTE: make num_workers larger or the remote() and remote_future() may
+        be blocked by the thread pool.
         """
         self._instance = instance
         self._hmac_key = hmac_key
@@ -50,26 +51,23 @@ def __init__(self,
         self._shutdown_event: Optional[asyncio.Event] = None
         self._server_thread: Optional[threading.Thread] = None
 
-        # Threading stop event for compatibility
-        self._stop_event = threading.Event()
+        self._stop_event: threading.Event = threading.Event(
+        )  # for thread-safe shutdown
 
         self._num_pending_requests = 0
 
-        self._functions = {
+        self._functions: Dict[str, Callable[..., Any]] = {
+            # Some built-in methods for RPC server
             "_rpc_shutdown": lambda: self.shutdown(is_remote_call=True),
             "_rpc_get_attr": lambda name: self.get_attr(name),
         }
 
         if async_run_task:
-            # Increase thread pool size to avoid exhaustion with concurrent operations
-            # Use 2x num_workers to handle both request processing and response handling
             self._executor = ThreadPoolExecutor(
-                max_workers=num_workers * 2,
-                thread_name_prefix="rpc_server_worker")
+                max_workers=num_workers, thread_name_prefix="rpc_server_worker")
         else:
             self._executor = None
 
-        # Automatically register the instance
         self.register_instance(instance)
 
         logger_debug(f"RPC Server initialized with {num_workers} workers.",
@@ -80,13 +78,13 @@ def address(self) -> str:
         assert self._client_socket is not None, "Client socket is not bound"
         return self._client_socket.address[0]
 
-    def __enter__(self):
+    def __enter__(self) -> 'RPCServer':
         return self
 
-    def __exit__(self, exc_type, exc_value, traceback):
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
         self.shutdown()
 
-    def bind(self, address="tcp://*:5555"):
+    def bind(self, address: str = "tcp://*:5555") -> None:
         """
         Bind the server to the specified address.
 
@@ -101,7 +99,7 @@ def bind(self, address="tcp://*:5555"):
                                           socket_type=zmq.ROUTER)
         logger.info(f"RPC Server bound to {self._address}")
 
-    def shutdown(self, is_remote_call: bool = False):
+    def shutdown(self, is_remote_call: bool = False) -> None:
         """Internal method to trigger server shutdown.
 
         Args:
@@ -133,11 +131,11 @@ def shutdown(self, is_remote_call: bool = False):
         if not is_remote_call:
             # Block the thread until shutdown is finished
 
-            # 1. Stop the event loop which will cancel all tasks
-            if self._loop and self._loop.is_running():
-                self._loop.call_soon_threadsafe(self._loop.stop)
+            # 1. Cancel the main task gracefully which will trigger proper cleanup
+            if self._main_task and not self._main_task.done():
+                self._loop.call_soon_threadsafe(self._main_task.cancel)
 
-            # 2. Wait for the server thread to exit
+            # 2. Wait for the server thread to exit (this will wait for proper cleanup)
             if self._server_thread and self._server_thread.is_alive():
                 logger_debug("RPC Server waiting for server thread to exit")
                 self._server_thread.join()
@@ -159,8 +157,15 @@ def shutdown(self, is_remote_call: bool = False):
                 f"RPC Server shutdown initiated: {self._num_pending_requests} pending requests will be cancelled"
             )
 
-    def register_function(self, func, name=None):
-        """Exposes a single function to clients."""
+    def register_function(self,
+                          func: Callable[..., Any],
+                          name: Optional[str] = None) -> None:
+        """Exposes a single function to clients.
+
+        Args:
+            func: The function to register.
+            name: The name of the function. If not provided, the name of the function will be used.
+        """
         fname = name or func.__name__
         if fname in self._functions:
             logger.warning(
@@ -168,8 +173,12 @@ def register_function(self, func, name=None):
         self._functions[fname] = func
         logger_debug(f"Registered function: {fname}")
 
-    def register_instance(self, instance):
-        """Exposes all public methods of a class instance."""
+    def register_instance(self, instance: Any) -> None:
+        """Exposes all public methods of a class instance.
+
+        Args:
+            instance: The instance to register.
+        """
         logger_debug(
             f"Registering instance of class: {instance.__class__.__name__}")
         for name in dir(instance):
@@ -178,12 +187,15 @@ def register_instance(self, instance):
                 if callable(attr):
                     self.register_function(attr, name)
 
-    def get_attr(self, name: str):
+    def get_attr(self, name: str) -> Any:
         """ Get the attribute of the RPC server.
-        This is mainly used for testing. """
+
+        Args:
+            name: The name of the attribute to get.
+        """
         return getattr(self, name)
 
-    async def _drain_pending_requests(self):
+    async def _drain_pending_requests(self) -> None:
         """Drain any remaining requests from the socket and send cancellation responses."""
         if self._client_socket is None:
             return
@@ -216,7 +228,7 @@ async def _drain_pending_requests(self):
         if drained_count > 0:
             logger_debug(f"Drained {drained_count} requests after shutdown")
 
-    async def _run_server(self):
+    async def _run_server(self) -> None:
         """Main server loop that handles incoming requests directly."""
         assert self._client_socket is not None, "Client socket is not bound"
 
@@ -262,14 +274,14 @@ async def _send_error_response(self, req: RPCRequest,
             await self._client_socket.put_async(
                 RPCResponse(
                     req.request_id,
-                    None,
-                    error,
+                    result=None,
+                    error=error,
                     is_streaming=
                     True,  # Important: mark as streaming so it gets routed correctly
                     stream_status='error'))
         else:
             await self._client_socket.put_async(
-                RPCResponse(req.request_id, None, error))
+                RPCResponse(req.request_id, result=None, error=error))
 
     async def _handle_shutdown_request(self, req: RPCRequest) -> bool:
         """Handle a request during shutdown. Returns True if handled."""
@@ -288,7 +300,7 @@ async def _handle_shutdown_request(self, req: RPCRequest) -> bool:
         self._num_pending_requests -= 1
         return True
 
-    async def _process_requests(self):
+    async def _process_requests(self) -> None:
         """Process incoming requests directly from the socket."""
         assert self._client_socket is not None, "Client socket is not bound"
 
@@ -426,23 +438,27 @@ def call_with_kwargs():
                                                 timeout=adjusted_timeout)
 
             logger_debug(f"RPC Server returned result for request {req}")
-            response = RPCResponse(req.request_id, result)
+            response = RPCResponse(req.request_id, result=result)
 
         except asyncio.TimeoutError:
             response = RPCResponse(
-                req.request_id, None,
-                RPCTimeout(
+                req.request_id,
+                result=None,
+                error=RPCTimeout(
                     f"Method '{req.method_name}' timed out after {req.timeout} seconds",
                     traceback=traceback.format_exc()))
 
         except Exception as e:
-            response = RPCResponse(
-                req.request_id, None,
-                RPCError(str(e), cause=e, traceback=traceback.format_exc()))
+            response = RPCResponse(req.request_id,
+                                   result=None,
+                                   error=RPCError(
+                                       str(e),
+                                       cause=e,
+                                       traceback=traceback.format_exc()))
 
         return response
 
-    async def _process_streaming_request(self, req: RPCRequest):
+    async def _process_streaming_request(self, req: RPCRequest) -> None:
         """Process a streaming request by sending multiple responses."""
         func = self._functions[req.method_name]
 
@@ -450,35 +466,36 @@ async def _process_streaming_request(self, req: RPCRequest):
             await self._client_socket.put_async(
                 RPCResponse(
                     req.request_id,
-                    None,
-                    RPCStreamingError(
+                    result=None,
+                    error=RPCStreamingError(
                         f"Method '{req.method_name}' is not an async generator.",
                         traceback=traceback.format_exc()),
-                    is_streaming=
-                    True,  # Important: mark as streaming so it gets routed correctly
-                    # need to redirect the error to the client's streaming queue
+                    is_streaming=True,
                     stream_status='error'))
             return
 
-        sequence_number = 0
+        chunk_index = 0
 
-        # Calculate adjusted timeout based on pending overhead
-        adjusted_timeout = self._calculate_adjusted_timeout(req,
-                                                            is_streaming=True)
+        adjusted_timeout: float = self._calculate_adjusted_timeout(
+            req, is_streaming=True)
 
         try:
             logger_debug(f"RPC Server running streaming task {req.method_name}")
             # Send start signal
             await self._client_socket.put_async(
-                RPCResponse(req.request_id, None, None, True, sequence_number,
-                            'start'))
-            sequence_number += 1
+                RPCResponse(req.request_id,
+                            result=None,
+                            error=None,
+                            is_streaming=True,
+                            chunk_index=chunk_index,
+                            stream_status='start'))
+            chunk_index += 1
 
             # Apply timeout to the entire streaming operation if specified
             if adjusted_timeout is not None and adjusted_timeout > 0:
                 # Create a task for the async generator with timeout
                 async def stream_with_timeout():
-                    nonlocal sequence_number
+                    nonlocal chunk_index
                     async for result in func(*req.args, **req.kwargs):
                         # Check if shutdown was triggered
                         if self._shutdown_event.is_set():
@@ -488,12 +505,16 @@ async def stream_with_timeout():
                         logger_debug(
                             f"RPC Server got data and ready to send result {result}"
                         )
-                        response = RPCResponse(req.request_id, result, None,
-                                               True, sequence_number, 'data')
+                        response = RPCResponse(req.request_id,
+                                               result=result,
+                                               error=None,
+                                               is_streaming=True,
+                                               chunk_index=chunk_index,
+                                               stream_status='data')
                         if not await self._send_response(req, response):
                             # Stop streaming after a pickle error
                             return
-                        sequence_number += 1
+                        chunk_index += 1
 
                 # Use wait_for for timeout handling
                 await asyncio.wait_for(stream_with_timeout(),
@@ -509,38 +530,57 @@ async def stream_with_timeout():
                     logger_debug(
                         f"RPC Server got data and ready to send result {result}"
                     )
-                    response = RPCResponse(req.request_id, result, None, True,
-                                           sequence_number, 'data')
+                    response = RPCResponse(req.request_id,
+                                           result=result,
+                                           error=None,
+                                           is_streaming=True,
+                                           chunk_index=chunk_index,
+                                           stream_status='data')
                     if not await self._send_response(req, response):
                         # Stop streaming after a pickle error
                         return
-                    sequence_number += 1
+                    chunk_index += 1
 
             # Send end signal
             await self._client_socket.put_async(
-                RPCResponse(req.request_id, None, None, True, sequence_number,
-                            'end'))
+                RPCResponse(req.request_id,
+                            result=None,
+                            error=None,
+                            is_streaming=True,
+                            chunk_index=chunk_index,
+                            stream_status='end'))
 
         except RPCCancelled as e:
             # Server is shutting down, send cancelled error
             await self._client_socket.put_async(
-                RPCResponse(req.request_id, None, e, True, sequence_number,
-                            'error'))
+                RPCResponse(req.request_id,
+                            result=None,
+                            error=e,
+                            is_streaming=True,
+                            chunk_index=chunk_index,
+                            stream_status='error'))
 
         except asyncio.TimeoutError:
             await self._client_socket.put_async(
                 RPCResponse(
-                    req.request_id, None,
-                    RPCTimeout(
+                    req.request_id,
+                    result=None,
+                    error=RPCTimeout(
                         f"Streaming method '{req.method_name}' timed out",
-                        traceback=traceback.format_exc()), True,
-                    sequence_number, 'error'))
+                        traceback=traceback.format_exc()),
+                    is_streaming=True,
+                    chunk_index=chunk_index,
+                    stream_status='error'))
 
         except Exception as e:
             response = RPCResponse(
-                req.request_id, None,
-                RPCStreamingError(str(e), traceback=traceback.format_exc()),
-                True, sequence_number, 'error')
+                req.request_id,
+                result=None,
+                error=RPCStreamingError(str(e),
+                                        traceback=traceback.format_exc()),
+                is_streaming=True,
+                chunk_index=chunk_index,
+                stream_status='error')
             await self._send_response(req, response)
 
     async def _send_response(self, req: RPCRequest,
@@ -555,20 +595,22 @@ async def _send_response(self, req: RPCRequest,
             error_msg = f"Failed to pickle response: {e}"
             if req.is_streaming:
                 error_cls = RPCStreamingError
-                # For streaming, we also need sequence number. The original response has it.
-                sequence_number = response.sequence_number if response else None
+                chunk_index = response.chunk_index if response else None
                 error_response = RPCResponse(
                     req.request_id,
-                    None,
-                    error_cls(error_msg, traceback=traceback.format_exc()),
+                    result=None,
+                    error=error_cls(error_msg,
+                                    traceback=traceback.format_exc()),
                     is_streaming=True,
-                    sequence_number=sequence_number,
+                    chunk_index=chunk_index,
                     stream_status='error')
             else:
                 error_cls = RPCError
                 error_response = RPCResponse(
-                    req.request_id, None,
-                    error_cls(error_msg, traceback=traceback.format_exc()))
+                    req.request_id,
+                    result=None,
+                    error=error_cls(error_msg,
+                                    traceback=traceback.format_exc()))
 
             try:
                 await self._client_socket.put_async(error_response)
@@ -578,7 +620,7 @@ async def _send_response(self, req: RPCRequest,
                 )
             return False
 
-    def start(self):
+    def start(self) -> None:
         """Binds sockets, starts workers, and begins proxying messages."""
         if self._client_socket is None:
             raise RuntimeError(
@@ -591,7 +633,6 @@ def start(self):
         # Create and configure the event loop
         self._loop = asyncio.new_event_loop()
 
-        # Initialize the shutdown event in the new loop
         self._shutdown_event = asyncio.Event()
 
         async def run_server():
@@ -618,14 +659,23 @@ async def run_server():
 
                 logger_debug("All server tasks completed")
 
-        # Create the main server task
         self._main_task = self._loop.create_task(run_server())
 
-        # Run the event loop in a separate thread
         def run_loop():
             asyncio.set_event_loop(self._loop)
             try:
                 self._loop.run_until_complete(self._main_task)
+            except RuntimeError as e:
+                # This can happen if the event loop is stopped while futures are pending
+                error_str = str(e)
+                if "Event loop stopped before Future completed" in error_str:
+                    # This is expected during shutdown - ignore it
+                    logger.debug(f"Expected shutdown error: {error_str}")
+                else:
+                    # This is an unexpected RuntimeError - log full details
+                    import traceback
+                    logger.error(f"Event loop error: {error_str}")
+                    logger.error(f"Traceback: {traceback.format_exc()}")
             except Exception as e:
                 logger.error(f"Event loop error: {e}")
             finally:
@@ -634,8 +684,12 @@ def run_loop():
                 for task in pending:
                     task.cancel()
                 if pending:
-                    self._loop.run_until_complete(
-                        asyncio.gather(*pending, return_exceptions=True))
+                    try:
+                        self._loop.run_until_complete(
+                            asyncio.gather(*pending, return_exceptions=True))
+                    except RuntimeError:
+                        # Event loop might already be closed
+                        pass
                 self._loop.close()
 
         self._server_thread = threading.Thread(target=run_loop,
diff --git a/tests/unittest/executor/test_rpc.py b/tests/unittest/executor/test_rpc.py
index b25519ac1a3..bc6e683426f 100644
--- a/tests/unittest/executor/test_rpc.py
+++ b/tests/unittest/executor/test_rpc.py
@@ -140,6 +140,93 @@ def get_task_submitted(self) -> bool:
                 assert client.get_task_submitted().remote()
 
 
+class TestRpcCorrectness:
+
+    class App:
+
+        def incremental_task(self, v: int):
+            return v + 1
+
+        async def incremental_task_async(self, v: int):
+            return v + 1
+
+        async def streaming_task(self, n: int):
+            for i in range(n):
+                yield i
+
+    def test_incremental_task(self):
+        addr = get_unique_ipc_addr()
+        with RpcServerWrapper(TestRpcCorrectness.App(), addr=addr) as server:
+            with RPCClient(addr) as client:
+                for i in range(10000):  # a large number of tasks
+                    result = client.incremental_task(i).remote()
+                    if i % 1000 == 0:
+                        print(f"incremental_task {i} done")
+                    assert result == i + 1, f"result {result} != {i + 1}"
+
+    def test_incremental_task_async(self):
+        addr = get_unique_ipc_addr()
+        with RpcServerWrapper(TestRpcCorrectness.App(), addr=addr) as server:
+            with RPCClient(addr) as client:
+
+                async def test_incremental_task_async():
+                    for i in range(10000):  # a large number of tasks
+                        result = await client.incremental_task_async(
+                            i).remote_async()
+                        if i % 1000 == 0:
+                            print(f"incremental_task_async {i} done")
+                        assert result == i + 1, f"result {result} != {i + 1}"
+
+                asyncio.run(test_incremental_task_async())
+
+    @pytest.mark.skip(reason="This test is flaky, need to fix it")
+    def test_incremental_task_future(self):
+        addr = get_unique_ipc_addr()
+        with RpcServerWrapper(TestRpcCorrectness.App(), addr=addr) as server:
+            # Create client with more workers to handle concurrent futures
+            with RPCClient(addr, num_workers=16) as client:
+                # Process in smaller batches to avoid overwhelming the system
+                batch_size = 50
+                total_tasks = 1000  # Reduced from 10000 for stability
+
+                for batch_start in range(0, total_tasks, batch_size):
+                    batch_end = min(batch_start + batch_size, total_tasks)
+                    futures = []
+
+                    # Create futures for this batch
+                    for i in range(batch_start, batch_end):
+                        futures.append(
+                            client.incremental_task(i).remote_future())
+
+                    # Wait for all futures in this batch to complete
+                    for idx, future in enumerate(futures):
+                        no = batch_start + idx
+                        if no % 100 == 0:
+                            print(f"incremental_task_future {no} done")
+                        assert future.result(
+                        ) == no + 1, f"result {future.result()} != {no + 1}"
+
+    def test_incremental_task_streaming(self):
+        addr = get_unique_ipc_addr()
+        with RpcServerWrapper(TestRpcCorrectness.App(), addr=addr) as server:
+            with RPCClient(addr) as client:
+
+                async def test_streaming_task():
+                    results = []
+                    no = 0
+                    async for result in client.streaming_task(
+                            10000).remote_streaming():
+                        results.append(result)
+                        if no % 1000 == 0:
+                            print(f"streaming_task {no} done")
+                        no += 1
+                    assert results == [
+                        i for i in range(10000)
+                    ], f"results {results} != {[i for i in range(10000)]}"
+
+                asyncio.run(test_streaming_task())
+
+
 class TestRpcError:
 
     class CustomError(Exception):

From 701ff1d975a4a1a7f77019f6aa91d2ef59d3aa51 Mon Sep 17 00:00:00 2001
From: Superjomn <328693+Superjomn@users.noreply.github.com>
Date: Wed, 5 Nov 2025 05:49:23 +0000
Subject: [PATCH 9/9] fix worker

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
---
 tests/unittest/executor/test_rpc_worker.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/unittest/executor/test_rpc_worker.py b/tests/unittest/executor/test_rpc_worker.py
index e43ff889e50..ab6e183fd12 100644
--- a/tests/unittest/executor/test_rpc_worker.py
+++ b/tests/unittest/executor/test_rpc_worker.py
@@ -11,7 +11,7 @@
 from tensorrt_llm.executor.rpc import RPCClient
 from tensorrt_llm.executor.rpc.rpc_common import get_unique_ipc_addr
 from tensorrt_llm.executor.rpc_worker import RpcWorker
-from tensorrt_llm.llmapi.llm_args import TorchLlmArgs
+from tensorrt_llm.llmapi.llm_args import KvCacheConfig, TorchLlmArgs
 from tensorrt_llm.llmapi.mpi_session import MpiPoolSession
 from tensorrt_llm.sampling_params import SamplingParams
 
@@ -33,6 +33,7 @@ def setup_method(self):
             tensor_parallel_size=1,
             backend='pytorch',
             enable_iter_perf_stats=True,
+            kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.5, ),
         )
         self.pool, self.addr = self.create_worker_pool()
         self.client = self.create_rpc_client(self.addr)