Add comments for new code

HuiGao-NV · HuiGao-NV · commit 0713ec61f1b9 · 2025-09-30T23:28:58.000Z
Signed-off-by: Hui Gao &lt;huig@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/memory_buffer_utils.py b/tensorrt_llm/_torch/memory_buffer_utils.py
@@ -6,7 +6,7 @@
 
 from tensorrt_llm.logger import logger
 
-from .utils import get_graph_pool
+from .utils import get_shared_pool
 
 
 @dataclass
@@ -86,14 +86,16 @@ def get_buffer(self, tensor_shape: list[int], dtype: torch.dtype,
         # The new buffer is created with uint8 to represent raw bytes.
         new_buffer_tensor = None
         try:
-            with torch.cuda.memory.use_mem_pool(get_graph_pool()):
+            with torch.cuda.memory.use_mem_pool(get_shared_pool()):
                 new_buffer_tensor = torch.zeros((required_memory_size, ),
                                                 device='cuda',
                                                 dtype=torch.uint8)
-        except Exception:
+        except Exception as ex:
             # Need to check if this is an OOM exception
             logger.debug(
-                f"Exception happened to create tensor from given memory pool")
+                f"Exception happened to create tensor from given memory pool: {str{ex}}"
+            )
+            # if exception happens during allocating memory from
             new_buffer_tensor = torch.zeros((required_memory_size, ),
                                             device='cuda',
                                             dtype=torch.uint8)
diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
@@ -195,6 +195,12 @@ def needs_capture(self, key: Tuple[int, int, int]):
         return key not in self.graph_outputs
 
     def get_graph_pool(self):
+        """Returns the CUDA memory pool used by this graph runner.
+
+        Returns:
+            The CUDA memory pool associated with captured graphs, or None if
+            no graphs have been captured yet.
+        """
         return self.memory_pool
 
     def capture(self,
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -48,7 +48,7 @@
 from ..speculative.mtp import SampleStateTensorsMTP
 from ..utils import (get_model_extra_attrs,
                      set_per_request_piecewise_cuda_graph_flag,
-                     set_prefer_mem_pool, set_torch_compiling,
+                     set_shared_mem_pool, set_torch_compiling,
                      with_model_extra_attrs)
 from .config import PyTorchConfig
 from .config_utils import is_mla
@@ -2187,7 +2187,7 @@ def forward(
                 new_tensors_device, cache_indirection_buffer)
 
             self.iter_counter += 1
-            with set_prefer_mem_pool(self.cuda_graph_runner.get_graph_pool()):
+            with set_shared_mem_pool(self.cuda_graph_runner.get_graph_pool()):
                 if not maybe_graph:
                     # Fallback to eager execution if graph was not used
                     with MoeLoadBalancerIterContext(moe_load_balancer):
diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py
@@ -317,20 +317,47 @@ def _get_allow_chain_drafter() -> bool:
 _buffer_pool = None
 
 
-def set_mem_pool(buffer_pool):
+def set_shared_pool(buffer_pool):
+    """Sets the global memory pool for buffer allocation.
+
+    Args:
+        buffer_pool: A CUDA memory pool object to use for allocations.
+    """
     global _buffer_pool
     _buffer_pool = buffer_pool
 
 
-def get_graph_pool():
+def get_shared_pool():
+    """Retrieves the current global memory pool.
+
+    Returns:
+        The current memory pool, or None if not set.
+    """
     global _buffer_pool
     return _buffer_pool
 
 
 @contextlib.contextmanager
-def set_prefer_mem_pool(mem_pool):
-    old_buffer_pool = get_graph_pool()
-    set_mem_pool(mem_pool)
+def set_shared_mem_pool(mem_pool) -> contextlib.AbstractContextManager:
+    """Temporarily sets a preferred memory pool and restores the previous one on exit.
+
+    This context manager allows temporarily switching to a different memory pool
+    for CUDA graph operations, ensuring the original pool is restored even if
+    an exception occurs.
+
+    Args:
+        mem_pool: The memory pool to use within the context.
+
+    Yields:
+        None
+
+    Example:
+        >>> with set_shared_mem_pool(graph_pool):
+        ...     # Allocations within this block use graph_pool
+        ...     tensor = allocate_buffer(...)
+    """
+    old_buffer_pool = get_shared_pool()
+    set_shared_pool(mem_pool)
     try:
         yield
     finally: