Address comments

HuiGao-NV · HuiGao-NV · commit 18b76a906058 · 2025-10-14T05:27:17.000Z
Signed-off-by: Hui Gao &lt;huig@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/memory_buffer_utils.py b/tensorrt_llm/_torch/memory_buffer_utils.py
@@ -6,8 +6,6 @@
 
 from tensorrt_llm.logger import logger
 
-from .utils import get_shared_pool
-
 
 @dataclass
 class BufferBlock:
@@ -115,3 +113,52 @@ def get_buffer(self, tensor_shape: list[int], dtype: torch.dtype,
 def get_memory_buffers():
     global _buffer
     return _buffer
+
+
+_shared_pool = None
+
+
+def set_shared_pool(shared_pool):
+    """Sets the global memory pool for buffer allocation.
+
+    Args:
+        shared_pool: A CUDA memory pool object to use for allocations.
+    """
+    global _shared_pool
+    _shared_pool = shared_pool
+
+
+def get_shared_pool():
+    """Retrieves the current global memory pool.
+
+    Returns:
+        The current memory pool, or None if not set.
+    """
+    return _shared_pool
+
+
+@contextlib.contextmanager
+def with_shared_pool(shared_pool) -> contextlib.AbstractContextManager:
+    """Temporarily sets a preferred memory pool and restores the previous one on exit.
+
+    This context manager allows temporarily switching to a different memory pool
+    for CUDA graph operations, ensuring the original pool is restored even if
+    an exception occurs.
+
+    Args:
+        shared_pool: The memory pool to use within the context.
+
+    Yields:
+        None
+
+    Example:
+        >>> with with_shared_pool(shared_pool):
+        ...     # Allocations within this block use shared_pool
+        ...     tensor = allocate_buffer(...)
+    """
+    old_shared_pool = get_shared_pool()
+    set_shared_pool(shared_pool)
+    try:
+        yield
+    finally:
+        set_shared_pool(old_shared_pool)
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -34,6 +34,7 @@
 from ..distributed import MPIDist
 from ..distributed.communicator import init_pp_comm
 from ..expert_statistic import ExpertStatistic
+from ..memory_buffer_utils import with_shared_pool
 from ..metadata import KVCacheParams
 from ..models.checkpoints.base_checkpoint_loader import BaseCheckpointLoader
 from ..models.modeling_multimodal_utils import filter_mm_token_from_input_ids
@@ -48,8 +49,7 @@
 from ..speculative.mtp import SampleStateTensorsMTP
 from ..utils import (get_model_extra_attrs,
                      set_per_request_piecewise_cuda_graph_flag,
-                     set_torch_compiling, with_model_extra_attrs,
-                     with_shared_pool)
+                     set_torch_compiling, with_model_extra_attrs)
 from .config import PyTorchConfig
 from .config_utils import is_mla
 from .cuda_graph_runner import CUDAGraphRunner
diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py
@@ -314,52 +314,3 @@ def get_device_uuid(device_idx: int) -> str:
     property = torch.cuda.get_device_properties(device_idx)
     uuid = "GPU-" + str(property.uuid)
     return uuid
-
-
-_buffer_pool = None
-
-
-def set_shared_pool(buffer_pool):
-    """Sets the global memory pool for buffer allocation.
-
-    Args:
-        buffer_pool: A CUDA memory pool object to use for allocations.
-    """
-    global _buffer_pool
-    _buffer_pool = buffer_pool
-
-
-def get_shared_pool():
-    """Retrieves the current global memory pool.
-
-    Returns:
-        The current memory pool, or None if not set.
-    """
-    return _buffer_pool
-
-
-@contextlib.contextmanager
-def with_shared_pool(buffer_pool) -> contextlib.AbstractContextManager:
-    """Temporarily sets a preferred memory pool and restores the previous one on exit.
-
-    This context manager allows temporarily switching to a different memory pool
-    for CUDA graph operations, ensuring the original pool is restored even if
-    an exception occurs.
-
-    Args:
-        mem_pool: The memory pool to use within the context.
-
-    Yields:
-        None
-
-    Example:
-        >>> with with_shared_pool(buffer_pool):
-        ...     # Allocations within this block use buffer_pool
-        ...     tensor = allocate_buffer(...)
-    """
-    old_buffer_pool = get_shared_pool()
-    set_shared_pool(mem_pool)
-    try:
-        yield
-    finally:
-        set_shared_pool(old_buffer_pool)