add flag for EPLB to force using GDRCopy

dongxuy04 · dongxuy04 · commit 198e041bd1ea · 2025-10-24T09:50:44.000+08:00
Signed-off-by: Dongxu Yang &lt;78518666+dongxuy04@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/common/envUtils.cpp b/cpp/tensorrt_llm/common/envUtils.cpp
@@ -450,4 +450,9 @@ bool getEnvDisableChunkedAttentionInGenPhase()
     return getBoolEnv("TRTLLM_DISABLE_CHUNKED_ATTENTION_IN_GEN_PHASE");
 }
 
+bool getEnvEplbForceGdrcopy()
+{
+    return getBoolEnv("TRTLLM_EPLB_FORCE_GDRCOPY");
+}
+
 } // namespace tensorrt_llm::common
diff --git a/cpp/tensorrt_llm/common/envUtils.h b/cpp/tensorrt_llm/common/envUtils.h
@@ -136,4 +136,6 @@ bool getEnvDisaggBenchmarkGenOnly();
 // Whether to disable the chunked-attention in the generation phase.
 bool getEnvDisableChunkedAttentionInGenPhase();
 
+bool getEnvEplbForceGdrcopy();
+
 } // namespace tensorrt_llm::common
diff --git a/cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp b/cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp
@@ -27,6 +27,7 @@
 #include "topologyDetector.h"
 
 #include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/common/logger.h"
 
 namespace tensorrt_llm::runtime
@@ -169,7 +170,8 @@ bool HostAccessibleDeviceAllocator::mAllowManagedFallback = false;
 
 bool HostAccessibleDeviceAllocator::isSupported()
 {
-    if (TopologyDetector::getInstance().getCurrentGpuMemoryNumaId() >= 0)
+    if (!tensorrt_llm::common::getEnvEplbForceGdrcopy()
+        && TopologyDetector::getInstance().getCurrentGpuMemoryNumaId() >= 0)
     {
         // we are on systems that GPU memory is also a NUMA node.
         return true;
@@ -195,7 +197,16 @@ void HostAccessibleDeviceAllocator::init()
     }
 
     TLLM_CUDA_CHECK(cudaGetDevice(&mDevId));
-    mGpuMemNumaId = TopologyDetector::getInstance().getCurrentGpuMemoryNumaId();
+    if (tensorrt_llm::common::getEnvEplbForceGdrcopy())
+    {
+        mGpuMemNumaId = -1;
+        TLLM_LOG_INFO("Force using GDRCopy for EPLB, ignore NUMA node for GPU memory.");
+    }
+    else
+    {
+        mGpuMemNumaId = TopologyDetector::getInstance().getCurrentGpuMemoryNumaId();
+    }
+
     if (mGpuMemNumaId < 0)
     {
         // We only use GDRCopy when there is no NUMA node for GPU memory.

Original file line number	Diff line number	Diff line change
`@@ -450,4 +450,9 @@ bool getEnvDisableChunkedAttentionInGenPhase()`
`450`	`450`	`return getBoolEnv("TRTLLM_DISABLE_CHUNKED_ATTENTION_IN_GEN_PHASE");`
`451`	`451`	`}`
`452`	`452`
	`453`	`+bool getEnvEplbForceGdrcopy()`
	`454`	`+{`
	`455`	`+ return getBoolEnv("TRTLLM_EPLB_FORCE_GDRCOPY");`
	`456`	`+}`
	`457`	`+`
`453`	`458`	`} // namespace tensorrt_llm::common`