StanfordLegion · eddy16112 · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025 · Oct 15, 2025
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -70,7 +70,7 @@ set(REALM_SOURCES
 )
 
 if(REALM_USE_CUDA)
-  list(APPEND REALM_SOURCES cuda/cuda_module.cc cuda/cuda_internal.cc cuda/cuda_access.cc)
+  list(APPEND REALM_SOURCES cuda/cuda_module.cc cuda/cuda_internal.cc cuda/cuda_access.cc cuda/cuda_hook.cc)
   if(REALM_USE_NVTX)
     list(APPEND REALM_SOURCES nvtx.cc)
   endif()

diff --git a/src/realm/cuda/cuda_hook.cc b/src/realm/cuda/cuda_hook.cc
diff --git a/src/realm/cuda/cuda_hook.h b/src/realm/cuda/cuda_hook.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2025 Stanford University, NVIDIA Corporation
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ #ifndef REALM_CUDA_HOOK_H
+ #define REALM_CUDA_HOOK_H
+
+ #include "realm/cuda/cuda_internal.h"
+
+ namespace Realm {
+
+   namespace Cuda {
+
+     class CudaHook {
+     public:
+       CudaHook();
+       ~CudaHook();
+
+       void start_task(GPUStream *current_stream);
+       void end_task(GPUStream *current_task_stream, Processor::TaskFuncID task_id);
+
+     private:
+       CUpti_SubscriberHandle cupti_subscriber;
+     };
+
+   }; // namespace Cuda
+
+ }; // namespace Realm
+
+ #endif
diff --git a/src/realm/cuda/cuda_internal.h b/src/realm/cuda/cuda_internal.h
@@ -1481,7 +1481,10 @@ namespace Realm {
   __op__(cuptiActivityGetNextRecord);                                                    \
   __op__(cuptiActivityRegisterTimestampCallback);                                        \
   __op__(cuptiActivityPushExternalCorrelationId);                                        \
-  __op__(cuptiActivityPopExternalCorrelationId);
+  __op__(cuptiActivityPopExternalCorrelationId);                                         \
+  __op__(cuptiSubscribe);                                                                \
+  __op__(cuptiEnableCallback);                                                           \
+  __op__(cuptiUnsubscribe);
 
 #define DECL_FNPTR_EXTERN(name) extern decltype(&name) name##_fnptr;
     CUPTI_APIS(DECL_FNPTR_EXTERN)

diff --git a/src/realm/cuda/cuda_module.cc b/src/realm/cuda/cuda_module.cc
@@ -19,6 +19,7 @@
 #include "realm/cuda/cuda_access.h"
 #include "realm/cuda/cuda_internal.h"
 #include "realm/cuda/cuda_memcpy.h"
+#include "realm/cuda/cuda_hook.h"
 
 #include "realm/tasks.h"
 #include "realm/logging.h"
@@ -93,6 +94,7 @@ namespace Realm {
       } data;
     };
 
+    extern Logger log_cuhook;
     Logger log_gpu("gpu");
     Logger log_gpudma("gpudma");
     Logger log_cudart("cudart");
@@ -108,6 +110,8 @@ namespace Realm {
 
     bool cuda_api_fnptrs_loaded = false;
 
+    static std::unique_ptr<CudaHook> cuda_hook{nullptr};
+
 // Make sure to only use decltype here, to ensure it matches the cuda.h definition
 #define DEFINE_FNPTR(name, ver) decltype(&name) name##_fnptr = 0;
 
@@ -142,19 +146,9 @@ namespace Realm {
     CUPTI_APIS(DEFINE_FNPTR);
 #undef DEFINE_FNPTR
 
-    // function pointers for cuda hook
-    typedef void (*PFN_cuhook_register_callback)(void);
-    typedef void (*PFN_cuhook_start_task)(CUstream current_task_stream);
-    typedef void (*PFN_cuhook_end_task)(CUstream current_task_stream);
-
-    static PFN_cuhook_register_callback cuhook_register_callback_fnptr = nullptr;
-    static PFN_cuhook_start_task cuhook_start_task_fnptr = nullptr;
-    static PFN_cuhook_end_task cuhook_end_task_fnptr = nullptr;
-    static bool cuhook_enabled = false;
-
     namespace ThreadLocal {
-      thread_local GPUStream *current_gpu_stream = 0;
-      thread_local std::set<GPUStream *> *created_gpu_streams = 0;
+      thread_local GPUStream *current_gpu_stream = nullptr;
+      thread_local std::set<GPUStream *> *created_gpu_streams = nullptr;
       static thread_local int context_sync_required = 0;
       thread_local bool block_on_synchronize = false;
     }; // namespace ThreadLocal
@@ -787,8 +781,8 @@ namespace Realm {
       ThreadLocal::current_gpu_stream = s;
       assert(!ThreadLocal::created_gpu_streams);
 
-      if(cuhook_enabled) {
-        cuhook_start_task_fnptr(s->get_stream());
+      if(cuda_hook) {
+        cuda_hook->start_task(s);
       }
 
       // a task can force context sync on task completion either on or off during
@@ -909,8 +903,8 @@ namespace Realm {
       // cuda stream sanity check and clear cuda hook calls
       // we only check against the current_gpu_stream because it is impossible to launch
       // tasks onto other realm gpu streams
-      if(cuhook_enabled) {
-        cuhook_end_task_fnptr(s->get_stream());
+      if(cuda_hook) {
+        cuda_hook->end_task(s, task->func_id);
       }
 
       ThreadLocal::current_gpu_stream = nullptr;
@@ -2584,7 +2578,8 @@ namespace Realm {
           .add_option_int_units("-cuda:hostreg", cfg_hostreg_limit, 'm')
           .add_option_int("-cuda:pageable_access", cfg_pageable_access)
           .add_option_int("-cuda:cupti", cfg_enable_cupti)
-          .add_option_int("-cuda:ipc", cfg_use_cuda_ipc);
+          .add_option_int("-cuda:ipc", cfg_use_cuda_ipc)
+          .add_option_bool("-cuda:cuhook", cfg_enable_cuhook);
 #ifdef REALM_USE_CUDART_HIJACK
       cp.add_option_int("-cuda:nongpusync", Cuda::cudart_hijack_nongpu_sync);
 #endif
@@ -2630,10 +2625,6 @@ namespace Realm {
       delete_container_contents(gpu_info);
       assert(cuda_module_singleton == this);
       cuda_module_singleton = 0;
-      cuhook_register_callback_fnptr = nullptr;
-      cuhook_start_task_fnptr = nullptr;
-      cuhook_end_task_fnptr = nullptr;
-      cuhook_enabled = false;
       delete rh_listener;
     }
 
@@ -3133,6 +3124,10 @@ namespace Realm {
         log_cupti.info() << "Unable to load cupti, gpu timelines may be inaccurate";
       }
 
+      if(m->config->cfg_enable_cuhook && !resolve_cupti_api_fnptrs()) {
+        log_cuhook.info() << "Unable to load cuhook because of missing CUPTI";
+      }
+
       // create GPUInfo
       std::vector<GPUInfo *> infos;
       {
@@ -3602,17 +3597,6 @@ namespace Realm {
       // make sure we hear about any changes to the size of the replicated
       //  heap
       runtime->repl_heap.add_listener(rh_listener);
-#ifdef REALM_USE_LIBDL
-      cuhook_register_callback_fnptr =
-          (PFN_cuhook_register_callback)dlsym(NULL, "cuhook_register_callback");
-      cuhook_start_task_fnptr = (PFN_cuhook_start_task)dlsym(NULL, "cuhook_start_task");
-      cuhook_end_task_fnptr = (PFN_cuhook_end_task)dlsym(NULL, "cuhook_end_task");
-      if(cuhook_register_callback_fnptr && cuhook_start_task_fnptr &&
-         cuhook_end_task_fnptr) {
-        cuhook_register_callback_fnptr();
-        cuhook_enabled = true;
-      }
-#endif
     }
 
     // create any memories provided by this module (default == do nothing)
@@ -3951,18 +3935,22 @@ namespace Realm {
 
       Module::create_dma_channels(runtime);
 
-      if(cupti_api_fnptrs_loaded &&
-         CUPTI_HAS_FNPTR(cuptiActivityPushExternalCorrelationId)) {
-        // Wait until the clock is fully calibrated before we register the timestamp
-        // callback, otherwise cupti will normalize to the wrong timestamp and the GPU
-        // timings will be incorrectly translated
-        CHECK_CUPTI(
-            CUPTI_FNPTR(cuptiActivityRegisterTimestampCallback)(cupti_timestamp_cb));
-        CHECK_CUPTI(CUPTI_FNPTR(cuptiActivityRegisterCallbacks)(
-            cupti_request_buffer_cb, cupti_buffer_complete_cb));
-        CHECK_CUPTI(
-            CUPTI_FNPTR(cuptiActivityEnable)(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION));
-        cupti_api_initialized = true;
+      if(cupti_api_fnptrs_loaded) {
+         if (config->cfg_enable_cupti && CUPTI_HAS_FNPTR(cuptiActivityPushExternalCorrelationId)) {
+          // Wait until the clock is fully calibrated before we register the timestamp
+          // callback, otherwise cupti will normalize to the wrong timestamp and the GPU
+          // timings will be incorrectly translated
+          CHECK_CUPTI(
+              CUPTI_FNPTR(cuptiActivityRegisterTimestampCallback)(cupti_timestamp_cb));
+          CHECK_CUPTI(CUPTI_FNPTR(cuptiActivityRegisterCallbacks)(
+              cupti_request_buffer_cb, cupti_buffer_complete_cb));
+          CHECK_CUPTI(
+              CUPTI_FNPTR(cuptiActivityEnable)(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION));
+          cupti_api_initialized = true;
+        }
+        if (config->cfg_enable_cuhook && CUPTI_HAS_FNPTR(cuptiSubscribe)) {
+          cuda_hook = std::make_unique<CudaHook>();
+        }
       }
     }
 

diff --git a/src/realm/cuda/cuda_module.h b/src/realm/cuda/cuda_module.h
@@ -105,6 +105,7 @@ namespace Realm {
     struct GPUInfo;
     class GPUZCMemory;
     class GPUReplHeapListener;
+    class CudaHook;
 
     class CudaModuleConfig : public ModuleConfig {
       friend class CudaModule;
@@ -144,6 +145,7 @@ namespace Realm {
       bool cfg_use_cuda_ipc = true;
       int cfg_pageable_access = 0;
       bool cfg_enable_cupti = false;
+      bool cfg_enable_cuhook = false;
 
       // resources
       bool resource_discovered = false;

diff --git a/tests/test_cuhook.cc b/tests/test_cuhook.cc
@@ -50,12 +50,9 @@ void gpu_task(const void *args, size_t arglen, const void *userdata, size_t user
   cudaFree(ptr);
   cudaStream_t stream;
 
-#ifdef REALM_USE_CUDART_HIJACK
-  cudaStreamCreate(&stream);
-#else
   stream = Cuda::get_task_cuda_stream();
   Cuda::set_task_ctxsync_required(false);
-#endif
+  
   gpu_kernel_wrapper(stream);
   // cudaStreamSynchronize(stream);
 }