Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ set(REALM_SOURCES
)

if(REALM_USE_CUDA)
list(APPEND REALM_SOURCES cuda/cuda_module.cc cuda/cuda_internal.cc cuda/cuda_access.cc)
list(APPEND REALM_SOURCES cuda/cuda_module.cc cuda/cuda_internal.cc cuda/cuda_access.cc cuda/cuda_hook.cc)
if(REALM_USE_NVTX)
list(APPEND REALM_SOURCES nvtx.cc)
endif()
Expand Down
1,244 changes: 164 additions & 1,080 deletions src/realm/cuda/cuda_hook.cc

Large diffs are not rendered by default.

43 changes: 43 additions & 0 deletions src/realm/cuda/cuda_hook.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Copyright 2025 Stanford University, NVIDIA Corporation
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef REALM_CUDA_HOOK_H
#define REALM_CUDA_HOOK_H

#include "realm/cuda/cuda_internal.h"

namespace Realm {

namespace Cuda {

class CudaHook {
public:
CudaHook();
~CudaHook();

void start_task(GPUStream *current_stream);
void end_task(GPUStream *current_task_stream, Processor::TaskFuncID task_id);

private:
CUpti_SubscriberHandle cupti_subscriber;
};

}; // namespace Cuda

}; // namespace Realm

#endif
5 changes: 4 additions & 1 deletion src/realm/cuda/cuda_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1481,7 +1481,10 @@ namespace Realm {
__op__(cuptiActivityGetNextRecord); \
__op__(cuptiActivityRegisterTimestampCallback); \
__op__(cuptiActivityPushExternalCorrelationId); \
__op__(cuptiActivityPopExternalCorrelationId);
__op__(cuptiActivityPopExternalCorrelationId); \
__op__(cuptiSubscribe); \
__op__(cuptiEnableCallback); \
__op__(cuptiUnsubscribe);

#define DECL_FNPTR_EXTERN(name) extern decltype(&name) name##_fnptr;
CUPTI_APIS(DECL_FNPTR_EXTERN)
Expand Down
76 changes: 32 additions & 44 deletions src/realm/cuda/cuda_module.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "realm/cuda/cuda_access.h"
#include "realm/cuda/cuda_internal.h"
#include "realm/cuda/cuda_memcpy.h"
#include "realm/cuda/cuda_hook.h"

#include "realm/tasks.h"
#include "realm/logging.h"
Expand Down Expand Up @@ -93,6 +94,7 @@ namespace Realm {
} data;
};

extern Logger log_cuhook;
Logger log_gpu("gpu");
Logger log_gpudma("gpudma");
Logger log_cudart("cudart");
Expand All @@ -108,6 +110,8 @@ namespace Realm {

bool cuda_api_fnptrs_loaded = false;

static std::unique_ptr<CudaHook> cuda_hook{nullptr};

// Make sure to only use decltype here, to ensure it matches the cuda.h definition
#define DEFINE_FNPTR(name, ver) decltype(&name) name##_fnptr = 0;

Expand Down Expand Up @@ -142,19 +146,9 @@ namespace Realm {
CUPTI_APIS(DEFINE_FNPTR);
#undef DEFINE_FNPTR

// function pointers for cuda hook
typedef void (*PFN_cuhook_register_callback)(void);
typedef void (*PFN_cuhook_start_task)(CUstream current_task_stream);
typedef void (*PFN_cuhook_end_task)(CUstream current_task_stream);

static PFN_cuhook_register_callback cuhook_register_callback_fnptr = nullptr;
static PFN_cuhook_start_task cuhook_start_task_fnptr = nullptr;
static PFN_cuhook_end_task cuhook_end_task_fnptr = nullptr;
static bool cuhook_enabled = false;

namespace ThreadLocal {
thread_local GPUStream *current_gpu_stream = 0;
thread_local std::set<GPUStream *> *created_gpu_streams = 0;
thread_local GPUStream *current_gpu_stream = nullptr;
thread_local std::set<GPUStream *> *created_gpu_streams = nullptr;
static thread_local int context_sync_required = 0;
thread_local bool block_on_synchronize = false;
}; // namespace ThreadLocal
Expand Down Expand Up @@ -787,8 +781,8 @@ namespace Realm {
ThreadLocal::current_gpu_stream = s;
assert(!ThreadLocal::created_gpu_streams);

if(cuhook_enabled) {
cuhook_start_task_fnptr(s->get_stream());
if(cuda_hook) {
cuda_hook->start_task(s);
}

// a task can force context sync on task completion either on or off during
Expand Down Expand Up @@ -909,8 +903,8 @@ namespace Realm {
// cuda stream sanity check and clear cuda hook calls
// we only check against the current_gpu_stream because it is impossible to launch
// tasks onto other realm gpu streams
if(cuhook_enabled) {
cuhook_end_task_fnptr(s->get_stream());
if(cuda_hook) {
cuda_hook->end_task(s, task->func_id);
}

ThreadLocal::current_gpu_stream = nullptr;
Expand Down Expand Up @@ -2584,7 +2578,8 @@ namespace Realm {
.add_option_int_units("-cuda:hostreg", cfg_hostreg_limit, 'm')
.add_option_int("-cuda:pageable_access", cfg_pageable_access)
.add_option_int("-cuda:cupti", cfg_enable_cupti)
.add_option_int("-cuda:ipc", cfg_use_cuda_ipc);
.add_option_int("-cuda:ipc", cfg_use_cuda_ipc)
.add_option_bool("-cuda:cuhook", cfg_enable_cuhook);
#ifdef REALM_USE_CUDART_HIJACK
cp.add_option_int("-cuda:nongpusync", Cuda::cudart_hijack_nongpu_sync);
#endif
Expand Down Expand Up @@ -2630,10 +2625,6 @@ namespace Realm {
delete_container_contents(gpu_info);
assert(cuda_module_singleton == this);
cuda_module_singleton = 0;
cuhook_register_callback_fnptr = nullptr;
cuhook_start_task_fnptr = nullptr;
cuhook_end_task_fnptr = nullptr;
cuhook_enabled = false;
delete rh_listener;
}

Expand Down Expand Up @@ -3133,6 +3124,10 @@ namespace Realm {
log_cupti.info() << "Unable to load cupti, gpu timelines may be inaccurate";
}

if(m->config->cfg_enable_cuhook && !resolve_cupti_api_fnptrs()) {
log_cuhook.info() << "Unable to load cuhook because of missing CUPTI";
}

// create GPUInfo
std::vector<GPUInfo *> infos;
{
Expand Down Expand Up @@ -3602,17 +3597,6 @@ namespace Realm {
// make sure we hear about any changes to the size of the replicated
// heap
runtime->repl_heap.add_listener(rh_listener);
#ifdef REALM_USE_LIBDL
cuhook_register_callback_fnptr =
(PFN_cuhook_register_callback)dlsym(NULL, "cuhook_register_callback");
cuhook_start_task_fnptr = (PFN_cuhook_start_task)dlsym(NULL, "cuhook_start_task");
cuhook_end_task_fnptr = (PFN_cuhook_end_task)dlsym(NULL, "cuhook_end_task");
if(cuhook_register_callback_fnptr && cuhook_start_task_fnptr &&
cuhook_end_task_fnptr) {
cuhook_register_callback_fnptr();
cuhook_enabled = true;
}
#endif
}

// create any memories provided by this module (default == do nothing)
Expand Down Expand Up @@ -3951,18 +3935,22 @@ namespace Realm {

Module::create_dma_channels(runtime);

if(cupti_api_fnptrs_loaded &&
CUPTI_HAS_FNPTR(cuptiActivityPushExternalCorrelationId)) {
// Wait until the clock is fully calibrated before we register the timestamp
// callback, otherwise cupti will normalize to the wrong timestamp and the GPU
// timings will be incorrectly translated
CHECK_CUPTI(
CUPTI_FNPTR(cuptiActivityRegisterTimestampCallback)(cupti_timestamp_cb));
CHECK_CUPTI(CUPTI_FNPTR(cuptiActivityRegisterCallbacks)(
cupti_request_buffer_cb, cupti_buffer_complete_cb));
CHECK_CUPTI(
CUPTI_FNPTR(cuptiActivityEnable)(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION));
cupti_api_initialized = true;
if(cupti_api_fnptrs_loaded) {
if (config->cfg_enable_cupti && CUPTI_HAS_FNPTR(cuptiActivityPushExternalCorrelationId)) {
// Wait until the clock is fully calibrated before we register the timestamp
// callback, otherwise cupti will normalize to the wrong timestamp and the GPU
// timings will be incorrectly translated
CHECK_CUPTI(
CUPTI_FNPTR(cuptiActivityRegisterTimestampCallback)(cupti_timestamp_cb));
CHECK_CUPTI(CUPTI_FNPTR(cuptiActivityRegisterCallbacks)(
cupti_request_buffer_cb, cupti_buffer_complete_cb));
CHECK_CUPTI(
CUPTI_FNPTR(cuptiActivityEnable)(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION));
cupti_api_initialized = true;
}
if (config->cfg_enable_cuhook && CUPTI_HAS_FNPTR(cuptiSubscribe)) {
cuda_hook = std::make_unique<CudaHook>();
}
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/realm/cuda/cuda_module.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ namespace Realm {
struct GPUInfo;
class GPUZCMemory;
class GPUReplHeapListener;
class CudaHook;

class CudaModuleConfig : public ModuleConfig {
friend class CudaModule;
Expand Down Expand Up @@ -144,6 +145,7 @@ namespace Realm {
bool cfg_use_cuda_ipc = true;
int cfg_pageable_access = 0;
bool cfg_enable_cupti = false;
bool cfg_enable_cuhook = false;

// resources
bool resource_discovered = false;
Expand Down
5 changes: 1 addition & 4 deletions tests/test_cuhook.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,9 @@ void gpu_task(const void *args, size_t arglen, const void *userdata, size_t user
cudaFree(ptr);
cudaStream_t stream;

#ifdef REALM_USE_CUDART_HIJACK
cudaStreamCreate(&stream);
#else
stream = Cuda::get_task_cuda_stream();
Cuda::set_task_ctxsync_required(false);
#endif
gpu_kernel_wrapper(stream);
// cudaStreamSynchronize(stream);
}
Expand Down
Loading