diff --git a/CMakeLists.txt b/CMakeLists.txt index c7f4c0fbf..ff1f90239 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -203,16 +203,11 @@ if (UMPIRE_ENABLE_TESTS) add_subdirectory(tests) endif () -if (UMPIRE_ENABLE_DEVELOPER_BENCHMARKS) +if (UMPIRE_ENABLE_BENCHMARKS) add_subdirectory(benchmarks) if ((NOT CMAKE_BUILD_TYPE) OR (NOT ${CMAKE_BUILD_TYPE} STREQUAL "Release")) message("-- Warning: CMAKE_BUILD_TYPE not set to Release, benchmark information will not be reliable for this build!") endif() -else() - if (UMPIRE_ENABLE_BENCHMARKS) - message("-- Warning: Benchmarks will not be built. If you want to build with benchmarks,\n" - " set UMPIRE_ENABLE_DEVELOPER_BENCHMARKS to On.") - endif() endif () if (UMPIRE_ENABLE_EXAMPLES) diff --git a/Dockerfile b/Dockerfile index 9a2f159ba..a41bb2169 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,8 +38,8 @@ ENV GTEST_COLOR=1 COPY . /home/umpire/workspace WORKDIR /home/umpire/workspace/build RUN cmake -DUMPIRE_ENABLE_DEVELOPER_DEFAULTS=On -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang \ - -DUMPIRE_ENABLE_C=On -DCMAKE_CXX_FLAGS="-fsanitize=address" -DENABLE_TESTS=On -DUMPIRE_ENABLE_TOOLS=On \ - -DUMPIRE_ENABLE_ASAN=On -DUMPIRE_ENABLE_SANITIZER_TESTS=On .. && \ + -DUMPIRE_ENABLE_C=On -DCMAKE_CXX_FLAGS="-fsanitize=address" -DENABLE_TESTS=On -DUMPIRE_ENABLE_TOOLS=On \ + -DUMPIRE_ENABLE_ASAN=On -DUMPIRE_ENABLE_SANITIZER_TESTS=On .. && \ make -j 2 && \ ctest -T test -E operation_tests --output-on-failure @@ -48,7 +48,7 @@ ENV GTEST_COLOR=1 COPY . /home/umpire/workspace WORKDIR /home/umpire/workspace/build RUN cmake -DUMPIRE_ENABLE_DEVELOPER_DEFAULTS=On -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CUDA_ARCHITECTURES=70 .. && \ - make -j 16 + make -j 8 # TODO: switch to ROCM 6 FROM ghcr.io/llnl/radiuss:hip-5.6.1-ubuntu-20.04 AS hip diff --git a/Makefile b/Makefile index 405126486..fabf48bae 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ else DebugArgs= endif -targets = asan clang10 clang11 clang12 clang13 gcc11 gcc7 gcc8 gcc9 hip hip.debug nvcc10 sycl umap_build +targets = gcc clang umap_build asan cuda hip sycl intel $(targets): DOCKER_BUILDKIT=1 docker build --target $@ --no-cache $(DebugArgs) . diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index cdbe9c3c4..f16242a94 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -88,64 +88,71 @@ blt_add_target_compile_flags( TO pool_stress_test FLAGS ${UMPIRE_DISABLE_DEPRECATED_WARNINGS_FLAG}) -if (UMPIRE_ENABLE_BENCHMARKS) - set (benchmark_depends gbenchmark umpire) +set (benchmark_depends gbenchmark umpire) - if (UMPIRE_ENABLE_OPENMP_TARGET) - set (benchmark_depends - ${benchmark_depends} - openmp) - endif() +if (UMPIRE_ENABLE_OPENMP_TARGET) + set (benchmark_depends + ${benchmark_depends} + openmp) +endif() - blt_add_executable( - NAME allocator_benchmarks - SOURCES allocator_benchmarks.cpp - DEPENDS_ON ${benchmark_depends}) +blt_add_executable( + NAME allocator_benchmarks + SOURCES allocator_benchmarks.cpp + DEPENDS_ON ${benchmark_depends}) - blt_add_benchmark( - NAME allocator_benchmarks - COMMAND allocator_benchmarks) +blt_add_benchmark( + NAME allocator_benchmarks + COMMAND allocator_benchmarks) - blt_add_executable( - NAME vendor_allocator_benchmarks - SOURCES vendor_allocator_benchmarks.cpp - DEPENDS_ON ${benchmark_depends}) +blt_add_executable( + NAME vendor_allocator_benchmarks + SOURCES vendor_allocator_benchmarks.cpp + DEPENDS_ON ${benchmark_depends}) + +blt_add_benchmark( + NAME vendor_allocator_benchmarks + COMMAND vendor_allocator_benchmarks) + +blt_add_executable( + NAME debuglog_benchmarks + SOURCES debuglog_benchmarks.cpp + DEPENDS_ON ${benchmark_depends}) - blt_add_benchmark( - NAME vendor_allocator_benchmarks - COMMAND vendor_allocator_benchmarks) +if (UMPIRE_ENABLE_OPENMP) + set (benchmark_depends + ${benchmark_depends} + openmp) blt_add_executable( - NAME debuglog_benchmarks - SOURCES debuglog_benchmarks.cpp + NAME file_resource_benchmarks + SOURCES file_resource_benchmarks.cpp DEPENDS_ON ${benchmark_depends}) +endif() - if (UMPIRE_ENABLE_OPENMP) - set (benchmark_depends - ${benchmark_depends} - openmp) +blt_add_executable( + NAME copy_benchmarks + SOURCES copy_benchmarks.cpp + DEPENDS_ON ${benchmark_depends}) - blt_add_executable( - NAME file_resource_benchmarks - SOURCES file_resource_benchmarks.cpp - DEPENDS_ON ${benchmark_depends}) - endif() +blt_add_benchmark( + NAME copy_benchmarks + COMMAND copy_benchmarks) - blt_add_executable( - NAME copy_benchmarks - SOURCES copy_benchmarks.cpp - DEPENDS_ON ${benchmark_depends}) +blt_add_executable( + NAME inspector_benchmarks + SOURCES inspector_benchmarks.cpp + DEPENDS_ON ${benchmark_depends}) - blt_add_benchmark( - NAME copy_benchmarks - COMMAND copy_benchmarks) +blt_add_benchmark( + NAME inspector_benchmarks + COMMAND inspector_benchmarks) - blt_add_executable( - NAME inspector_benchmarks - SOURCES inspector_benchmarks.cpp - DEPENDS_ON ${benchmark_depends}) +blt_add_executable( + NAME copy_performance_benchmark + SOURCES op/copy_performance_benchmark.cpp + DEPENDS_ON ${benchmark_depends}) - blt_add_benchmark( - NAME inspector_benchmarks - COMMAND inspector_benchmarks) -endif() +blt_add_benchmark( + NAME copy_performance_benchmark + COMMAND copy_performance_benchmark) diff --git a/benchmarks/op/copy_performance_benchmark.cpp b/benchmarks/op/copy_performance_benchmark.cpp new file mode 100644 index 000000000..690c737f1 --- /dev/null +++ b/benchmarks/op/copy_performance_benchmark.cpp @@ -0,0 +1,365 @@ +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC and Umpire +// project contributors. See the COPYRIGHT file for details. +// +// SPDX-License-Identifier: (MIT) +////////////////////////////////////////////////////////////////////////////// + +#include + +#include "benchmark/benchmark.h" + +#include "umpire/ResourceManager.hpp" +#include "umpire/Allocator.hpp" +#include "umpire/op.hpp" + +constexpr int MIN_SIZE = 64; // 64 bytes +constexpr int MAX_SIZE = 1048576; // 1 MB +constexpr int MULTIPLIER = 2; + +//============================================================================== +// Benchmark 1: Original ResourceManager copy (legacy approach) +//============================================================================== + +static void BM_ResourceManager_Copy(benchmark::State& state, const std::string& src_name, const std::string& dst_name) { + auto& rm = umpire::ResourceManager::getInstance(); + + auto src_allocator = rm.getAllocator(src_name); + auto dst_allocator = rm.getAllocator(dst_name); + + const std::size_t size = state.range(0); + + void* src_ptr = src_allocator.allocate(size); + void* dst_ptr = dst_allocator.allocate(size); + + // Initialize source data + std::memset(src_ptr, 0xAA, size); + + for (auto _ : state) { + // Original ResourceManager approach + rm.copy(src_ptr, dst_ptr, size); + benchmark::DoNotOptimize(dst_ptr); + benchmark::ClobberMemory(); + } + + state.SetBytesProcessed(static_cast(state.iterations()) * static_cast(size)); + + src_allocator.deallocate(src_ptr); + dst_allocator.deallocate(dst_ptr); +} + +//============================================================================== +// Benchmark 2: Runtime-dispatch copy (new operation system v2) +//============================================================================== + +static void BM_RuntimeDispatch_Copy(benchmark::State& state, const std::string& src_name, const std::string& dst_name) { + auto& rm = umpire::ResourceManager::getInstance(); + + auto src_allocator = rm.getAllocator(src_name); + auto dst_allocator = rm.getAllocator(dst_name); + + const std::size_t size = state.range(0); + + void* src_ptr = src_allocator.allocate(size); + void* dst_ptr = dst_allocator.allocate(size); + + // Initialize source data + std::memset(src_ptr, 0xBB, size); + + for (auto _ : state) { + // Runtime dispatch - auto-detects platform from pointers + umpire::copy(src_ptr, dst_ptr, size); + benchmark::DoNotOptimize(dst_ptr); + benchmark::ClobberMemory(); + } + + state.SetBytesProcessed(static_cast(state.iterations()) * static_cast(size)); + + src_allocator.deallocate(src_ptr); + dst_allocator.deallocate(dst_ptr); +} + +//============================================================================== +// Benchmark 3: Compile-time dispatch copy (zero-overhead direct calls) +//============================================================================== + +// Host to Host +static void BM_CompileTimeDispatch_Copy_Host_Host(benchmark::State& state) { + auto& rm = umpire::ResourceManager::getInstance(); + + auto src_allocator = rm.getAllocator("HOST"); + auto dst_allocator = rm.getAllocator("HOST"); + + const std::size_t size = state.range(0); + + void* src_ptr = src_allocator.allocate(size); + void* dst_ptr = dst_allocator.allocate(size); + + // Initialize source data + std::memset(src_ptr, 0xCC, size); + + for (auto _ : state) { + // Compile-time dispatch - explicit platform specification + umpire::copy( + static_cast(src_ptr), + static_cast(dst_ptr), + size); + benchmark::DoNotOptimize(dst_ptr); + benchmark::ClobberMemory(); + } + + state.SetBytesProcessed(static_cast(state.iterations()) * static_cast(size)); + + src_allocator.deallocate(src_ptr); + dst_allocator.deallocate(dst_ptr); +} + +#if defined(UMPIRE_ENABLE_CUDA) +// Host to CUDA Device +static void BM_CompileTimeDispatch_Copy_Host_Cuda(benchmark::State& state) { + auto& rm = umpire::ResourceManager::getInstance(); + + auto src_allocator = rm.getAllocator("HOST"); + auto dst_allocator = rm.getAllocator("DEVICE"); + + const std::size_t size = state.range(0); + + void* src_ptr = src_allocator.allocate(size); + void* dst_ptr = dst_allocator.allocate(size); + + // Initialize source data + std::memset(src_ptr, 0xDD, size); + + for (auto _ : state) { + // Compile-time dispatch - explicit platform specification + umpire::copy( + static_cast(src_ptr), + static_cast(dst_ptr), + size); + benchmark::DoNotOptimize(dst_ptr); + benchmark::ClobberMemory(); + } + + state.SetBytesProcessed(static_cast(state.iterations()) * static_cast(size)); + + src_allocator.deallocate(src_ptr); + dst_allocator.deallocate(dst_ptr); +} + +// CUDA Device to Host +static void BM_CompileTimeDispatch_Copy_Cuda_Host(benchmark::State& state) { + auto& rm = umpire::ResourceManager::getInstance(); + + auto src_allocator = rm.getAllocator("DEVICE"); + auto dst_allocator = rm.getAllocator("HOST"); + + const std::size_t size = state.range(0); + + void* src_ptr = src_allocator.allocate(size); + void* dst_ptr = dst_allocator.allocate(size); + + // Initialize source data on device + unsigned char pattern = 0xEE; + umpire::memset(src_ptr, pattern, size); + + for (auto _ : state) { + // Compile-time dispatch - explicit platform specification + umpire::copy( + static_cast(src_ptr), + static_cast(dst_ptr), + size); + benchmark::DoNotOptimize(dst_ptr); + benchmark::ClobberMemory(); + } + + state.SetBytesProcessed(static_cast(state.iterations()) * static_cast(size)); + + src_allocator.deallocate(src_ptr); + dst_allocator.deallocate(dst_ptr); +} + +// CUDA Device to CUDA Device +static void BM_CompileTimeDispatch_Copy_Cuda_Cuda(benchmark::State& state) { + auto& rm = umpire::ResourceManager::getInstance(); + + auto src_allocator = rm.getAllocator("DEVICE"); + auto dst_allocator = rm.getAllocator("DEVICE"); + + const std::size_t size = state.range(0); + + void* src_ptr = src_allocator.allocate(size); + void* dst_ptr = dst_allocator.allocate(size); + + // Initialize source data on device + unsigned char pattern = 0xFF; + umpire::memset(src_ptr, pattern, size); + + for (auto _ : state) { + // Compile-time dispatch - explicit platform specification + umpire::copy( + static_cast(src_ptr), + static_cast(dst_ptr), + size); + benchmark::DoNotOptimize(dst_ptr); + benchmark::ClobberMemory(); + } + + state.SetBytesProcessed(static_cast(state.iterations()) * static_cast(size)); + + src_allocator.deallocate(src_ptr); + dst_allocator.deallocate(dst_ptr); +} +#endif // UMPIRE_ENABLE_CUDA + +#if defined(UMPIRE_ENABLE_HIP) +// Host to HIP Device +static void BM_CompileTimeDispatch_Copy_Host_Hip(benchmark::State& state) { + auto& rm = umpire::ResourceManager::getInstance(); + + auto src_allocator = rm.getAllocator("HOST"); + auto dst_allocator = rm.getAllocator("DEVICE"); + + const std::size_t size = state.range(0); + + void* src_ptr = src_allocator.allocate(size); + void* dst_ptr = dst_allocator.allocate(size); + + // Initialize source data + std::memset(src_ptr, 0x11, size); + + for (auto _ : state) { + // Compile-time dispatch - explicit platform specification + umpire::copy( + static_cast(src_ptr), + static_cast(dst_ptr), + size); + benchmark::DoNotOptimize(dst_ptr); + benchmark::ClobberMemory(); + } + + state.SetBytesProcessed(static_cast(state.iterations()) * static_cast(size)); + + src_allocator.deallocate(src_ptr); + dst_allocator.deallocate(dst_ptr); +} + +// HIP Device to Host +static void BM_CompileTimeDispatch_Copy_Hip_Host(benchmark::State& state) { + auto& rm = umpire::ResourceManager::getInstance(); + + auto src_allocator = rm.getAllocator("DEVICE"); + auto dst_allocator = rm.getAllocator("HOST"); + + const std::size_t size = state.range(0); + + void* src_ptr = src_allocator.allocate(size); + void* dst_ptr = dst_allocator.allocate(size); + + // Initialize source data on device + unsigned char pattern = 0x22; + umpire::memset(src_ptr, pattern, size); + + for (auto _ : state) { + // Compile-time dispatch - explicit platform specification + umpire::copy( + static_cast(src_ptr), + static_cast(dst_ptr), + size); + benchmark::DoNotOptimize(dst_ptr); + benchmark::ClobberMemory(); + } + + state.SetBytesProcessed(static_cast(state.iterations()) * static_cast(size)); + + src_allocator.deallocate(src_ptr); + dst_allocator.deallocate(dst_ptr); +} +#endif // UMPIRE_ENABLE_HIP + +//============================================================================== +// Benchmark Registration +//============================================================================== + +// Host to Host benchmarks - all three approaches +BENCHMARK_CAPTURE(BM_ResourceManager_Copy, ResourceManager_Host_Host, std::string("HOST"), std::string("HOST")) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); + +BENCHMARK_CAPTURE(BM_RuntimeDispatch_Copy, RuntimeDispatch_Host_Host, std::string("HOST"), std::string("HOST")) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); + +BENCHMARK(BM_CompileTimeDispatch_Copy_Host_Host) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); + +#if defined(UMPIRE_ENABLE_CUDA) +// Host to CUDA Device benchmarks +BENCHMARK_CAPTURE(BM_ResourceManager_Copy, ResourceManager_Host_Cuda, std::string("HOST"), std::string("DEVICE")) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); + +BENCHMARK_CAPTURE(BM_RuntimeDispatch_Copy, RuntimeDispatch_Host_Cuda, std::string("HOST"), std::string("DEVICE")) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); + +BENCHMARK(BM_CompileTimeDispatch_Copy_Host_Cuda) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); + +// CUDA Device to Host benchmarks +BENCHMARK_CAPTURE(BM_ResourceManager_Copy, ResourceManager_Cuda_Host, std::string("DEVICE"), std::string("HOST")) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); + +BENCHMARK_CAPTURE(BM_RuntimeDispatch_Copy, RuntimeDispatch_Cuda_Host, std::string("DEVICE"), std::string("HOST")) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); + +BENCHMARK(BM_CompileTimeDispatch_Copy_Cuda_Host) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); + +// CUDA Device to CUDA Device benchmarks +BENCHMARK_CAPTURE(BM_ResourceManager_Copy, ResourceManager_Cuda_Cuda, std::string("DEVICE"), std::string("DEVICE")) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); + +BENCHMARK_CAPTURE(BM_RuntimeDispatch_Copy, RuntimeDispatch_Cuda_Cuda, std::string("DEVICE"), std::string("DEVICE")) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); + +BENCHMARK(BM_CompileTimeDispatch_Copy_Cuda_Cuda) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); +#endif // UMPIRE_ENABLE_CUDA + +#if defined(UMPIRE_ENABLE_HIP) +// Host to HIP Device benchmarks +BENCHMARK_CAPTURE(BM_ResourceManager_Copy, ResourceManager_Host_Hip, std::string("HOST"), std::string("DEVICE")) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); + +BENCHMARK_CAPTURE(BM_RuntimeDispatch_Copy, RuntimeDispatch_Host_Hip, std::string("HOST"), std::string("DEVICE")) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); + +BENCHMARK(BM_CompileTimeDispatch_Copy_Host_Hip) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); + +// HIP Device to Host benchmarks +BENCHMARK_CAPTURE(BM_ResourceManager_Copy, ResourceManager_Hip_Host, std::string("DEVICE"), std::string("HOST")) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); + +BENCHMARK_CAPTURE(BM_RuntimeDispatch_Copy, RuntimeDispatch_Hip_Host, std::string("DEVICE"), std::string("HOST")) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); + +BENCHMARK(BM_CompileTimeDispatch_Copy_Hip_Host) + ->RangeMultiplier(MULTIPLIER)->Range(MIN_SIZE, MAX_SIZE) + ->Unit(benchmark::kMicrosecond); +#endif // UMPIRE_ENABLE_HIP + +BENCHMARK_MAIN(); \ No newline at end of file diff --git a/cmake/SetupUmpireOptions.cmake b/cmake/SetupUmpireOptions.cmake index 725afbae1..1c7bc461e 100644 --- a/cmake/SetupUmpireOptions.cmake +++ b/cmake/SetupUmpireOptions.cmake @@ -25,6 +25,7 @@ option(UMPIRE_ENABLE_NUMA "Build Umpire with NUMA support" Off) option(UMPIRE_ENABLE_OPENMP_TARGET "Build Umpire with OPENMP target" Off) option(UMPIRE_ENABLE_LOGGING "Build Umpire with Logging enabled" On) +option(UMPIRE_ENABLE_BOUNDS_CHECKS "Enable bounds checking in memory operations" On) option(UMPIRE_ENABLE_SLIC "Build Umpire with SLIC logging" Off) option(UMPIRE_ENABLE_BACKTRACE "Build Umpire with allocation backtrace enabled" Off) option(UMPIRE_ENABLE_BACKTRACE_SYMBOLS "Build Umpire with symbol support" Off) @@ -39,6 +40,7 @@ option(UMPIRE_ENABLE_SANITIZER_TESTS "Enable address sanitizer tests" Off) option(UMPIRE_ENABLE_DEVICE_ALLOCATOR "Enable Device Allocator" Off) option(UMPIRE_ENABLE_SQLITE_EXPERIMENTAL "Build with sqlite event integration (experimental)" Off) option(UMPIRE_DISABLE_ALLOCATIONMAP_DEBUG "Disable verbose output from AllocationMap during debug builds" Off) +option(UMPIRE_RM_USE_NEW_OPS "Enable new template-based memory operations" On) set(UMPIRE_FMT_TARGET fmt::fmt-header-only CACHE STRING "Name of fmt target to use") if (UMPIRE_ENABLE_INACCESSIBILITY_TESTS) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 2a03ddf2f..3c75fcff8 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -55,7 +55,9 @@ if (UMPIRE_ENABLE_HIP) NAME multi_device SOURCES multi_device.cpp DEPENDS_ON umpire blt::hip) - + blt_add_target_compile_flags( + TO multi_device + FLAGS ${UMPIRE_DISABLE_DEPRECATED_WARNINGS_FLAG}) list(APPEND umpire_examples multi_device) endif() diff --git a/examples/cookbook/CMakeLists.txt b/examples/cookbook/CMakeLists.txt index c4589bea5..9e90cb088 100644 --- a/examples/cookbook/CMakeLists.txt +++ b/examples/cookbook/CMakeLists.txt @@ -23,6 +23,9 @@ if (UMPIRE_ENABLE_NUMA) NAME recipe_move_between_numa SOURCES recipe_move_between_numa.cpp DEPENDS_ON ${cookbook_depends}) + blt_add_target_compile_flags( + TO recipe_move_between_numa + FLAGS ${UMPIRE_DISABLE_DEPRECATED_WARNINGS_FLAG}) list(APPEND umpire_cookbooks recipe_move_between_numa) endif () @@ -79,6 +82,9 @@ if (UMPIRE_ENABLE_CUDA) NAME recipe_move_to_managed SOURCES recipe_move_to_managed.cpp DEPENDS_ON ${cookbook_depends}) + blt_add_target_compile_flags( + TO recipe_move_to_managed + FLAGS ${UMPIRE_DISABLE_DEPRECATED_WARNINGS_FLAG}) list(APPEND umpire_cookbooks recipe_move_to_managed) blt_add_executable( @@ -246,5 +252,4 @@ blt_add_executable( list(APPEND umpire_cookbooks recipe_named_allocation) install(TARGETS ${umpire_cookbooks} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) - umpire_add_code_checks(PREFIX cookbook) diff --git a/examples/cookbook/recipe_device_ipc.cpp b/examples/cookbook/recipe_device_ipc.cpp index e37ff8efc..d7ba997f3 100644 --- a/examples/cookbook/recipe_device_ipc.cpp +++ b/examples/cookbook/recipe_device_ipc.cpp @@ -12,6 +12,7 @@ #include "umpire/Umpire.hpp" #include "umpire/strategy/DeviceIpcAllocator.hpp" #include "umpire/util/MemoryResourceTraits.hpp" +#include "umpire/op.hpp" #if defined(UMPIRE_ENABLE_MPI) #include #endif @@ -41,7 +42,8 @@ int main(int argc, char** argv) // Allocate device memory - only rank 0 will physically allocate // All other ranks will import via IPC - const size_t size = 1024 * sizeof(float); + constexpr std::size_t num_elements = 1024; + const size_t size = num_elements * sizeof(float); float* data = static_cast(ipc_allocator.allocate(size)); std::cout << "Rank " << rank << ": Got device memory at " << data << std::endl; @@ -59,7 +61,7 @@ int main(int argc, char** argv) } // Copy to device - rm.copy(data, host_data, size); + umpire::copy(host_data, data, num_elements); host_allocator.deallocate(host_data); } @@ -76,7 +78,7 @@ int main(int argc, char** argv) // All ranks can now access the data // Verify by copying a portion back to host float* value = static_cast(host_allocator.allocate(sizeof(float))); - rm.copy(value, data + 1, sizeof(float)); + umpire::copy(data + 1, value, 1); std::cout << "Rank " << rank << ": second value is " << *value << std::endl; diff --git a/examples/cookbook/recipe_move_between_numa.cpp b/examples/cookbook/recipe_move_between_numa.cpp index 7282386fa..18729c8c9 100644 --- a/examples/cookbook/recipe_move_between_numa.cpp +++ b/examples/cookbook/recipe_move_between_numa.cpp @@ -12,6 +12,7 @@ #include "umpire/util/Macros.hpp" #include "umpire/util/error.hpp" #include "umpire/util/numa.hpp" +#include "umpire/op.hpp" #if defined(UMPIRE_ENABLE_CUDA) #include @@ -52,7 +53,7 @@ int main(int, char**) } // Touch it - rm.memset(dst_ptr, 0); + umpire::memset(dst_ptr, 0, alloc_size); // Verify NUMA node if (umpire::numa::get_location(dst_ptr) != host_nodes[1]) { @@ -82,7 +83,7 @@ int main(int, char**) // Touch it -- this currently uses the host memset operation (thus, copying // the memory back) - rm.memset(dst_ptr, 0); + umpire::memset(dst_ptr, 0, alloc_size); // Verify NUMA node if (umpire::numa::get_location(dst_ptr) != device_nodes[0]) { diff --git a/examples/multi_device.cpp b/examples/multi_device.cpp index 9be296bdb..f39868c7e 100644 --- a/examples/multi_device.cpp +++ b/examples/multi_device.cpp @@ -8,6 +8,7 @@ #include "umpire/ResourceManager.hpp" #include "umpire/strategy/QuickPool.hpp" +#include "umpire/op.hpp" constexpr int BLOCK_SIZE = 256; constexpr int NUM_THREADS = 4096; @@ -67,7 +68,7 @@ int main(int, char**) } #endif - rm.copy(b, a); + umpire::copy(a, b, NUM_THREADS); b = static_cast(rm.move(b, rm.getAllocator("HOST"))); UMPIRE_ASSERT(b[BLOCK_SIZE] == (BLOCK_SIZE * MULTIPLE) && "Error: incorrect value!"); diff --git a/examples/tutorial/CMakeLists.txt b/examples/tutorial/CMakeLists.txt index 2a19df0f0..198c48336 100644 --- a/examples/tutorial/CMakeLists.txt +++ b/examples/tutorial/CMakeLists.txt @@ -88,4 +88,8 @@ endif () install(TARGETS ${umpire_tutorials} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) +blt_add_target_compile_flags( + TO tut_move + FLAGS ${UMPIRE_DISABLE_DEPRECATED_WARNINGS_FLAG}) + umpire_add_code_checks(PREFIX tutorial) diff --git a/examples/tutorial/tut_copy.cpp b/examples/tutorial/tut_copy.cpp index ac9e79745..e96d60b74 100644 --- a/examples/tutorial/tut_copy.cpp +++ b/examples/tutorial/tut_copy.cpp @@ -6,6 +6,7 @@ ////////////////////////////////////////////////////////////////////////////// #include "umpire/Allocator.hpp" #include "umpire/ResourceManager.hpp" +#include "umpire/op.hpp" void copy_data(double* source_data, std::size_t size, const std::string& destination) { @@ -15,7 +16,7 @@ void copy_data(double* source_data, std::size_t size, const std::string& destina double* dest_data = static_cast(dest_allocator.allocate(size * sizeof(double))); // _sphinx_tag_tut_copy_start - rm.copy(dest_data, source_data); + umpire::copy(source_data, dest_data, size); // _sphinx_tag_tut_copy_end std::cout << "Copied source data (" << source_data << ") to destination " << destination << " (" << dest_data << ")" diff --git a/examples/tutorial/tut_memset.cpp b/examples/tutorial/tut_memset.cpp index be62c7d77..4aae072a6 100644 --- a/examples/tutorial/tut_memset.cpp +++ b/examples/tutorial/tut_memset.cpp @@ -6,6 +6,7 @@ ////////////////////////////////////////////////////////////////////////////// #include "umpire/Allocator.hpp" #include "umpire/ResourceManager.hpp" +#include "umpire/op.hpp" int main(int, char**) { @@ -37,7 +38,7 @@ int main(int, char**) << std::endl; // _sphinx_tag_tut_memset_start - rm.memset(data, 0); + umpire::memset(data, 0, SIZE * sizeof(double)); // _sphinx_tag_tut_memset_end std::cout << "Set data from " << destination << " (" << data << ") to 0." << std::endl; diff --git a/examples/tutorial/tut_reallocate.cpp b/examples/tutorial/tut_reallocate.cpp index ab5fe3aac..93d1e244a 100644 --- a/examples/tutorial/tut_reallocate.cpp +++ b/examples/tutorial/tut_reallocate.cpp @@ -6,6 +6,7 @@ ////////////////////////////////////////////////////////////////////////////// #include "umpire/Allocator.hpp" #include "umpire/ResourceManager.hpp" +#include "umpire/op.hpp" int main(int, char**) { @@ -40,7 +41,7 @@ int main(int, char**) std::cout << "Reallocating data (" << data << ") to size " << REALLOCATED_SIZE << "..."; // _sphinx_tag_tut_realloc_start - data = static_cast(rm.reallocate(data, REALLOCATED_SIZE)); + data = umpire::reallocate(&data, REALLOCATED_SIZE); // _sphinx_tag_tut_realloc_end std::cout << "done. Reallocated data (" << data << ")" << std::endl; diff --git a/include/umpire/op.hpp b/include/umpire/op.hpp new file mode 100644 index 000000000..4924ebf1c --- /dev/null +++ b/include/umpire/op.hpp @@ -0,0 +1,19 @@ +#pragma once + +#include "umpire/config.hpp" +#include "umpire/op/host.hpp" +#include "umpire/op/operations.hpp" +#if defined(UMPIRE_ENABLE_CUDA) +#include "umpire/op/cuda.hpp" +#endif +#if defined(UMPIRE_ENABLE_HIP) +#include "umpire/op/hip.hpp" +#endif +#if defined(UMPIRE_ENABLE_SYCL) +#include "umpire/op/sycl.hpp" +#endif +#if defined(UMPIRE_ENABLE_OPENMP_TARGET) +#include "umpire/op/openmp_target.hpp" +#endif + +#include "umpire/op/dispatch.hpp" diff --git a/include/umpire/op/cuda.hpp b/include/umpire/op/cuda.hpp new file mode 100644 index 000000000..5dfc8cc87 --- /dev/null +++ b/include/umpire/op/cuda.hpp @@ -0,0 +1,478 @@ +#pragma once + +#include + +#include "umpire/op/detail/utils.hpp" +#include "umpire/op/operations.hpp" +#include "umpire/resource/platform.hpp" +#include "umpire/util/Platform.hpp" +#include "umpire/util/error.hpp" + +namespace umpire { +namespace op { + +// CUDA implementation helpers +namespace detail { + +/** + * @brief Get the CUDA memory copy direction kind + * + * @tparam SRC Source platform + * @tparam DST Destination platform + */ +template +struct copy_kind; + +// Device to host specialization +template <> +struct copy_kind { + static constexpr cudaMemcpyKind value = cudaMemcpyDeviceToHost; +}; + +// Host to device specialization +template <> +struct copy_kind { + static constexpr cudaMemcpyKind value = cudaMemcpyHostToDevice; +}; + +// Device to device specialization +template <> +struct copy_kind { + static constexpr cudaMemcpyKind value = cudaMemcpyDeviceToDevice; +}; + +/** + * @brief Check if a CUDA device supports managed memory features + * + * @param device Device ID to check + * @return true if the device supports managed memory + * @return false if the device does not support managed memory + */ +inline bool supports_managed_memory(int device) +{ + cudaDeviceProp properties; + cudaError_t error = ::cudaGetDeviceProperties(&properties, device); + + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, fmt::format("cudaGetDeviceProperties for device {} failed with error: {}", device, + cudaGetErrorString(error))); + } + + return (properties.managedMemory == 1 && properties.concurrentManagedAccess == 1); +} + +/** + * @brief Get CUDA stream from a resource + * + * @param resource The resource to get the stream from + * @return cudaStream_t The CUDA stream + */ +inline cudaStream_t get_stream(camp::resources::Resource& resource) +{ + auto cuda_resource = resource.try_get(); + if (!cuda_resource) { + UMPIRE_ERROR(resource_error, fmt::format("Expected resources::Cuda, got resources::{}", + platform_to_string(resource.get_platform()))); + } + return cuda_resource->get_stream(); +} + +/** + * @brief Apply memory advice to a CUDA managed memory allocation + * + * @tparam T Type of memory + * @param ptr Pointer to memory + * @param count Number of elements + * @param device Device ID for advice + * @param advice Memory advice to apply + */ +template +inline void advise(T* ptr, std::size_t count, int device, cudaMemoryAdvise advice) +{ + // Skip if device doesn't support managed memory + if (!supports_managed_memory(device)) + return; + + std::size_t size = detail::get_size(count); + cudaError_t error = ::cudaMemAdvise(ptr, size, advice, device); + + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, + fmt::format("cudaMemAdvise(ptr={}, size={}, advice={}, device={}) failed with error: {}", ptr, size, + static_cast(advice), device, cudaGetErrorString(error))); + } +} + +/** + * @brief Synchronous memory copy implementation + * + * @tparam T Type of memory + * @param src Source pointer + * @param dst Destination pointer + * @param count Number of elements + * @param kind Copy direction kind + */ +template +inline void copy(T* src, T* dst, std::size_t count, cudaMemcpyKind kind) +{ + std::size_t size = detail::get_size(count); + + cudaError_t error = ::cudaMemcpy(dst, src, size, kind); + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, fmt::format("cudaMemcpy(dst={}, src={}, size={}, kind={}) failed with error: {}", reinterpret_cast(dst), + reinterpret_cast(src), size, static_cast(kind), cudaGetErrorString(error))); + } +} + +/** + * @brief Asynchronous memory copy implementation + * + * @tparam T Type of memory + * @param src Source pointer + * @param dst Destination pointer + * @param count Number of elements + * @param resource Resource for asynchronous operation + * @param kind Copy direction kind + * @return Event representing the asynchronous operation + */ +template +inline camp::resources::EventProxy copy_async(T* src, T* dst, std::size_t count, + camp::resources::Resource& resource, + cudaMemcpyKind kind) +{ + auto stream = get_stream(resource); + std::size_t size = detail::get_size(count); + + cudaError_t error = ::cudaMemcpyAsync(dst, src, size, kind, stream); + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, + fmt::format("cudaMemcpyAsync(dst={}, src={}, size={}, kind={}, stream={}) failed with error: {}", dst, + src, size, static_cast(kind), static_cast(stream), cudaGetErrorString(error))); + } + + return camp::resources::EventProxy{resource}; +} + +/** + * @brief Synchronous memory set implementation + * + * @tparam T Type of memory + * @param ptr Pointer to memory + * @param value Value to set + * @param count Number of elements + */ +template +inline void memset(T* ptr, int value, std::size_t count) +{ + std::size_t size = detail::get_size(count); + + cudaError_t error = ::cudaMemset(ptr, value, size); + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, fmt::format("cudaMemset(ptr={}, value={}, size={}) failed with error: {}", reinterpret_cast(ptr), value, + size, cudaGetErrorString(error))); + } +} + +/** + * @brief Asynchronous memory set implementation + * + * @tparam T Type of memory + * @param ptr Pointer to memory + * @param value Value to set + * @param count Number of elements + * @param resource Resource for asynchronous operation + * @return Event representing the asynchronous operation + */ +template +inline camp::resources::EventProxy memset_async(T* ptr, int value, std::size_t count, + camp::resources::Resource& resource) +{ + auto stream = get_stream(resource); + std::size_t size = detail::get_size(count); + + cudaError_t error = ::cudaMemsetAsync(ptr, value, size, stream); + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, + fmt::format("cudaMemsetAsync(ptr={}, value={}, size={}, stream={}) failed with error: {}", ptr, value, + size, static_cast(stream), cudaGetErrorString(error))); + } + + return camp::resources::EventProxy{resource}; +} + +/** + * @brief Synchronous memory prefetch implementation + * + * @tparam T Type of memory + * @param ptr Pointer to memory + * @param device Device to prefetch to + * @param count Number of elements + */ +template +inline void prefetch(T* ptr, int device, std::size_t count) +{ + // Use current device for properties if device is CPU + int current_device; + cudaGetDevice(¤t_device); + int gpu = (device != cudaCpuDeviceId) ? device : current_device; + + if (supports_managed_memory(gpu)) { + std::size_t size = detail::get_size(count); + cudaError_t error = ::cudaMemPrefetchAsync(ptr, size, device, nullptr); + + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, fmt::format("cudaMemPrefetchAsync(ptr={}, size={}, device={}) failed with error: {}", + reinterpret_cast(ptr), size, device, cudaGetErrorString(error))); + } + } +} + +/** + * @brief Asynchronous memory prefetch implementation + * + * @tparam T Type of memory + * @param ptr Pointer to memory + * @param device Device to prefetch to + * @param count Number of elements + * @param resource Resource for asynchronous operation + * @return Event representing the asynchronous operation + */ +template +inline camp::resources::EventProxy prefetch_async(T* ptr, int device, std::size_t count, + camp::resources::Resource& resource) +{ + auto stream = get_stream(resource); + + // Use current device for properties if device is CPU + int current_device; + cudaGetDevice(¤t_device); + int gpu = (device != cudaCpuDeviceId) ? device : current_device; + + if (supports_managed_memory(gpu)) { + std::size_t size = detail::get_size(count); + cudaError_t error = ::cudaMemPrefetchAsync(ptr, size, device, stream); + + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, + fmt::format("cudaMemPrefetchAsync(ptr={}, size={}, device={}, stream={}) failed with error: {}", ptr, + size, device, static_cast(stream), cudaGetErrorString(error))); + } + } + + return camp::resources::EventProxy{resource}; +} + +} // namespace detail + +//------------------------------------------------------------------------------ +// CUDA Operation Template Specializations +//------------------------------------------------------------------------------ + +// CUDA-to-CUDA copy operation +template <> +struct copy { + /** + * @brief CUDA to CUDA synchronous copy + * + * @tparam T Type of data being copied + * @param src Source pointer + * @param dst Destination pointer + * @param len Number of elements to copy + */ + template + static void exec(T* src, T* dst, std::size_t len) noexcept + { + detail::copy(src, dst, len, detail::copy_kind::value); + } + + /** + * @brief CUDA to CUDA asynchronous copy + * + * @tparam T Type of data being copied + * @param src Source pointer + * @param dst Destination pointer + * @param len Number of elements to copy + * @param resource Resource for asynchronous operation + * @return Event representing the asynchronous operation + */ + template + static camp::resources::EventProxy exec(T* src, T* dst, std::size_t len, + camp::resources::Resource& resource) + { + return detail::copy_async(src, dst, len, resource, + detail::copy_kind::value); + } +}; + +// CUDA-to-Host copy operation +template <> +struct copy { + /** + * @brief CUDA to Host synchronous copy + * + * @tparam T Type of data being copied + * @param src Source pointer + * @param dst Destination pointer + * @param len Number of elements to copy + */ + template + static void exec(T* src, T* dst, std::size_t len) noexcept + { + detail::copy(src, dst, len, detail::copy_kind::value); + } + + /** + * @brief CUDA to Host asynchronous copy + * + * @tparam T Type of data being copied + * @param src Source pointer + * @param dst Destination pointer + * @param len Number of elements to copy + * @param resource Resource for asynchronous operation + * @return Event representing the asynchronous operation + */ + template + static camp::resources::EventProxy exec(T* src, T* dst, std::size_t len, + camp::resources::Resource& resource) + { + return detail::copy_async(src, dst, len, resource, + detail::copy_kind::value); + } +}; + +// Host-to-CUDA copy operation +template <> +struct copy { + /** + * @brief Host to CUDA synchronous copy + * + * @tparam T Type of data being copied + * @param src Source pointer + * @param dst Destination pointer + * @param len Number of elements to copy + */ + template + static void exec(T* src, T* dst, std::size_t len) noexcept + { + detail::copy(src, dst, len, detail::copy_kind::value); + } + + /** + * @brief Host to CUDA asynchronous copy + * + * @tparam T Type of data being copied + * @param src Source pointer + * @param dst Destination pointer + * @param len Number of elements to copy + * @param resource Resource for asynchronous operation + * @return Event representing the asynchronous operation + */ + template + static camp::resources::EventProxy exec(T* src, T* dst, std::size_t len, + camp::resources::Resource& resource) + { + return detail::copy_async(src, dst, len, resource, + detail::copy_kind::value); + } +}; + +// CUDA memset operation +template <> +struct memset { + /** + * @brief CUDA synchronous memset + * + * @tparam T Type of memory being set + * @param ptr Pointer to memory + * @param val Value to set + * @param len Number of elements to set + */ + template + static void exec(T* ptr, int val, std::size_t len) noexcept + { + detail::memset(ptr, val, len); + } + + /** + * @brief CUDA asynchronous memset + * + * @tparam T Type of memory being set + * @param ptr Pointer to memory + * @param val Value to set + * @param len Number of elements to set + * @param resource Resource for asynchronous operation + * @return Event representing the asynchronous operation + */ + template + static camp::resources::EventProxy exec(T* ptr, int val, std::size_t len, + camp::resources::Resource& resource) + { + return detail::memset_async(ptr, val, len, resource); + } +}; + +// CUDA prefetch operation +template <> +struct prefetch { + /** + * @brief CUDA synchronous prefetch + * + * @tparam T Type of memory being prefetched + * @param ptr Pointer to memory + * @param device Device to prefetch to + * @param len Number of elements to prefetch + */ + template + static void exec(T* ptr, int device, std::size_t len) noexcept + { + detail::prefetch(ptr, device, len); + } + + /** + * @brief CUDA asynchronous prefetch + * + * @tparam T Type of memory being prefetched + * @param ptr Pointer to memory + * @param device Device to prefetch to + * @param len Number of elements to prefetch + * @param resource Resource for asynchronous operation + * @return Event representing the asynchronous operation + */ + template + static camp::resources::EventProxy exec(T* ptr, int device, std::size_t len, + camp::resources::Resource& resource) + { + return detail::prefetch_async(ptr, device, len, resource); + } +}; + +// Memory advice operations define macro to reduce duplication +#define DEFINE_CUDA_ADVICE_OP(op_name, advice_flag) \ + template <> \ + struct op_name { \ + /** \ + * @brief Apply memory advice operation \ + * \ + * @tparam T Type of memory \ + * @param ptr Pointer to memory \ + * @param device Device to apply advice for \ + * @param len Number of elements \ + */ \ + template \ + static inline void exec(T* ptr, int device, std::size_t len) noexcept \ + { \ + detail::advise(ptr, len, device, advice_flag); \ + } \ + }; + +DEFINE_CUDA_ADVICE_OP(set_accessed_by, cudaMemAdviseSetAccessedBy) +DEFINE_CUDA_ADVICE_OP(set_preferred_location, cudaMemAdviseSetPreferredLocation) +DEFINE_CUDA_ADVICE_OP(set_read_mostly, cudaMemAdviseSetReadMostly) +DEFINE_CUDA_ADVICE_OP(unset_accessed_by, cudaMemAdviseUnsetAccessedBy) +DEFINE_CUDA_ADVICE_OP(unset_preferred_location, cudaMemAdviseUnsetPreferredLocation) +DEFINE_CUDA_ADVICE_OP(unset_read_mostly, cudaMemAdviseUnsetReadMostly) + +#undef DEFINE_CUDA_ADVICE_OP + +} // namespace op +} // namespace umpire diff --git a/include/umpire/op/detail/traits.hpp b/include/umpire/op/detail/traits.hpp new file mode 100644 index 000000000..6d8c24328 --- /dev/null +++ b/include/umpire/op/detail/traits.hpp @@ -0,0 +1,36 @@ +#pragma once + +#include + +#include "umpire/config.hpp" + +namespace umpire { +namespace op { +namespace detail { + +template +struct supports_memory_advice : std::false_type {}; + +#if defined(UMPIRE_ENABLE_CUDA) +template <> +struct supports_memory_advice : std::true_type {}; +#endif + +#if defined(UMPIRE_ENABLE_HIP) +template <> +struct supports_memory_advice : std::true_type {}; +#endif + +#if defined(UMPIRE_ENABLE_SYCL) +template <> +struct supports_memory_advice : std::true_type {}; +#endif + +#if defined(UMPIRE_ENABLE_OPENMP_TARGET) +template <> +struct supports_memory_advice : std::true_type {}; +#endif + +} // namespace detail +} // namespace op +} // namespace umpire diff --git a/include/umpire/op/detail/utils.hpp b/include/umpire/op/detail/utils.hpp new file mode 100644 index 000000000..81f34ed6a --- /dev/null +++ b/include/umpire/op/detail/utils.hpp @@ -0,0 +1,55 @@ +#pragma once + +#include +#include +#include "camp/resource.hpp" + +namespace umpire { +namespace op { +namespace detail { + +/** + * @brief Calculate size in bytes based on element count and type + * + * @tparam T The pointer type (void* or typed pointer) + * @param count Number of elements or bytes (if T is void) + * @return std::size_t Size in bytes + */ +template +inline std::size_t get_size(std::size_t count) noexcept +{ + if constexpr (std::is_same_v) + return count; + else + return count * sizeof(T); +} + +/** + * @brief Create a default event for platforms without native async support + * + * @param resource The resource to create the event for + * @return camp::resources::EventProxy A completed event + */ +inline camp::resources::EventProxy +make_completed_event(camp::resources::Resource& resource) noexcept +{ + return camp::resources::EventProxy{resource}; +} + +/** + * @brief Get minimum of two values (used for copy size calculations) + * + * @tparam T The type of values being compared + * @param a First value + * @param b Second value + * @return constexpr T The smaller of the two values + */ +template +constexpr T min(const T& a, const T& b) noexcept +{ + return (a < b) ? a : b; +} + +} // namespace detail +} // namespace op +} // namespace umpire diff --git a/include/umpire/op/dispatch.hpp b/include/umpire/op/dispatch.hpp new file mode 100644 index 000000000..771a0a2a4 --- /dev/null +++ b/include/umpire/op/dispatch.hpp @@ -0,0 +1,783 @@ +#pragma once + +#include + +#include "umpire/ResourceManager.hpp" +#include "umpire/config.hpp" +#include "umpire/op/detail/traits.hpp" +#include "umpire/resource/platform.hpp" + +namespace umpire { +namespace op { +namespace detail { + +/** + * @brief Dispatch an operation to the appropriate platform implementation + * + * @tparam Op The operation template to dispatch + * @tparam Args Argument types for the operation + * @param platform The platform to dispatch to + * @param args Arguments for the operation + * @return Result of the operation + */ +template