From c67a61a06f77dacd53d81ee27b1e604c4069dcbc Mon Sep 17 00:00:00 2001 From: Savanna Spayd Date: Sun, 20 Apr 2025 22:39:16 -0700 Subject: [PATCH 1/2] Added exclusive prefix sum to examples --- examples/CMakeLists.txt | 4 ++ examples/prefix_sum.cpp | 156 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 160 insertions(+) create mode 100644 examples/prefix_sum.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 617c6eec3d..b8a2fe1683 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -67,6 +67,10 @@ raja_add_executable( NAME pi-reduce_vs_atomic SOURCES pi-reduce_vs_atomic.cpp) +raja_add_executable( + NAME prefix_sum + SOURCES prefix_sum.cpp) + raja_add_executable( NAME raja-launch SOURCES raja-launch.cpp) diff --git a/examples/prefix_sum.cpp b/examples/prefix_sum.cpp new file mode 100644 index 0000000000..939fd040b4 --- /dev/null +++ b/examples/prefix_sum.cpp @@ -0,0 +1,156 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" +#include "memoryManager.hpp" + +/* + * Prefix Sum Example + * + * Computes an exclusive prefix sum (scan) on an array of integers using + * multiple execution policies. + * + * RAJA features shown: + * - `exclusive_scan` operation with different execution backends + * - Use of `make_span` to create RAJA-compatible views + * - Sequential, OpenMP, and CUDA execution variants + * - CUDA device memory allocation and transfer + * + * If CUDA is enabled, device memory is allocated manually with `cudaMalloc` + * and the results are copied back to host memory for validation. + */ + + +/* + CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block +*/ +#if defined(RAJA_ENABLE_CUDA) +const int CUDA_BLOCK_SIZE = 256; +#endif + +/* + N - the length of the series to perform the prefix sum +*/ +const int N = 100; + +bool check_equal(const std::vector& a, const std::vector& b) +{ + if (a.size() != b.size()) return false; + for (size_t i = 0; i < a.size(); ++i) { + if (a[i] != b[i]) return false; + } + return true; +} + +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) +{ + std::cout << "\n\nRAJA prefix sum (exclusive_scan) example using a series of " << N << " length...\n"; + + std::vector input(N, 1); + std::vector reference_output(N, 0); + std::vector test_output(N, 0); + + //---------------------------------------------------------------------------- + std::cout << "\n Running C-style prefix sum...\n"; + + auto start = std::chrono::high_resolution_clock::now(); + + reference_output[0] = 0; + for (int i = 1; i < N; ++i) { + reference_output[i] = reference_output[i - 1] + input[i - 1]; + } + + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end - start); + std::cout << " Reference complete. Time: " << duration.count() << " us\n"; + + //---------------------------------------------------------------------------- + std::cout << "\n Running RAJA::exclusive_scan with seq_exec...\n"; + std::fill(test_output.begin(), test_output.end(), 0); + + start = std::chrono::high_resolution_clock::now(); + + RAJA::exclusive_scan( + RAJA::make_span(input), + RAJA::make_span(test_output), + RAJA::operators::plus()); + + end = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end - start); + + std::cout << " Result: " + << (check_equal(reference_output, test_output) ? "PASS" : "FAIL") + << " | Time: " << duration.count() << " us\n"; + + + //---------------------------------------------------------------------------- +#if defined(RAJA_ENABLE_OPENMP) + std::cout << "\n Running RAJA::exclusive_scan with omp_parallel_for_exec...\n"; + std::fill(test_output.begin(), test_output.end(), 0); + + start = std::chrono::high_resolution_clock::now(); + + RAJA::exclusive_scan( + RAJA::make_span(input), + RAJA::make_span(test_output), + RAJA::operators::plus()); + + end = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end - start); + + std::cout << " Result: " + << (check_equal(reference_output, test_output) ? "PASS" : "FAIL") + << " | Time: " << duration.count() << " us\n"; +#endif + + + //---------------------------------------------------------------------------- +#if defined(RAJA_ENABLE_CUDA) + std::cout << "\n Running RAJA::exclusive_scan with cuda_exec...\n"; + + int* d_input; + int* d_output; + + cudaMalloc((void**)&d_input, N * sizeof(int)); + cudaMalloc((void**)&d_output, N * sizeof(int)); + + cudaMemcpy(d_input, input.data(), N * sizeof(int), cudaMemcpyHostToDevice); + cudaMemset(d_output, 0, N * sizeof(int)); + + cudaDeviceSynchronize(); + start = std::chrono::high_resolution_clock::now(); + + RAJA::exclusive_scan>( + RAJA::make_span(d_input, N), + RAJA::make_span(d_output, N), + RAJA::operators::plus{}); + + cudaDeviceSynchronize(); // Make sure the scan finishes before timing ends + end = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end - start); + + cudaMemcpy(test_output.data(), d_output, N * sizeof(int), cudaMemcpyDeviceToHost); + + std::cout << " Result: " + << (check_equal(reference_output, test_output) ? "PASS" : "FAIL") + << " | Time: " << duration.count() << " us\n"; + + cudaFree(d_input); + cudaFree(d_output); +#endif + + + std::cout << "\n DONE!...\n"; + return 0; +} From 6a8876ec4c284164fcdadeef5d66420b0a0bb7e7 Mon Sep 17 00:00:00 2001 From: Savanna Spayd Date: Wed, 30 Apr 2025 00:32:15 -0700 Subject: [PATCH 2/2] Used RAJA timer instead of chrono --- examples/prefix_sum.cpp | 54 ++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/examples/prefix_sum.cpp b/examples/prefix_sum.cpp index 939fd040b4..337d521f6d 100644 --- a/examples/prefix_sum.cpp +++ b/examples/prefix_sum.cpp @@ -6,13 +6,10 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// #include -#include -#include -#include -#include -#include #include "RAJA/RAJA.hpp" +#include "RAJA/util/Timer.hpp" + #include "memoryManager.hpp" /* @@ -42,7 +39,7 @@ const int CUDA_BLOCK_SIZE = 256; /* N - the length of the series to perform the prefix sum */ -const int N = 100; +const int N = 100000000; bool check_equal(const std::vector& a, const std::vector& b) { @@ -59,39 +56,45 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) std::vector input(N, 1); std::vector reference_output(N, 0); - std::vector test_output(N, 0); + std::vector test_output(N, 0); + + auto timer = RAJA::Timer(); + double elapsed_us; //---------------------------------------------------------------------------- std::cout << "\n Running C-style prefix sum...\n"; - auto start = std::chrono::high_resolution_clock::now(); + timer.start(); reference_output[0] = 0; for (int i = 1; i < N; ++i) { reference_output[i] = reference_output[i - 1] + input[i - 1]; } - auto end = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end - start); - std::cout << " Reference complete. Time: " << duration.count() << " us\n"; + timer.stop(); + elapsed_us = timer.elapsed(); + + std::cout << " Reference complete. Time: " << elapsed_us * 1e6 << " us\n"; + timer.reset(); //---------------------------------------------------------------------------- std::cout << "\n Running RAJA::exclusive_scan with seq_exec...\n"; std::fill(test_output.begin(), test_output.end(), 0); - start = std::chrono::high_resolution_clock::now(); + timer.start(); RAJA::exclusive_scan( RAJA::make_span(input), RAJA::make_span(test_output), RAJA::operators::plus()); - end = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end - start); + timer.stop(); + elapsed_us = timer.elapsed(); std::cout << " Result: " << (check_equal(reference_output, test_output) ? "PASS" : "FAIL") - << " | Time: " << duration.count() << " us\n"; + << " | Time: " << elapsed_us * 1e6 << " us\n"; + timer.reset(); //---------------------------------------------------------------------------- @@ -99,19 +102,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) std::cout << "\n Running RAJA::exclusive_scan with omp_parallel_for_exec...\n"; std::fill(test_output.begin(), test_output.end(), 0); - start = std::chrono::high_resolution_clock::now(); + timer.start(); RAJA::exclusive_scan( RAJA::make_span(input), RAJA::make_span(test_output), RAJA::operators::plus()); - end = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end - start); + timer.stop(); + elapsed_us = timer.elapsed(); std::cout << " Result: " << (check_equal(reference_output, test_output) ? "PASS" : "FAIL") - << " | Time: " << duration.count() << " us\n"; + << " | Time: " << elapsed_us * 1e6 << " us\n"; + timer.reset(); #endif @@ -129,7 +133,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) cudaMemset(d_output, 0, N * sizeof(int)); cudaDeviceSynchronize(); - start = std::chrono::high_resolution_clock::now(); + timer.start(); RAJA::exclusive_scan>( RAJA::make_span(d_input, N), @@ -137,15 +141,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) RAJA::operators::plus{}); cudaDeviceSynchronize(); // Make sure the scan finishes before timing ends - end = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end - start); + timer.stop(); + elapsed_us = timer.elapsed(); cudaMemcpy(test_output.data(), d_output, N * sizeof(int), cudaMemcpyDeviceToHost); std::cout << " Result: " << (check_equal(reference_output, test_output) ? "PASS" : "FAIL") - << " | Time: " << duration.count() << " us\n"; - + << " | Time: " << elapsed_us * 1e6 << " us\n"; + timer.reset(); cudaFree(d_input); cudaFree(d_output); #endif @@ -153,4 +157,4 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) std::cout << "\n DONE!...\n"; return 0; -} +} \ No newline at end of file