From c67a61a06f77dacd53d81ee27b1e604c4069dcbc Mon Sep 17 00:00:00 2001
From: Savanna Spayd <savannaspayd@gmail.com>
Date: Sun, 20 Apr 2025 22:39:16 -0700
Subject: [PATCH 1/2] Added exclusive prefix sum to examples

---
 examples/CMakeLists.txt |   4 ++
 examples/prefix_sum.cpp | 156 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 160 insertions(+)
 create mode 100644 examples/prefix_sum.cpp
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 617c6eec3d..b8a2fe1683 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -67,6 +67,10 @@ raja_add_executable(
   NAME pi-reduce_vs_atomic
   SOURCES pi-reduce_vs_atomic.cpp)
 
+raja_add_executable(
+  NAME prefix_sum
+  SOURCES prefix_sum.cpp)
+  
 raja_add_executable(
   NAME raja-launch
   SOURCES raja-launch.cpp)
diff --git a/examples/prefix_sum.cpp b/examples/prefix_sum.cpp
new file mode 100644
index 0000000000..939fd040b4
--- /dev/null
+++ b/examples/prefix_sum.cpp
@@ -0,0 +1,156 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <iostream>
+#include <iomanip>  
+#include <vector>
+#include <cassert>
+#include <chrono>
+
+#include "RAJA/RAJA.hpp"
+#include "memoryManager.hpp"
+
+/*
+ *  Prefix Sum Example
+ *
+ *  Computes an exclusive prefix sum (scan) on an array of integers using
+ *  multiple execution policies.
+ *
+ *  RAJA features shown:
+ *    - `exclusive_scan` operation with different execution backends
+ *    - Use of `make_span` to create RAJA-compatible views
+ *    - Sequential, OpenMP, and CUDA execution variants
+ *    - CUDA device memory allocation and transfer
+ *
+ *  If CUDA is enabled, device memory is allocated manually with `cudaMalloc`
+ *  and the results are copied back to host memory for validation.
+ */
+
+
+/*
+  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
+*/
+#if defined(RAJA_ENABLE_CUDA)
+const int CUDA_BLOCK_SIZE = 256;
+#endif
+
+/*
+  N - the length of the series to perform the prefix sum
+*/
+const int N = 100;
+
+bool check_equal(const std::vector<int>& a, const std::vector<int>& b)
+{
+  if (a.size() != b.size()) return false;
+  for (size_t i = 0; i < a.size(); ++i) {
+    if (a[i] != b[i]) return false;
+  }
+  return true;
+}
+
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
+{
+  std::cout << "\n\nRAJA prefix sum (exclusive_scan) example using a series of " << N << " length...\n";
+
+  std::vector<int> input(N, 1);
+  std::vector<int> reference_output(N, 0);
+  std::vector<int> test_output(N, 0);
+
+  //----------------------------------------------------------------------------
+  std::cout << "\n Running C-style prefix sum...\n";
+
+  auto start = std::chrono::high_resolution_clock::now();
+
+  reference_output[0] = 0;
+  for (int i = 1; i < N; ++i) {
+    reference_output[i] = reference_output[i - 1] + input[i - 1];
+  }
+
+  auto end = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  std::cout << "   Reference complete. Time: " << duration.count() << " us\n";
+
+  //----------------------------------------------------------------------------
+  std::cout << "\n Running RAJA::exclusive_scan with seq_exec...\n";
+  std::fill(test_output.begin(), test_output.end(), 0);
+
+  start = std::chrono::high_resolution_clock::now();
+
+  RAJA::exclusive_scan<RAJA::seq_exec>(
+    RAJA::make_span(input),
+    RAJA::make_span(test_output),
+    RAJA::operators::plus<int>());
+
+  end = std::chrono::high_resolution_clock::now();
+  duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+  std::cout << "   Result: " 
+            << (check_equal(reference_output, test_output) ? "PASS" : "FAIL")
+            << " | Time: " << duration.count() << " us\n";
+
+
+  //----------------------------------------------------------------------------
+#if defined(RAJA_ENABLE_OPENMP)
+  std::cout << "\n Running RAJA::exclusive_scan with omp_parallel_for_exec...\n";
+  std::fill(test_output.begin(), test_output.end(), 0);
+  
+  start = std::chrono::high_resolution_clock::now();
+  
+  RAJA::exclusive_scan<RAJA::omp_parallel_for_exec>(
+    RAJA::make_span(input),
+    RAJA::make_span(test_output),
+    RAJA::operators::plus<int>());
+  
+  end = std::chrono::high_resolution_clock::now();
+  duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  
+  std::cout << "   Result: " 
+            << (check_equal(reference_output, test_output) ? "PASS" : "FAIL")
+            << " | Time: " << duration.count() << " us\n";
+#endif
+  
+
+  //----------------------------------------------------------------------------
+#if defined(RAJA_ENABLE_CUDA)
+  std::cout << "\n Running RAJA::exclusive_scan with cuda_exec...\n";
+  
+  int* d_input;
+  int* d_output;
+  
+  cudaMalloc((void**)&d_input, N * sizeof(int));
+  cudaMalloc((void**)&d_output, N * sizeof(int));
+  
+  cudaMemcpy(d_input, input.data(), N * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemset(d_output, 0, N * sizeof(int));
+  
+  cudaDeviceSynchronize();
+  start = std::chrono::high_resolution_clock::now();
+  
+  RAJA::exclusive_scan<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
+    RAJA::make_span(d_input, N),
+    RAJA::make_span(d_output, N),
+    RAJA::operators::plus<int>{});
+  
+  cudaDeviceSynchronize();  // Make sure the scan finishes before timing ends
+  end = std::chrono::high_resolution_clock::now();
+  duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  
+  cudaMemcpy(test_output.data(), d_output, N * sizeof(int), cudaMemcpyDeviceToHost);
+  
+  std::cout << "   Result: " 
+            << (check_equal(reference_output, test_output) ? "PASS" : "FAIL")
+            << " | Time: " << duration.count() << " us\n";
+  
+  cudaFree(d_input);
+  cudaFree(d_output);
+#endif
+  
+
+  std::cout << "\n DONE!...\n";
+  return 0;
+}

From 6a8876ec4c284164fcdadeef5d66420b0a0bb7e7 Mon Sep 17 00:00:00 2001
From: Savanna Spayd <savannaspayd@gmail.com>
Date: Wed, 30 Apr 2025 00:32:15 -0700
Subject: [PATCH 2/2] Used RAJA timer instead of chrono

---
 examples/prefix_sum.cpp | 54 ++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/examples/prefix_sum.cpp b/examples/prefix_sum.cpp
index 939fd040b4..337d521f6d 100644
--- a/examples/prefix_sum.cpp
+++ b/examples/prefix_sum.cpp
@@ -6,13 +6,10 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 #include <cstdlib>
-#include <iostream>
-#include <iomanip>  
-#include <vector>
-#include <cassert>
-#include <chrono>
 
 #include "RAJA/RAJA.hpp"
+#include "RAJA/util/Timer.hpp"
+
 #include "memoryManager.hpp"
 
 /*
@@ -42,7 +39,7 @@ const int CUDA_BLOCK_SIZE = 256;
 /*
   N - the length of the series to perform the prefix sum
 */
-const int N = 100;
+const int N = 100000000;
 
 bool check_equal(const std::vector<int>& a, const std::vector<int>& b)
 {
@@ -59,39 +56,45 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
 
   std::vector<int> input(N, 1);
   std::vector<int> reference_output(N, 0);
-  std::vector<int> test_output(N, 0);
+  std::vector<int> test_output(N, 0);  
+
+  auto timer = RAJA::Timer();
+  double elapsed_us;
 
   //----------------------------------------------------------------------------
   std::cout << "\n Running C-style prefix sum...\n";
 
-  auto start = std::chrono::high_resolution_clock::now();
+  timer.start();
 
   reference_output[0] = 0;
   for (int i = 1; i < N; ++i) {
     reference_output[i] = reference_output[i - 1] + input[i - 1];
   }
 
-  auto end = std::chrono::high_resolution_clock::now();
-  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
-  std::cout << "   Reference complete. Time: " << duration.count() << " us\n";
+  timer.stop();
+  elapsed_us = timer.elapsed();
+  
+  std::cout << "   Reference complete. Time: " << elapsed_us * 1e6 << " us\n";
+  timer.reset();
 
   //----------------------------------------------------------------------------
   std::cout << "\n Running RAJA::exclusive_scan with seq_exec...\n";
   std::fill(test_output.begin(), test_output.end(), 0);
 
-  start = std::chrono::high_resolution_clock::now();
+  timer.start();
 
   RAJA::exclusive_scan<RAJA::seq_exec>(
     RAJA::make_span(input),
     RAJA::make_span(test_output),
     RAJA::operators::plus<int>());
 
-  end = std::chrono::high_resolution_clock::now();
-  duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  timer.stop();
+  elapsed_us = timer.elapsed();
 
   std::cout << "   Result: " 
             << (check_equal(reference_output, test_output) ? "PASS" : "FAIL")
-            << " | Time: " << duration.count() << " us\n";
+            << " | Time: " << elapsed_us * 1e6 << " us\n";
+  timer.reset();
 
 
   //----------------------------------------------------------------------------
@@ -99,19 +102,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
   std::cout << "\n Running RAJA::exclusive_scan with omp_parallel_for_exec...\n";
   std::fill(test_output.begin(), test_output.end(), 0);
   
-  start = std::chrono::high_resolution_clock::now();
+  timer.start();
   
   RAJA::exclusive_scan<RAJA::omp_parallel_for_exec>(
     RAJA::make_span(input),
     RAJA::make_span(test_output),
     RAJA::operators::plus<int>());
   
-  end = std::chrono::high_resolution_clock::now();
-  duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  timer.stop();
+  elapsed_us = timer.elapsed();
   
   std::cout << "   Result: " 
             << (check_equal(reference_output, test_output) ? "PASS" : "FAIL")
-            << " | Time: " << duration.count() << " us\n";
+            << " | Time: " << elapsed_us * 1e6 << " us\n";
+  timer.reset();
 #endif
   
 
@@ -129,7 +133,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
   cudaMemset(d_output, 0, N * sizeof(int));
   
   cudaDeviceSynchronize();
-  start = std::chrono::high_resolution_clock::now();
+  timer.start();
   
   RAJA::exclusive_scan<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
     RAJA::make_span(d_input, N),
@@ -137,15 +141,15 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
     RAJA::operators::plus<int>{});
   
   cudaDeviceSynchronize();  // Make sure the scan finishes before timing ends
-  end = std::chrono::high_resolution_clock::now();
-  duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  timer.stop();
+  elapsed_us = timer.elapsed();
   
   cudaMemcpy(test_output.data(), d_output, N * sizeof(int), cudaMemcpyDeviceToHost);
   
   std::cout << "   Result: " 
             << (check_equal(reference_output, test_output) ? "PASS" : "FAIL")
-            << " | Time: " << duration.count() << " us\n";
-  
+            << " | Time: " << elapsed_us * 1e6 << " us\n";
+  timer.reset();
   cudaFree(d_input);
   cudaFree(d_output);
 #endif
@@ -153,4 +157,4 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
 
   std::cout << "\n DONE!...\n";
   return 0;
-}
+}
\ No newline at end of file