llnl · rhornung67 · Jul 17, 2025 · Jul 17, 2025
diff --git a/Intro_Tutorial/lessons/07_raja_algs/07_raja_atomic.cpp b/Intro_Tutorial/lessons/07_raja_algs/07_raja_atomic.cpp
@@ -40,43 +40,46 @@ int main()
   }
 #endif
 
-#if defined(COMPILE) 
-#if defined(RAJA_ENABLE_CUDA) 
+#if defined(COMPILE) && defined(RAJA_ENABLE_CUDA)
   // TODO: Implement pi approximation to run on CUDA device
   {
     constexpr std::size_t CUDA_BLOCK_SIZE{256};
 
-    // TODO: Define CUDA execution policy and atomic policy
-    using EXEC_POL   = ???<CUDA_BLOCK_SIZE>;
+    // TODO: Define CUDA execution policy, using the CUDA block size defined
+    // above, and define the atomic policy
+    using EXEC_POL   = ???;
     using ATOMIC_POL = ???;
 
     pi_h[0] = 0.0;
 
     auto device_allocator = rm.getAllocator("DEVICE");
 
-    // TODO: Allocate device data for 'pi_d' using the device allocator
-    // defined above and use the Umpire memset operation to initialize the data
     double* pi_d{nullptr};
 
-    rm.memset(???, ???);
+    // TODO: Allocate device data for 'pi_d' using the device allocator
+    // defined above
+    pi_d = ???;
+
+    // TODO: Use the Umpire memset operation to initialize the data
+    rm.memset( ??? , ??? );
 
     // TODO: Write a RAJA CUDA kernel to approximate pi
     RAJA::forall<EXEC_POL>(RAJA::TypedRangeSegment<int>(0, N), [=] __device__ (int i) {
         double x = (double(i) + 0.5) * dx;
         ???
     });
 
-    // TODO: Copy result back to 'pi_h' to print result 
-    rm.copy(???, ???, ???);
+    // TODO: Use the Umpire copy operation to copy the result in device memory
+    // to the host array 'pi_h' so that the result can be printed below
+    rm.copy( ??? , ??? , ??? );
     pi_h[0] *= 4.0;
 
     std::cout << "CUDA pi approximation " << " = "
               << std::setprecision(20) << pi_h[0] << std::endl;
 
     device_allocator.deallocate(pi_d);
   }
-#endif // if defined(RAJA_ENABLE_CUDA) 
-#endif // if defined(COMPILE)
+#endif // if defined(COMPILE) && defined(RAJA_ENABLE_CUDA)
 
   host_allocator.deallocate(pi_h);
 

diff --git a/Intro_Tutorial/lessons/07_raja_algs/07_raja_scan.cpp b/Intro_Tutorial/lessons/07_raja_algs/07_raja_scan.cpp
@@ -55,8 +55,7 @@ int main()
   }
 #endif
 
-#if defined(COMPILE) 
-#if defined(RAJA_ENABLE_CUDA) 
+#if defined(COMPILE) && defined(RAJA_ENABLE_CUDA)
   // TODO: Implement RAJA scan to run on CUDA device
   {
     constexpr int M{20};
@@ -73,21 +72,21 @@ int main()
     }
 
     // TODO: Create a device memory alloctor, allocate array 'array_d'
-    //       on the device, and initialize the array by copying the values from
-    //       'array_h' above.
+    //       on the device, and initialize the device array by using the
+    //       Umpire copy operation to copy the values from 'array_h'.
     auto device_allocator = ???;
     array_d = ???;
-    rm.copy(???, ???, ???);
+    rm.copy( ??? , ??? , ??? );
 
     // TODO: Write a RAJA operation to do an exclusive in-place scan on a 
     //       GPU using CUDA using the array 'array_d' and a maximum operation
     constexpr std::size_t CUDA_BLOCK_SIZE{128};
-    RAJA::exclusive_scan_inplace<???>(
+    RAJA::exclusive_scan_inplace< ??? >(
       ???, RAJA::operators::maximum<int>{});
 
-    // TODO: Copy the results of your scan operation back to the array
-    //       'array_h' so they can be printing in the loop below.
-    rm.copy(???, ???, ???);
+    // TODO: Use the Umpire copy operation to copy the result in device memory
+    // to the host array 'array_h' so that the result can be printed below
+    rm.copy( ??? , ??? , ??? );
 
     std::cout << "Output (exclusive (CUDA) in-place): ";
     for (int i = 0; i < M; ++i) {
@@ -96,8 +95,7 @@ int main()
     std::cout << std::endl;
 
   }
-#endif // if defined(RAJA_ENABLE_CUDA) 
-#endif // if defined(COMPILE)
+#endif // if defined(COMPILE) && defined(RAJA_ENABLE_CUDA)
 
   return 0;
 }
diff --git a/Intro_Tutorial/lessons/07_raja_algs/solution/07_raja_atomic_solution.cpp b/Intro_Tutorial/lessons/07_raja_algs/solution/07_raja_atomic_solution.cpp
@@ -40,35 +40,37 @@ int main()
   }
 #endif
 
-#if defined(COMPILE) 
-#if defined(RAJA_ENABLE_CUDA) 
+#if defined(COMPILE) && defined(RAJA_ENABLE_CUDA)
   // TODO: Implement pi approximation to run on CUDA device
   {
     constexpr std::size_t CUDA_BLOCK_SIZE{256};
 
-    // TODO: Define CUDA execution policy and atomic policy
+    // TODO: Define CUDA execution policy, using the CUDA block size defined
+    // above, and define the atomic policy 
     using EXEC_POL   = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
     using ATOMIC_POL = RAJA::cuda_atomic;
 
     pi_h[0] = 0.0;
 
-    auto device_allocator = rm.getAllocator("DEVICE"); 
+    auto device_allocator = rm.getAllocator("DEVICE");
 
-    // TODO: Allocate device data for 'pi_d' using the device allocator
-    // defined above and use the Umpire memset operation to initialize the data
     double* pi_d{nullptr};
 
+    // TODO: Allocate device data for 'pi_d' using the device allocator
+    // defined above
     pi_d = static_cast<double*>(device_allocator.allocate(1*sizeof(double)));
 
+    // TODO: Use the Umpire memset operation to initialize the data
     rm.memset(pi_d, 0);
 
     // TODO: Write a RAJA CUDA kernel to approximate pi
     RAJA::forall<EXEC_POL>(RAJA::TypedRangeSegment<int>(0, N), [=] __device__ (int i) {
         double x = (double(i) + 0.5) * dx;
         RAJA::atomicAdd<ATOMIC_POL>( pi_d,  dx / (1.0 + x * x) );
     });
-
-    // TODO: Copy result back to 'pi_h' to print result 
+
+    // TODO: Use the Umpire copy operation to copy the result in device memory
+    // to the host array 'pi_h' so that the result can be printed below
     rm.copy(pi_h, pi_d, 1*sizeof(double));
     pi_h[0] *= 4.0;
 
@@ -77,8 +79,7 @@ int main()
 
     device_allocator.deallocate(pi_d);
   }
-#endif // if defined(RAJA_ENABLE_CUDA) 
-#endif // if defined(COMPILE)
+#endif // if defined(COMPILE) && defined(RAJA_ENABLE_CUDA)
 
   host_allocator.deallocate(pi_h);
 

diff --git a/Intro_Tutorial/lessons/07_raja_algs/solution/07_raja_scan_solution.cpp b/Intro_Tutorial/lessons/07_raja_algs/solution/07_raja_scan_solution.cpp
@@ -55,8 +55,7 @@ int main()
   }
 #endif
 
-#if defined(COMPILE) 
-#if defined(RAJA_ENABLE_CUDA) 
+#if defined(COMPILE) && defined(RAJA_ENABLE_CUDA)
   // TODO: Implement RAJA scan to run on CUDA device
   {
     constexpr int M{20};
@@ -73,8 +72,8 @@ int main()
     }
 
     // TODO: Create a device memory alloctor, allocate array 'array_d'
-    //       on the device, and initialize the array by copying the values from
-    //       'array_h' above.
+    //       on the device, and initialize the device array by using the
+    //       Umpire copy operation to copy the values from 'array_h'.
     auto device_allocator = rm.getAllocator("DEVICE");
     array_d = static_cast<int*>(device_allocator.allocate(M*sizeof(int)));
     rm.copy(array_d, array_h, M*sizeof(int));
@@ -85,8 +84,8 @@ int main()
     RAJA::exclusive_scan_inplace<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
       RAJA::make_span(array_d, M), RAJA::operators::maximum<int>{});    
 
-    // TODO: Copy the results of your scan operation back to the array
-    //       'array_h' so they can be printing in the loop below.
+    // TODO: Use the Umpire copy operation to copy the result in device memory
+    // to the host array 'array_h' so that the result can be printed below
     rm.copy(array_h, array_d, M*sizeof(int));
 
     std::cout << "Output (exclusive (CUDA) in-place): ";
@@ -96,8 +95,7 @@ int main()
     std::cout << std::endl;
 
   }
-#endif // if defined(RAJA_ENABLE_CUDA) 
-#endif // if defined(COMPILE)
+#endif // if defined(COMPILE) && defined(RAJA_ENABLE_CUDA)
 
   return 0;
 }