diff --git a/Intro_Tutorial/lessons/07_raja_algs/07_raja_atomic.cpp b/Intro_Tutorial/lessons/07_raja_algs/07_raja_atomic.cpp index 1ac3b25..b20eb7e 100644 --- a/Intro_Tutorial/lessons/07_raja_algs/07_raja_atomic.cpp +++ b/Intro_Tutorial/lessons/07_raja_algs/07_raja_atomic.cpp @@ -40,25 +40,28 @@ int main() } #endif -#if defined(COMPILE) -#if defined(RAJA_ENABLE_CUDA) +#if defined(COMPILE) && defined(RAJA_ENABLE_CUDA) // TODO: Implement pi approximation to run on CUDA device { constexpr std::size_t CUDA_BLOCK_SIZE{256}; - // TODO: Define CUDA execution policy and atomic policy - using EXEC_POL = ???; + // TODO: Define CUDA execution policy, using the CUDA block size defined + // above, and define the atomic policy + using EXEC_POL = ???; using ATOMIC_POL = ???; pi_h[0] = 0.0; auto device_allocator = rm.getAllocator("DEVICE"); - // TODO: Allocate device data for 'pi_d' using the device allocator - // defined above and use the Umpire memset operation to initialize the data double* pi_d{nullptr}; - rm.memset(???, ???); + // TODO: Allocate device data for 'pi_d' using the device allocator + // defined above + pi_d = ???; + + // TODO: Use the Umpire memset operation to initialize the data + rm.memset( ??? , ??? ); // TODO: Write a RAJA CUDA kernel to approximate pi RAJA::forall(RAJA::TypedRangeSegment(0, N), [=] __device__ (int i) { @@ -66,8 +69,9 @@ int main() ??? }); - // TODO: Copy result back to 'pi_h' to print result - rm.copy(???, ???, ???); + // TODO: Use the Umpire copy operation to copy the result in device memory + // to the host array 'pi_h' so that the result can be printed below + rm.copy( ??? , ??? , ??? ); pi_h[0] *= 4.0; std::cout << "CUDA pi approximation " << " = " @@ -75,8 +79,7 @@ int main() device_allocator.deallocate(pi_d); } -#endif // if defined(RAJA_ENABLE_CUDA) -#endif // if defined(COMPILE) +#endif // if defined(COMPILE) && defined(RAJA_ENABLE_CUDA) host_allocator.deallocate(pi_h); diff --git a/Intro_Tutorial/lessons/07_raja_algs/07_raja_scan.cpp b/Intro_Tutorial/lessons/07_raja_algs/07_raja_scan.cpp index 027355e..d0f089f 100644 --- a/Intro_Tutorial/lessons/07_raja_algs/07_raja_scan.cpp +++ b/Intro_Tutorial/lessons/07_raja_algs/07_raja_scan.cpp @@ -55,8 +55,7 @@ int main() } #endif -#if defined(COMPILE) -#if defined(RAJA_ENABLE_CUDA) +#if defined(COMPILE) && defined(RAJA_ENABLE_CUDA) // TODO: Implement RAJA scan to run on CUDA device { constexpr int M{20}; @@ -73,21 +72,21 @@ int main() } // TODO: Create a device memory alloctor, allocate array 'array_d' - // on the device, and initialize the array by copying the values from - // 'array_h' above. + // on the device, and initialize the device array by using the + // Umpire copy operation to copy the values from 'array_h'. auto device_allocator = ???; array_d = ???; - rm.copy(???, ???, ???); + rm.copy( ??? , ??? , ??? ); // TODO: Write a RAJA operation to do an exclusive in-place scan on a // GPU using CUDA using the array 'array_d' and a maximum operation constexpr std::size_t CUDA_BLOCK_SIZE{128}; - RAJA::exclusive_scan_inplace( + RAJA::exclusive_scan_inplace< ??? >( ???, RAJA::operators::maximum{}); - // TODO: Copy the results of your scan operation back to the array - // 'array_h' so they can be printing in the loop below. - rm.copy(???, ???, ???); + // TODO: Use the Umpire copy operation to copy the result in device memory + // to the host array 'array_h' so that the result can be printed below + rm.copy( ??? , ??? , ??? ); std::cout << "Output (exclusive (CUDA) in-place): "; for (int i = 0; i < M; ++i) { @@ -96,8 +95,7 @@ int main() std::cout << std::endl; } -#endif // if defined(RAJA_ENABLE_CUDA) -#endif // if defined(COMPILE) +#endif // if defined(COMPILE) && defined(RAJA_ENABLE_CUDA) return 0; } diff --git a/Intro_Tutorial/lessons/07_raja_algs/solution/07_raja_atomic_solution.cpp b/Intro_Tutorial/lessons/07_raja_algs/solution/07_raja_atomic_solution.cpp index c370c9c..b772e87 100644 --- a/Intro_Tutorial/lessons/07_raja_algs/solution/07_raja_atomic_solution.cpp +++ b/Intro_Tutorial/lessons/07_raja_algs/solution/07_raja_atomic_solution.cpp @@ -40,26 +40,27 @@ int main() } #endif -#if defined(COMPILE) -#if defined(RAJA_ENABLE_CUDA) +#if defined(COMPILE) && defined(RAJA_ENABLE_CUDA) // TODO: Implement pi approximation to run on CUDA device { constexpr std::size_t CUDA_BLOCK_SIZE{256}; - // TODO: Define CUDA execution policy and atomic policy + // TODO: Define CUDA execution policy, using the CUDA block size defined + // above, and define the atomic policy using EXEC_POL = RAJA::cuda_exec; using ATOMIC_POL = RAJA::cuda_atomic; pi_h[0] = 0.0; - auto device_allocator = rm.getAllocator("DEVICE"); + auto device_allocator = rm.getAllocator("DEVICE"); - // TODO: Allocate device data for 'pi_d' using the device allocator - // defined above and use the Umpire memset operation to initialize the data double* pi_d{nullptr}; + // TODO: Allocate device data for 'pi_d' using the device allocator + // defined above pi_d = static_cast(device_allocator.allocate(1*sizeof(double))); + // TODO: Use the Umpire memset operation to initialize the data rm.memset(pi_d, 0); // TODO: Write a RAJA CUDA kernel to approximate pi @@ -67,8 +68,9 @@ int main() double x = (double(i) + 0.5) * dx; RAJA::atomicAdd( pi_d, dx / (1.0 + x * x) ); }); - - // TODO: Copy result back to 'pi_h' to print result + + // TODO: Use the Umpire copy operation to copy the result in device memory + // to the host array 'pi_h' so that the result can be printed below rm.copy(pi_h, pi_d, 1*sizeof(double)); pi_h[0] *= 4.0; @@ -77,8 +79,7 @@ int main() device_allocator.deallocate(pi_d); } -#endif // if defined(RAJA_ENABLE_CUDA) -#endif // if defined(COMPILE) +#endif // if defined(COMPILE) && defined(RAJA_ENABLE_CUDA) host_allocator.deallocate(pi_h); diff --git a/Intro_Tutorial/lessons/07_raja_algs/solution/07_raja_scan_solution.cpp b/Intro_Tutorial/lessons/07_raja_algs/solution/07_raja_scan_solution.cpp index 549c934..14fc06a 100644 --- a/Intro_Tutorial/lessons/07_raja_algs/solution/07_raja_scan_solution.cpp +++ b/Intro_Tutorial/lessons/07_raja_algs/solution/07_raja_scan_solution.cpp @@ -55,8 +55,7 @@ int main() } #endif -#if defined(COMPILE) -#if defined(RAJA_ENABLE_CUDA) +#if defined(COMPILE) && defined(RAJA_ENABLE_CUDA) // TODO: Implement RAJA scan to run on CUDA device { constexpr int M{20}; @@ -73,8 +72,8 @@ int main() } // TODO: Create a device memory alloctor, allocate array 'array_d' - // on the device, and initialize the array by copying the values from - // 'array_h' above. + // on the device, and initialize the device array by using the + // Umpire copy operation to copy the values from 'array_h'. auto device_allocator = rm.getAllocator("DEVICE"); array_d = static_cast(device_allocator.allocate(M*sizeof(int))); rm.copy(array_d, array_h, M*sizeof(int)); @@ -85,8 +84,8 @@ int main() RAJA::exclusive_scan_inplace>( RAJA::make_span(array_d, M), RAJA::operators::maximum{}); - // TODO: Copy the results of your scan operation back to the array - // 'array_h' so they can be printing in the loop below. + // TODO: Use the Umpire copy operation to copy the result in device memory + // to the host array 'array_h' so that the result can be printed below rm.copy(array_h, array_d, M*sizeof(int)); std::cout << "Output (exclusive (CUDA) in-place): "; @@ -96,8 +95,7 @@ int main() std::cout << std::endl; } -#endif // if defined(RAJA_ENABLE_CUDA) -#endif // if defined(COMPILE) +#endif // if defined(COMPILE) && defined(RAJA_ENABLE_CUDA) return 0; }