Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 14 additions & 11 deletions Intro_Tutorial/lessons/07_raja_algs/07_raja_atomic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,43 +40,46 @@ int main()
}
#endif

#if defined(COMPILE)
#if defined(RAJA_ENABLE_CUDA)
#if defined(COMPILE) && defined(RAJA_ENABLE_CUDA)
// TODO: Implement pi approximation to run on CUDA device
{
constexpr std::size_t CUDA_BLOCK_SIZE{256};

// TODO: Define CUDA execution policy and atomic policy
using EXEC_POL = ???<CUDA_BLOCK_SIZE>;
// TODO: Define CUDA execution policy, using the CUDA block size defined
// above, and define the atomic policy
using EXEC_POL = ???;
using ATOMIC_POL = ???;

pi_h[0] = 0.0;

auto device_allocator = rm.getAllocator("DEVICE");

// TODO: Allocate device data for 'pi_d' using the device allocator
// defined above and use the Umpire memset operation to initialize the data
double* pi_d{nullptr};

rm.memset(???, ???);
// TODO: Allocate device data for 'pi_d' using the device allocator
// defined above
pi_d = ???;

// TODO: Use the Umpire memset operation to initialize the data
rm.memset( ??? , ??? );

// TODO: Write a RAJA CUDA kernel to approximate pi
RAJA::forall<EXEC_POL>(RAJA::TypedRangeSegment<int>(0, N), [=] __device__ (int i) {
double x = (double(i) + 0.5) * dx;
???
});

// TODO: Copy result back to 'pi_h' to print result
rm.copy(???, ???, ???);
// TODO: Use the Umpire copy operation to copy the result in device memory
// to the host array 'pi_h' so that the result can be printed below
rm.copy( ??? , ??? , ??? );
pi_h[0] *= 4.0;

std::cout << "CUDA pi approximation " << " = "
<< std::setprecision(20) << pi_h[0] << std::endl;

device_allocator.deallocate(pi_d);
}
#endif // if defined(RAJA_ENABLE_CUDA)
#endif // if defined(COMPILE)
#endif // if defined(COMPILE) && defined(RAJA_ENABLE_CUDA)

host_allocator.deallocate(pi_h);

Expand Down
20 changes: 9 additions & 11 deletions Intro_Tutorial/lessons/07_raja_algs/07_raja_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,7 @@ int main()
}
#endif

#if defined(COMPILE)
#if defined(RAJA_ENABLE_CUDA)
#if defined(COMPILE) && defined(RAJA_ENABLE_CUDA)
// TODO: Implement RAJA scan to run on CUDA device
{
constexpr int M{20};
Expand All @@ -73,21 +72,21 @@ int main()
}

// TODO: Create a device memory alloctor, allocate array 'array_d'
// on the device, and initialize the array by copying the values from
// 'array_h' above.
// on the device, and initialize the device array by using the
// Umpire copy operation to copy the values from 'array_h'.
auto device_allocator = ???;
array_d = ???;
rm.copy(???, ???, ???);
rm.copy( ??? , ??? , ??? );

// TODO: Write a RAJA operation to do an exclusive in-place scan on a
// GPU using CUDA using the array 'array_d' and a maximum operation
constexpr std::size_t CUDA_BLOCK_SIZE{128};
RAJA::exclusive_scan_inplace<???>(
RAJA::exclusive_scan_inplace< ??? >(
???, RAJA::operators::maximum<int>{});

// TODO: Copy the results of your scan operation back to the array
// 'array_h' so they can be printing in the loop below.
rm.copy(???, ???, ???);
// TODO: Use the Umpire copy operation to copy the result in device memory
// to the host array 'array_h' so that the result can be printed below
rm.copy( ??? , ??? , ??? );

std::cout << "Output (exclusive (CUDA) in-place): ";
for (int i = 0; i < M; ++i) {
Expand All @@ -96,8 +95,7 @@ int main()
std::cout << std::endl;

}
#endif // if defined(RAJA_ENABLE_CUDA)
#endif // if defined(COMPILE)
#endif // if defined(COMPILE) && defined(RAJA_ENABLE_CUDA)

return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,35 +40,37 @@ int main()
}
#endif

#if defined(COMPILE)
#if defined(RAJA_ENABLE_CUDA)
#if defined(COMPILE) && defined(RAJA_ENABLE_CUDA)
// TODO: Implement pi approximation to run on CUDA device
{
constexpr std::size_t CUDA_BLOCK_SIZE{256};

// TODO: Define CUDA execution policy and atomic policy
// TODO: Define CUDA execution policy, using the CUDA block size defined
// above, and define the atomic policy
using EXEC_POL = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
using ATOMIC_POL = RAJA::cuda_atomic;

pi_h[0] = 0.0;

auto device_allocator = rm.getAllocator("DEVICE");
auto device_allocator = rm.getAllocator("DEVICE");

// TODO: Allocate device data for 'pi_d' using the device allocator
// defined above and use the Umpire memset operation to initialize the data
double* pi_d{nullptr};

// TODO: Allocate device data for 'pi_d' using the device allocator
// defined above
pi_d = static_cast<double*>(device_allocator.allocate(1*sizeof(double)));

// TODO: Use the Umpire memset operation to initialize the data
rm.memset(pi_d, 0);

// TODO: Write a RAJA CUDA kernel to approximate pi
RAJA::forall<EXEC_POL>(RAJA::TypedRangeSegment<int>(0, N), [=] __device__ (int i) {
double x = (double(i) + 0.5) * dx;
RAJA::atomicAdd<ATOMIC_POL>( pi_d, dx / (1.0 + x * x) );
});

// TODO: Copy result back to 'pi_h' to print result

// TODO: Use the Umpire copy operation to copy the result in device memory
// to the host array 'pi_h' so that the result can be printed below
rm.copy(pi_h, pi_d, 1*sizeof(double));
pi_h[0] *= 4.0;

Expand All @@ -77,8 +79,7 @@ int main()

device_allocator.deallocate(pi_d);
}
#endif // if defined(RAJA_ENABLE_CUDA)
#endif // if defined(COMPILE)
#endif // if defined(COMPILE) && defined(RAJA_ENABLE_CUDA)

host_allocator.deallocate(pi_h);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,7 @@ int main()
}
#endif

#if defined(COMPILE)
#if defined(RAJA_ENABLE_CUDA)
#if defined(COMPILE) && defined(RAJA_ENABLE_CUDA)
// TODO: Implement RAJA scan to run on CUDA device
{
constexpr int M{20};
Expand All @@ -73,8 +72,8 @@ int main()
}

// TODO: Create a device memory alloctor, allocate array 'array_d'
// on the device, and initialize the array by copying the values from
// 'array_h' above.
// on the device, and initialize the device array by using the
// Umpire copy operation to copy the values from 'array_h'.
auto device_allocator = rm.getAllocator("DEVICE");
array_d = static_cast<int*>(device_allocator.allocate(M*sizeof(int)));
rm.copy(array_d, array_h, M*sizeof(int));
Expand All @@ -85,8 +84,8 @@ int main()
RAJA::exclusive_scan_inplace<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
RAJA::make_span(array_d, M), RAJA::operators::maximum<int>{});

// TODO: Copy the results of your scan operation back to the array
// 'array_h' so they can be printing in the loop below.
// TODO: Use the Umpire copy operation to copy the result in device memory
// to the host array 'array_h' so that the result can be printed below
rm.copy(array_h, array_d, M*sizeof(int));

std::cout << "Output (exclusive (CUDA) in-place): ";
Expand All @@ -96,8 +95,7 @@ int main()
std::cout << std::endl;

}
#endif // if defined(RAJA_ENABLE_CUDA)
#endif // if defined(COMPILE)
#endif // if defined(COMPILE) && defined(RAJA_ENABLE_CUDA)

return 0;
}