diff --git a/CMakeLists.txt b/CMakeLists.txt index 91e92bf..d9762ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,3 +26,4 @@ endif() add_subdirectory(tpl) add_subdirectory(Intro_Tutorial) add_subdirectory(Intermediate_Tutorial) +add_subdirectory(Profile_Demo) diff --git a/Profile_Demo/CMakeLists.txt b/Profile_Demo/CMakeLists.txt new file mode 100644 index 0000000..84739cd --- /dev/null +++ b/Profile_Demo/CMakeLists.txt @@ -0,0 +1,6 @@ +if (ENABLE_CUDA) + blt_add_executable( + NAME profile_raja + SOURCES profile_raja.cpp + DEPENDS_ON RAJA umpire cuda) +endif() diff --git a/Profile_Demo/README.md b/Profile_Demo/README.md new file mode 100644 index 0000000..8b77a81 --- /dev/null +++ b/Profile_Demo/README.md @@ -0,0 +1,23 @@ +# Basic RAJA profiling with Caliper + +In this example, we explore profiling RAJA kernels using the Caliper library developed at LLNL. +Below are example build commands you can use to configure Caliper and RAJA for profiling on NVIDIA GPUs. + +Building Caliper on an NVIDIA platform: +``cmake -DCMAKE_INSTALL_PREFIX=${caliper_path} -DWITH_NVTX=ON -DWITH_CUPTI=ON ../`` + +Building RAJA: +``cmake -DENABLE_CUDA=ON -DRAJA_ENABLE_RUNTIME_PLUGINS=ON -DRAJA_ENABLE_CALIPER=ON -Dcaliper_DIR=${caliper_path}/build/share/cmake/caliper -DCMAKE_CUDA_FLAGS="--expt-extended-lambda" -Dcaliper_DIR=${caliper_path} ../ && make profile_raja -j`` + +Once the suite is built, you can invoke the following command to profile a set of basic linear algebra kernels: + +``CALI_CONFIG=runtime-report ./bin/profile_raja 1024`` + +This example provides three different kernel policies, allowing users to observe runtime performance differences between the kernels. +To switch between them, uncomment the desired variable at the top of the file. + +For more information on Caliper we refer the reader to the following pages: + +- [RAJA-Caliper Quick Start Documentation](https://raja.readthedocs.io/en/develop/sphinx/user_guide/profiling_with_caliper.html) +- [Caliper GitHub](https://github.com/LLNL/Caliper) +- [Caliper Documentation](https://software.llnl.gov/Caliper/) \ No newline at end of file diff --git a/Profile_Demo/caliper-plugin.cpp b/Profile_Demo/caliper-plugin.cpp new file mode 100644 index 0000000..63c3017 --- /dev/null +++ b/Profile_Demo/caliper-plugin.cpp @@ -0,0 +1,37 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "RAJA/util/PluginStrategy.hpp" + +#include +#include + +class CaliperPlugin : public RAJA::util::PluginStrategy +{ +public: + void preLaunch(const RAJA::util::PluginContext&p) override + { + if(!p.kernel_name.empty()) CALI_MARK_BEGIN(p.kernel_name.c_str()); + } + + void postLaunch(const RAJA::util::PluginContext& p) override + { + if(!p.kernel_name.empty()) CALI_MARK_END(p.kernel_name.c_str()); + } + +private: + +}; + +// Dynamically loading plugin. +extern "C" RAJA::util::PluginStrategy *RAJAGetPlugin() +{ + return new CaliperPlugin; +} + +// Statically loading plugin. +static RAJA::util::PluginRegistry::add P("Caliper", "Enables Caliper Profiling"); diff --git a/Profile_Demo/profile_raja.cpp b/Profile_Demo/profile_raja.cpp new file mode 100644 index 0000000..68faadc --- /dev/null +++ b/Profile_Demo/profile_raja.cpp @@ -0,0 +1,165 @@ +#include +#include + +#include "RAJA/RAJA.hpp" +#include "umpire/Umpire.hpp" + +#include "caliper-plugin.cpp" + +//Uncomment for policy selection + +#define DIRECT_POLICY +//#define LOOP_POLICY +//#define GLOBAL_POLICY + +constexpr int max_threads = 1024; +constexpr bool async = false; +using forall_pol = RAJA::cuda_exec; +using launch_pol = RAJA::LaunchPolicy>; + +void init(double *A, double *B, double *C, int m, int n) { + + RAJA::forall(RAJA::RangeSegment(0, n * n), + RAJA::Name("init"), + [=] RAJA_HOST_DEVICE (RAJA::Index_type i) { + A[i] = 1.0; + B[i] = 1.0; + C[i] = 0.0; + }); +} + +void matrix_add(const double *A, const double *B, double *C, int m, int n) { + + RAJA::forall + (RAJA::RangeSegment(0, m * n), RAJA::Name("matrix_add"), [=] RAJA_HOST_DEVICE (RAJA::Index_type i) { + C[i] = A[i] + B[i]; + }); + +} + +void matrix_scalar_mult(const double *A, double *B, double scalar, int m, int n) { + + RAJA::forall + (RAJA::RangeSegment(0, m * n), RAJA::Name("matrix_scalar_mult"), [=] RAJA_HOST_DEVICE (RAJA::Index_type i) { + B[i] = scalar * A[i]; + }); +} + +void matrix_multiply(const double *A, const double *B, double *C, int m, int n, int p) { + + // A: m x n, B: n x p, C: m x p + auto v_A = RAJA::make_permuted_view(A, m, n); + auto v_B = RAJA::make_permuted_view(B, n, p); + auto v_C = RAJA::make_permuted_view(C, m, p); + +#if defined(DIRECT_POLICY) + const int threads = p; + const int teams = m; + + RAJA::LaunchParams params{RAJA::Teams(teams), RAJA::Threads(threads)}; + + using loop1_pol = RAJA::LoopPolicy; + using loop0_pol = RAJA::LoopPolicy; +#endif + +#if defined(LOOP_POLICY) + const int threads = 256; + const int teams = m; + + RAJA::LaunchParams params{RAJA::Teams(teams), RAJA::Threads(threads)}; + + using loop1_pol = RAJA::LoopPolicy; + using loop0_pol = RAJA::LoopPolicy; +#endif + +#if defined(GLOBAL_POLICY) + const int threads = 16; + const int teams_x = (n - 1)/threads + 1; + const int teams_y = (m - 1)/threads + 1; + + RAJA::LaunchParams params{RAJA::Teams(teams_x, teams_y), RAJA::Threads(threads, threads)}; + + using loop1_pol = RAJA::LoopPolicy; + using loop0_pol = RAJA::LoopPolicy; +#endif + + RAJA::launch + (params, RAJA::Name("matrix_multiply"), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, m), [&] (int i) { + RAJA::loop (ctx, RAJA::RangeSegment(0, p), [&] (int j) { + + double dot =0.0; + for (int k = 0; k < n; k++) { + dot += v_A(i, k) * v_B(k, j); + } + v_C(i, j) = dot; + }); + }); + }); +} + + +bool check_matrix_multiply(const double *C, const int n) +{ + + bool pass = true; + auto v_C = RAJA::make_permuted_view(C, n, n); + + for(int r=0; r(allocator.allocate(n*n*sizeof(double))); + B = static_cast(allocator.allocate(n*n*sizeof(double))); + C = static_cast(allocator.allocate(n*n*sizeof(double))); + + init(A, B, C, n, n); + + matrix_add(A, B, C, n, n); + + matrix_scalar_mult(A, C, 2.0, n, n); + + matrix_multiply(A, B, C, n, n, n); + + bool pass = check_matrix_multiply(C, n); + + if(!pass) { + throw std::runtime_error("matrix_multiply did not pass"); + } + + std::cout<<"Matrix multiply passed"<