diff --git a/Profile_Demo/profile_raja.cpp b/Profile_Demo/profile_raja.cpp index 68faadc..3d02d91 100644 --- a/Profile_Demo/profile_raja.cpp +++ b/Profile_Demo/profile_raja.cpp @@ -6,16 +6,26 @@ #include "caliper-plugin.cpp" +// Basic profiling: +// CALI_CONFIG=runtime-report ./bin/profile_raja 1024 +// +// Show cuda activity +// CALI_CONFIG=cuda-activity-report,show_kernels ./bin/profile_raja 1024 +// +// Generate nsys profile +// CALI_SERVICES_ENABLE=nvtx,cuda nsys profile -o my_profile ./bin/profile_raja 1024 + //Uncomment for policy selection #define DIRECT_POLICY //#define LOOP_POLICY //#define GLOBAL_POLICY -constexpr int max_threads = 1024; +constexpr int cuda_threads = 256; +constexpr int launch_max_threads = 256; //TODO add to cuda_launch_t constexpr bool async = false; -using forall_pol = RAJA::cuda_exec; -using launch_pol = RAJA::LaunchPolicy>; +using forall_pol = RAJA::cuda_exec; +using launch_pol = RAJA::LaunchPolicy>; void init(double *A, double *B, double *C, int m, int n) { @@ -58,7 +68,10 @@ void matrix_multiply(const double *A, const double *B, double *C, int m, int n, RAJA::LaunchParams params{RAJA::Teams(teams), RAJA::Threads(threads)}; + //Rows: if (int i = blockIdx.x; i < m) using loop1_pol = RAJA::LoopPolicy; + + //Cols: if (int j = threadIdx.x; j < p) using loop0_pol = RAJA::LoopPolicy; #endif @@ -68,19 +81,25 @@ void matrix_multiply(const double *A, const double *B, double *C, int m, int n, RAJA::LaunchParams params{RAJA::Teams(teams), RAJA::Threads(threads)}; + //Rows: if (int i = blockIdx.x; i < m; i += gridDim.x) using loop1_pol = RAJA::LoopPolicy; + + //Cols: if (int j = threadIdx.x; j < p; j += blockDim.x) using loop0_pol = RAJA::LoopPolicy; #endif #if defined(GLOBAL_POLICY) - const int threads = 16; + constexpr int threads = 16; const int teams_x = (n - 1)/threads + 1; const int teams_y = (m - 1)/threads + 1; RAJA::LaunchParams params{RAJA::Teams(teams_x, teams_y), RAJA::Threads(threads, threads)}; - using loop1_pol = RAJA::LoopPolicy; - using loop0_pol = RAJA::LoopPolicy; + //Rows: int i = threadIdx.y + blockIdx.y * blockDim.y + using loop1_pol = RAJA::LoopPolicy>; + + //Cols: int j = threadIdx.x + blockIdx.x * blockDim.x + using loop0_pol = RAJA::LoopPolicy>; #endif RAJA::launch @@ -128,27 +147,38 @@ int main(int argc, char* argv[]) int n = std::atoi(argv[1]); std::cout<<"Using matrix size "<(host_allocator.allocate(n*n*sizeof(double))); + h_B = static_cast(host_allocator.allocate(n*n*sizeof(double))); + h_C = static_cast(host_allocator.allocate(n*n*sizeof(double))); + + d_A = static_cast(device_allocator.allocate(n*n*sizeof(double))); + d_B = static_cast(device_allocator.allocate(n*n*sizeof(double))); + d_C = static_cast(device_allocator.allocate(n*n*sizeof(double))); - A = static_cast(allocator.allocate(n*n*sizeof(double))); - B = static_cast(allocator.allocate(n*n*sizeof(double))); - C = static_cast(allocator.allocate(n*n*sizeof(double))); + init(d_A, d_B, d_C, n, n); - init(A, B, C, n, n); + matrix_add(d_A, d_B, d_C, n, n); - matrix_add(A, B, C, n, n); + matrix_scalar_mult(d_A, d_C, 2.0, n, n); - matrix_scalar_mult(A, C, 2.0, n, n); + matrix_multiply(d_A, d_B, d_C, n, n, n); - matrix_multiply(A, B, C, n, n, n); + rm.copy(h_C, d_C, n*n*sizeof(double)); - bool pass = check_matrix_multiply(C, n); + bool pass = check_matrix_multiply(h_C, n); if(!pass) { throw std::runtime_error("matrix_multiply did not pass"); @@ -156,10 +186,13 @@ int main(int argc, char* argv[]) std::cout<<"Matrix multiply passed"<