From 179288fea039f460c844fb1d57a0d0ddbfc45191 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 3 Feb 2025 13:48:11 -0800 Subject: [PATCH 1/6] test with direct policies --- src/apps/FEM_MACROS.hpp | 6 ++++++ src/apps/MASS3DPA-Hip.cpp | 16 ++++++++-------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp index d3c2c9e0d..04f11a164 100644 --- a/src/apps/FEM_MACROS.hpp +++ b/src/apps/FEM_MACROS.hpp @@ -24,6 +24,12 @@ for (int i = threadIdx.k; i < N; i += blockDim.k) #endif +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) +#define GPU_FOREACH_THREAD_DIRECT(i, k, N) \ + for (int i = threadIdx.k; i Date: Mon, 3 Feb 2025 14:02:51 -0800 Subject: [PATCH 2/6] clean up pass --- src/apps/FEM_MACROS.hpp | 3 ++- src/apps/MASS3DPA-Hip.cpp | 24 ++++++++++++++++-------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp index 04f11a164..07772c2dd 100644 --- a/src/apps/FEM_MACROS.hpp +++ b/src/apps/FEM_MACROS.hpp @@ -26,7 +26,8 @@ #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) #define GPU_FOREACH_THREAD_DIRECT(i, k, N) \ - for (int i = threadIdx.k; i Date: Mon, 3 Feb 2025 14:23:45 -0800 Subject: [PATCH 3/6] add direct macros -- for testing --- src/apps/FEM_MACROS.hpp | 4 +--- src/apps/MASS3DPA-Hip.cpp | 24 ++++++++---------------- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp index 07772c2dd..a28ca445b 100644 --- a/src/apps/FEM_MACROS.hpp +++ b/src/apps/FEM_MACROS.hpp @@ -26,9 +26,7 @@ #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) #define GPU_FOREACH_THREAD_DIRECT(i, k, N) \ - for (int i = threadIdx.k; i < N; i += blockDim.k) -//if(int i = threadIdx.k < N) - // + if(int i = threadIdx.k; i < N) #endif #if defined(RAJA_ENABLE_SYCL) diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index 90ab5aec6..dc07d9bb3 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -35,46 +35,38 @@ __global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt, GPU_FOREACH_THREAD(dx, x, MPA_D1D){ MASS3DPA_1 } - //GPU_FOREACH_THREAD_DIRECT(dx, x, MPA_Q1D) { - {int dx = threadIdx.x; + GPU_FOREACH_THREAD_DIRECT(dx, x, MPA_Q1D) { MASS3DPA_2 } } __syncthreads(); GPU_FOREACH_THREAD(dy, y, MPA_D1D) { - //GPU_FOREACH_THREAD_DIRECT(qx, x, MPA_Q1D) { - {int qx = threadIdx.x; + GPU_FOREACH_THREAD_DIRECT(qx, x, MPA_Q1D) { MASS3DPA_3 } } __syncthreads(); - //GPU_FOREACH_THREAD_DIRECT(qy, y, MPA_Q1D) - {int qy = threadIdx.y; - //GPU_FOREACH_THREAD_DIRECT(qx, x, MPA_Q1D) - {int qx = threadIdx.x; + GPU_FOREACH_THREAD_DIRECT(qy, y, MPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, MPA_Q1D) { MASS3DPA_4 } } __syncthreads(); - //GPU_FOREACH_THREAD_DIRECT(qy, y, MPA_Q1D) - {int qy = threadIdx.y; - //GPU_FOREACH_THREAD_DIRECT(qx, x, MPA_Q1D) - {int qx = threadIdx.x; + GPU_FOREACH_THREAD_DIRECT(qy, y, MPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, MPA_Q1D) { MASS3DPA_5 } } __syncthreads(); GPU_FOREACH_THREAD(d, y, MPA_D1D) { - //GPU_FOREACH_THREAD_DIRECT(q, x, MPA_Q1D) - {int q = threadIdx.x; + GPU_FOREACH_THREAD_DIRECT(q, x, MPA_Q1D) { MASS3DPA_6 } } __syncthreads(); - //GPU_FOREACH_THREAD_DIRECT(qy, y, MPA_Q1D) - {int qy = threadIdx.y; + GPU_FOREACH_THREAD_DIRECT(qy, y, MPA_Q1D){ GPU_FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_7 } From 9d810b6ddb7b68f07220aaf74d41ce9db4fa795f Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 4 Feb 2025 10:45:34 -0800 Subject: [PATCH 4/6] use direct methods --- src/apps/DIFFUSION3DPA-Hip.cpp | 52 +++++++++++++++++----------------- src/apps/FEM_MACROS.hpp | 2 +- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index e43c5ff15..31005c178 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -23,7 +23,7 @@ namespace rajaperf { namespace apps { template < size_t block_size > - __launch_bounds__(block_size) +__launch_bounds__(block_size) __global__ void Diffusion3DPA(const Real_ptr Basis, const Real_ptr dBasis, const Real_ptr D, const Real_ptr X, Real_ptr Y, bool symmetric) { @@ -32,73 +32,73 @@ __global__ void Diffusion3DPA(const Real_ptr Basis, DIFFUSION3DPA_0_GPU; - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) { DIFFUSION3DPA_1; } } } if (threadIdx.z == 0) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) { DIFFUSION3DPA_2; } } } __syncthreads(); - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) { DIFFUSION3DPA_3; } } } __syncthreads(); - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) { DIFFUSION3DPA_4; } } } __syncthreads(); - GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qz, z, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) { DIFFUSION3DPA_5; } } } __syncthreads(); if (threadIdx.z == 0) { - GPU_FOREACH_THREAD(d, y, DPA_D1D) { - GPU_FOREACH_THREAD(q, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(d, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(q, x, DPA_Q1D) { DIFFUSION3DPA_6; } } } __syncthreads(); - GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qz, z, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) { DIFFUSION3DPA_7; } } } __syncthreads(); - GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qz, z, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) { DIFFUSION3DPA_8; } } } __syncthreads(); - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) { DIFFUSION3DPA_9; } } diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp index a28ca445b..a9c5c5e36 100644 --- a/src/apps/FEM_MACROS.hpp +++ b/src/apps/FEM_MACROS.hpp @@ -26,7 +26,7 @@ #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) #define GPU_FOREACH_THREAD_DIRECT(i, k, N) \ - if(int i = threadIdx.k; i < N) + if(int i = threadIdx.k; i < N) #endif #if defined(RAJA_ENABLE_SYCL) From 6bacf6bb89113e4cdf24da320c5f6bcff7fbd7b7 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 4 Feb 2025 10:47:29 -0800 Subject: [PATCH 5/6] revert mass --- src/apps/MASS3DPA-Hip.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/apps/MASS3DPA-Hip.cpp b/src/apps/MASS3DPA-Hip.cpp index dc07d9bb3..9d2068f2a 100644 --- a/src/apps/MASS3DPA-Hip.cpp +++ b/src/apps/MASS3DPA-Hip.cpp @@ -35,38 +35,38 @@ __global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt, GPU_FOREACH_THREAD(dx, x, MPA_D1D){ MASS3DPA_1 } - GPU_FOREACH_THREAD_DIRECT(dx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(dx, x, MPA_Q1D) { MASS3DPA_2 } } __syncthreads(); GPU_FOREACH_THREAD(dy, y, MPA_D1D) { - GPU_FOREACH_THREAD_DIRECT(qx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_3 } } __syncthreads(); - GPU_FOREACH_THREAD_DIRECT(qy, y, MPA_Q1D) { - GPU_FOREACH_THREAD_DIRECT(qx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, MPA_Q1D) { + GPU_FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_4 } } __syncthreads(); - GPU_FOREACH_THREAD_DIRECT(qy, y, MPA_Q1D) { - GPU_FOREACH_THREAD_DIRECT(qx, x, MPA_Q1D) { + GPU_FOREACH_THREAD(qy, y, MPA_Q1D) { + GPU_FOREACH_THREAD(qx, x, MPA_Q1D) { MASS3DPA_5 } } __syncthreads(); GPU_FOREACH_THREAD(d, y, MPA_D1D) { - GPU_FOREACH_THREAD_DIRECT(q, x, MPA_Q1D) { + GPU_FOREACH_THREAD(q, x, MPA_Q1D) { MASS3DPA_6 } } __syncthreads(); - GPU_FOREACH_THREAD_DIRECT(qy, y, MPA_Q1D){ + GPU_FOREACH_THREAD(qy, y, MPA_Q1D) { GPU_FOREACH_THREAD(dx, x, MPA_D1D) { MASS3DPA_7 } From c3100c4450271b64e343ef7d972585de7418b0d0 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 4 Feb 2025 10:55:00 -0800 Subject: [PATCH 6/6] use direct for raja too --- src/apps/DIFFUSION3DPA-Hip.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 31005c178..e26ad28c1 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -144,13 +144,13 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { RAJA::LoopPolicy; using inner_x = - RAJA::LoopPolicy>; + RAJA::LoopPolicy>; using inner_y = - RAJA::LoopPolicy>; + RAJA::LoopPolicy>; using inner_z = - RAJA::LoopPolicy>; + RAJA::LoopPolicy>; startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) {