diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 39b1a5d7f..7f9ff68d5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -76,6 +76,8 @@ blt_add_executable( apps/MASS3DEA-Seq.cpp apps/MASS3DPA.cpp apps/MASS3DPA-Seq.cpp + apps/MASS3DPA_ATOMIC.cpp + apps/MASS3DPA_ATOMIC-Seq.cpp apps/MASSVEC3DPA.cpp apps/MASSVEC3DPA-Seq.cpp apps/MATVEC_3D_STENCIL.cpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index d5e3814cd..29e54a32c 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -90,6 +90,12 @@ blt_add_library( MASS3DPA-Seq.cpp MASS3DPA-OMP.cpp MASS3DPA-Sycl.cpp + MASS3DPA_ATOMIC.cpp + MASS3DPA_ATOMIC-Cuda.cpp + MASS3DPA_ATOMIC-Hip.cpp + MASS3DPA_ATOMIC-Seq.cpp + MASS3DPA_ATOMIC-OMP.cpp + MASS3DPA_ATOMIC-Sycl.cpp MASSVEC3DPA.cpp MASSVEC3DPA-Cuda.cpp MASSVEC3DPA-Hip.cpp diff --git a/src/apps/CONVECTION3DPA.hpp b/src/apps/CONVECTION3DPA.hpp index f495f8b4f..4b51f81c3 100644 --- a/src/apps/CONVECTION3DPA.hpp +++ b/src/apps/CONVECTION3DPA.hpp @@ -10,8 +10,8 @@ /// Action of 3D diffusion matrix via partial assembly /// /// Based on MFEM's/CEED algorithms. -/// Reference implementation -/// https://github.com/mfem/mfem/blob/master/fem/bilininteg_convection_pa.cpp +/// Reference implementation - MFEM-v4.9 +/// https://github.com/mfem/mfem/blob/v4.9/fem/integ/bilininteg_convection_kernels.hpp /// /// /// for(int e = 0; e < NE; ++e) { diff --git a/src/apps/DIFFUSION3DPA-Cuda.cpp b/src/apps/DIFFUSION3DPA-Cuda.cpp index 94d52e7a2..c08df076d 100644 --- a/src/apps/DIFFUSION3DPA-Cuda.cpp +++ b/src/apps/DIFFUSION3DPA-Cuda.cpp @@ -32,73 +32,73 @@ __global__ void Diffusion3DPA(const Real_ptr Basis, DIFFUSION3DPA_0_GPU; - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) { DIFFUSION3DPA_1; } } } if (threadIdx.z == 0) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) { DIFFUSION3DPA_2; } } } __syncthreads(); - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) { DIFFUSION3DPA_3; } } } __syncthreads(); - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) { DIFFUSION3DPA_4; } } } __syncthreads(); - GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qz, z, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) { DIFFUSION3DPA_5; } } } __syncthreads(); if (threadIdx.z == 0) { - GPU_FOREACH_THREAD(d, y, DPA_D1D) { - GPU_FOREACH_THREAD(q, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(d, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(q, x, DPA_Q1D) { DIFFUSION3DPA_6; } } } __syncthreads(); - GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qz, z, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) { DIFFUSION3DPA_7; } } } __syncthreads(); - GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qz, z, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) { DIFFUSION3DPA_8; } } } __syncthreads(); - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) { DIFFUSION3DPA_9; } } diff --git a/src/apps/DIFFUSION3DPA-Hip.cpp b/src/apps/DIFFUSION3DPA-Hip.cpp index 5c0e73eab..e59cdf358 100644 --- a/src/apps/DIFFUSION3DPA-Hip.cpp +++ b/src/apps/DIFFUSION3DPA-Hip.cpp @@ -28,81 +28,82 @@ __global__ void Diffusion3DPA(const Real_ptr Basis, const Real_ptr dBasis, const Real_ptr D, const Real_ptr X, Real_ptr Y, bool symmetric) { - const Index_type e = hipBlockIdx_x; + const Index_type e = blockIdx.x; DIFFUSION3DPA_0_GPU; - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) { DIFFUSION3DPA_1; } } } if (threadIdx.z == 0) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) { DIFFUSION3DPA_2; } } } __syncthreads(); - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) { DIFFUSION3DPA_3; } } } __syncthreads(); - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) { DIFFUSION3DPA_4; } } } __syncthreads(); - GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(qx, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qz, z, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) { DIFFUSION3DPA_5; } } } __syncthreads(); if (threadIdx.z == 0) { - GPU_FOREACH_THREAD(d, y, DPA_D1D) { - GPU_FOREACH_THREAD(q, x, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) { DIFFUSION3DPA_6; } } } __syncthreads(); - GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { - GPU_FOREACH_THREAD(qy, y, DPA_Q1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qz, z, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) { DIFFUSION3DPA_7; } } } __syncthreads(); - GPU_FOREACH_THREAD(qz, z, DPA_Q1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(qz, z, DPA_Q1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) { DIFFUSION3DPA_8; } } } __syncthreads(); - GPU_FOREACH_THREAD(dz, z, DPA_D1D) { - GPU_FOREACH_THREAD(dy, y, DPA_D1D) { - GPU_FOREACH_THREAD(dx, x, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) { DIFFUSION3DPA_9; } } } + } template < size_t block_size > @@ -248,7 +249,7 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](Index_type qx) { - DIFFUSION3DPA_5; + DIFFUSION3DPA_5; } // lambda (qx) ); // RAJA::loop @@ -262,9 +263,9 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type RAJA_UNUSED_ARG(dz)) { RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](Index_type d) { + [&](Index_type dy) { RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](Index_type q) { + [&](Index_type qx) { DIFFUSION3DPA_6; @@ -320,7 +321,7 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](Index_type dx) { - DIFFUSION3DPA_9; + DIFFUSION3DPA_9; } // lambda (dx) ); // RAJA::loop diff --git a/src/apps/DIFFUSION3DPA-OMP.cpp b/src/apps/DIFFUSION3DPA-OMP.cpp index ebbef55ef..73833938b 100644 --- a/src/apps/DIFFUSION3DPA-OMP.cpp +++ b/src/apps/DIFFUSION3DPA-OMP.cpp @@ -38,70 +38,71 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { DIFFUSION3DPA_0_CPU; - CPU_FOREACH(dz, z, DPA_D1D) { - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_1; + CPU_FOREACH(dz,z,DPA_D1D) { + CPU_FOREACH(dy,y,DPA_D1D) { + CPU_FOREACH(dx,x,DPA_D1D) { + DIFFUSION3DPA_1 } } } - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_2; + CPU_FOREACH(dy,y,DPA_D1D) { + CPU_FOREACH(qx,x,DPA_Q1D) { + DIFFUSION3DPA_2 } } - CPU_FOREACH(dz, z, DPA_D1D) { - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_3; + CPU_FOREACH(dz,z,DPA_D1D) { + CPU_FOREACH(dy,y,DPA_D1D) { + CPU_FOREACH(qx,x,DPA_Q1D) { + + DIFFUSION3DPA_3 } } } - CPU_FOREACH(dz, z, DPA_D1D) { - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_4; + CPU_FOREACH(dz,z,DPA_D1D) { + CPU_FOREACH(qy,y,DPA_Q1D) { + CPU_FOREACH(qx,x,DPA_Q1D) { + DIFFUSION3DPA_4 } } } - CPU_FOREACH(qz, z, DPA_Q1D) { - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_5; + CPU_FOREACH(qz,z,DPA_Q1D) { + CPU_FOREACH(qy,y,DPA_Q1D) { + CPU_FOREACH(qx,x,DPA_Q1D) { + DIFFUSION3DPA_5 } } } - CPU_FOREACH(d, y, DPA_D1D) { - CPU_FOREACH(q, x, DPA_Q1D) { - DIFFUSION3DPA_6; + CPU_FOREACH(dy,y,DPA_D1D) { + CPU_FOREACH(qx,x,DPA_Q1D) { + DIFFUSION3DPA_6 } } - CPU_FOREACH(qz, z, DPA_Q1D) { - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_7; + CPU_FOREACH(qz,z,DPA_Q1D) { + CPU_FOREACH(qy,y,DPA_Q1D) { + CPU_FOREACH(dx,x,DPA_D1D) { + DIFFUSION3DPA_7 } } } - CPU_FOREACH(qz, z, DPA_Q1D) { - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_8; + CPU_FOREACH(qz,z,DPA_Q1D) { + CPU_FOREACH(dy,y,DPA_D1D) { + CPU_FOREACH(dx,x,DPA_D1D) { + DIFFUSION3DPA_8 } } } - CPU_FOREACH(dz, z, DPA_D1D) { - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_9; + CPU_FOREACH(dz,z,DPA_D1D) { + CPU_FOREACH(dy,y,DPA_D1D) { + CPU_FOREACH(dx,x,DPA_D1D) { + DIFFUSION3DPA_9 } } } @@ -221,7 +222,7 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), [&](Index_type qx) { - DIFFUSION3DPA_5; + DIFFUSION3DPA_5; } // lambda (qx) ); // RAJA::loop @@ -235,11 +236,11 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type RAJA_UNUSED_ARG(dz)) { RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](Index_type d) { + [&](Index_type dy) { RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](Index_type q) { + [&](Index_type qx) { - DIFFUSION3DPA_6; + DIFFUSION3DPA_6; } // lambda (q) ); // RAJA::loop @@ -257,7 +258,7 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), [&](Index_type dx) { - DIFFUSION3DPA_7; + DIFFUSION3DPA_7; } // lambda (dx) ); // RAJA::loop diff --git a/src/apps/DIFFUSION3DPA-Seq.cpp b/src/apps/DIFFUSION3DPA-Seq.cpp index 8ebf3e0b0..b81badf1d 100644 --- a/src/apps/DIFFUSION3DPA-Seq.cpp +++ b/src/apps/DIFFUSION3DPA-Seq.cpp @@ -35,70 +35,71 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { DIFFUSION3DPA_0_CPU; - CPU_FOREACH(dz, z, DPA_D1D) { - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_1; + CPU_FOREACH(dz,z,DPA_D1D) { + CPU_FOREACH(dy,y,DPA_D1D) { + CPU_FOREACH(dx,x,DPA_D1D) { + DIFFUSION3DPA_1 } } } - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_2; + CPU_FOREACH(dy,y,DPA_D1D) { + CPU_FOREACH(qx,x,DPA_Q1D) { + DIFFUSION3DPA_2 } } - CPU_FOREACH(dz, z, DPA_D1D) { - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_3; + CPU_FOREACH(dz,z,DPA_D1D) { + CPU_FOREACH(dy,y,DPA_D1D) { + CPU_FOREACH(qx,x,DPA_Q1D) { + + DIFFUSION3DPA_3 } } } - CPU_FOREACH(dz, z, DPA_D1D) { - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_4; + CPU_FOREACH(dz,z,DPA_D1D) { + CPU_FOREACH(qy,y,DPA_Q1D) { + CPU_FOREACH(qx,x,DPA_Q1D) { + DIFFUSION3DPA_4 } } } - CPU_FOREACH(qz, z, DPA_Q1D) { - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(qx, x, DPA_Q1D) { - DIFFUSION3DPA_5; + CPU_FOREACH(qz,z,DPA_Q1D) { + CPU_FOREACH(qy,y,DPA_Q1D) { + CPU_FOREACH(qx,x,DPA_Q1D) { + DIFFUSION3DPA_5 } } } - CPU_FOREACH(d, y, DPA_D1D) { - CPU_FOREACH(q, x, DPA_Q1D) { - DIFFUSION3DPA_6; + CPU_FOREACH(dy,y,DPA_D1D) { + CPU_FOREACH(qx,x,DPA_Q1D) { + DIFFUSION3DPA_6 } } - CPU_FOREACH(qz, z, DPA_Q1D) { - CPU_FOREACH(qy, y, DPA_Q1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_7; + CPU_FOREACH(qz,z,DPA_Q1D) { + CPU_FOREACH(qy,y,DPA_Q1D) { + CPU_FOREACH(dx,x,DPA_D1D) { + DIFFUSION3DPA_7 } } } - CPU_FOREACH(qz, z, DPA_Q1D) { - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_8; + CPU_FOREACH(qz,z,DPA_Q1D) { + CPU_FOREACH(dy,y,DPA_D1D) { + CPU_FOREACH(dx,x,DPA_D1D) { + DIFFUSION3DPA_8 } } } - CPU_FOREACH(dz, z, DPA_D1D) { - CPU_FOREACH(dy, y, DPA_D1D) { - CPU_FOREACH(dx, x, DPA_D1D) { - DIFFUSION3DPA_9; + CPU_FOREACH(dz,z,DPA_D1D) { + CPU_FOREACH(dy,y,DPA_D1D) { + CPU_FOREACH(dx,x,DPA_D1D) { + DIFFUSION3DPA_9 } } } @@ -233,9 +234,9 @@ void DIFFUSION3DPA::runSeqVariant(VariantID vid) { RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](Index_type RAJA_UNUSED_ARG(dz)) { RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_D1D), - [&](Index_type d) { + [&](Index_type dy) { RAJA::loop(ctx, RAJA::RangeSegment(0, DPA_Q1D), - [&](Index_type q) { + [&](Index_type qx) { DIFFUSION3DPA_6; diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index ecb96696a..83e4ed16a 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -6,12 +6,13 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// clang-format off /// /// Action of 3D diffusion matrix via partial assembly /// /// Based on MFEM's/CEED algorithms. -/// Reference implementation -/// https://github.com/mfem/mfem/blob/master/fem/bilininteg_diffusion_pa.cpp +/// Reference implementation - MFEM-v4.9 +/// https://github.com/mfem/mfem/blob/v4.9/fem/integ/bilininteg_diffusion_kernels.hpp /// /// for (int e = 0; e < NE; ++e) { /// @@ -41,420 +42,372 @@ /// double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); /// double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); /// -/// for(int dz=0;dz MD1) ? MQ1 : MD1; \ - RAJA_TEAM_SHARED double sBG[MQ1*MD1]; \ - double (*B)[MD1] = (double (*)[MD1]) sBG; \ - double (*G)[MD1] = (double (*)[MD1]) sBG; \ - double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; \ - double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; \ - RAJA_TEAM_SHARED double sm0[3][MDQ*MDQ*MDQ]; \ - RAJA_TEAM_SHARED double sm1[3][MDQ*MDQ*MDQ]; \ - double (*s_X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); \ - double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0); \ - double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1); \ - double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0); \ - double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1); \ - double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2); \ - double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0); \ - double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1); \ - double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2); \ - double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0); \ - double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1); \ - double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2); \ - double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0); \ - double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); \ - double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); - -#define DIFFUSION3DPA_0_CPU \ - constexpr int MQ1 = DPA_Q1D; \ - constexpr int MD1 = DPA_D1D; \ - constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - double sBG[MQ1*MD1]; \ - double (*B)[MD1] = (double (*)[MD1]) sBG; \ - double (*G)[MD1] = (double (*)[MD1]) sBG; \ - double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; \ - double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; \ - double sm0[3][MDQ*MDQ*MDQ]; \ - double sm1[3][MDQ*MDQ*MDQ]; \ - double (*s_X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); \ - double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0); \ - double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1); \ - double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0); \ - double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1); \ - double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2); \ - double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0); \ - double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1); \ - double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2); \ - double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0); \ - double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1); \ - double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2); \ - double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0); \ - double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); \ - double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); - -#define DIFFUSION3DPA_1 \ - s_X[dz][dy][dx] = DPA_X(dx,dy,dz,e); - -#define DIFFUSION3DPA_2 \ - const Index_type i = DPA_qi(qx,dy,DPA_Q1D); \ - const Index_type j = DPA_dj(qx,dy,DPA_D1D); \ - const Index_type k = DPA_qk(qx,dy,DPA_Q1D); \ - const Index_type l = DPA_dl(qx,dy,DPA_D1D); \ - B[i][j] = DPA_b(qx,dy); \ - G[k][l] = DPA_g(qx,dy) * DPA_sign(qx,dy); \ - -#define DIFFUSION3DPA_3 \ - Real_type u = 0.0, v = 0.0; \ - RAJAPERF_UNROLL(MD1) \ - for (Index_type dx = 0; dx < DPA_D1D; ++dx) \ - { \ - const Index_type i = DPA_qi(qx,dx,DPA_Q1D); \ - const Index_type j = DPA_dj(qx,dx,DPA_D1D); \ - const Index_type k = DPA_qk(qx,dx,DPA_Q1D); \ - const Index_type l = DPA_dl(qx,dx,DPA_D1D); \ - const Real_type s = DPA_sign(qx,dx); \ - const Real_type coords = s_X[dz][dy][dx]; \ - u += coords * B[i][j]; \ - v += coords * G[k][l] * s; \ - } \ - DDQ0[dz][dy][qx] = u; \ - DDQ1[dz][dy][qx] = v; - -#define DIFFUSION3DPA_4 \ - Real_type u = 0.0, v = 0.0, w = 0.0; \ - RAJAPERF_UNROLL(MD1) \ - for (Index_type dy = 0; dy < DPA_D1D; ++dy) \ - { \ - const Index_type i = DPA_qi(qy,dy,DPA_Q1D); \ - const Index_type j = DPA_dj(qy,dy,DPA_D1D); \ - const Index_type k = DPA_qk(qy,dy,DPA_Q1D); \ - const Index_type l = DPA_dl(qy,dy,DPA_D1D); \ - const Real_type s = DPA_sign(qy,dy); \ - u += DDQ1[dz][dy][qx] * B[i][j]; \ - v += DDQ0[dz][dy][qx] * G[k][l] * s; \ - w += DDQ0[dz][dy][qx] * B[i][j]; \ - } \ - DQQ0[dz][qy][qx] = u; \ - DQQ1[dz][qy][qx] = v; \ - DQQ2[dz][qy][qx] = w; - -#define DIFFUSION3DPA_5 \ - Real_type u = 0.0, v = 0.0, w = 0.0; \ - RAJAPERF_UNROLL(MD1) \ - for (Index_type dz = 0; dz < DPA_D1D; ++dz) \ - { \ - const Index_type i = DPA_qi(qz,dz,DPA_Q1D); \ - const Index_type j = DPA_dj(qz,dz,DPA_D1D); \ - const Index_type k = DPA_qk(qz,dz,DPA_Q1D); \ - const Index_type l = DPA_dl(qz,dz,DPA_D1D); \ - const Real_type s = DPA_sign(qz,dz); \ - u += DQQ0[dz][qy][qx] * B[i][j]; \ - v += DQQ1[dz][qy][qx] * B[i][j]; \ - w += DQQ2[dz][qy][qx] * G[k][l] * s; \ - } \ - const Real_type O11 = DPA_d(qx,qy,qz,0,e); \ - const Real_type O12 = DPA_d(qx,qy,qz,1,e); \ - const Real_type O13 = DPA_d(qx,qy,qz,2,e); \ - const Real_type O21 = symmetric ? O12 : DPA_d(qx,qy,qz,3,e); \ - const Real_type O22 = symmetric ? DPA_d(qx,qy,qz,3,e) : DPA_d(qx,qy,qz,4,e); \ - const Real_type O23 = symmetric ? DPA_d(qx,qy,qz,4,e) : DPA_d(qx,qy,qz,5,e); \ - const Real_type O31 = symmetric ? O13 : DPA_d(qx,qy,qz,6,e); \ - const Real_type O32 = symmetric ? O23 : DPA_d(qx,qy,qz,7,e); \ - const Real_type O33 = symmetric ? DPA_d(qx,qy,qz,5,e) : DPA_d(qx,qy,qz,8,e); \ - const Real_type gX = u; \ - const Real_type gY = v; \ - const Real_type gZ = w; \ - QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ); \ - QQQ1[qz][qy][qx] = (O21*gX) + (O22*gY) + (O23*gZ); \ - QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ); - -#define DIFFUSION3DPA_6 \ - const Index_type i = DPA_qi(q,d,DPA_Q1D); \ - const Index_type j = DPA_dj(q,d,DPA_D1D); \ - const Index_type k = DPA_qk(q,d,DPA_Q1D); \ - const Index_type l = DPA_dl(q,d,DPA_D1D); \ - Bt[j][i] = DPA_b(q,d); \ - Gt[l][k] = DPA_g(q,d) * DPA_sign(q,d); - -#define DIFFUSION3DPA_7 \ - Real_type u = 0.0, v = 0.0, w = 0.0; \ - RAJAPERF_UNROLL(MQ1) \ - for (Index_type qx = 0; qx < DPA_Q1D; ++qx) \ - { \ - const Index_type i = DPA_qi(qx,dx,DPA_Q1D); \ - const Index_type j = DPA_dj(qx,dx,DPA_D1D); \ - const Index_type k = DPA_qk(qx,dx,DPA_Q1D); \ - const Index_type l = DPA_dl(qx,dx,DPA_D1D); \ - const Real_type s = DPA_sign(qx,dx); \ - u += QQQ0[qz][qy][qx] * Gt[l][k] * s; \ - v += QQQ1[qz][qy][qx] * Bt[j][i]; \ - w += QQQ2[qz][qy][qx] * Bt[j][i]; \ - } \ - QQD0[qz][qy][dx] = u; \ - QQD1[qz][qy][dx] = v; \ - QQD2[qz][qy][dx] = w; - -#define DIFFUSION3DPA_8 \ - Real_type u = 0.0, v = 0.0, w = 0.0; \ - RAJAPERF_UNROLL(DPA_Q1D) \ - for (Index_type qy = 0; qy < DPA_Q1D; ++qy) \ - { \ - const Index_type i = DPA_qi(qy,dy,DPA_Q1D); \ - const Index_type j = DPA_dj(qy,dy,DPA_D1D); \ - const Index_type k = DPA_qk(qy,dy,DPA_Q1D); \ - const Index_type l = DPA_dl(qy,dy,DPA_D1D); \ - const Real_type s = DPA_sign(qy,dy); \ - u += QQD0[qz][qy][dx] * Bt[j][i]; \ - v += QQD1[qz][qy][dx] * Gt[l][k] * s; \ - w += QQD2[qz][qy][dx] * Bt[j][i]; \ - } \ - QDD0[qz][dy][dx] = u; \ - QDD1[qz][dy][dx] = v; \ - QDD2[qz][dy][dx] = w; - -#define DIFFUSION3DPA_9 \ - Real_type u = 0.0, v = 0.0, w = 0.0; \ - RAJAPERF_UNROLL(MQ1) \ - for (Index_type qz = 0; qz < DPA_Q1D; ++qz) \ - { \ - const Index_type i = DPA_qi(qz,dz,DPA_Q1D); \ - const Index_type j = DPA_dj(qz,dz,DPA_D1D); \ - const Index_type k = DPA_qk(qz,dz,DPA_Q1D); \ - const Index_type l = DPA_dl(qz,dz,DPA_D1D); \ - const Real_type s = DPA_sign(qz,dz); \ - u += QDD0[qz][dy][dx] * Bt[j][i]; \ - v += QDD1[qz][dy][dx] * Bt[j][i]; \ - w += QDD2[qz][dy][dx] * Gt[l][k] * s; \ - } \ - DPA_Y(dx,dy,dz,e) += (u + v + w); - -namespace rajaperf -{ +#define DPA_qi(q, d, Q) (((q) <= (d)) ? (q) : (Q) - 1 - (q)) +#define DPA_dj(q, d, D) (((q) <= (d)) ? (d) : (D) - 1 - (d)) +#define DPA_qk(q, d, Q) (((q) <= (d)) ? (Q) - 1 - (q) : (q)) +#define DPA_dl(q, d, D) (((q) <= (d)) ? (D) - 1 - (d) : (d)) +#define DPA_sign(q, d) (((q) <= (d)) ? -1.0 : 1.0) + +#define DIFFUSION3DPA_0_GPU \ + constexpr Index_type MQ1 = DPA_Q1D; \ + constexpr Index_type MD1 = DPA_D1D; \ + constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + RAJA_TEAM_SHARED Real_type sBG[MQ1 * MD1]; \ + Real_type(*B)[MD1] = (Real_type(*)[MD1])sBG; \ + Real_type(*G)[MD1] = (Real_type(*)[MD1])sBG; \ + Real_type(*Bt)[MQ1] = (Real_type(*)[MQ1])sBG; \ + Real_type(*Gt)[MQ1] = (Real_type(*)[MQ1])sBG; \ + RAJA_TEAM_SHARED Real_type sm0[3][MDQ * MDQ * MDQ]; \ + RAJA_TEAM_SHARED Real_type sm1[3][MDQ * MDQ * MDQ]; \ + Real_type(*s_X)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 2); \ + Real_type(*DDQ0)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])(sm0 + 0); \ + Real_type(*DDQ1)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])(sm0 + 1); \ + Real_type(*DQQ0)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 0); \ + Real_type(*DQQ1)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 1); \ + Real_type(*DQQ2)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 2); \ + Real_type(*QQQ0)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 0); \ + Real_type(*QQQ1)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 1); \ + Real_type(*QQQ2)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 2); \ + Real_type(*QQD0)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 0); \ + Real_type(*QQD1)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 1); \ + Real_type(*QQD2)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 2); \ + Real_type(*QDD0)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 0); \ + Real_type(*QDD1)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 1); \ + Real_type(*QDD2)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 2); + +#define DIFFUSION3DPA_0_CPU \ + constexpr Index_type MQ1 = DPA_Q1D; \ + constexpr Index_type MD1 = DPA_D1D; \ + constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + Real_type sBG[MQ1 * MD1]; \ + Real_type(*B)[MD1] = (Real_type(*)[MD1])sBG; \ + Real_type(*G)[MD1] = (Real_type(*)[MD1])sBG; \ + Real_type(*Bt)[MQ1] = (Real_type(*)[MQ1])sBG; \ + Real_type(*Gt)[MQ1] = (Real_type(*)[MQ1])sBG; \ + Real_type sm0[3][MDQ * MDQ * MDQ]; \ + Real_type sm1[3][MDQ * MDQ * MDQ]; \ + Real_type(*s_X)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 2); \ + Real_type(*DDQ0)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])(sm0 + 0); \ + Real_type(*DDQ1)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])(sm0 + 1); \ + Real_type(*DQQ0)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 0); \ + Real_type(*DQQ1)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 1); \ + Real_type(*DQQ2)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm1 + 2); \ + Real_type(*QQQ0)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 0); \ + Real_type(*QQQ1)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 1); \ + Real_type(*QQQ2)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])(sm0 + 2); \ + Real_type(*QQD0)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 0); \ + Real_type(*QQD1)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 1); \ + Real_type(*QQD2)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])(sm1 + 2); \ + Real_type(*QDD0)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 0); \ + Real_type(*QDD1)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 1); \ + Real_type(*QDD2)[MD1][MD1] = (Real_type(*)[MD1][MD1])(sm0 + 2); + +#define DIFFUSION3DPA_1 s_X[dz][dy][dx] = DPA_X(dx, dy, dz, e); + +#define DIFFUSION3DPA_2 \ + B[qx][dy] = DPA_b(qx, dy); \ + G[qx][dy] = DPA_g(qx, dy); + +#define DIFFUSION3DPA_3 \ + Real_type u = 0.0, v = 0.0; \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dx = 0; dx < DPA_D1D; ++dx) { \ + const Real_type coords = s_X[dz][dy][dx]; \ + u += coords * B[qx][dx]; \ + v += coords * G[qx][dx]; \ + } \ + DDQ0[dz][dy][qx] = u; \ + DDQ1[dz][dy][qx] = v; + +#define DIFFUSION3DPA_4 \ + Real_type u = 0.0, v = 0.0, w = 0.0; \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dy = 0; dy < DPA_D1D; ++dy) { \ + u += DDQ1[dz][dy][qx] * B[qy][dy]; \ + v += DDQ0[dz][dy][qx] * G[qy][dy]; \ + w += DDQ0[dz][dy][qx] * B[qy][dy]; \ + } \ + DQQ0[dz][qy][qx] = u; \ + DQQ1[dz][qy][qx] = v; \ + DQQ2[dz][qy][qx] = w; + +#define DIFFUSION3DPA_5 \ + Real_type u = 0.0, v = 0.0, w = 0.0; \ + RAJAPERF_UNROLL(MD1) \ + for (Index_type dz = 0; dz < DPA_D1D; ++dz) { \ + u += DQQ0[dz][qy][qx] * B[qz][dz]; \ + v += DQQ1[dz][qy][qx] * B[qz][dz]; \ + w += DQQ2[dz][qy][qx] * G[qz][dz]; \ + } \ + const Real_type O11 = DPA_d(qx, qy, qz, 0, e); \ + const Real_type O12 = DPA_d(qx, qy, qz, 1, e); \ + const Real_type O13 = DPA_d(qx, qy, qz, 2, e); \ + const Real_type O21 = symmetric ? O12 : DPA_d(qx, qy, qz, 3, e); \ + const Real_type O22 = \ + symmetric ? DPA_d(qx, qy, qz, 3, e) : DPA_d(qx, qy, qz, 4, e); \ + const Real_type O23 = \ + symmetric ? DPA_d(qx, qy, qz, 4, e) : DPA_d(qx, qy, qz, 5, e); \ + const Real_type O31 = symmetric ? O13 : DPA_d(qx, qy, qz, 6, e); \ + const Real_type O32 = symmetric ? O23 : DPA_d(qx, qy, qz, 7, e); \ + const Real_type O33 = \ + symmetric ? DPA_d(qx, qy, qz, 5, e) : DPA_d(qx, qy, qz, 8, e); \ + const Real_type gX = u; \ + const Real_type gY = v; \ + const Real_type gZ = w; \ + QQQ0[qz][qy][qx] = (O11 * gX) + (O12 * gY) + (O13 * gZ); \ + QQQ1[qz][qy][qx] = (O21 * gX) + (O22 * gY) + (O23 * gZ); \ + QQQ2[qz][qy][qx] = (O31 * gX) + (O32 * gY) + (O33 * gZ); + +#define DIFFUSION3DPA_6 \ + Bt[dy][qx] = DPA_b(qx, dy); \ + Gt[dy][qx] = DPA_g(qx, dy); + +#define DIFFUSION3DPA_7 \ + Real_type u = 0.0, v = 0.0, w = 0.0; \ + RAJAPERF_UNROLL(MQ1) \ + for (Index_type qx = 0; qx < DPA_Q1D; ++qx) { \ + u += QQQ0[qz][qy][qx] * Gt[dx][qx]; \ + v += QQQ1[qz][qy][qx] * Bt[dx][qx]; \ + w += QQQ2[qz][qy][qx] * Bt[dx][qx]; \ + } \ + QQD0[qz][qy][dx] = u; \ + QQD1[qz][qy][dx] = v; \ + QQD2[qz][qy][dx] = w; + +#define DIFFUSION3DPA_8 \ + Real_type u = 0.0, v = 0.0, w = 0.0; \ + RAJAPERF_UNROLL(DPA_Q1D) \ + for (Index_type qy = 0; qy < DPA_Q1D; ++qy) { \ + u += QQD0[qz][qy][dx] * Bt[dy][qy]; \ + v += QQD1[qz][qy][dx] * Gt[dy][qy]; \ + w += QQD2[qz][qy][dx] * Bt[dy][qy]; \ + } \ + QDD0[qz][dy][dx] = u; \ + QDD1[qz][dy][dx] = v; \ + QDD2[qz][dy][dx] = w; + +#define DIFFUSION3DPA_9 \ + Real_type u = 0.0, v = 0.0, w = 0.0; \ + RAJAPERF_UNROLL(MQ1) \ + for (Index_type qz = 0; qz < DPA_Q1D; ++qz) { \ + u += QDD0[qz][dy][dx] * Bt[dz][qz]; \ + v += QDD1[qz][dy][dx] * Bt[dz][qz]; \ + w += QDD2[qz][dy][dx] * Gt[dz][qz]; \ + } \ + DPA_Y(dx, dy, dz, e) += (u + v + w); + +namespace rajaperf { class RunParams; -namespace apps -{ +namespace apps { -class DIFFUSION3DPA : public KernelBase -{ +class DIFFUSION3DPA : public KernelBase { public: - - DIFFUSION3DPA(const RunParams& params); + DIFFUSION3DPA(const RunParams ¶ms); ~DIFFUSION3DPA(); @@ -471,12 +424,9 @@ class DIFFUSION3DPA : public KernelBase void runSeqVariant(VariantID vid); void runOpenMPVariant(VariantID vid); - template < size_t block_size > - void runCudaVariantImpl(VariantID vid); - template < size_t block_size > - void runHipVariantImpl(VariantID vid); - template < size_t work_group_size > - void runSyclVariantImpl(VariantID vid); + template void runCudaVariantImpl(VariantID vid); + template void runHipVariantImpl(VariantID vid); + template void runSyclVariantImpl(VariantID vid); private: static const size_t default_gpu_block_size = DPA_Q1D * DPA_Q1D * DPA_Q1D; diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp index 03e98d84d..1b050ba60 100644 --- a/src/apps/FEM_MACROS.hpp +++ b/src/apps/FEM_MACROS.hpp @@ -24,6 +24,9 @@ #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) #define GPU_FOREACH_THREAD(i, k, N) \ for (Index_type i = threadIdx.k; i < N; i += blockDim.k) + +#define GPU_FOREACH_THREAD_DIRECT(i, k, N) \ + if(const int i=hipThreadIdx_ ##k; i + +namespace rajaperf { +namespace apps { + +template < size_t block_size > + __launch_bounds__(block_size) +__global__ void Mass3DPA_Atomic(const Real_ptr B, const Real_ptr Bt, + const Real_ptr D, const Real_ptr X, const Index_ptr ElemToDoF, Real_ptr Y) { + + const Index_type e = blockIdx.x; + + MASS3DPA_ATOMIC_0_GPU; + + + GPU_FOREACH_THREAD_DIRECT(dz, z, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_1; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(dz, z, 1) { + GPU_FOREACH_THREAD_DIRECT(d, y, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(q, x, mpa3d_at::Q1D) { + MASS3DPA_ATOMIC_2; + } + } + } + + + GPU_FOREACH_THREAD_DIRECT(dz, z, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, mpa3d_at::Q1D) { + MASS3DPA_ATOMIC_3; + } + } + } + + + GPU_FOREACH_THREAD_DIRECT(dz, z, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, mpa3d_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, mpa3d_at::Q1D) { + MASS3DPA_ATOMIC_4; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(qz, z, mpa3d_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, mpa3d_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, mpa3d_at::Q1D) { + MASS3DPA_ATOMIC_5; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(qz, z, mpa3d_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, mpa3d_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_6; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(qz, z, mpa3d_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_7; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(dz, z, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_8; + } + } + } + +} + +template < size_t block_size > +void MASS3DPA_ATOMIC::runCudaVariantImpl(VariantID vid) { + setBlockSize(block_size); + + const Index_type run_reps = getRunReps(); + + auto res{getCudaResource()}; + + MASS3DPA_ATOMIC_DATA_SETUP; + + switch (vid) { + + case Base_CUDA: { + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + + dim3 nthreads_per_block(mpa3d_at::Q1D, mpa3d_at::Q1D, mpa3d_at::Q1D); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel( (Mass3DPA_Atomic), + NE, nthreads_per_block, + shmem, res.get_stream(), + B, Bt, D, X, ElemToDoF, Y ); + + } + stopTimer(); + + break; + } + + case RAJA_CUDA: { + + constexpr bool async = true; + + using launch_policy = RAJA::LaunchPolicy>; + + using outer_x = RAJA::LoopPolicy; + + using inner_x = RAJA::LoopPolicy>; + + using inner_y = RAJA::LoopPolicy>; + + using inner_z = RAJA::LoopPolicy>; + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + + RAJA::launch( res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(mpa3d_at::Q1D, mpa3d_at::Q1D, mpa3d_at::Q1D)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](Index_type e) { + + + MASS3DPA_ATOMIC_0_GPU; + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_1; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type ) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type q) { + MASS3DPA_ATOMIC_2; + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); // RAJA::loop + } // lambda () + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_3; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_4; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_5; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_6; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_7; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_8; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + + } // loop over kernel reps + stopTimer(); + + break; + } + + default: { + + getCout() << "\n MASS3DPA_ATOMIC : Unknown Cuda variant id = " << vid << std::endl; + break; + } + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MASS3DPA_ATOMIC, Cuda, Base_CUDA, RAJA_CUDA) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/apps/MASS3DPA_ATOMIC-Hip.cpp b/src/apps/MASS3DPA_ATOMIC-Hip.cpp new file mode 100644 index 000000000..fc3e1d343 --- /dev/null +++ b/src/apps/MASS3DPA_ATOMIC-Hip.cpp @@ -0,0 +1,303 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// Uncomment to add compiler directives loop unrolling +//#define USE_RAJAPERF_UNROLL + +#include "MASS3DPA_ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + +template < size_t block_size > + __launch_bounds__(block_size) +__global__ void Mass3DPA_Atomic(const Real_ptr B, const Real_ptr Bt, + const Real_ptr D, const Real_ptr X, const Index_ptr ElemToDoF, Real_ptr Y) { + + const Index_type e = blockIdx.x; + + MASS3DPA_ATOMIC_0_GPU; + + + GPU_FOREACH_THREAD_DIRECT(dz, z, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_1; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(dz, z, 1) { + GPU_FOREACH_THREAD_DIRECT(d, y, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(q, x, mpa3d_at::Q1D) { + MASS3DPA_ATOMIC_2; + } + } + } + + + GPU_FOREACH_THREAD_DIRECT(dz, z, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, mpa3d_at::Q1D) { + MASS3DPA_ATOMIC_3; + } + } + } + + + GPU_FOREACH_THREAD_DIRECT(dz, z, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, mpa3d_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, mpa3d_at::Q1D) { + MASS3DPA_ATOMIC_4; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(qz, z, mpa3d_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, mpa3d_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qx, x, mpa3d_at::Q1D) { + MASS3DPA_ATOMIC_5; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(qz, z, mpa3d_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(qy, y, mpa3d_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_6; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(qz, z, mpa3d_at::Q1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_7; + } + } + } + + GPU_FOREACH_THREAD_DIRECT(dz, z, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dy, y, mpa3d_at::D1D) { + GPU_FOREACH_THREAD_DIRECT(dx, x, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_8; + } + } + } + +} + +template < size_t block_size > +void MASS3DPA_ATOMIC::runHipVariantImpl(VariantID vid) { + setBlockSize(block_size); + + const Index_type run_reps = getRunReps(); + + auto res{getHipResource()}; + + MASS3DPA_ATOMIC_DATA_SETUP; + + switch (vid) { + + case Base_HIP: { + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + + dim3 nthreads_per_block(mpa3d_at::Q1D, mpa3d_at::Q1D, mpa3d_at::Q1D); + constexpr size_t shmem = 0; + + RPlaunchHipKernel( (Mass3DPA_Atomic), + NE, nthreads_per_block, + shmem, res.get_stream(), + B, Bt, D, X, ElemToDoF, Y ); + + } + stopTimer(); + + break; + } + + case RAJA_HIP: { + + constexpr bool async = true; + + using launch_policy = RAJA::LaunchPolicy>; + + using outer_x = RAJA::LoopPolicy; + + using inner_x = RAJA::LoopPolicy>; + + using inner_y = RAJA::LoopPolicy>; + + using inner_z = RAJA::LoopPolicy>; + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + + RAJA::launch( res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(mpa3d_at::Q1D, mpa3d_at::Q1D, mpa3d_at::Q1D)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](Index_type e) { + + + MASS3DPA_ATOMIC_0_GPU; + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_1; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type ) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type q) { + MASS3DPA_ATOMIC_2; + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); // RAJA::loop + } // lambda () + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_3; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_4; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_5; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_6; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_7; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_8; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + + } // loop over kernel reps + stopTimer(); + + break; + } + + default: { + + getCout() << "\n MASS3DPA_ATOMIC : Unknown Hip variant id = " << vid << std::endl; + break; + } + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MASS3DPA_ATOMIC, Hip, Base_HIP, RAJA_HIP) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/apps/MASS3DPA_ATOMIC-OMP.cpp b/src/apps/MASS3DPA_ATOMIC-OMP.cpp new file mode 100644 index 000000000..c3098f68c --- /dev/null +++ b/src/apps/MASS3DPA_ATOMIC-OMP.cpp @@ -0,0 +1,247 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// Uncomment to add compiler directives loop unrolling +//#define USE_RAJAPERF_UNROLL + +#include "MASS3DPA_ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf { +namespace apps { + + +void MASS3DPA_ATOMIC::runOpenMPVariant(VariantID vid) { + +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + MASS3DPA_ATOMIC_DATA_SETUP; + + switch (vid) { + + case Base_OpenMP: { + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + +#pragma omp parallel for + for (Index_type e = 0; e < NE; ++e) { + + MASS3DPA_ATOMIC_0_CPU; + + SHARED_LOOP_3D(dx, dy, dz, mpa3d_at::D1D, mpa3d_at::D1D, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_1; + } + + SHARED_LOOP_2D(q, d, mpa3d_at::Q1D, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_2; + } + + SHARED_LOOP_3D(qx, dy, dz, mpa3d_at::Q1D, mpa3d_at::D1D, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_3; + } + + SHARED_LOOP_3D(qx, qy, dz, mpa3d_at::Q1D, mpa3d_at::Q1D, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_4; + } + + SHARED_LOOP_3D(qx, qy, qz, mpa3d_at::Q1D, mpa3d_at::Q1D, mpa3d_at::Q1D) { + MASS3DPA_ATOMIC_5; + } + + SHARED_LOOP_3D(dx, qy, qz, mpa3d_at::D1D, mpa3d_at::Q1D, mpa3d_at::Q1D) { + MASS3DPA_ATOMIC_6; + } + + SHARED_LOOP_3D(dx, dy, qz, mpa3d_at::D1D, mpa3d_at::D1D, mpa3d_at::Q1D) { + MASS3DPA_ATOMIC_7; + } + + SHARED_LOOP_3D(dx, dy, dz, mpa3d_at::D1D, mpa3d_at::D1D, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_8; + } + + } // element loop + } + stopTimer(); + + break; + } + + case RAJA_OpenMP: { + + auto res{getHostResource()}; + + using launch_policy = RAJA::LaunchPolicy; + + using outer_x = RAJA::LoopPolicy; + + using inner_x = RAJA::LoopPolicy; + + using inner_y = RAJA::LoopPolicy; + + using inner_z = RAJA::LoopPolicy; + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + + //Grid is empty as the host does not need a compute grid to be specified + RAJA::launch( res, + RAJA::LaunchParams(), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](Index_type e) { + + MASS3DPA_ATOMIC_0_CPU; + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_1; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type ) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type q) { + MASS3DPA_ATOMIC_2; + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); // RAJA::loop + } // lambda () + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_3; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_4; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_5; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_6; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_7; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_8; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // // RAJA::launch + + } // loop over kernel reps + stopTimer(); + + return; + } + + default: + getCout() << "\n MASS3DPA_ATOMIC : Unknown OpenMP variant id = " << vid + << std::endl; + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +RAJAPERF_DEFAULT_TUNING_DEFINE_BOILERPLATE(MASS3DPA_ATOMIC, OpenMP, Base_OpenMP, RAJA_OpenMP) + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/MASS3DPA_ATOMIC-Seq.cpp b/src/apps/MASS3DPA_ATOMIC-Seq.cpp new file mode 100644 index 000000000..eaf712a75 --- /dev/null +++ b/src/apps/MASS3DPA_ATOMIC-Seq.cpp @@ -0,0 +1,247 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// Uncomment to add compiler directives for loop unrolling +//#define USE_RAJAPERF_UNROLL + +#include "MASS3DPA_ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf { +namespace apps { + + +void MASS3DPA_ATOMIC::runSeqVariant(VariantID vid) { + const Index_type run_reps = getRunReps(); + + MASS3DPA_ATOMIC_DATA_SETUP; + + switch (vid) { + + case Base_Seq: { + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) + { + + for (Index_type e = 0; e < NE; ++e) { + + MASS3DPA_ATOMIC_0_CPU; + + SHARED_LOOP_3D(dx, dy, dz, mpa3d_at::D1D, mpa3d_at::D1D, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_1; + } + + SHARED_LOOP_2D(q, d, mpa3d_at::Q1D, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_2; + } + + SHARED_LOOP_3D(qx, dy, dz, mpa3d_at::Q1D, mpa3d_at::D1D, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_3; + } + + SHARED_LOOP_3D(qx, qy, dz, mpa3d_at::Q1D, mpa3d_at::Q1D, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_4; + } + + SHARED_LOOP_3D(qx, qy, qz, mpa3d_at::Q1D, mpa3d_at::Q1D, mpa3d_at::Q1D) { + MASS3DPA_ATOMIC_5; + } + + SHARED_LOOP_3D(dx, qy, qz, mpa3d_at::D1D, mpa3d_at::Q1D, mpa3d_at::Q1D) { + MASS3DPA_ATOMIC_6; + } + + SHARED_LOOP_3D(dx, dy, qz, mpa3d_at::D1D, mpa3d_at::D1D, mpa3d_at::Q1D) { + MASS3DPA_ATOMIC_7; + } + + SHARED_LOOP_3D(dx, dy, dz, mpa3d_at::D1D, mpa3d_at::D1D, mpa3d_at::D1D) { + MASS3DPA_ATOMIC_8; + } + + } // element loop + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case RAJA_Seq: { + + auto res{getHostResource()}; + + //Currently Teams requires two policies if compiled with a device + using launch_policy = RAJA::LaunchPolicy; + + using outer_x = RAJA::LoopPolicy; + + using inner_x = RAJA::LoopPolicy; + + using inner_y = RAJA::LoopPolicy; + + using inner_z = RAJA::LoopPolicy; + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + + RAJA::launch( res, + RAJA::LaunchParams(), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](Index_type e) { + + + MASS3DPA_ATOMIC_0_CPU; + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_1; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type ) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type q) { + MASS3DPA_ATOMIC_2; + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); // RAJA::loop + } // lambda () + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_3; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_4; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qx) { + MASS3DPA_ATOMIC_5; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_6; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_7; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa3d_at::D1D), + [&](Index_type dx) { + MASS3DPA_ATOMIC_8; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + + } // loop over kernel reps + stopTimer(); + + return; + } +#endif // RUN_RAJA_SEQ + + default: + getCout() << "\n MASS3DPA_ATOMIC : Unknown Seq variant id = " << vid << std::endl; + } +} + +RAJAPERF_DEFAULT_TUNING_DEFINE_BOILERPLATE(MASS3DPA_ATOMIC, Seq, Base_Seq, RAJA_Seq) + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/MASS3DPA_ATOMIC-Sycl.cpp b/src/apps/MASS3DPA_ATOMIC-Sycl.cpp new file mode 100644 index 000000000..c53959900 --- /dev/null +++ b/src/apps/MASS3DPA_ATOMIC-Sycl.cpp @@ -0,0 +1,132 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// Uncomment to add compiler directives loop unrolling +//#define USE_RAJAPERF_UNROLL + +#include "MASS3DPA_ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_SYCL) + +#include "common/SyclDataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + +template < size_t work_group_size > +void MASS3DPA_ATOMIC::runSyclVariantImpl(VariantID vid) { + setBlockSize(work_group_size); + + const Index_type run_reps = getRunReps(); + + auto res{getSyclResource()}; + auto qu = res.get_queue(); + + MASS3DPA_ATOMIC_DATA_SETUP; + + const ::sycl::range<3> workGroupSize(1, MPA_Q1D, MPA_Q1D); + const ::sycl::range<3> gridSize(1, MPA_Q1D, MPA_Q1D*NE); + + switch (vid) { + + case Base_SYCL: { + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + + qu->submit([&](::sycl::handler& h) { + + constexpr Index_type MQ1 = MPA_Q1D; + constexpr Index_type MD1 = MPA_D1D; + constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; + + auto sDQ_vec = ::sycl::local_accessor(::sycl::range<1>(MQ1 * MD1), h); + auto sm0_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ * MDQ * MDQ), h); + auto sm1_vec = ::sycl::local_accessor(::sycl::range<1>(MDQ * MDQ * MDQ), h); + + h.parallel_for + (::sycl::nd_range<3>(gridSize, workGroupSize), + [=] (::sycl::nd_item<3> itm) { + + const Index_type e = itm.get_group(2); + + }); + }); + + } + stopTimer(); + + break; + } + + case RAJA_SYCL: { + + constexpr bool async = true; + + using launch_policy = RAJA::LaunchPolicy>; + + using outer_x = RAJA::LoopPolicy; + + using inner_x = RAJA::LoopPolicy; + + using inner_y = RAJA::LoopPolicy; + + //Caclulate amount of shared memory needed + size_t shmem = 0; + { + constexpr Index_type MQ1 = MPA_Q1D; + constexpr Index_type MD1 = MPA_D1D; + constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; + + constexpr Index_type no_mats = 2; + shmem += MQ1 * MD1 * no_mats * MDQ * MDQ * MDQ * sizeof(Real_type); + } + + startTimer(); + // Loop counter increment uses macro to quiet C++20 compiler warning + for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { + + RAJA::launch( res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(MPA_Q1D, MPA_Q1D), shmem), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](Index_type e) { + + } // lambda (e) + ); // RAJA::loop + + } // outer lambda (ctx) + ); // RAJA::launch + + } // loop over kernel reps + stopTimer(); + + break; + } + + default: { + + getCout() << "\n MASS3DPA_ATOMIC : Unknown Sycl variant id = " << vid << std::endl; + break; + } + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BOILERPLATE(MASS3DPA_ATOMIC, Sycl, Base_SYCL, RAJA_SYCL) + +} // end namespace apps +} // end namespace rajaperf + +#endif // RAJA_ENABLE_SYCL diff --git a/src/apps/MASS3DPA_ATOMIC.cpp b/src/apps/MASS3DPA_ATOMIC.cpp new file mode 100644 index 000000000..c0526eab2 --- /dev/null +++ b/src/apps/MASS3DPA_ATOMIC.cpp @@ -0,0 +1,121 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MASS3DPA_ATOMIC.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +#include + +namespace rajaperf { +namespace apps { + +MASS3DPA_ATOMIC::MASS3DPA_ATOMIC(const RunParams ¶ms) + : KernelBase(rajaperf::Apps_MASS3DPA_ATOMIC, params) { + + m_DOF_default = 1000000; + + m_P = mpa3d_at::D1D - 1; // polynomial order + + m_NE = m_DOF_default/pow(m_P,3); //approximate how many elements we need + + //How does this work?? + //getTargetProblemSize(); + //m_NE = std::max(static_cast( (getTargetProblemSize() + , Index_type(1)); + + //Construct the mesh + m_Nx = static_cast(std::cbrt(m_NE)); + m_Ny = m_Nx; + m_Nz = m_Ny; + m_NE = m_Nx * m_Ny * m_Nz; + + //compute true number of dofs + m_Tot_Dofs = (m_Nx * m_P + 1) * (m_Ny * m_P + 1) * (m_Nz * m_P + 1); + + setDefaultProblemSize(m_Tot_Dofs); + setDefaultReps(50); + + setActualProblemSize(m_Tot_Dofs); + + setItsPerRep(m_NE * mpa3d_at::D1D * mpa3d_at::D1D); + setKernelsPerRep(1); + + setBytesReadPerRep(2 * sizeof(Real_type) * mpa3d_at::Q1D * mpa3d_at::D1D + // B, Bt + 1 * sizeof(Index_type) * mpa3d_at::D1D * mpa3d_at::D1D * mpa3d_at::D1D * m_NE + //ElemToDoF + 1 * sizeof(Real_type) * m_Tot_Dofs + // X + 1 * sizeof(Real_type) * mpa3d_at::Q1D * mpa3d_at::Q1D * mpa3d_at::Q1D * m_NE); // D + + setBytesWrittenPerRep(1 * sizeof(Real_type) * mpa3d_at::D1D * mpa3d_at::D1D * + mpa3d_at::D1D * m_NE); // Y + + setBytesAtomicModifyWrittenPerRep(m_Tot_Dofs); + + setFLOPsPerRep( + m_NE * + (2 * mpa3d_at::D1D * mpa3d_at::D1D * mpa3d_at::D1D * mpa3d_at::Q1D + + 2 * mpa3d_at::D1D * mpa3d_at::D1D * mpa3d_at::Q1D * mpa3d_at::Q1D + + 2 * mpa3d_at::D1D * mpa3d_at::Q1D * mpa3d_at::Q1D * mpa3d_at::Q1D + + mpa3d_at::Q1D * mpa3d_at::Q1D * mpa3d_at::Q1D + + 2 * mpa3d_at::Q1D * mpa3d_at::Q1D * mpa3d_at::Q1D * mpa3d_at::D1D + + 2 * mpa3d_at::Q1D * mpa3d_at::Q1D * mpa3d_at::D1D * mpa3d_at::D1D + + 2 * mpa3d_at::Q1D * mpa3d_at::D1D * mpa3d_at::D1D * mpa3d_at::D1D + + mpa3d_at::D1D * mpa3d_at::D1D * mpa3d_at::D1D)); + + setChecksumConsistency(ChecksumConsistency::ConsistentPerVariantTuning); + + setComplexity(Complexity::N); + + setUsesFeature(Launch); + + addVariantTunings(); +} + +MASS3DPA_ATOMIC::~MASS3DPA_ATOMIC() { } + +void MASS3DPA_ATOMIC::setUp(VariantID vid, + size_t RAJAPERF_UNUSED_ARG(tune_idx)) { + + allocAndInitDataConst(m_B, Index_type(mpa3d_at::Q1D * mpa3d_at::D1D), + Real_type(1.0), vid); + allocAndInitDataConst(m_Bt, Index_type(mpa3d_at::Q1D * mpa3d_at::D1D), + Real_type(1.0), vid); + allocAndInitDataConst( + m_D, Index_type(mpa3d_at::Q1D * mpa3d_at::Q1D * mpa3d_at::Q1D * m_NE), + Real_type(1.0), vid); + allocAndInitDataConst(m_X, Index_type(m_Tot_Dofs), Real_type(1.0), vid); + allocAndInitDataConst(m_Y, Index_type(m_Tot_Dofs), Real_type(0.0), vid); + + // Compute table elem to dof table size + const int ndof_per_elem = (m_P + 1) * (m_P + 1) * (m_P + 1); + const int total_size = ndof_per_elem * m_NE; + + auto a_elemToDoF = allocDataForInit(m_ElemToDoF, total_size, vid); + buildElemToDofTable(m_Nx, m_Ny, m_Nz, m_P, m_ElemToDoF); + +} + +void MASS3DPA_ATOMIC::updateChecksum(VariantID vid, size_t tune_idx) { + checksum[vid][tune_idx] += calcChecksum(m_Y, m_Tot_Dofs, vid); +} + +void MASS3DPA_ATOMIC::tearDown(VariantID vid, + size_t RAJAPERF_UNUSED_ARG(tune_idx)) { + (void)vid; + + deallocData(m_B, vid); + deallocData(m_Bt, vid); + deallocData(m_D, vid); + deallocData(m_X, vid); + deallocData(m_Y, vid); + deallocData(m_ElemToDoF, vid); +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/MASS3DPA_ATOMIC.hpp b/src/apps/MASS3DPA_ATOMIC.hpp new file mode 100644 index 000000000..fc52ec812 --- /dev/null +++ b/src/apps/MASS3DPA_ATOMIC.hpp @@ -0,0 +1,377 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Action of 3D mass matrix via partial assembly +/// +/// +/// for (int e = 0; e < NE; ++e) { +/// +/// constexpr int MQ1 = mpa3d_at::Q1D; +/// constexpr int MD1 = mpa3d_at::D1D; +/// +/// constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; +/// +/// double sm_B[MQ1][MD1]; +/// double sm_Bt[MD1][MQ1]; +/// +/// double sm0[MDQ * MDQ * MDQ]; +/// double sm1[MDQ * MDQ * MDQ]; +/// double(*sm_X)[MD1][MD1] = (double(*)[MD1][MD1])sm0; +/// double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; +/// double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; +/// double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; +/// double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; +/// double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; +/// +/// int thread_dofs[MD1 * MD1 * MD1]; +/// +/// for(int dz=0; dz MD1) ? MQ1 : MD1; \ + double sm_B[MQ1][MD1]; \ + double sm_Bt[MD1][MQ1]; \ + double sm0[MDQ * MDQ * MDQ]; \ + double sm1[MDQ * MDQ * MDQ]; \ + double (*sm_X)[MD1][MD1] = (double (*)[MD1][MD1])sm0; \ + double (*DDQ)[MD1][MQ1] = (double (*)[MD1][MQ1])sm1; \ + double (*DQQ)[MQ1][MQ1] = (double (*)[MQ1][MQ1])sm0; \ + double (*QQQ)[MQ1][MQ1] = (double (*)[MQ1][MQ1])sm1; \ + double (*QQD)[MQ1][MD1] = (double (*)[MQ1][MD1])sm0; \ + double (*QDD)[MD1][MD1] = (double (*)[MD1][MD1])sm1; \ + int thread_dofs[MD1 * MD1 * MD1]; + +#define MASS3DPA_ATOMIC_0_GPU \ + constexpr int MQ1 = mpa3d_at::Q1D; \ + constexpr int MD1 = mpa3d_at::D1D; \ + constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + RAJA_TEAM_SHARED double sm_B[MQ1][MD1]; \ + RAJA_TEAM_SHARED double sm_Bt[MD1][MQ1]; \ + RAJA_TEAM_SHARED double sm0[MDQ * MDQ * MDQ]; \ + RAJA_TEAM_SHARED double sm1[MDQ * MDQ * MDQ]; \ + double (*sm_X)[MD1][MD1] = (double (*)[MD1][MD1])sm0; \ + double (*DDQ)[MD1][MQ1] = (double (*)[MD1][MQ1])sm1; \ + double (*DQQ)[MQ1][MQ1] = (double (*)[MQ1][MQ1])sm0; \ + double (*QQQ)[MQ1][MQ1] = (double (*)[MQ1][MQ1])sm1; \ + double (*QQD)[MQ1][MD1] = (double (*)[MQ1][MD1])sm0; \ + double (*QDD)[MD1][MD1] = (double (*)[MD1][MD1])sm1; \ + RAJA_TEAM_SHARED int thread_dofs[MD1 * MD1 * MD1]; + +#define MASS3DPA_ATOMIC_1 \ + int j = dx + mpa3d_at::D1D * (dy + dz * mpa3d_at::D1D); \ + thread_dofs[j] = \ + ElemToDoF[j + mpa3d_at::D1D * mpa3d_at::D1D * mpa3d_at::D1D * e]; \ + sm_X[dz][dy][dx] = \ + X[thread_dofs[j]]; // missing dof_map for lexicographical ordering + +#define MASS3DPA_ATOMIC_2 \ + sm_B[q][d] = MPAT_B(q, d); \ + sm_Bt[d][q] = sm_B[q][d]; + +//flop counts +//2 * D1D +#define MASS3DPA_ATOMIC_3 \ + double u = 0.0; \ + for (int dx = 0; dx < mpa3d_at::D1D; ++dx) { \ + u += sm_X[dz][dy][dx] * sm_B[qx][dx]; \ + } \ + DDQ[dz][dy][qx] = u; + +//2 * D1D +#define MASS3DPA_ATOMIC_4 \ + double u = 0.0; \ + for (int dy = 0; dy < mpa3d_at::D1D; ++dy) { \ + u += DDQ[dz][dy][qx] * sm_B[qy][dy]; \ + } \ + DQQ[dz][qy][qx] = u; + +//2 * D1D + 1 +#define MASS3DPA_ATOMIC_5 \ + double u = 0.0; \ + for (int dz = 0; dz < mpa3d_at::D1D; ++dz) { \ + u += DQQ[dz][qy][qx] * sm_B[qz][dz]; \ + } \ + QQQ[qz][qy][qx] = u * MPAT_D(qx, qy, qz, e); + +//2 * Q1D +#define MASS3DPA_ATOMIC_6 \ + double u = 0.0; \ + for (int qx = 0; qx < mpa3d_at::Q1D; ++qx) { \ + u += QQQ[qz][qy][qx] * sm_Bt[dx][qx]; \ + } \ + QQD[qz][qy][dx] = u; + +//2 * Q1D +#define MASS3DPA_ATOMIC_7 \ + double u = 0.0; \ + for (int qy = 0; qy < mpa3d_at::Q1D; ++qy) { \ + u += QQD[qz][qy][dx] * sm_Bt[dy][qy]; \ + } \ + QDD[qz][dy][dx] = u; + +//2 * Q1D + 1 +#define MASS3DPA_ATOMIC_8 \ + double u = 0.0; \ + for (int qz = 0; qz < mpa3d_at::Q1D; ++qz) { \ + u += QDD[qz][dy][dx] * sm_Bt[dz][qz]; \ + } \ + const int j = dx + mpa3d_at::D1D * (dy + dz * mpa3d_at::D1D); \ + RAJA::atomicAdd(&Y[thread_dofs[j]], u); // atomic add + +namespace rajaperf { +class RunParams; + +namespace apps { + +// Helper function to get global node ID for structured 3D grid +inline int nodeID(int ix, int iy, int iz, int num_nodes_x, int num_nodes_y, + int num_nodes_z) { + return ix + num_nodes_x * (iy + num_nodes_y * iz); +} + +/** + * Build element-to-DOF connectivity for a structured 3D hex mesh + * with arbitrary polynomial order p and 1 DOF per node. + * + * Inputs: + * Nx, Ny, Nz : number of elements in x, y, z directions + * p : polynomial order (>=1) + * + * Outputs: + * elem_to_dofs : size = num_elems + * each entry is a vector of size (p+1)^3 + * containing the global DOF indices of that element + * + * Element numbering: + * elem_id = ex + Nx * (ey + Ny * ez) + */ +inline void +buildElemToDofTable(int Nx, int Ny, int Nz, int p, + Index_ptr elemToDof) // output buffer, must be preallocated +{ + const int num_nodes_x = Nx * p + 1; + const int num_nodes_y = Ny * p + 1; + const int num_nodes_z = Nz * p + 1; + + const int ndof_per_elem = (p + 1) * (p + 1) * (p + 1); + + // Loop over elements + for (int ez = 0; ez < Nz; ++ez) { + for (int ey = 0; ey < Ny; ++ey) { + for (int ex = 0; ex < Nx; ++ex) { + // Global element index (row in elemToDof) + int e = ex + Nx * (ey + Ny * ez); + + // Pointer to start of this element's DOF list + Index_ptr row = elemToDof + e * ndof_per_elem; + + int local = 0; + + // Loop over local nodes of the element + for (int kz = 0; kz <= p; ++kz) { + int iz = ez * p + kz; + for (int ky = 0; ky <= p; ++ky) { + int iy = ey * p + ky; + for (int kx = 0; kx <= p; ++kx) { + int ix = ex * p + kx; + + int nodeID = ix + num_nodes_x * (iy + num_nodes_y * iz); + + // Scalar DOF per node, so dofID == nodeID + int dofID = nodeID; + + row[local++] = dofID; + } + } + } + } + } + } +} + +class MASS3DPA_ATOMIC : public KernelBase { +public: + MASS3DPA_ATOMIC(const RunParams ¶ms); + + ~MASS3DPA_ATOMIC(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void defineSeqVariantTunings(); + void defineOpenMPVariantTunings(); + void defineCudaVariantTunings(); + void defineHipVariantTunings(); + void defineSyclVariantTunings(); + + void runSeqVariant(VariantID vid); + void runOpenMPVariant(VariantID vid); + + template void runCudaVariantImpl(VariantID vid); + template void runHipVariantImpl(VariantID vid); + template void runSyclVariantImpl(VariantID vid); + +private: + static const size_t default_gpu_block_size = mpa3d_at::Q1D * mpa3d_at::Q1D * mpa3d_at::Q1D; + using gpu_block_sizes_type = integer::list_type; + + Real_ptr m_B; + Real_ptr m_Bt; + Real_ptr m_D; + Real_ptr m_X; + Real_ptr m_Y; + + Index_type m_Nx; // zones in x dimension + Index_type m_Ny; // zones in y dimension + Index_type m_Nz; // zones in z dimension + Index_type m_P; // polynomial order + Index_type m_Tot_Dofs; // total number of dofs + + Index_ptr m_ElemToDoF; + + Index_type m_NE; + Index_type m_DOF_default; +}; + +} // end namespace apps +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 04e3c7147..0a9d9c2c1 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -95,6 +95,7 @@ #include "apps/LTIMES_NOVIEW.hpp" #include "apps/MASS3DEA.hpp" #include "apps/MASS3DPA.hpp" +#include "apps/MASS3DPA_ATOMIC.hpp" #include "apps/MASSVEC3DPA.hpp" #include "apps/MATVEC_3D_STENCIL.hpp" #include "apps/NODAL_ACCUMULATION_3D.hpp" @@ -254,6 +255,7 @@ static const std::string KernelNames [] = std::string("Apps_LTIMES_NOVIEW"), std::string("Apps_MASS3DEA"), std::string("Apps_MASS3DPA"), + std::string("Apps_MASS3DPA_ATOMIC"), std::string("Apps_MASSVEC3DPA"), std::string("Apps_MATVEC_3D_STENCIL"), std::string("Apps_NODAL_ACCUMULATION_3D"), @@ -1108,6 +1110,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new apps::MASS3DPA(run_params); break; } + case Apps_MASS3DPA_ATOMIC : { + kernel = new apps::MASS3DPA_ATOMIC(run_params); + break; + } case Apps_MASSVEC3DPA : { kernel = new apps::MASSVEC3DPA(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 892721f40..a993a5680 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -155,6 +155,7 @@ enum KernelID { Apps_LTIMES_NOVIEW, Apps_MASS3DEA, Apps_MASS3DPA, + Apps_MASS3DPA_ATOMIC, Apps_MASSVEC3DPA, Apps_MATVEC_3D_STENCIL, Apps_NODAL_ACCUMULATION_3D,