diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp index 03e98d84d..d6154f386 100644 --- a/src/apps/FEM_MACROS.hpp +++ b/src/apps/FEM_MACROS.hpp @@ -81,15 +81,17 @@ for (int ty = threadIdx.y; ty < Ny; ty += blockDim.y) \ for (int tx = threadIdx.x; tx < Nx; tx += blockDim.x) -#define GPU_SHARED_LOOP_2D_INC(tx, ty, Nx, Ny, runtime_blocks_size) \ +#define GPU_SHARED_LOOP_2D_INC(tx, ty, Nx, Ny, runtime_block_size_x, \ + runtime_block_size_y) \ if (threadIdx.z < 1) \ - for (int ty = threadIdx.y; ty < Ny; ty += runtime_blocks_size) \ - for (int tx = threadIdx.x; tx < Nx; tx += runtime_blocks_size) - -#define GPU_SHARED_LOOP_3D_INC(tx, ty, tz, Nx, Ny, Nz, runtime_blocks_size) \ - for (int tz = threadIdx.z; tz < Nz; tz += runtime_blocks_size) \ - for (int ty = threadIdx.y; ty < Ny; ty += runtime_blocks_size) \ - for (int tx = threadIdx.x; tx < Nx; tx += runtime_blocks_size) + for (int ty = threadIdx.y; ty < Ny; ty += runtime_block_size_y) \ + for (int tx = threadIdx.x; tx < Nx; tx += runtime_block_size_x) + +#define GPU_SHARED_LOOP_3D_INC(tx, ty, tz, Nx, Ny, Nz, runtime_block_size_x, \ + runtime_block_size_y, runtime_block_size_z) \ + for (int tz = threadIdx.z; tz < Nz; tz += runtime_block_size_z) \ + for (int ty = threadIdx.y; ty < Ny; ty += runtime_block_size_y) \ + for (int tx = threadIdx.x; tx < Nx; tx += runtime_block_size_x) #endif diff --git a/src/apps/MASSVEC3DPA-Cuda.cpp b/src/apps/MASSVEC3DPA-Cuda.cpp index 56bb6a3ab..c07d265ca 100644 --- a/src/apps/MASSVEC3DPA-Cuda.cpp +++ b/src/apps/MASSVEC3DPA-Cuda.cpp @@ -76,58 +76,133 @@ void MassVec3DPA_BLOCKDIM_LOOP_INC(const Real_ptr B, const Real_ptr Bt, template __launch_bounds__(block_size) __global__ void MassVec3DPA_ARGUMENT_LOOP_INC(const Real_ptr B, const Real_ptr Bt, - const Real_ptr D, const Real_ptr X, - Real_ptr Y, - const Index_type runtime_block_size) + const Real_ptr D, const Real_ptr X, + Real_ptr Y, + const Index_type runtime_block_size_x, + const Index_type runtime_block_size_y, + const Index_type runtime_block_size_z) +{ + + const Index_type e = blockIdx.x; + + MASSVEC3DPA_0_GPU; + + GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, + runtime_block_size_x, runtime_block_size_y) { + MASSVEC3DPA_1; + } + + for (Index_type c = 0; c < 3; ++c) { + GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { + MASSVEC3DPA_2; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { + MASSVEC3DPA_3; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { + MASSVEC3DPA_4; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { + MASSVEC3DPA_5; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { + MASSVEC3DPA_6; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { + MASSVEC3DPA_7; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { + MASSVEC3DPA_8; + } + __syncthreads(); + + } // (c) dimension loop +} + +template +__launch_bounds__(block_size) __global__ +void MassVec3DPA_BLOCKDIM_COPY_LOOP_INC(const Real_ptr B, const Real_ptr Bt, + const Real_ptr D, const Real_ptr X, + Real_ptr Y) { const Index_type e = blockIdx.x; + const int bdx = blockDim.x; + const int bdy = blockDim.y; + const int bdz = blockDim.z; + MASSVEC3DPA_0_GPU; - GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, runtime_block_size) { + GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, bdx, bdy) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - runtime_block_size) { + bdx, bdy, bdz) { MASSVEC3DPA_2; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, - runtime_block_size) { + bdx, bdy, bdz) { MASSVEC3DPA_3; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, - runtime_block_size) { + bdx, bdy, bdz) { MASSVEC3DPA_4; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, - runtime_block_size) { + bdx, bdy, bdz) { MASSVEC3DPA_5; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, - runtime_block_size) { + bdx, bdy, bdz) { MASSVEC3DPA_6; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, - runtime_block_size) { + bdx, bdy, bdz) { MASSVEC3DPA_7; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - runtime_block_size) { + bdx, bdy, bdz) { MASSVEC3DPA_8; } __syncthreads(); @@ -146,49 +221,49 @@ void MassVec3DPA_COMPILE_LOOP_INC(const Real_ptr B, const Real_ptr Bt, MASSVEC3DPA_0_GPU; - GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, block_size) { + GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_2; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_3; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_4; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_5; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_6; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_7; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_8; } __syncthreads(); @@ -199,8 +274,7 @@ void MassVec3DPA_COMPILE_LOOP_INC(const Real_ptr B, const Real_ptr Bt, template __launch_bounds__(block_size) __global__ void MassVec3DPA_DIRECT(const Real_ptr B, const Real_ptr Bt, - const Real_ptr D, const Real_ptr X, - Real_ptr Y) + const Real_ptr D, const Real_ptr X, Real_ptr Y) { const Index_type e = blockIdx.x; @@ -250,6 +324,166 @@ void MassVec3DPA_DIRECT(const Real_ptr B, const Real_ptr Bt, } // (c) dimension loop } +template +void MASSVEC3DPA::runRAJAImpl(RESOURCE &res) +{ + + MASSVEC3DPA_DATA_SETUP; + + constexpr bool async = true; + + using launch_policy = RAJA::LaunchPolicy< + RAJA::cuda_launch_t>; + + using outer_x = RAJA::LoopPolicy; + + RAJA::launch( + res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](Index_type e) { + + MASSVEC3DPA_0_GPU + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type q) { + MASSVEC3DPA_1; + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); // RAJA::loop + } // lambda () + ); // RAJA::loop + + for (Index_type c = 0; c < 3; ++c) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_2; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qx) { + MASSVEC3DPA_3; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qx) { + MASSVEC3DPA_4; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qx) { + MASSVEC3DPA_5; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_6; + } // lambda (dx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_7; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_8; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + } // c - dim loop + } // lambda (e) + ); // RAJA::loop + } // outer lambda (ctx) + ); // RAJA::launch +} + + template void MASSVEC3DPA::runCudaVariantImpl(VariantID vid, size_t tune_idx) { @@ -289,7 +523,9 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid, size_t tune_idx) RPlaunchCudaKernel((MassVec3DPA_ARGUMENT_LOOP_INC), NE, nthreads_per_block, shmem, res.get_stream(), B, Bt, D, - X, Y, static_cast(MVPA_Q1D)); + X, Y, static_cast(MVPA_Q1D), + static_cast(MVPA_Q1D), + static_cast(MVPA_Q1D)); } stopTimer(); @@ -301,7 +537,7 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid, size_t tune_idx) dim3 nthreads_per_block(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D); constexpr size_t shmem = 0; - RPlaunchCudaKernel((MassVec3DPA_COMPILE_LOOP_INC), NE, + RPlaunchCudaKernel((MassVec3DPA_BLOCKDIM_COPY_LOOP_INC), NE, nthreads_per_block, shmem, res.get_stream(), B, Bt, D, X, Y); } @@ -309,6 +545,20 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid, size_t tune_idx) } else if (tune_idx == 3) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { + + dim3 nthreads_per_block(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel((MassVec3DPA_COMPILE_LOOP_INC), NE, + nthreads_per_block, shmem, res.get_stream(), B, Bt, D, + X, Y); + } + stopTimer(); + + } else if (tune_idx == 4) { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { @@ -345,157 +595,32 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid, size_t tune_idx) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); - } + } //tune_idx == 0 if (tune_idx == 1) { + using inner_x = RAJA::LoopPolicy; + + using inner_y = RAJA::LoopPolicy; + + using inner_z = RAJA::LoopPolicy; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { + + runRAJAImpl(res); + + } // loop over kernel reps + stopTimer(); + } //tune_idx == 1 + + + if (tune_idx == 2) { + using inner_x = RAJA::LoopPolicy>; using inner_y = RAJA::LoopPolicy>; @@ -505,157 +630,13 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid, size_t tune_idx) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); } - if (tune_idx == 2) { + if (tune_idx == 3) { using inner_x = RAJA::LoopPolicy; @@ -666,149 +647,7 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid, size_t tune_idx) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); @@ -851,12 +690,14 @@ void MASSVEC3DPA::setCudaTuningDefinitions(VariantID vid) if (vid == Base_CUDA) { addVariantTuningName(vid, "BLOCKDIM_LOOP_INC"); addVariantTuningName(vid, "ARGUMENT_LOOP_INC"); + addVariantTuningName(vid, "BLOCKDIM_COPY_LOOP_INC"); addVariantTuningName(vid, "COMPILE_LOOP_INC"); addVariantTuningName(vid, "DIRECT"); } if (vid == RAJA_CUDA) { addVariantTuningName(vid, "BLOCKDIM_LOOP_INC"); + addVariantTuningName(vid, "BLOCKDIM_COPY_LOOP_INC"); addVariantTuningName(vid, "COMPILE_LOOP_INC"); addVariantTuningName(vid, "DIRECT"); } diff --git a/src/apps/MASSVEC3DPA-Hip.cpp b/src/apps/MASSVEC3DPA-Hip.cpp index 6627f1b96..b29d3d463 100644 --- a/src/apps/MASSVEC3DPA-Hip.cpp +++ b/src/apps/MASSVEC3DPA-Hip.cpp @@ -78,56 +78,131 @@ __launch_bounds__(block_size) __global__ void MassVec3DPA_ARGUMENT_LOOP_INC(const Real_ptr B, const Real_ptr Bt, const Real_ptr D, const Real_ptr X, Real_ptr Y, - const Index_type runtime_block_size) + const Index_type runtime_block_size_x, + const Index_type runtime_block_size_y, + const Index_type runtime_block_size_z) { const Index_type e = blockIdx.x; MASSVEC3DPA_0_GPU; - GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, runtime_block_size) { + GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, + runtime_block_size_x, runtime_block_size_y) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_2; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_3; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_4; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_5; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_6; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_7; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { + MASSVEC3DPA_8; + } + __syncthreads(); + + } // (c) dimension loop +} + +template +__launch_bounds__(block_size) __global__ +void MassVec3DPA_BLOCKDIM_COPY_LOOP_INC(const Real_ptr B, const Real_ptr Bt, + const Real_ptr D, const Real_ptr X, + Real_ptr Y) +{ + + const Index_type e = blockIdx.x; + + const int bdx = blockDim.x; + const int bdy = blockDim.y; + const int bdz = blockDim.z; + + MASSVEC3DPA_0_GPU; + + GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, bdx, bdy) { + MASSVEC3DPA_1; + } + + for (Index_type c = 0; c < 3; ++c) { + GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, + bdx, bdy, bdz) { + MASSVEC3DPA_2; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, + bdx, bdy, bdz) { + MASSVEC3DPA_3; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, + bdx, bdy, bdz) { + MASSVEC3DPA_4; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, + bdx, bdy, bdz) { + MASSVEC3DPA_5; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, + bdx, bdy, bdz) { + MASSVEC3DPA_6; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, + bdx, bdy, bdz) { + MASSVEC3DPA_7; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, + bdx, bdy, bdz) { MASSVEC3DPA_8; } __syncthreads(); @@ -146,49 +221,49 @@ void MassVec3DPA_COMPILE_LOOP_INC(const Real_ptr B, const Real_ptr Bt, MASSVEC3DPA_0_GPU; - GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, block_size) { + GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_2; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_3; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_4; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_5; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_6; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_7; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_8; } __syncthreads(); @@ -249,6 +324,166 @@ void MassVec3DPA_DIRECT(const Real_ptr B, const Real_ptr Bt, } // (c) dimension loop } +template +void MASSVEC3DPA::runRAJAImpl(RESOURCE &res) +{ + + MASSVEC3DPA_DATA_SETUP; + + constexpr bool async = true; + + using launch_policy = RAJA::LaunchPolicy< + RAJA::hip_launch_t>; + + using outer_x = RAJA::LoopPolicy; + + RAJA::launch( + res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](Index_type e) { + + MASSVEC3DPA_0_GPU + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type q) { + MASSVEC3DPA_1; + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); // RAJA::loop + } // lambda () + ); // RAJA::loop + + for (Index_type c = 0; c < 3; ++c) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_2; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qx) { + MASSVEC3DPA_3; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qx) { + MASSVEC3DPA_4; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qx) { + MASSVEC3DPA_5; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_6; + } // lambda (dx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_7; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_8; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + } // c - dim loop + } // lambda (e) + ); // RAJA::loop + } // outer lambda (ctx) + ); // RAJA::launch +} + + template void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) { @@ -288,7 +523,9 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) RPlaunchHipKernel((MassVec3DPA_ARGUMENT_LOOP_INC), NE, nthreads_per_block, shmem, res.get_stream(), B, Bt, D, - X, Y, static_cast(MVPA_Q1D)); + X, Y, static_cast(MVPA_Q1D), + static_cast(MVPA_Q1D), + static_cast(MVPA_Q1D)); } stopTimer(); @@ -300,7 +537,7 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) dim3 nthreads_per_block(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D); constexpr size_t shmem = 0; - RPlaunchHipKernel((MassVec3DPA_COMPILE_LOOP_INC), NE, + RPlaunchHipKernel((MassVec3DPA_BLOCKDIM_COPY_LOOP_INC), NE, nthreads_per_block, shmem, res.get_stream(), B, Bt, D, X, Y); } @@ -308,6 +545,20 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) } else if (tune_idx == 3) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { + + dim3 nthreads_per_block(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D); + constexpr size_t shmem = 0; + + RPlaunchHipKernel((MassVec3DPA_COMPILE_LOOP_INC), NE, + nthreads_per_block, shmem, res.get_stream(), B, Bt, D, + X, Y); + } + stopTimer(); + + } else if (tune_idx == 4) { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { @@ -344,157 +595,32 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); - } + } //tune_idx == 0 if (tune_idx == 1) { + using inner_x = RAJA::LoopPolicy; + + using inner_y = RAJA::LoopPolicy; + + using inner_z = RAJA::LoopPolicy; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { + + runRAJAImpl(res); + + } // loop over kernel reps + stopTimer(); + } //tune_idx == 1 + + + if (tune_idx == 2) { + using inner_x = RAJA::LoopPolicy>; using inner_y = RAJA::LoopPolicy>; @@ -504,157 +630,13 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); } - if (tune_idx == 2) { + if (tune_idx == 3) { using inner_x = RAJA::LoopPolicy; @@ -665,149 +647,7 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); @@ -850,12 +690,14 @@ void MASSVEC3DPA::setHipTuningDefinitions(VariantID vid) if (vid == Base_HIP) { addVariantTuningName(vid, "BLOCKDIM_LOOP_INC"); addVariantTuningName(vid, "ARGUMENT_LOOP_INC"); + addVariantTuningName(vid, "BLOCKDIM_COPY_LOOP_INC"); addVariantTuningName(vid, "COMPILE_LOOP_INC"); addVariantTuningName(vid, "DIRECT"); } if (vid == RAJA_HIP) { addVariantTuningName(vid, "BLOCKDIM_LOOP_INC"); + addVariantTuningName(vid, "BLOCKDIM_COPY_LOOP_INC"); addVariantTuningName(vid, "COMPILE_LOOP_INC"); addVariantTuningName(vid, "DIRECT"); } diff --git a/src/apps/MASSVEC3DPA.hpp b/src/apps/MASSVEC3DPA.hpp index 4157d755d..3e4045190 100644 --- a/src/apps/MASSVEC3DPA.hpp +++ b/src/apps/MASSVEC3DPA.hpp @@ -269,6 +269,9 @@ class MASSVEC3DPA : public KernelBase { void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template + void runRAJAImpl(RESOURCE &res); + template void runCudaVariantImpl(VariantID vid, size_t tune_idx); template