From c463bf15da5466d02352787dd12b439efbb429ad Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 25 Nov 2025 07:24:28 -0800 Subject: [PATCH 1/3] add new variants --- src/apps/MASSVEC3DPA-Hip.cpp | 249 ++++++++++++++++++++++++++++++++++- 1 file changed, 246 insertions(+), 3 deletions(-) diff --git a/src/apps/MASSVEC3DPA-Hip.cpp b/src/apps/MASSVEC3DPA-Hip.cpp index 6627f1b96..5fae78e24 100644 --- a/src/apps/MASSVEC3DPA-Hip.cpp +++ b/src/apps/MASSVEC3DPA-Hip.cpp @@ -135,6 +135,72 @@ void MassVec3DPA_ARGUMENT_LOOP_INC(const Real_ptr B, const Real_ptr Bt, } // (c) dimension loop } +template +__launch_bounds__(block_size) __global__ +void MassVec3DPA_BLOCKDIM_COPY_LOOP_INC(const Real_ptr B, const Real_ptr Bt, + const Real_ptr D, const Real_ptr X, + Real_ptr Y, + const Index_type runtime_block_size) +{ + + const Index_type e = blockIdx.x; + + const int bdx = blockDim.x; //block size is the same for xyz... + //const int bdy = blockDim.y; + //const int bdz = blockDim.z; + + MASSVEC3DPA_0_GPU; + + GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, bdx) { + MASSVEC3DPA_1; + } + + for (Index_type c = 0; c < 3; ++c) { + GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, + bdx) { + MASSVEC3DPA_2; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, + bdx) { + MASSVEC3DPA_3; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, + bdx) { + MASSVEC3DPA_4; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, + bdx) { + MASSVEC3DPA_5; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, + bdx) { + MASSVEC3DPA_6; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, + bdx) { + MASSVEC3DPA_7; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, + bdx) { + MASSVEC3DPA_8; + } + __syncthreads(); + + } // (c) dimension loop +} + template __launch_bounds__(block_size) __global__ void MassVec3DPA_COMPILE_LOOP_INC(const Real_ptr B, const Real_ptr Bt, @@ -294,6 +360,20 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) } else if (tune_idx == 2) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { + + dim3 nthreads_per_block(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D); + constexpr size_t shmem = 0; + + RPlaunchHipKernel((MassVec3DPA_BLOCKDIM_COPY_LOOP_INC), NE, + nthreads_per_block, shmem, res.get_stream(), B, Bt, D, + X, Y, static_cast(MVPA_Q1D)); + } + stopTimer(); + + } else if (tune_idx == 3) { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { @@ -306,7 +386,7 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) } stopTimer(); - } else if (tune_idx == 3) { + } else if (tune_idx == 4) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { @@ -491,10 +571,171 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) } // loop over kernel reps stopTimer(); - } + } //tune_idx == 0 if (tune_idx == 1) { + using inner_x = RAJA::LoopPolicy; + + using inner_y = RAJA::LoopPolicy; + + using inner_z = RAJA::LoopPolicy; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { + + RAJA::launch( + res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](Index_type e) { + + MASSVEC3DPA_0_GPU + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type q) { + MASSVEC3DPA_1; + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); // RAJA::loop + } // lambda () + ); // RAJA::loop + + for (Index_type c = 0; c < 3; ++c) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_2; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qx) { + MASSVEC3DPA_3; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qx) { + MASSVEC3DPA_4; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qx) { + MASSVEC3DPA_5; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_6; + } // lambda (dx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_7; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_8; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + } // c - dim loop + } // lambda (e) + ); // RAJA::loop + } // outer lambda (ctx) + ); // RAJA::launch + + } // loop over kernel reps + stopTimer(); + } //tune_idx == 1 + + + if (tune_idx == 2) { + using inner_x = RAJA::LoopPolicy>; using inner_y = RAJA::LoopPolicy>; @@ -654,7 +895,7 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) stopTimer(); } - if (tune_idx == 2) { + if (tune_idx == 3) { using inner_x = RAJA::LoopPolicy; @@ -850,12 +1091,14 @@ void MASSVEC3DPA::setHipTuningDefinitions(VariantID vid) if (vid == Base_HIP) { addVariantTuningName(vid, "BLOCKDIM_LOOP_INC"); addVariantTuningName(vid, "ARGUMENT_LOOP_INC"); + addVariantTuningName(vid, "BLOCKDIM_COPY_LOOP_INC"); addVariantTuningName(vid, "COMPILE_LOOP_INC"); addVariantTuningName(vid, "DIRECT"); } if (vid == RAJA_HIP) { addVariantTuningName(vid, "BLOCKDIM_LOOP_INC"); + addVariantTuningName(vid, "BLOCKDIM_COPY_LOOP_INC"); addVariantTuningName(vid, "COMPILE_LOOP_INC"); addVariantTuningName(vid, "DIRECT"); } From 867d522f209e511fad3f99c5be82c290855a8933 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 2 Dec 2025 09:52:49 -0800 Subject: [PATCH 2/3] template cuda variant to reduce duplicate code --- src/apps/MASSVEC3DPA-Cuda.cpp | 710 +++++++++++++------------------- src/apps/MASSVEC3DPA-Hip.cpp | 740 ++++++++-------------------------- src/apps/MASSVEC3DPA.hpp | 3 + 3 files changed, 437 insertions(+), 1016 deletions(-) diff --git a/src/apps/MASSVEC3DPA-Cuda.cpp b/src/apps/MASSVEC3DPA-Cuda.cpp index 56bb6a3ab..bec28997b 100644 --- a/src/apps/MASSVEC3DPA-Cuda.cpp +++ b/src/apps/MASSVEC3DPA-Cuda.cpp @@ -76,9 +76,9 @@ void MassVec3DPA_BLOCKDIM_LOOP_INC(const Real_ptr B, const Real_ptr Bt, template __launch_bounds__(block_size) __global__ void MassVec3DPA_ARGUMENT_LOOP_INC(const Real_ptr B, const Real_ptr Bt, - const Real_ptr D, const Real_ptr X, - Real_ptr Y, - const Index_type runtime_block_size) + const Real_ptr D, const Real_ptr X, + Real_ptr Y, + const Index_type runtime_block_size) { const Index_type e = blockIdx.x; @@ -135,6 +135,72 @@ void MassVec3DPA_ARGUMENT_LOOP_INC(const Real_ptr B, const Real_ptr Bt, } // (c) dimension loop } +template +__launch_bounds__(block_size) __global__ +void MassVec3DPA_BLOCKDIM_COPY_LOOP_INC(const Real_ptr B, const Real_ptr Bt, + const Real_ptr D, const Real_ptr X, + Real_ptr Y, + const Index_type runtime_block_size) +{ + + const Index_type e = blockIdx.x; + + const int bdx = blockDim.x; //block size is the same for xyz... + //const int bdy = blockDim.y; + //const int bdz = blockDim.z; + + MASSVEC3DPA_0_GPU; + + GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, bdx) { + MASSVEC3DPA_1; + } + + for (Index_type c = 0; c < 3; ++c) { + GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, + bdx) { + MASSVEC3DPA_2; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, + bdx) { + MASSVEC3DPA_3; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, + bdx) { + MASSVEC3DPA_4; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, + bdx) { + MASSVEC3DPA_5; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, + bdx) { + MASSVEC3DPA_6; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, + bdx) { + MASSVEC3DPA_7; + } + __syncthreads(); + + GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, + bdx) { + MASSVEC3DPA_8; + } + __syncthreads(); + + } // (c) dimension loop +} + template __launch_bounds__(block_size) __global__ void MassVec3DPA_COMPILE_LOOP_INC(const Real_ptr B, const Real_ptr Bt, @@ -199,8 +265,7 @@ void MassVec3DPA_COMPILE_LOOP_INC(const Real_ptr B, const Real_ptr Bt, template __launch_bounds__(block_size) __global__ void MassVec3DPA_DIRECT(const Real_ptr B, const Real_ptr Bt, - const Real_ptr D, const Real_ptr X, - Real_ptr Y) + const Real_ptr D, const Real_ptr X, Real_ptr Y) { const Index_type e = blockIdx.x; @@ -250,6 +315,166 @@ void MassVec3DPA_DIRECT(const Real_ptr B, const Real_ptr Bt, } // (c) dimension loop } +template +void MASSVEC3DPA::runRAJAImpl(RESOURCE &res) +{ + + MASSVEC3DPA_DATA_SETUP; + + constexpr bool async = true; + + using launch_policy = RAJA::LaunchPolicy< + RAJA::cuda_launch_t>; + + using outer_x = RAJA::LoopPolicy; + + RAJA::launch( + res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](Index_type e) { + + MASSVEC3DPA_0_GPU + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type q) { + MASSVEC3DPA_1; + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); // RAJA::loop + } // lambda () + ); // RAJA::loop + + for (Index_type c = 0; c < 3; ++c) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_2; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qx) { + MASSVEC3DPA_3; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qx) { + MASSVEC3DPA_4; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qx) { + MASSVEC3DPA_5; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_6; + } // lambda (dx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_7; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_8; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + } // c - dim loop + } // lambda (e) + ); // RAJA::loop + } // outer lambda (ctx) + ); // RAJA::launch +} + + template void MASSVEC3DPA::runCudaVariantImpl(VariantID vid, size_t tune_idx) { @@ -295,6 +520,20 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid, size_t tune_idx) } else if (tune_idx == 2) { + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { + + dim3 nthreads_per_block(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D); + constexpr size_t shmem = 0; + + RPlaunchCudaKernel((MassVec3DPA_BLOCKDIM_COPY_LOOP_INC), NE, + nthreads_per_block, shmem, res.get_stream(), B, Bt, D, + X, Y, static_cast(MVPA_Q1D)); + } + stopTimer(); + + } else if (tune_idx == 3) { + startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { @@ -307,7 +546,7 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid, size_t tune_idx) } stopTimer(); - } else if (tune_idx == 3) { + } else if (tune_idx == 4) { startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { @@ -345,157 +584,32 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid, size_t tune_idx) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); - } + } //tune_idx == 0 if (tune_idx == 1) { + using inner_x = RAJA::LoopPolicy; + + using inner_y = RAJA::LoopPolicy; + + using inner_z = RAJA::LoopPolicy; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { + + runRAJAImpl(res); + + } // loop over kernel reps + stopTimer(); + } //tune_idx == 1 + + + if (tune_idx == 2) { + using inner_x = RAJA::LoopPolicy>; using inner_y = RAJA::LoopPolicy>; @@ -505,157 +619,13 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid, size_t tune_idx) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); } - if (tune_idx == 2) { + if (tune_idx == 3) { using inner_x = RAJA::LoopPolicy; @@ -666,149 +636,7 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid, size_t tune_idx) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); @@ -851,12 +679,14 @@ void MASSVEC3DPA::setCudaTuningDefinitions(VariantID vid) if (vid == Base_CUDA) { addVariantTuningName(vid, "BLOCKDIM_LOOP_INC"); addVariantTuningName(vid, "ARGUMENT_LOOP_INC"); + addVariantTuningName(vid, "BLOCKDIM_COPY_LOOP_INC"); addVariantTuningName(vid, "COMPILE_LOOP_INC"); addVariantTuningName(vid, "DIRECT"); } if (vid == RAJA_CUDA) { addVariantTuningName(vid, "BLOCKDIM_LOOP_INC"); + addVariantTuningName(vid, "BLOCKDIM_COPY_LOOP_INC"); addVariantTuningName(vid, "COMPILE_LOOP_INC"); addVariantTuningName(vid, "DIRECT"); } diff --git a/src/apps/MASSVEC3DPA-Hip.cpp b/src/apps/MASSVEC3DPA-Hip.cpp index 5fae78e24..34866e25a 100644 --- a/src/apps/MASSVEC3DPA-Hip.cpp +++ b/src/apps/MASSVEC3DPA-Hip.cpp @@ -315,6 +315,166 @@ void MassVec3DPA_DIRECT(const Real_ptr B, const Real_ptr Bt, } // (c) dimension loop } +template +void MASSVEC3DPA::runRAJAImpl(RESOURCE &res) +{ + + MASSVEC3DPA_DATA_SETUP; + + constexpr bool async = true; + + using launch_policy = RAJA::LaunchPolicy< + RAJA::hip_launch_t>; + + using outer_x = RAJA::LoopPolicy; + + RAJA::launch( + res, + RAJA::LaunchParams(RAJA::Teams(NE), + RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, NE), + [&](Index_type e) { + + MASSVEC3DPA_0_GPU + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type q) { + MASSVEC3DPA_1; + } // lambda (q) + ); // RAJA::loop + } // lambda (d) + ); // RAJA::loop + } // lambda () + ); // RAJA::loop + + for (Index_type c = 0; c < 3; ++c) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_2; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qx) { + MASSVEC3DPA_3; + } // lambda (qx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qx) { + MASSVEC3DPA_4; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qx) { + MASSVEC3DPA_5; + } // lambda (qx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_6; + } // lambda (dx) + ); // RAJA::loop + } // lambda (qy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), + [&](Index_type qz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_7; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (qz) + ); // RAJA::loop + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dz) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), + [&](Index_type dx) { + MASSVEC3DPA_8; + } // lambda (dx) + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } // lambda (dz) + ); // RAJA::loop + + ctx.teamSync(); + + } // c - dim loop + } // lambda (e) + ); // RAJA::loop + } // outer lambda (ctx) + ); // RAJA::launch +} + + template void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) { @@ -424,150 +584,7 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); @@ -584,150 +601,7 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); @@ -745,151 +619,7 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); @@ -906,149 +636,7 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1) { - RAJA::launch( - res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(MVPA_Q1D, MVPA_Q1D, MVPA_Q1D)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASSVEC3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), - [&](Index_type) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type q) { - MASSVEC3DPA_1; - } // lambda (q) - ); // RAJA::loop - } // lambda (d) - ); // RAJA::loop - } // lambda () - ); // RAJA::loop - - for (Index_type c = 0; c < 3; ++c) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_2; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_3; - } // lambda (qx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_4; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qx) { - MASSVEC3DPA_5; - } // lambda (qx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_6; - } // lambda (dx) - ); // RAJA::loop - } // lambda (qy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_Q1D), - [&](Index_type qz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_7; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (qz) - ); // RAJA::loop - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dz) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, MVPA_D1D), - [&](Index_type dx) { - MASSVEC3DPA_8; - } // lambda (dx) - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop - } // lambda (dz) - ); // RAJA::loop - - ctx.teamSync(); - - } // c - dim loop - } // lambda (e) - ); // RAJA::loop - } // outer lambda (ctx) - ); // RAJA::launch + runRAJAImpl(res); } // loop over kernel reps stopTimer(); diff --git a/src/apps/MASSVEC3DPA.hpp b/src/apps/MASSVEC3DPA.hpp index 4157d755d..3e4045190 100644 --- a/src/apps/MASSVEC3DPA.hpp +++ b/src/apps/MASSVEC3DPA.hpp @@ -269,6 +269,9 @@ class MASSVEC3DPA : public KernelBase { void setHipTuningDefinitions(VariantID vid); void setSyclTuningDefinitions(VariantID vid); + template + void runRAJAImpl(RESOURCE &res); + template void runCudaVariantImpl(VariantID vid, size_t tune_idx); template From be301b4009cbe74189e8f26d6ffc282967621ae6 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 2 Dec 2025 10:48:31 -0800 Subject: [PATCH 3/3] update to the FEM macros --- src/apps/FEM_MACROS.hpp | 18 ++++---- src/apps/MASSVEC3DPA-Cuda.cpp | 77 ++++++++++++++++++++--------------- src/apps/MASSVEC3DPA-Hip.cpp | 77 ++++++++++++++++++++--------------- 3 files changed, 98 insertions(+), 74 deletions(-) diff --git a/src/apps/FEM_MACROS.hpp b/src/apps/FEM_MACROS.hpp index 03e98d84d..d6154f386 100644 --- a/src/apps/FEM_MACROS.hpp +++ b/src/apps/FEM_MACROS.hpp @@ -81,15 +81,17 @@ for (int ty = threadIdx.y; ty < Ny; ty += blockDim.y) \ for (int tx = threadIdx.x; tx < Nx; tx += blockDim.x) -#define GPU_SHARED_LOOP_2D_INC(tx, ty, Nx, Ny, runtime_blocks_size) \ +#define GPU_SHARED_LOOP_2D_INC(tx, ty, Nx, Ny, runtime_block_size_x, \ + runtime_block_size_y) \ if (threadIdx.z < 1) \ - for (int ty = threadIdx.y; ty < Ny; ty += runtime_blocks_size) \ - for (int tx = threadIdx.x; tx < Nx; tx += runtime_blocks_size) - -#define GPU_SHARED_LOOP_3D_INC(tx, ty, tz, Nx, Ny, Nz, runtime_blocks_size) \ - for (int tz = threadIdx.z; tz < Nz; tz += runtime_blocks_size) \ - for (int ty = threadIdx.y; ty < Ny; ty += runtime_blocks_size) \ - for (int tx = threadIdx.x; tx < Nx; tx += runtime_blocks_size) + for (int ty = threadIdx.y; ty < Ny; ty += runtime_block_size_y) \ + for (int tx = threadIdx.x; tx < Nx; tx += runtime_block_size_x) + +#define GPU_SHARED_LOOP_3D_INC(tx, ty, tz, Nx, Ny, Nz, runtime_block_size_x, \ + runtime_block_size_y, runtime_block_size_z) \ + for (int tz = threadIdx.z; tz < Nz; tz += runtime_block_size_z) \ + for (int ty = threadIdx.y; ty < Ny; ty += runtime_block_size_y) \ + for (int tx = threadIdx.x; tx < Nx; tx += runtime_block_size_x) #endif diff --git a/src/apps/MASSVEC3DPA-Cuda.cpp b/src/apps/MASSVEC3DPA-Cuda.cpp index bec28997b..c07d265ca 100644 --- a/src/apps/MASSVEC3DPA-Cuda.cpp +++ b/src/apps/MASSVEC3DPA-Cuda.cpp @@ -78,56 +78,66 @@ __launch_bounds__(block_size) __global__ void MassVec3DPA_ARGUMENT_LOOP_INC(const Real_ptr B, const Real_ptr Bt, const Real_ptr D, const Real_ptr X, Real_ptr Y, - const Index_type runtime_block_size) + const Index_type runtime_block_size_x, + const Index_type runtime_block_size_y, + const Index_type runtime_block_size_z) { const Index_type e = blockIdx.x; MASSVEC3DPA_0_GPU; - GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, runtime_block_size) { + GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, + runtime_block_size_x, runtime_block_size_y) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_2; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_3; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_4; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_5; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_6; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_7; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_8; } __syncthreads(); @@ -138,62 +148,61 @@ void MassVec3DPA_ARGUMENT_LOOP_INC(const Real_ptr B, const Real_ptr Bt, template __launch_bounds__(block_size) __global__ void MassVec3DPA_BLOCKDIM_COPY_LOOP_INC(const Real_ptr B, const Real_ptr Bt, - const Real_ptr D, const Real_ptr X, - Real_ptr Y, - const Index_type runtime_block_size) + const Real_ptr D, const Real_ptr X, + Real_ptr Y) { const Index_type e = blockIdx.x; - const int bdx = blockDim.x; //block size is the same for xyz... - //const int bdy = blockDim.y; - //const int bdz = blockDim.z; + const int bdx = blockDim.x; + const int bdy = blockDim.y; + const int bdz = blockDim.z; MASSVEC3DPA_0_GPU; - GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, bdx) { + GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, bdx, bdy) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - bdx) { + bdx, bdy, bdz) { MASSVEC3DPA_2; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, - bdx) { + bdx, bdy, bdz) { MASSVEC3DPA_3; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, - bdx) { + bdx, bdy, bdz) { MASSVEC3DPA_4; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, - bdx) { + bdx, bdy, bdz) { MASSVEC3DPA_5; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, - bdx) { + bdx, bdy, bdz) { MASSVEC3DPA_6; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, - bdx) { + bdx, bdy, bdz) { MASSVEC3DPA_7; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - bdx) { + bdx, bdy, bdz) { MASSVEC3DPA_8; } __syncthreads(); @@ -212,49 +221,49 @@ void MassVec3DPA_COMPILE_LOOP_INC(const Real_ptr B, const Real_ptr Bt, MASSVEC3DPA_0_GPU; - GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, block_size) { + GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_2; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_3; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_4; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_5; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_6; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_7; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_8; } __syncthreads(); @@ -514,7 +523,9 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid, size_t tune_idx) RPlaunchCudaKernel((MassVec3DPA_ARGUMENT_LOOP_INC), NE, nthreads_per_block, shmem, res.get_stream(), B, Bt, D, - X, Y, static_cast(MVPA_Q1D)); + X, Y, static_cast(MVPA_Q1D), + static_cast(MVPA_Q1D), + static_cast(MVPA_Q1D)); } stopTimer(); @@ -528,7 +539,7 @@ void MASSVEC3DPA::runCudaVariantImpl(VariantID vid, size_t tune_idx) RPlaunchCudaKernel((MassVec3DPA_BLOCKDIM_COPY_LOOP_INC), NE, nthreads_per_block, shmem, res.get_stream(), B, Bt, D, - X, Y, static_cast(MVPA_Q1D)); + X, Y); } stopTimer(); diff --git a/src/apps/MASSVEC3DPA-Hip.cpp b/src/apps/MASSVEC3DPA-Hip.cpp index 34866e25a..b29d3d463 100644 --- a/src/apps/MASSVEC3DPA-Hip.cpp +++ b/src/apps/MASSVEC3DPA-Hip.cpp @@ -78,56 +78,66 @@ __launch_bounds__(block_size) __global__ void MassVec3DPA_ARGUMENT_LOOP_INC(const Real_ptr B, const Real_ptr Bt, const Real_ptr D, const Real_ptr X, Real_ptr Y, - const Index_type runtime_block_size) + const Index_type runtime_block_size_x, + const Index_type runtime_block_size_y, + const Index_type runtime_block_size_z) { const Index_type e = blockIdx.x; MASSVEC3DPA_0_GPU; - GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, runtime_block_size) { + GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, + runtime_block_size_x, runtime_block_size_y) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_2; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_3; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_4; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_5; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_6; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_7; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - runtime_block_size) { + runtime_block_size_x, runtime_block_size_y, + runtime_block_size_z) { MASSVEC3DPA_8; } __syncthreads(); @@ -138,62 +148,61 @@ void MassVec3DPA_ARGUMENT_LOOP_INC(const Real_ptr B, const Real_ptr Bt, template __launch_bounds__(block_size) __global__ void MassVec3DPA_BLOCKDIM_COPY_LOOP_INC(const Real_ptr B, const Real_ptr Bt, - const Real_ptr D, const Real_ptr X, - Real_ptr Y, - const Index_type runtime_block_size) + const Real_ptr D, const Real_ptr X, + Real_ptr Y) { const Index_type e = blockIdx.x; - const int bdx = blockDim.x; //block size is the same for xyz... - //const int bdy = blockDim.y; - //const int bdz = blockDim.z; + const int bdx = blockDim.x; + const int bdy = blockDim.y; + const int bdz = blockDim.z; MASSVEC3DPA_0_GPU; - GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, bdx) { + GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, bdx, bdy) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - bdx) { + bdx, bdy, bdz) { MASSVEC3DPA_2; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, - bdx) { + bdx, bdy, bdz) { MASSVEC3DPA_3; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, - bdx) { + bdx, bdy, bdz) { MASSVEC3DPA_4; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, - bdx) { + bdx, bdy, bdz) { MASSVEC3DPA_5; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, - bdx) { + bdx, bdy, bdz) { MASSVEC3DPA_6; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, - bdx) { + bdx, bdy, bdz) { MASSVEC3DPA_7; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - bdx) { + bdx, bdy, bdz) { MASSVEC3DPA_8; } __syncthreads(); @@ -212,49 +221,49 @@ void MassVec3DPA_COMPILE_LOOP_INC(const Real_ptr B, const Real_ptr Bt, MASSVEC3DPA_0_GPU; - GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, block_size) { + GPU_SHARED_LOOP_2D_INC(q, d, MVPA_Q1D, MVPA_D1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_1; } for (Index_type c = 0; c < 3; ++c) { GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_2; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, dy, dz, MVPA_Q1D, MVPA_D1D, MVPA_D1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_3; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, dz, MVPA_Q1D, MVPA_Q1D, MVPA_D1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_4; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(qx, qy, qz, MVPA_Q1D, MVPA_Q1D, MVPA_Q1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_5; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, qy, qz, MVPA_D1D, MVPA_Q1D, MVPA_Q1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_6; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, qz, MVPA_D1D, MVPA_D1D, MVPA_Q1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_7; } __syncthreads(); GPU_SHARED_LOOP_3D_INC(dx, dy, dz, MVPA_D1D, MVPA_D1D, MVPA_D1D, - block_size) { + MVPA_Q1D, MVPA_Q1D, MVPA_Q1D) { MASSVEC3DPA_8; } __syncthreads(); @@ -514,7 +523,9 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) RPlaunchHipKernel((MassVec3DPA_ARGUMENT_LOOP_INC), NE, nthreads_per_block, shmem, res.get_stream(), B, Bt, D, - X, Y, static_cast(MVPA_Q1D)); + X, Y, static_cast(MVPA_Q1D), + static_cast(MVPA_Q1D), + static_cast(MVPA_Q1D)); } stopTimer(); @@ -528,7 +539,7 @@ void MASSVEC3DPA::runHipVariantImpl(VariantID vid, size_t tune_idx) RPlaunchHipKernel((MassVec3DPA_BLOCKDIM_COPY_LOOP_INC), NE, nthreads_per_block, shmem, res.get_stream(), B, Bt, D, - X, Y, static_cast(MVPA_Q1D)); + X, Y); } stopTimer();