Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/apps/CONVECTION3DPA.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
/// Action of 3D diffusion matrix via partial assembly
///
/// Based on MFEM's/CEED algorithms.
/// Reference implementation
/// https://github.com/mfem/mfem/blob/master/fem/bilininteg_convection_pa.cpp
/// Reference implementation - MFEM-v4.9
/// https://github.com/mfem/mfem/blob/v4.9/fem/integ/bilininteg_convection_kernels.hpp
///
///
/// for(int e = 0; e < NE; ++e) {
Expand Down
50 changes: 25 additions & 25 deletions src/apps/DIFFUSION3DPA-Cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,73 +32,73 @@ __global__ void Diffusion3DPA(const Real_ptr Basis,

DIFFUSION3DPA_0_GPU;

GPU_FOREACH_THREAD(dz, z, DPA_D1D) {
GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD(dx, x, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) {
DIFFUSION3DPA_1;
}
}
}

if (threadIdx.z == 0) {
GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD(qx, x, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) {
DIFFUSION3DPA_2;
}
}
}
__syncthreads();
GPU_FOREACH_THREAD(dz, z, DPA_D1D) {
GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD(qx, x, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) {
DIFFUSION3DPA_3;
}
}
}
__syncthreads();
GPU_FOREACH_THREAD(dz, z, DPA_D1D) {
GPU_FOREACH_THREAD(qy, y, DPA_Q1D) {
GPU_FOREACH_THREAD(qx, x, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(qy, y, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) {
DIFFUSION3DPA_4;
}
}
}
__syncthreads();
GPU_FOREACH_THREAD(qz, z, DPA_Q1D) {
GPU_FOREACH_THREAD(qy, y, DPA_Q1D) {
GPU_FOREACH_THREAD(qx, x, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(qz, z, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(qy, y, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) {
DIFFUSION3DPA_5;
}
}
}
__syncthreads();
if (threadIdx.z == 0) {
GPU_FOREACH_THREAD(d, y, DPA_D1D) {
GPU_FOREACH_THREAD(q, x, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(d, y, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(q, x, DPA_Q1D) {
DIFFUSION3DPA_6;
}
}
}
__syncthreads();
GPU_FOREACH_THREAD(qz, z, DPA_Q1D) {
GPU_FOREACH_THREAD(qy, y, DPA_Q1D) {
GPU_FOREACH_THREAD(dx, x, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(qz, z, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(qy, y, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) {
DIFFUSION3DPA_7;
}
}
}
__syncthreads();
GPU_FOREACH_THREAD(qz, z, DPA_Q1D) {
GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD(dx, x, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(qz, z, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) {
DIFFUSION3DPA_8;
}
}
}
__syncthreads();
GPU_FOREACH_THREAD(dz, z, DPA_D1D) {
GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD(dx, x, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) {
DIFFUSION3DPA_9;
}
}
Expand Down
61 changes: 31 additions & 30 deletions src/apps/DIFFUSION3DPA-Hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,81 +28,82 @@ __global__ void Diffusion3DPA(const Real_ptr Basis,
const Real_ptr dBasis, const Real_ptr D,
const Real_ptr X, Real_ptr Y, bool symmetric) {

const Index_type e = hipBlockIdx_x;
const Index_type e = blockIdx.x;

DIFFUSION3DPA_0_GPU;

GPU_FOREACH_THREAD(dz, z, DPA_D1D) {
GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD(dx, x, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) {
DIFFUSION3DPA_1;
}
}
}

if (threadIdx.z == 0) {
GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD(qx, x, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) {
DIFFUSION3DPA_2;
}
}
}
__syncthreads();
GPU_FOREACH_THREAD(dz, z, DPA_D1D) {
GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD(qx, x, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) {
DIFFUSION3DPA_3;
}
}
}
__syncthreads();
GPU_FOREACH_THREAD(dz, z, DPA_D1D) {
GPU_FOREACH_THREAD(qy, y, DPA_Q1D) {
GPU_FOREACH_THREAD(qx, x, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(qy, y, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) {
DIFFUSION3DPA_4;
}
}
}
__syncthreads();
GPU_FOREACH_THREAD(qz, z, DPA_Q1D) {
GPU_FOREACH_THREAD(qy, y, DPA_Q1D) {
GPU_FOREACH_THREAD(qx, x, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(qz, z, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(qy, y, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) {
DIFFUSION3DPA_5;
}
}
}
__syncthreads();
if (threadIdx.z == 0) {
GPU_FOREACH_THREAD(d, y, DPA_D1D) {
GPU_FOREACH_THREAD(q, x, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(qx, x, DPA_Q1D) {
DIFFUSION3DPA_6;
}
}
}
__syncthreads();
GPU_FOREACH_THREAD(qz, z, DPA_Q1D) {
GPU_FOREACH_THREAD(qy, y, DPA_Q1D) {
GPU_FOREACH_THREAD(dx, x, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(qz, z, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(qy, y, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) {
DIFFUSION3DPA_7;
}
}
}
__syncthreads();
GPU_FOREACH_THREAD(qz, z, DPA_Q1D) {
GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD(dx, x, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(qz, z, DPA_Q1D) {
GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) {
DIFFUSION3DPA_8;
}
}
}
__syncthreads();
GPU_FOREACH_THREAD(dz, z, DPA_D1D) {
GPU_FOREACH_THREAD(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD(dx, x, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(dz, z, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(dy, y, DPA_D1D) {
GPU_FOREACH_THREAD_DIRECT(dx, x, DPA_D1D) {
DIFFUSION3DPA_9;
}
}
}

}

template < size_t block_size >
Expand Down Expand Up @@ -248,7 +249,7 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
[&](Index_type qx) {

DIFFUSION3DPA_5;
DIFFUSION3DPA_5;

} // lambda (qx)
); // RAJA::loop<inner_x>
Expand All @@ -262,9 +263,9 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) {
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
[&](Index_type RAJA_UNUSED_ARG(dz)) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
[&](Index_type d) {
[&](Index_type dy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
[&](Index_type q) {
[&](Index_type qx) {

DIFFUSION3DPA_6;

Expand Down Expand Up @@ -320,7 +321,7 @@ void DIFFUSION3DPA::runHipVariantImpl(VariantID vid) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
[&](Index_type dx) {

DIFFUSION3DPA_9;
DIFFUSION3DPA_9;

} // lambda (dx)
); // RAJA::loop<inner_x>
Expand Down
79 changes: 40 additions & 39 deletions src/apps/DIFFUSION3DPA-OMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,70 +38,71 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) {

DIFFUSION3DPA_0_CPU;

CPU_FOREACH(dz, z, DPA_D1D) {
CPU_FOREACH(dy, y, DPA_D1D) {
CPU_FOREACH(dx, x, DPA_D1D) {
DIFFUSION3DPA_1;
CPU_FOREACH(dz,z,DPA_D1D) {
CPU_FOREACH(dy,y,DPA_D1D) {
CPU_FOREACH(dx,x,DPA_D1D) {
DIFFUSION3DPA_1
}
}
}

CPU_FOREACH(dy, y, DPA_D1D) {
CPU_FOREACH(qx, x, DPA_Q1D) {
DIFFUSION3DPA_2;
CPU_FOREACH(dy,y,DPA_D1D) {
CPU_FOREACH(qx,x,DPA_Q1D) {
DIFFUSION3DPA_2
}
}

CPU_FOREACH(dz, z, DPA_D1D) {
CPU_FOREACH(dy, y, DPA_D1D) {
CPU_FOREACH(qx, x, DPA_Q1D) {
DIFFUSION3DPA_3;
CPU_FOREACH(dz,z,DPA_D1D) {
CPU_FOREACH(dy,y,DPA_D1D) {
CPU_FOREACH(qx,x,DPA_Q1D) {

DIFFUSION3DPA_3
}
}
}

CPU_FOREACH(dz, z, DPA_D1D) {
CPU_FOREACH(qy, y, DPA_Q1D) {
CPU_FOREACH(qx, x, DPA_Q1D) {
DIFFUSION3DPA_4;
CPU_FOREACH(dz,z,DPA_D1D) {
CPU_FOREACH(qy,y,DPA_Q1D) {
CPU_FOREACH(qx,x,DPA_Q1D) {
DIFFUSION3DPA_4
}
}
}

CPU_FOREACH(qz, z, DPA_Q1D) {
CPU_FOREACH(qy, y, DPA_Q1D) {
CPU_FOREACH(qx, x, DPA_Q1D) {
DIFFUSION3DPA_5;
CPU_FOREACH(qz,z,DPA_Q1D) {
CPU_FOREACH(qy,y,DPA_Q1D) {
CPU_FOREACH(qx,x,DPA_Q1D) {
DIFFUSION3DPA_5
}
}
}

CPU_FOREACH(d, y, DPA_D1D) {
CPU_FOREACH(q, x, DPA_Q1D) {
DIFFUSION3DPA_6;
CPU_FOREACH(dy,y,DPA_D1D) {
CPU_FOREACH(qx,x,DPA_Q1D) {
DIFFUSION3DPA_6
}
}

CPU_FOREACH(qz, z, DPA_Q1D) {
CPU_FOREACH(qy, y, DPA_Q1D) {
CPU_FOREACH(dx, x, DPA_D1D) {
DIFFUSION3DPA_7;
CPU_FOREACH(qz,z,DPA_Q1D) {
CPU_FOREACH(qy,y,DPA_Q1D) {
CPU_FOREACH(dx,x,DPA_D1D) {
DIFFUSION3DPA_7
}
}
}

CPU_FOREACH(qz, z, DPA_Q1D) {
CPU_FOREACH(dy, y, DPA_D1D) {
CPU_FOREACH(dx, x, DPA_D1D) {
DIFFUSION3DPA_8;
CPU_FOREACH(qz,z,DPA_Q1D) {
CPU_FOREACH(dy,y,DPA_D1D) {
CPU_FOREACH(dx,x,DPA_D1D) {
DIFFUSION3DPA_8
}
}
}

CPU_FOREACH(dz, z, DPA_D1D) {
CPU_FOREACH(dy, y, DPA_D1D) {
CPU_FOREACH(dx, x, DPA_D1D) {
DIFFUSION3DPA_9;
CPU_FOREACH(dz,z,DPA_D1D) {
CPU_FOREACH(dy,y,DPA_D1D) {
CPU_FOREACH(dx,x,DPA_D1D) {
DIFFUSION3DPA_9
}
}
}
Expand Down Expand Up @@ -221,7 +222,7 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
[&](Index_type qx) {

DIFFUSION3DPA_5;
DIFFUSION3DPA_5;

} // lambda (qx)
); // RAJA::loop<inner_x>
Expand All @@ -235,11 +236,11 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) {
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
[&](Index_type RAJA_UNUSED_ARG(dz)) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, DPA_D1D),
[&](Index_type d) {
[&](Index_type dy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_Q1D),
[&](Index_type q) {
[&](Index_type qx) {

DIFFUSION3DPA_6;
DIFFUSION3DPA_6;

} // lambda (q)
); // RAJA::loop<inner_x>
Expand All @@ -257,7 +258,7 @@ void DIFFUSION3DPA::runOpenMPVariant(VariantID vid) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, DPA_D1D),
[&](Index_type dx) {

DIFFUSION3DPA_7;
DIFFUSION3DPA_7;

} // lambda (dx)
); // RAJA::loop<inner_x>
Expand Down
Loading