diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 598131623..4c7e18add 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -97,6 +97,9 @@ blt_add_executable( basic/INIT_VIEW1D_OFFSET.cpp basic/INIT_VIEW1D_OFFSET-Seq.cpp basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp + basic/MAT_FUSED_MUL_ADD.cpp + basic/MAT_FUSED_MUL_ADD-Seq.cpp + basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp basic/MAT_MAT_SHARED.cpp basic/MAT_MAT_SHARED-Seq.cpp basic/MAT_MAT_SHARED-OMPTarget.cpp diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt index ceeb1a502..8d2cee87b 100644 --- a/src/basic/CMakeLists.txt +++ b/src/basic/CMakeLists.txt @@ -56,6 +56,12 @@ blt_add_library( INIT_VIEW1D_OFFSET-Cuda.cpp INIT_VIEW1D_OFFSET-OMP.cpp INIT_VIEW1D_OFFSET-OMPTarget.cpp + MAT_FUSED_MUL_ADD.cpp + MAT_FUSED_MUL_ADD-Seq.cpp + MAT_FUSED_MUL_ADD-Hip.cpp + MAT_FUSED_MUL_ADD-Cuda.cpp + MAT_FUSED_MUL_ADD-OMP.cpp + MAT_FUSED_MUL_ADD-OMPTarget.cpp MAT_MAT_SHARED.cpp MAT_MAT_SHARED-Seq.cpp MAT_MAT_SHARED-Hip.cpp diff --git a/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp new file mode 100644 index 000000000..87966915c --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD-Cuda.cpp @@ -0,0 +1,155 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include "common/CudaDataUtils.hpp" + +#include + +namespace rajaperf { +namespace basic { + +#define MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA \ + const Index_type N = m_N; \ + allocAndInitCudaDeviceData(A, m_A, N); \ + allocAndInitCudaDeviceData(B, m_B, N); \ + allocAndInitCudaDeviceData(D, m_D, N); + + +#define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA \ + getCudaDeviceData(m_A, A, N); \ + getCudaDeviceData(m_B, B, N); \ + getCudaDeviceData(m_D, D, N); \ + deallocCudaDeviceData(A); \ + deallocCudaDeviceData(B); \ + deallocCudaDeviceData(D); + + +template < Index_type block_size > +__launch_bounds__(block_size) +__global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, + Index_type N){ +constexpr Index_type Ne = 16; +const Index_type N_Elem = N/(Ne*Ne); +for(Index_type ii = 0; ii != N_Elem; ++ii){ + Index_type col = threadIdx.x + blockIdx.x * blockDim.x; + Index_type row = threadIdx.y + blockIdx.y * blockDim.y; + MAT_FUSED_MUL_ADD_BODY; +} +} +template < Index_type block_size, typename Lambda > +__launch_bounds__(block_size) +__global__ void mat_fused_lam(Index_type N, Lambda body) +{ +constexpr Index_type Ne = 16; +const Index_type N_Elem = N/(Ne*Ne); +for(Index_type ii = 0; ii != N_Elem; ++ii){ + Index_type col = threadIdx.x + blockIdx.x * blockDim.x; + Index_type row = threadIdx.y + blockIdx.y * blockDim.y; + body(ii,col,row); + } +} +template < size_t block_size > +void MAT_FUSED_MUL_ADD::runCudaVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type N = m_N; + constexpr Index_type Ne = 16; + const Index_type N_Elem = N/(Ne*Ne); + + constexpr Index_type block_x = gpu_block_size::sqrt(block_size); + constexpr Index_type block_y = gpu_block_size::sqrt(block_size); + dim3 blockDim(block_x, block_y); + dim3 gridDim(static_cast(RAJA_DIVIDE_CEILING_INT(Ne, block_size)), + static_cast(RAJA_DIVIDE_CEILING_INT(Ne, block_size)), + static_cast(1)); + + MAT_FUSED_MUL_ADD_DATA_SETUP; + + MAT_FUSED_MUL_ADD_DATA_INIT; + + if (vid == Base_CUDA) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + mat_fused_mul_add<<>>(A, B, D, N); + } + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA; + + } else if (vid == Lambda_CUDA) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + auto mat_fused_lamda = + [=] __device__ (Index_type ii, Index_type row, Index_type col) { + MAT_FUSED_MUL_ADD_BODY; + }; + mat_fused_lam + <<>>(N, mat_fused_lamda); + } + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA; + + } else if (vid == RAJA_CUDA) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_CUDA; + + startTimer(); + RAJA::RangeSegment row_range(0, Ne); + RAJA::RangeSegment col_range(0, Ne); + RAJA::RangeSegment ii_range(0, N_Elem); + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::CudaKernel< + RAJA::statement::For<2, RAJA::cuda_block_z_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_direct, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_direct, + RAJA::statement::For<1, RAJA::cuda_thread_y_direct, + RAJA::statement::For<0, RAJA::cuda_thread_x_direct, + RAJA::statement::Lambda<0> + > + > + > + > + > + > + >; + RAJA::kernel(RAJA::make_tuple(row_range, col_range, ii_range), + [=] RAJA_DEVICE (Index_type row, Index_type col, Index_type ii) { + MAT_FUSED_MUL_ADD_BODY; + }); + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_CUDA; + + + } else { + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown Cuda variant id = " << vid + << std::endl; + } +} + +RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MAT_FUSED_MUL_ADD, Cuda) + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_CUDA diff --git a/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp new file mode 100644 index 000000000..b8a0d286b --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD-Hip.cpp @@ -0,0 +1,304 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include "common/HipDataUtils.hpp" + +#include + +namespace rajaperf { +namespace basic { + +#define MAT_FUSED_MUL_ADD_DATA_SETUP_HIP \ + const Index_type N = m_N; \ + allocAndInitHipDeviceData(A, m_A, N); \ + allocAndInitHipDeviceData(B, m_B, N); \ + allocAndInitHipDeviceData(D, m_D, N); + +#define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP \ + getHipDeviceData(m_A, A, N); \ + getHipDeviceData(m_B, B, N); \ + getHipDeviceData(m_D, D, N); \ + deallocHipDeviceData(A); \ + deallocHipDeviceData(B); \ + deallocHipDeviceData(D); + +__global__ void mat_fused_mul_add_builtin(const Real_ptr A, const Real_ptr B, Real_ptr D, const Index_type N){ + constexpr Index_type Ne = 16; + const Index_type N_Elem = N/(Ne*Ne); + for(Index_type ii = 0; ii != N_Elem; ++ii){ + using real4 = __attribute__((__vector_size__(4 * sizeof(Real_type)))) Real_type; + real4 result = {0}; + + Index_type a_idx = Ne * threadIdx.x + threadIdx.y + ii*(Ne*Ne); + Index_type b_idx = threadIdx.x + Ne * threadIdx.y + ii*(Ne*Ne); + + for(Index_type i = 0; i < 4; ++i){ + Real_type a = A[a_idx]; + Real_type b = B[b_idx]; + +#ifdef __gfx90a__ +#if defined(RP_USE_DOUBLE) + result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, result, 0, 0, 0); +#elif defined(RP_USE_FLOAT) + result = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, result, 0, 0, 0); +#endif +#endif + +#ifdef __gfx908__ +#if defined(RP_USE_FLOAT) + result = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, result, 0, 0, 0); +#endif +#endif + a_idx += 4; // move two columns to the right + b_idx += 4*Ne; // move two rows down + } + + RAJA_UNROLL_COUNT(4) + for(Index_type i = 0; i < 4; ++i){ + const Index_type d_idx = threadIdx.x + + Ne * (threadIdx.y + 4 * i); + D[d_idx + ii*(Ne*Ne)] = result[i]; + } +} +} +//Reference for cases with no hardware support +template < Index_type block_size > +__launch_bounds__(block_size) +__global__ void mat_fused_mul_add(const Real_ptr A, const Real_ptr B, Real_ptr D, + const Index_type N){ +constexpr Index_type Ne = 16; +const Index_type N_Elem = N/(Ne*Ne); +for(Index_type ii = 0; ii != N_Elem; ++ii){ + Index_type col = threadIdx.x + blockIdx.x * blockDim.x; + Index_type row = threadIdx.y + blockIdx.y * blockDim.y; + MAT_FUSED_MUL_ADD_BODY; +} +} +template < Index_type block_size, typename Lambda > +__launch_bounds__(block_size) +__global__ void mat_fused_lam(const Index_type N, Lambda body) +{ +constexpr Index_type Ne = 16; +const Index_type N_Elem = N/(Ne*Ne); +for(Index_type ii = 0; ii != N_Elem; ++ii){ + Index_type col = threadIdx.x + blockIdx.x * blockDim.x; + Index_type row = threadIdx.y + blockIdx.y * blockDim.y; + body(ii,col,row); + } +} +void MAT_FUSED_MUL_ADD::runHipVariantBuiltin(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type iend = getActualProblemSize(); + const Index_type N = m_N; + constexpr Index_type Ne = m_Ne; + const Index_type N_Elem = N/(Ne*Ne); + + dim3 gridDim (1, 1, 1); + dim3 blockDimBuiltin(Ne, 4, 1); + + MAT_FUSED_MUL_ADD_DATA_SETUP; + + MAT_FUSED_MUL_ADD_DATA_INIT; + + if (vid == Base_HIP) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + hipLaunchKernelGGL((mat_fused_mul_add_builtin), dim3(gridDim), dim3(blockDimBuiltin), 0, 0, A, B, D, iend); + hipErrchk( hipGetLastError() ); + } + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; + + } else { + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown Hip variant id = " << vid + << std::endl; + } +} +template < size_t block_size > +void MAT_FUSED_MUL_ADD::runHipVariantImpl(VariantID vid) +{ + const Index_type run_reps = getRunReps(); + const Index_type iend = getActualProblemSize(); + const Index_type N = m_N; + constexpr Index_type Ne = m_Ne; + const Index_type N_Elem = N/(Ne*Ne); + + constexpr Index_type block_x = gpu_block_size::sqrt(block_size); + constexpr Index_type block_y = gpu_block_size::sqrt(block_size); + dim3 blockDim(block_x, block_y); + dim3 gridDim(static_cast(RAJA_DIVIDE_CEILING_INT(Ne, block_size)), + static_cast(RAJA_DIVIDE_CEILING_INT(Ne, block_size)), + static_cast(1)); + MAT_FUSED_MUL_ADD_DATA_SETUP; + + MAT_FUSED_MUL_ADD_DATA_INIT; + + if (vid == Base_HIP) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + hipLaunchKernelGGL((mat_fused_mul_add), dim3(gridDim), dim3(blockDim), 0, 0, A, B, D, iend); + hipErrchk( hipGetLastError() ); + } + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; + } else if (vid == Lambda_HIP) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + auto mat_fused_lamda = + [=] __device__ (Index_type ii, Index_type row, Index_type col) { + MAT_FUSED_MUL_ADD_BODY; + }; + hipLaunchKernelGGL((mat_fused_lam), + dim3(gridDim), dim3(blockDim), 0, 0, + iend, mat_fused_lamda); + hipErrchk( hipGetLastError() ); + } + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; + + } else if (vid == RAJA_HIP) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_HIP; + + startTimer(); + RAJA::RangeSegment ii_range(0, N_Elem); + RAJA::RangeSegment row_range(0, Ne); + RAJA::RangeSegment col_range(0, Ne); + + using EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::HipKernel< + RAJA::statement::For<2, RAJA::hip_block_z_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_direct, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_direct, + RAJA::statement::For<1, RAJA::hip_thread_y_direct, + RAJA::statement::For<0, RAJA::hip_thread_x_direct, + RAJA::statement::Lambda<0> + > + > + > + > + > + > + >; + RAJA::kernel(RAJA::make_tuple(row_range, col_range, ii_range), + [=] RAJA_DEVICE (Index_type row, Index_type col, Index_type ii) { + MAT_FUSED_MUL_ADD_BODY; + }); + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_HIP; + + } else { + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown Hip variant id = " << vid + << std::endl; + } +} +namespace{ + +bool builtinSupported() +{ + std::string hipArch = getHipArch(); +#if defined(RP_USE_DOUBLE) + if (hipArch=="gfx90a") + return true; +#endif +#if defined(RP_USE_FLOAT) + if (hipArch=="gfx90a" || hipArch=="gfx908") + return true; +#endif +return false; +} +} +void MAT_FUSED_MUL_ADD::runHipVariant(VariantID vid, size_t tune_idx) +{ + bool builtin_supported = builtinSupported(); + + size_t t = 0; + if ( vid == Base_HIP && builtin_supported) { + + if (tune_idx == t) { + + runHipVariantBuiltin(vid); + + } + + t += 1; + } + if ( (vid == Base_HIP) || (vid == RAJA_HIP) || (vid == Lambda_HIP)){ + + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0 || + run_params.validGPUBlockSize(block_size)) { + + if (tune_idx == t) { + + runHipVariantImpl(vid); + + } + + t += 1; + + } + + }); + } + else { + + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown Hip variant id = " << vid << std::endl; + + } + +} + +void MAT_FUSED_MUL_ADD::setHipTuningDefinitions(VariantID vid) +{ + bool builtin_supported = builtinSupported(); + if ( vid == Base_HIP ) { + + if (builtin_supported) { + addVariantTuningName(vid, "builtin"); + } + } + seq_for(gpu_block_sizes_type{}, [&](auto block_size) { + + if (run_params.numValidGPUBlockSize() == 0u || + run_params.validGPUBlockSize(block_size)) { + + addVariantTuningName(vid, "block_"+std::to_string(block_size)); + } + + }); + +} + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_HIP diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp new file mode 100644 index 000000000..6b417faae --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD-OMP.cpp @@ -0,0 +1,108 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#include + +namespace rajaperf { +namespace basic { + +void MAT_FUSED_MUL_ADD::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { +#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP) + + const Index_type run_reps = getRunReps(); + const Index_type iend = getActualProblemSize(); + const Index_type N = m_N; + const Index_type Ne = m_Ne; + const Index_type N_Elem = (N/(Ne*Ne)); + MAT_FUSED_MUL_ADD_DATA_SETUP; + + MAT_FUSED_MUL_ADD_DATA_INIT; + + switch (vid) { + + case Base_OpenMP: { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + #pragma omp parallel for + for(Index_type ii = 0; ii < N_Elem; ++ii){ + for(Index_type row = 0; row < Ne; ++row){ + for(Index_type col = 0; col < Ne; ++col){ + MAT_FUSED_MUL_ADD_BODY; + } + } + } + + + } + stopTimer(); + + break; + } + + case Lambda_OpenMP: { + auto mat_fused_base_lam = [=](Index_type ii, Index_type row, Index_type col){ + MAT_FUSED_MUL_ADD_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + #pragma omp parallel for + for(Index_type ii = 0; ii < N_Elem; ++ii){ + for(Index_type row = 0; row < Ne; ++row){ + for(Index_type col = 0; col < Ne; ++col){ + mat_fused_base_lam(ii, row, col); + } + } + } + + + } + stopTimer(); + + break; + } + + case RAJA_OpenMP: { + + RAJA::RangeSegment row_range(0, Ne); + RAJA::RangeSegment col_range(0, Ne); + RAJA::RangeSegment ii_range(0, N_Elem); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + RAJA::forall( ii_range, [=](int ii) { + RAJA::forall( row_range, [=](int row) { + RAJA::forall( col_range, [=](int col) { + MAT_FUSED_MUL_ADD_BODY; + }); + }); + }); + } // loop over kernel reps + stopTimer(); + + break; + } + + default: { + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown variant id = " << vid + << std::endl; + } + } + +#else + RAJA_UNUSED_VAR(vid); +#endif +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp new file mode 100644 index 000000000..18e7a140e --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD-OMPTarget.cpp @@ -0,0 +1,117 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +#include "common/OpenMPTargetDataUtils.hpp" + +#include + +namespace rajaperf { +namespace basic { + + // + // Define threads per team for target execution + // + const size_t threads_per_team = 256; + +#define MAT_FUSED_MUL_ADD_DATA_SETUP_OMP_TARGET \ + int hid = omp_get_initial_device(); \ + int did = omp_get_default_device(); \ + const Index_type N = m_N; \ + constexpr Index_type Ne = m_Ne; \ + const Index_type N_Elem = (N/(Ne*Ne); \ + allocAndInitOpenMPDeviceData(A, m_A, N, did, hid); \ + allocAndInitOpenMPDeviceData(B, m_B, N, did, hid); \ + allocAndInitOpenMPDeviceData(D, m_D, N, did, hid); + +#define MAT_FUSED_MUL_ADD_DATA_TEARDOWN_OMP_TARGET \ + getOpenMPDeviceData(m_A, A, N, hid, did); \ + getOpenMPDeviceData(m_B, B, N, hid, did); \ + getOpenMPDeviceData(m_D, D, N, hid, did); \ + deallocOpenMPDeviceData(A, did); \ + deallocOpenMPDeviceData(B, did); \ + deallocOpenMPDeviceData(D, did); + + +void MAT_FUSED_MUL_ADD::runOpenMPTargetVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MAT_FUSED_MUL_ADD_DATA_SETUP; + + MAT_FUSED_MUL_ADD_DATA_INIT; + + if ( vid == Base_OpenMPTarget ) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_OMP_TARGET; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + #pragma omp target is_device_ptr(A, B, D) device( did ) + #pragma omp teams distribute parallel for schedule(static, 1) collapse(2) + for(Index_type ii = 0; ii != N_Elem; ++ii){ + for(Index_type row = 0; row != Ne; ++row){ + for(Index_type col = 0; col != Ne; ++col){ + MAT_FUSED_MUL_ADD_BODY; + } + } + } + } + + stopTimer(); + + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_OMP_TARGET; + + } else if ( vid == RAJA_OpenMPTarget ) { + + MAT_FUSED_MUL_ADD_DATA_SETUP_OMP_TARGET; + + RAJA::RangeSegment row_range(0, Ne); + RAJA::RangeSegment col_range(0, Ne); + RAJA::RangeSegment ii_range(0, N_Elem); + + + using EXEC_POL = + RAJA::KernelPolicy< + + RAJA::statement::For<0, RAJA::seq_exec, // ii + RAJA::statement::Collapse, // row, col + RAJA::statement::Lambda<0> + > + > + >; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::kernel( RAJA::make_tuple(ii_range, + row_range, + col_range), + [=] (Index_type ii, Index_type row, Index_type col) { + MAT_FUSED_MUL_ADD_BODY; + }); + MAT_FUSED_MUL_ADD_DATA_TEARDOWN_OMP_TARGET; + + } else { + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown OMP Target variant id = " << vid << std::endl; + } +} + } + +} // end namespace basic +} // end namespace rajaperf + +#endif // RAJA_ENABLE_TARGET_OPENMP diff --git a/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp new file mode 100644 index 000000000..14f783f66 --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD-Seq.cpp @@ -0,0 +1,98 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include + +namespace rajaperf { +namespace basic { + +void MAT_FUSED_MUL_ADD::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { + + const Index_type run_reps = getRunReps(); + const Index_type N = m_N; + constexpr Index_type Ne = m_Ne; + const Index_type N_Elem = N/(Ne*Ne); + + MAT_FUSED_MUL_ADD_DATA_SETUP; + + MAT_FUSED_MUL_ADD_DATA_INIT; + switch (vid) { + + case Base_Seq: { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + for(Index_type ii = 0; ii != N_Elem; ++ii){ + for(Index_type row = 0; row != Ne; ++row){ + for(Index_type col = 0; col != Ne; ++col){ + MAT_FUSED_MUL_ADD_BODY; + } + } + } + + } // number of iterations + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq: { + + auto mat_fused_lam = [=](Index_type ii, Index_type row, Index_type col){ + MAT_FUSED_MUL_ADD_BODY; + }; + + startTimer(); + for (Index_type irep = 0; irep < run_reps; ++irep) { + for(Index_type ii = 0; ii != N_Elem; ++ii){ + for(Index_type row = 0; row != Ne; ++row){ + for(Index_type col = 0; col != Ne; ++col){ + mat_fused_lam(ii,row,col); + } + } + } + + } // irep + stopTimer(); + + break; + } + + case RAJA_Seq: { + RAJA::RangeSegment row_range(0, Ne); + RAJA::RangeSegment col_range(0, Ne); + RAJA::RangeSegment ii_range(0, N_Elem); + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + RAJA::forall( ii_range, [=](int ii) { + RAJA::forall( row_range, [=](int row) { + RAJA::forall( col_range, [=](int col) { + MAT_FUSED_MUL_ADD_BODY; + }); + }); + }); + } // loop over kernel reps + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default: { + getCout() << "\n MAT_FUSED_MUL_ADD : Unknown variant id = " << vid + << std::endl; + } + } +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/MAT_FUSED_MUL_ADD.cpp b/src/basic/MAT_FUSED_MUL_ADD.cpp new file mode 100644 index 000000000..f7e40d3a8 --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD.cpp @@ -0,0 +1,84 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "MAT_FUSED_MUL_ADD.hpp" + +#include "RAJA/RAJA.hpp" + +#include "common/DataUtils.hpp" + +#include + +namespace rajaperf { +namespace basic { + +MAT_FUSED_MUL_ADD::MAT_FUSED_MUL_ADD(const RunParams ¶ms) + : KernelBase(rajaperf::Basic_MAT_FUSED_MUL_ADD, params) +{ + m_N_default = 1000; + setDefaultProblemSize(m_N_default*m_N_default); + setDefaultReps(5); + + //Make sure problem target size is divisible by 16*16 + m_N = RAJA_DIVIDE_CEILING_INT(Index_type(getTargetProblemSize()),Index_type(m_Ne*m_Ne))*Index_type(m_Ne*m_Ne); + setActualProblemSize(m_N); + + setItsPerRep(getActualProblemSize()); + setKernelsPerRep(1); + + setBytesPerRep(2*m_N*sizeof(Real_type)); + setFLOPsPerRep(2*m_N*m_Ne); + + + checksum_scale_factor = 1e-6 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() ); + + + + setVariantDefined(Base_Seq); + setVariantDefined(Lambda_Seq); + setVariantDefined(RAJA_Seq); + + setVariantDefined(Base_OpenMP); + setVariantDefined(Lambda_OpenMP); + setVariantDefined(RAJA_OpenMP); + + setVariantDefined(Base_CUDA); + setVariantDefined(Lambda_CUDA); + setVariantDefined(RAJA_CUDA); + + setVariantDefined(Base_HIP); + setVariantDefined(Lambda_HIP); + setVariantDefined(RAJA_HIP); +} + +MAT_FUSED_MUL_ADD::~MAT_FUSED_MUL_ADD() {} + +void MAT_FUSED_MUL_ADD::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { + + allocAndInitDataConst(m_A, m_N, 1.0, vid); + allocAndInitDataConst(m_B, m_N, 1.0, vid); + allocAndInitDataConst(m_D, m_N, 0.0, vid); + +} + +void MAT_FUSED_MUL_ADD::updateChecksum(VariantID vid, size_t tune_idx) { + checksum[vid][tune_idx] += calcChecksum(m_D, m_N, checksum_scale_factor ); +} + +void MAT_FUSED_MUL_ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) { + (void)vid; + deallocData(m_A); + deallocData(m_B); + deallocData(m_D); + +} + +} // end namespace basic +} // end namespace rajaperf diff --git a/src/basic/MAT_FUSED_MUL_ADD.hpp b/src/basic/MAT_FUSED_MUL_ADD.hpp new file mode 100644 index 000000000..01aae2601 --- /dev/null +++ b/src/basic/MAT_FUSED_MUL_ADD.hpp @@ -0,0 +1,91 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-20, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Compute D = A x B + C, where +// Inputs: +// A: N_Elem x (Ne x Ne) matrices +// B: N_Elem x (Ne x Ne) matrices +// Ouput: +// D: N/(Ne*Ne) Ne x Ne matrices +// All square row-major matrices, C is ignored. +//for(Index_type ii = 0; ii != N_Elem; ++ii){ +// for(Index_type row = 0; row != Ne; ++row){ +// for(Index_type col = 0; col != Ne; ++col){ +// MAT_FUSED_MUL_ADD_BODY; +// } +// } +//} +#ifndef RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP +#define RAJAPerf_Basic_MAT_FUSED_MUL_ADD_HPP + +#include "RAJA/RAJA.hpp" +#include "common/KernelBase.hpp" + +#define MAT_FUSED_MUL_ADD_DATA_INIT \ +for(Index_type ii = 0; ii != N_Elem; ++ii){ \ + for(Index_type i = 0; i != Ne*Ne; ++i){ m_A[i+(ii*Ne*Ne)] = i; } \ + for(Index_type i = 0; i != Ne*Ne; ++i){ m_B[i+(ii*Ne*Ne)] = (Ne*Ne) - 1 - i; } \ +} + +#define MAT_FUSED_MUL_ADD_DATA_SETUP \ + Real_ptr A = m_A; \ + Real_ptr B = m_B; \ + Real_ptr D = m_D; + +#define MAT_FUSED_MUL_ADD_BODY \ + Real_type dot = 0; \ + for (Index_type k = 0; k < Ne; ++k) { \ + dot += A[row*Ne + k + ii*(Ne*Ne)] * B[k*Ne + col + ii*(Ne*Ne)]; \ + } \ + D[row*Ne + col + ii*(Ne*Ne)] = dot; \ + +namespace rajaperf { +class RunParams; + +namespace basic { + +class MAT_FUSED_MUL_ADD : public KernelBase { +public: + MAT_FUSED_MUL_ADD(const RunParams ¶ms); + + ~MAT_FUSED_MUL_ADD(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + void runHipVariantBuiltin(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::make_list_type; + + Real_ptr m_A; + Real_ptr m_B; + Real_ptr m_D; + + Index_type m_N; + Index_type m_N_default; + static constexpr Index_type m_Ne = 16; +}; + +} // end namespace basic +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/HipDataUtils.hpp b/src/common/HipDataUtils.hpp index a3871d31e..782fd7a1f 100644 --- a/src/common/HipDataUtils.hpp +++ b/src/common/HipDataUtils.hpp @@ -178,6 +178,15 @@ void deallocHipPinnedData(T& pptr) pptr = nullptr; } +static inline std::string getHipArch() +{ + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + std::string gcnArchName(devProp.gcnArchName); + std::string hipArch = gcnArchName.substr(0, 7); + if(hipArch.back() == ':' ) hipArch.pop_back(); + return hipArch; +} } // closing brace for rajaperf namespace diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 9f66f4bf3..32c0eedfc 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -25,6 +25,7 @@ #include "basic/INIT3.hpp" #include "basic/INIT_VIEW1D.hpp" #include "basic/INIT_VIEW1D_OFFSET.hpp" +#include "basic/MAT_FUSED_MUL_ADD.hpp" #include "basic/MAT_MAT_SHARED.hpp" #include "basic/MULADDSUB.hpp" #include "basic/NESTED_INIT.hpp" @@ -159,6 +160,7 @@ static const std::string KernelNames [] = std::string("Basic_INIT3"), std::string("Basic_INIT_VIEW1D"), std::string("Basic_INIT_VIEW1D_OFFSET"), + std::string("Basic_MAT_FUSED_MUL_ADD"), std::string("Basic_MAT_MAT_SHARED"), std::string("Basic_MULADDSUB"), std::string("Basic_NESTED_INIT"), @@ -537,6 +539,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new basic::INIT_VIEW1D_OFFSET(run_params); break; } + case Basic_MAT_FUSED_MUL_ADD : { + kernel = new basic::MAT_FUSED_MUL_ADD(run_params); + break; + } case Basic_MAT_MAT_SHARED : { kernel = new basic::MAT_MAT_SHARED(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 61a6f3bef..15641ce01 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -78,6 +78,7 @@ enum KernelID { Basic_INIT3, Basic_INIT_VIEW1D, Basic_INIT_VIEW1D_OFFSET, + Basic_MAT_FUSED_MUL_ADD, Basic_MAT_MAT_SHARED, Basic_MULADDSUB, Basic_NESTED_INIT,