diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 274497683..932fc15bd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -64,7 +64,7 @@ jobs: build-dir: build options: ENABLE_WARNINGS_AS_ERRORS=Off - BLT_CXX_STD=c++17 + BLT_CXX_STD=c++20 CMAKE_BUILD_TYPE=Release PERFSUITE_RUN_SHORT_TEST=On ${{ matrix.shared.args }} diff --git a/CMakeLists.txt b/CMakeLists.txt index aa09cfc50..0cdbf877b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,17 +20,17 @@ set(COMPILERS_KNOWN_TO_CMAKE33 AppleClang Clang GNU MSVC) include(CheckCXXCompilerFlag) if(NOT DEFINED BLT_CXX_STD) - if("cxx_std_20" IN_LIST CMAKE_CXX_KNOWN_FEATURES) - set(BLT_CXX_STD c++20 CACHE STRING "Version of C++ standard") + if("cxx_std_23" IN_LIST CMAKE_CXX_KNOWN_FEATURES) + set(BLT_CXX_STD c++23 CACHE STRING "Version of C++ standard") message("Using C++ standard: ${BLT_CXX_STD}") - elseif("cxx_std_17" IN_LIST CMAKE_CXX_KNOWN_FEATURES) - set(BLT_CXX_STD c++17 CACHE STRING "Version of C++ standard") + elseif("cxx_std_20" IN_LIST CMAKE_CXX_KNOWN_FEATURES) + set(BLT_CXX_STD c++20 CACHE STRING "Version of C++ standard") message("Using C++ standard: ${BLT_CXX_STD}") elseif("${CMAKE_CXX_COMPILER_ID}" IN_LIST COMPILERS_KNOWN_TO_CMAKE33) - set(BLT_CXX_STD c++17 CACHE STRING "Version of C++ standard") + set(BLT_CXX_STD c++20 CACHE STRING "Version of C++ standard") message("Using C++ standard: ${BLT_CXX_STD}") else() #cmake has no idea what to do, do it ourselves... - set(flag_var "c++17") + set(flag_var "c++20") CHECK_CXX_COMPILER_FLAG("-std=${flag_var}" COMPILER_SUPPORTS_${flag_var}) if(COMPILER_SUPPORTS_${flag_var}) set(BLT_CXX_STD ${flag_var} CACHE STRING "Version of C++ standard") @@ -41,8 +41,9 @@ if(NOT DEFINED BLT_CXX_STD) else() #check BLT_CXX_STD is high enough by disallowing the only invalid option if(("${BLT_CXX_STD}" STREQUAL "c++98") OR ("${BLT_CXX_STD}" STREQUAL "c++11") OR - ("${BLT_CXX_STD}" STREQUAL "c++14")) - message(FATAL_ERROR "RAJA requires minimum C++ standard of c++17") + ("${BLT_CXX_STD}" STREQUAL "c++14") OR + ("${BLT_CXX_STD}" STREQUAL "c++17")) + message(FATAL_ERROR "RAJA requires minimum C++ standard of c++20") endif() endif(NOT DEFINED BLT_CXX_STD) @@ -247,7 +248,7 @@ set(RAJAPERF_BUILD_SYSTYPE $ENV{SYS_TYPE}) set(RAJAPERF_BUILD_HOST $ENV{HOSTNAME}) if (ENABLE_CUDA) - set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_STANDARD 20) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -restrict --extended-lambda --expt-relaxed-constexpr") set(RAJAPERF_COMPILER "${CUDA_NVCC_EXECUTABLE}") diff --git a/Dockerfile b/Dockerfile index 86353abf0..e88a2a221 100644 --- a/Dockerfile +++ b/Dockerfile @@ -137,5 +137,5 @@ ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN /bin/bash -c "source /opt/intel/oneapi/setvars.sh 2>&1 > /dev/null && \ - cmake -DCMAKE_CXX_COMPILER=dpcpp -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=Off -DRAJA_ENABLE_SYCL=On -DBLT_CXX_STD=c++17 -DRAJA_ENABLE_DESUL_ATOMICS=On .. && \ + cmake -DCMAKE_CXX_COMPILER=dpcpp -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=Off -DRAJA_ENABLE_SYCL=On -DBLT_CXX_STD=c++20 -DRAJA_ENABLE_DESUL_ATOMICS=On .. && \ make -j 16" diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0f9a16593..35b102ec4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -9,7 +9,7 @@ jobs: # pool: # vmImage: 'windows-2019' # variables: -# CMAKE_EXTRA_FLAGS: '-DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off -DBLT_CXX_STD="" -DCMAKE_CXX_STANDARD=17' +# CMAKE_EXTRA_FLAGS: '-DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off -DBLT_CXX_STD="" -DCMAKE_CXX_STANDARD=20' # steps: # - checkout: self # clean: boolean diff --git a/scripts/alcf-builds/sycl.sh b/scripts/alcf-builds/sycl.sh index 2482444ea..63e4c8813 100755 --- a/scripts/alcf-builds/sycl.sh +++ b/scripts/alcf-builds/sycl.sh @@ -24,7 +24,7 @@ cmake \ -DENABLE_TARGET_OPENMP=Off \ -DENABLE_ALL_WARNINGS=Off \ -DENABLE_SYCL=On \ - -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_CXX_STANDARD=20 \ -DCMAKE_LINKER=icpx \ "$@" \ .. diff --git a/scripts/lc-builds/corona_sycl.sh b/scripts/lc-builds/corona_sycl.sh index 7470ffb5c..b00f25b2e 100755 --- a/scripts/lc-builds/corona_sycl.sh +++ b/scripts/lc-builds/corona_sycl.sh @@ -52,7 +52,7 @@ cmake \ -DCMAKE_C_COMPILER=clang \ -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_LINKER=clang++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -DENABLE_TESTS=On \ -DENABLE_EXAMPLES=On \ "$@" \ diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh index 73caddb1c..7d6dc6d7f 100755 --- a/scripts/lc-builds/toss4_amdclang.sh +++ b/scripts/lc-builds/toss4_amdclang.sh @@ -79,7 +79,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ diff --git a/scripts/lc-builds/toss4_amdclang_asan.sh b/scripts/lc-builds/toss4_amdclang_asan.sh index 409125ab7..038648a8c 100755 --- a/scripts/lc-builds/toss4_amdclang_asan.sh +++ b/scripts/lc-builds/toss4_amdclang_asan.sh @@ -86,7 +86,7 @@ cmake \ -DCMAKE_CXX_FLAGS="-fsanitize=address -fsanitize=undefined -shared-libsan" \ -DCMAKE_HIP_FLAGS="-fsanitize=address -fsanitize=undefined -shared-libsan -fgpu-rdc --hip-version=${COMP_HIP_VER}" \ -DCMAKE_EXE_LINKER_FLAGS="-L/opt/rocm-${COMP_HIP_VER}/lib/asan/ -L/opt/rocm-${COMP_HIP_VER}/llvm/lib/asan -Wl,-rpath,/opt/rocm-${COMP_HIP_VER}/lib/asan/:/opt/rocm-${COMP_HIP_VER}/llvm/lib/asan:/opt/rocm-${COMP_HIP_VER}/lib/llvm/lib/clang/${COMP_CLANG_MAJOR_VER}/lib/linux -fgpu-rdc --hip-version=${COMP_HIP_VER}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ diff --git a/scripts/lc-builds/toss4_cce_hip.sh b/scripts/lc-builds/toss4_cce_hip.sh index e3f746723..c506be0c5 100755 --- a/scripts/lc-builds/toss4_cce_hip.sh +++ b/scripts/lc-builds/toss4_cce_hip.sh @@ -53,7 +53,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES=${HIP_ARCH} \ -DGPU_TARGETS=${HIP_ARCH} \ -DAMDGPU_TARGETS=${HIP_ARCH} \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ diff --git a/scripts/lc-builds/toss4_clang-mpi_caliper.sh b/scripts/lc-builds/toss4_clang-mpi_caliper.sh index cc6522a37..5f840fa04 100755 --- a/scripts/lc-builds/toss4_clang-mpi_caliper.sh +++ b/scripts/lc-builds/toss4_clang-mpi_caliper.sh @@ -41,7 +41,7 @@ module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DENABLE_OPENMP=On \ diff --git a/scripts/lc-builds/toss4_clang.sh b/scripts/lc-builds/toss4_clang.sh index d97228fdb..a21c01980 100755 --- a/scripts/lc-builds/toss4_clang.sh +++ b/scripts/lc-builds/toss4_clang.sh @@ -34,7 +34,7 @@ module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/toss4_clang_caliper.sh b/scripts/lc-builds/toss4_clang_caliper.sh index 47773adfd..73ae770b9 100755 --- a/scripts/lc-builds/toss4_clang_caliper.sh +++ b/scripts/lc-builds/toss4_clang_caliper.sh @@ -41,7 +41,7 @@ module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh index 197afac9b..5909722fe 100755 --- a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh +++ b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh @@ -104,7 +104,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DENABLE_HIP=ON \ diff --git a/scripts/lc-builds/toss4_cray-mpich_amdclang_asan.sh b/scripts/lc-builds/toss4_cray-mpich_amdclang_asan.sh index a2442b4a2..f27a02f5a 100755 --- a/scripts/lc-builds/toss4_cray-mpich_amdclang_asan.sh +++ b/scripts/lc-builds/toss4_cray-mpich_amdclang_asan.sh @@ -111,7 +111,7 @@ cmake \ -DCMAKE_CXX_FLAGS="-fsanitize=address -shared-libsan" \ -DCMAKE_HIP_FLAGS="-fsanitize=address -shared-libsan -fgpu-rdc --hip-version=${COMP_HIP_VER}" \ -DCMAKE_EXE_LINKER_FLAGS="-L/opt/rocm-${COMP_HIP_VER}/lib/asan/ -L/opt/rocm-${COMP_HIP_VER}/llvm/lib/asan -Wl,-rpath,/opt/rocm-${COMP_HIP_VER}/lib/asan/:/opt/rocm-${COMP_HIP_VER}/llvm/lib/asan:/opt/rocm-${COMP_HIP_VER}/lib/llvm/lib/clang/${COMP_CLANG_MAJOR_VER}/lib/linux -fgpu-rdc --hip-version=${COMP_HIP_VER}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DENABLE_HIP=ON \ diff --git a/scripts/lc-builds/toss4_cray-mpich_amdclang_caliper.sh b/scripts/lc-builds/toss4_cray-mpich_amdclang_caliper.sh index 95fa3bb64..15b104dfc 100755 --- a/scripts/lc-builds/toss4_cray-mpich_amdclang_caliper.sh +++ b/scripts/lc-builds/toss4_cray-mpich_amdclang_caliper.sh @@ -108,7 +108,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DENABLE_HIP=ON \ diff --git a/scripts/lc-builds/toss4_gcc-mpi_caliper.sh b/scripts/lc-builds/toss4_gcc-mpi_caliper.sh index bf1f077ff..2b2ba87b8 100755 --- a/scripts/lc-builds/toss4_gcc-mpi_caliper.sh +++ b/scripts/lc-builds/toss4_gcc-mpi_caliper.sh @@ -42,7 +42,7 @@ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_C_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/gcc \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DENABLE_OPENMP=On \ diff --git a/scripts/lc-builds/toss4_gcc.sh b/scripts/lc-builds/toss4_gcc.sh index d20f8fe6d..568e425ce 100755 --- a/scripts/lc-builds/toss4_gcc.sh +++ b/scripts/lc-builds/toss4_gcc.sh @@ -34,7 +34,7 @@ module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/toss4_gcc_caliper.sh b/scripts/lc-builds/toss4_gcc_caliper.sh index 31c33f325..0484031d8 100755 --- a/scripts/lc-builds/toss4_gcc_caliper.sh +++ b/scripts/lc-builds/toss4_gcc_caliper.sh @@ -41,7 +41,7 @@ module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/toss4_hipcc.sh b/scripts/lc-builds/toss4_hipcc.sh index 19b427627..327f6c08e 100755 --- a/scripts/lc-builds/toss4_hipcc.sh +++ b/scripts/lc-builds/toss4_hipcc.sh @@ -69,7 +69,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ diff --git a/scripts/lc-builds/toss4_icpc-classic.sh b/scripts/lc-builds/toss4_icpc-classic.sh index a0a90ce98..fcbd276e1 100755 --- a/scripts/lc-builds/toss4_icpc-classic.sh +++ b/scripts/lc-builds/toss4_icpc-classic.sh @@ -40,7 +40,7 @@ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icpc \ -DCMAKE_C_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icc \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ diff --git a/scripts/lc-builds/toss4_icpc.sh b/scripts/lc-builds/toss4_icpc.sh index 07628ff2c..c5572e28e 100755 --- a/scripts/lc-builds/toss4_icpc.sh +++ b/scripts/lc-builds/toss4_icpc.sh @@ -40,7 +40,7 @@ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/bin/icpc \ -DCMAKE_C_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/bin/icc \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ diff --git a/scripts/lc-builds/toss4_icpx.sh b/scripts/lc-builds/toss4_icpx.sh index cd78adaa4..8b3c69fa9 100755 --- a/scripts/lc-builds/toss4_icpx.sh +++ b/scripts/lc-builds/toss4_icpx.sh @@ -42,7 +42,7 @@ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=icpx \ -DCMAKE_C_COMPILER=icx \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ diff --git a/scripts/lc-builds/toss4_mvapich2_icpx.sh b/scripts/lc-builds/toss4_mvapich2_icpx.sh index 3505c502f..4dd9ec443 100755 --- a/scripts/lc-builds/toss4_mvapich2_icpx.sh +++ b/scripts/lc-builds/toss4_mvapich2_icpx.sh @@ -50,7 +50,7 @@ cmake \ -DMPI_CXX_COMPILER="/usr/tce/packages/mvapich2/mvapich2-${MPI_VER}-intel-${COMP_VER}/bin/mpicxx" \ -DCMAKE_CXX_COMPILER=icpx \ -DCMAKE_C_COMPILER=icx \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ diff --git a/scripts/ubuntu-builds/ubuntu_amdclang.sh b/scripts/ubuntu-builds/ubuntu_amdclang.sh index c82a45a39..4da4785b9 100755 --- a/scripts/ubuntu-builds/ubuntu_amdclang.sh +++ b/scripts/ubuntu-builds/ubuntu_amdclang.sh @@ -60,7 +60,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ diff --git a/src/algorithm/ATOMIC.cpp b/src/algorithm/ATOMIC.cpp index 2f163a61a..30dcc8192 100644 --- a/src/algorithm/ATOMIC.cpp +++ b/src/algorithm/ATOMIC.cpp @@ -78,5 +78,53 @@ void ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) (void) vid; } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void ATOMIC::setCountedAttributes() +{ + const size_t replication = getActualProblemSize(); + + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ATOMIC_DATA_SETUP(replication); + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ATOMIC_BODY(RAJAPERF_ATOMIC_ADD_COUNTING, i, ATOMIC_VALUE)); + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + ATOMIC_DATA_TEARDOWN(replication); + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/ATOMIC.hpp b/src/algorithm/ATOMIC.hpp index 92348aea5..1800b5ce1 100644 --- a/src/algorithm/ATOMIC.hpp +++ b/src/algorithm/ATOMIC.hpp @@ -68,6 +68,7 @@ class ATOMIC : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp index 57ae5a6a6..969ae125e 100644 --- a/src/algorithm/HISTOGRAM.cpp +++ b/src/algorithm/HISTOGRAM.cpp @@ -146,5 +146,63 @@ void HISTOGRAM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(DataSpace::Host, m_counts_final); } + +// // Only define setCountedAttributes functions past this point +// // BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void HISTOGRAM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + HISTOGRAM_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HISTOGRAM_SETUP_COUNTS; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HISTOGRAM_INIT_COUNTS; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(HISTOGRAM_BODY(RAJAPERF_ATOMIC_ADD_COUNTING)); + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HISTOGRAM_FINALIZE_COUNTS; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HISTOGRAM_TEARDOWN_COUNTS; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp index 852b65fa6..7472e8eb2 100644 --- a/src/algorithm/HISTOGRAM.hpp +++ b/src/algorithm/HISTOGRAM.hpp @@ -90,6 +90,7 @@ class HISTOGRAM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/algorithm/MEMCPY.cpp b/src/algorithm/MEMCPY.cpp index d5f28a5f3..8923bca6c 100644 --- a/src/algorithm/MEMCPY.cpp +++ b/src/algorithm/MEMCPY.cpp @@ -79,5 +79,47 @@ void MEMCPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MEMCPY::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MEMCPY_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(MEMCPY_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/MEMCPY.hpp b/src/algorithm/MEMCPY.hpp index 90f506613..920788869 100644 --- a/src/algorithm/MEMCPY.hpp +++ b/src/algorithm/MEMCPY.hpp @@ -48,6 +48,7 @@ class MEMCPY : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/algorithm/MEMSET.cpp b/src/algorithm/MEMSET.cpp index 638abf20a..8ddccff64 100644 --- a/src/algorithm/MEMSET.cpp +++ b/src/algorithm/MEMSET.cpp @@ -78,5 +78,47 @@ void MEMSET::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_x, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MEMSET::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MEMSET_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(MEMSET_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/MEMSET.hpp b/src/algorithm/MEMSET.hpp index 2719751bf..6c97ba489 100644 --- a/src/algorithm/MEMSET.hpp +++ b/src/algorithm/MEMSET.hpp @@ -48,6 +48,7 @@ class MEMSET : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/algorithm/REDUCE_SUM.cpp b/src/algorithm/REDUCE_SUM.cpp index 8adb472a4..958f977b5 100644 --- a/src/algorithm/REDUCE_SUM.cpp +++ b/src/algorithm/REDUCE_SUM.cpp @@ -81,5 +81,55 @@ void REDUCE_SUM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_x, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void REDUCE_SUM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE_SUM_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Real_type sum = m_sum_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(REDUCE_SUM_BODY); + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_sum = sum; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/REDUCE_SUM.hpp b/src/algorithm/REDUCE_SUM.hpp index b0e504349..00260d734 100644 --- a/src/algorithm/REDUCE_SUM.hpp +++ b/src/algorithm/REDUCE_SUM.hpp @@ -52,6 +52,7 @@ class REDUCE_SUM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index ba54a8ceb..f7d193614 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -83,5 +83,50 @@ void SCAN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void SCAN::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + SCAN_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + SCAN_PROLOGUE; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(SCAN_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/SCAN.hpp b/src/algorithm/SCAN.hpp index f4ad374ac..2a5574d65 100644 --- a/src/algorithm/SCAN.hpp +++ b/src/algorithm/SCAN.hpp @@ -56,6 +56,7 @@ class SCAN : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/CONVECTION3DPA.cpp b/src/apps/CONVECTION3DPA.cpp index d476e1289..1af41e83e 100644 --- a/src/apps/CONVECTION3DPA.cpp +++ b/src/apps/CONVECTION3DPA.cpp @@ -104,5 +104,149 @@ void CONVECTION3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx deallocData(m_Y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void CONVECTION3DPA::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + CONVECTION3DPA_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type e = 0; e < NE; ++e)) { + RAJAPERF_COUNTERS_TEAM_CONTEXT(); + + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_0_CPU); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,CPA_D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy,y,CPA_D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx,x,CPA_D1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_1); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,CPA_D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy,y,CPA_D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,CPA_Q1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_2); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,CPA_D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,CPA_Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy,y,CPA_Q1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_3); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,CPA_Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy,y,CPA_Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qz,z,CPA_Q1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_4); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qz,z,CPA_Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy,y,CPA_Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,CPA_Q1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_5); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,CPA_Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy,y,CPA_Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,CPA_D1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_6); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,CPA_D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,CPA_Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy,y,CPA_D1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_7); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,CPA_D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy,y,CPA_D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx,x,CPA_D1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_8); + } + } + } + + } // element loop + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/CONVECTION3DPA.hpp b/src/apps/CONVECTION3DPA.hpp index 688dd1649..4f147cf28 100644 --- a/src/apps/CONVECTION3DPA.hpp +++ b/src/apps/CONVECTION3DPA.hpp @@ -223,51 +223,51 @@ Index_type NE = m_NE; D[qx + CPA_Q1D * qy + CPA_Q1D * CPA_Q1D * qz + CPA_Q1D * CPA_Q1D * CPA_Q1D * d + CPA_VDIM * CPA_Q1D * CPA_Q1D * CPA_Q1D * e] #define CONVECTION3DPA_0_GPU \ - constexpr int max_D1D = CPA_D1D; \ - constexpr int max_Q1D = CPA_Q1D; \ - constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; \ - RAJA_TEAM_SHARED double sm0[max_DQ*max_DQ*max_DQ]; \ - RAJA_TEAM_SHARED double sm1[max_DQ*max_DQ*max_DQ]; \ - RAJA_TEAM_SHARED double sm2[max_DQ*max_DQ*max_DQ]; \ - RAJA_TEAM_SHARED double sm3[max_DQ*max_DQ*max_DQ]; \ - RAJA_TEAM_SHARED double sm4[max_DQ*max_DQ*max_DQ]; \ - RAJA_TEAM_SHARED double sm5[max_DQ*max_DQ*max_DQ]; \ - double (*u)[max_D1D][max_D1D] = (double (*)[max_D1D][max_D1D]) sm0; \ - double (*Bu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm1; \ - double (*Gu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm2; \ - double (*BBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; \ - double (*GBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; \ - double (*BGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm5; \ - double (*GBBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm0; \ - double (*BGBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm1; \ - double (*BBGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm2; \ - double (*DGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; \ - double (*BDGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; \ - double (*BBDGu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm5; + constexpr auto max_D1D = CPA_D1D; \ + constexpr auto max_Q1D = CPA_Q1D; \ + constexpr auto max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; \ + RAJA_TEAM_SHARED Real_array3 sm0; \ + RAJA_TEAM_SHARED Real_array3 sm1; \ + RAJA_TEAM_SHARED Real_array3 sm2; \ + RAJA_TEAM_SHARED Real_array3 sm3; \ + RAJA_TEAM_SHARED Real_array3 sm4; \ + RAJA_TEAM_SHARED Real_array3 sm5; \ + Real_array3_ref u( sm0); \ + Real_array3_ref Bu(sm1); \ + Real_array3_ref Gu(sm2); \ + Real_array3_ref BBu(sm3); \ + Real_array3_ref GBu(sm4); \ + Real_array3_ref BGu(sm5); \ + Real_array3_ref GBBu(sm0); \ + Real_array3_ref BGBu(sm1); \ + Real_array3_ref BBGu(sm2); \ + Real_array3_ref DGu(sm3); \ + Real_array3_ref BDGu(sm4); \ + Real_array3_ref BBDGu(sm5); #define CONVECTION3DPA_0_CPU \ - constexpr int max_D1D = CPA_D1D; \ - constexpr int max_Q1D = CPA_Q1D; \ - constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; \ - double sm0[max_DQ*max_DQ*max_DQ]; \ - double sm1[max_DQ*max_DQ*max_DQ]; \ - double sm2[max_DQ*max_DQ*max_DQ]; \ - double sm3[max_DQ*max_DQ*max_DQ]; \ - double sm4[max_DQ*max_DQ*max_DQ]; \ - double sm5[max_DQ*max_DQ*max_DQ]; \ - double (*u)[max_D1D][max_D1D] = (double (*)[max_D1D][max_D1D]) sm0; \ - double (*Bu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm1; \ - double (*Gu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm2; \ - double (*BBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; \ - double (*GBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; \ - double (*BGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm5; \ - double (*GBBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm0; \ - double (*BGBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm1; \ - double (*BBGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm2; \ - double (*DGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; \ - double (*BDGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; \ - double (*BBDGu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm5; + constexpr auto max_D1D = CPA_D1D; \ + constexpr auto max_Q1D = CPA_Q1D; \ + constexpr auto max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; \ + Real_array3 sm0; \ + Real_array3 sm1; \ + Real_array3 sm2; \ + Real_array3 sm3; \ + Real_array3 sm4; \ + Real_array3 sm5; \ + Real_array3_ref u( sm0); \ + Real_array3_ref Bu(sm1); \ + Real_array3_ref Gu(sm2); \ + Real_array3_ref BBu(sm3); \ + Real_array3_ref GBu(sm4); \ + Real_array3_ref BGu(sm5); \ + Real_array3_ref GBBu(sm0); \ + Real_array3_ref BGBu(sm1); \ + Real_array3_ref BBGu(sm2); \ + Real_array3_ref DGu(sm3); \ + Real_array3_ref BDGu(sm4); \ + Real_array3_ref BBDGu(sm5); #define CONVECTION3DPA_1 \ u[dz][dy][dx] = CPA_X(dx,dy,dz,e); @@ -372,6 +372,7 @@ class CONVECTION3DPA : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index dcd7a29a5..5bb2aa6a1 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -113,5 +113,48 @@ void DEL_DOT_VEC_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx deallocData(m_div, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DEL_DOT_VEC_2D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + DEL_DOT_VEC_2D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ii = ibegin ; ii < iend ; ++ii )) { + RAJAPERF_COUNTERS_LOOP_BODY(DEL_DOT_VEC_2D_BODY_INDEX); + RAJAPERF_COUNTERS_LOOP_BODY(DEL_DOT_VEC_2D_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp index 14bd78533..8e1e75738 100644 --- a/src/apps/DEL_DOT_VEC_2D.hpp +++ b/src/apps/DEL_DOT_VEC_2D.hpp @@ -112,6 +112,7 @@ class DEL_DOT_VEC_2D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index 705622b30..a01caeb7f 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -103,5 +103,129 @@ void DIFFUSION3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_Y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DIFFUSION3DPA::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + DIFFUSION3DPA_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type e = 0; e < NE; ++e)) { + RAJAPERF_COUNTERS_TEAM_CONTEXT(); + + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_0_CPU); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz, z, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, DPA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_1); + } + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, DPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_2); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz, z, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, DPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_3); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz, z, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, DPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, DPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_4); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qz, z, DPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, DPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, DPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_5); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(d, y, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(q, x, DPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_6); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qz, z, DPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, DPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, DPA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_7); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qz, z, DPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, DPA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_8); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz, z, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, DPA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_9); + } + } + } + + } // element loop + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index b03e90ea0..52fffdbba 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -254,58 +254,58 @@ const bool symmetric = true; (((q)<=(d)) ? -1.0 : 1.0) #define DIFFUSION3DPA_0_GPU \ - constexpr int MQ1 = DPA_Q1D; \ - constexpr int MD1 = DPA_D1D; \ - constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - RAJA_TEAM_SHARED double sBG[MQ1*MD1]; \ - double (*B)[MD1] = (double (*)[MD1]) sBG; \ - double (*G)[MD1] = (double (*)[MD1]) sBG; \ - double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; \ - double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; \ - RAJA_TEAM_SHARED double sm0[3][MDQ*MDQ*MDQ]; \ - RAJA_TEAM_SHARED double sm1[3][MDQ*MDQ*MDQ]; \ - double (*s_X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); \ - double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0); \ - double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1); \ - double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0); \ - double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1); \ - double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2); \ - double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0); \ - double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1); \ - double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2); \ - double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0); \ - double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1); \ - double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2); \ - double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0); \ - double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); \ - double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); + constexpr auto MQ1 = DPA_Q1D; \ + constexpr auto MD1 = DPA_D1D; \ + constexpr auto MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + RAJA_TEAM_SHARED Real_array2 sBG; \ + Real_array2_ref B(sBG); \ + Real_array2_ref G(sBG); \ + Real_array2_ref Bt(sBG); \ + Real_array2_ref Gt(sBG); \ + RAJA_TEAM_SHARED Real_array4<3, MDQ, MDQ, MDQ> sm0; \ + RAJA_TEAM_SHARED Real_array4<3, MDQ, MDQ, MDQ> sm1; \ + Real_array3_ref s_X(sm0[2]); \ + Real_array3_ref DDQ0(sm0[0]); \ + Real_array3_ref DDQ1(sm0[1]); \ + Real_array3_ref DQQ0(sm1[0]); \ + Real_array3_ref DQQ1(sm1[1]); \ + Real_array3_ref DQQ2(sm1[2]); \ + Real_array3_ref QQQ0(sm0[0]); \ + Real_array3_ref QQQ1(sm0[1]); \ + Real_array3_ref QQQ2(sm0[2]); \ + Real_array3_ref QQD0(sm1[0]); \ + Real_array3_ref QQD1(sm1[1]); \ + Real_array3_ref QQD2(sm1[2]); \ + Real_array3_ref QDD0(sm0[0]); \ + Real_array3_ref QDD1(sm0[1]); \ + Real_array3_ref QDD2(sm0[2]); #define DIFFUSION3DPA_0_CPU \ - constexpr int MQ1 = DPA_Q1D; \ - constexpr int MD1 = DPA_D1D; \ - constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - double sBG[MQ1*MD1]; \ - double (*B)[MD1] = (double (*)[MD1]) sBG; \ - double (*G)[MD1] = (double (*)[MD1]) sBG; \ - double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; \ - double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; \ - double sm0[3][MDQ*MDQ*MDQ]; \ - double sm1[3][MDQ*MDQ*MDQ]; \ - double (*s_X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); \ - double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0); \ - double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1); \ - double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0); \ - double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1); \ - double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2); \ - double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0); \ - double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1); \ - double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2); \ - double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0); \ - double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1); \ - double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2); \ - double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0); \ - double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); \ - double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); + constexpr auto MQ1 = DPA_Q1D; \ + constexpr auto MD1 = DPA_D1D; \ + constexpr auto MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + Real_array2 sBG; \ + Real_array2_ref B(sBG); \ + Real_array2_ref G(sBG); \ + Real_array2_ref Bt(sBG); \ + Real_array2_ref Gt(sBG); \ + Real_array4<3, MDQ, MDQ, MDQ> sm0; \ + Real_array4<3, MDQ, MDQ, MDQ> sm1; \ + Real_array3_ref s_X(sm0[2]); \ + Real_array3_ref DDQ0(sm0[0]); \ + Real_array3_ref DDQ1(sm0[1]); \ + Real_array3_ref DQQ0(sm1[0]); \ + Real_array3_ref DQQ1(sm1[1]); \ + Real_array3_ref DQQ2(sm1[2]); \ + Real_array3_ref QQQ0(sm0[0]); \ + Real_array3_ref QQQ1(sm0[1]); \ + Real_array3_ref QQQ2(sm0[2]); \ + Real_array3_ref QQD0(sm1[0]); \ + Real_array3_ref QQD1(sm1[1]); \ + Real_array3_ref QQD2(sm1[2]); \ + Real_array3_ref QDD0(sm0[0]); \ + Real_array3_ref QDD1(sm0[1]); \ + Real_array3_ref QDD2(sm0[2]); #define DIFFUSION3DPA_1 \ s_X[dz][dy][dx] = DPA_X(dx,dy,dz,e); @@ -461,6 +461,7 @@ class DIFFUSION3DPA : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index cdcc75a7f..8a3865cdc 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -133,5 +133,67 @@ void ENERGY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_vnewc, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void ENERGY::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ENERGY_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY1); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY2); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY3); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY4); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY5); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY6); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp index 079ff07d2..0cdc06158 100644 --- a/src/apps/ENERGY.hpp +++ b/src/apps/ENERGY.hpp @@ -197,6 +197,7 @@ class ENERGY : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index e250cb91e..99ecb6e11 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -88,5 +88,52 @@ void FIR::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_out, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void FIR::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + FIR_COEFF; + + FIR_DATA_SETUP; + + Real_type coeff[FIR_COEFFLEN]; + std::copy(std::begin(coeff_array), std::end(coeff_array), std::begin(coeff)); + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(FIR_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp index 11e5c8e2c..fbb4eec1d 100644 --- a/src/apps/FIR.hpp +++ b/src/apps/FIR.hpp @@ -72,6 +72,7 @@ class FIR : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index 9cc621909..443efbf3e 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -106,5 +106,50 @@ void LTIMES_NOVIEW::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_psidat, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void LTIMES_NOVIEW::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + LTIMES_NOVIEW_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type z = 0; z < num_z; ++z )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type g = 0; g < num_g; ++g )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type m = 0; m < num_m; ++m )) { + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type d = 0; d < num_d; ++d )) { + RAJAPERF_COUNTERS_LOOP_BODY(LTIMES_NOVIEW_BODY); + } + } + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index 8c0d4652b..811295c9e 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -60,6 +60,7 @@ class LTIMES_NOVIEW : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/MASS3DEA.cpp b/src/apps/MASS3DEA.cpp index 322ed9163..fb800bb91 100644 --- a/src/apps/MASS3DEA.cpp +++ b/src/apps/MASS3DEA.cpp @@ -93,5 +93,73 @@ void MASS3DEA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_M, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MASS3DEA::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + MASS3DEA_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type e = 0; e < NE; ++e)) { + RAJAPERF_COUNTERS_TEAM_CONTEXT(); + + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DEA_0_CPU); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(d, x, MEA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(q, y, MEA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DEA_1); + } + } + + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DEA_2_CPU); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(k1, x, MEA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(k2, y, MEA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(k3, z, MEA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DEA_3); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(i1, x, MEA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(i2, y, MEA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(i3, z, MEA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DEA_4); + } + } + } + + } // element loop + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/MASS3DEA.hpp b/src/apps/MASS3DEA.hpp index 1b3ca0074..465f8bf88 100644 --- a/src/apps/MASS3DEA.hpp +++ b/src/apps/MASS3DEA.hpp @@ -97,17 +97,17 @@ D[qx + MEA_Q1D * qy + MEA_Q1D * MEA_Q1D * qz + \ MEA_Q1D * MEA_Q1D * MEA_Q1D * e] -#define MASS3DEA_0 RAJA_TEAM_SHARED double s_B[MEA_Q1D][MEA_D1D]; +#define MASS3DEA_0 RAJA_TEAM_SHARED Real_array2 s_B; -#define MASS3DEA_0_CPU double s_B[MEA_Q1D][MEA_D1D]; +#define MASS3DEA_0_CPU Real_array2 s_B; #define MASS3DEA_1 s_B[q][d] = MEA_B(q, d); #define MASS3DEA_2 \ - RAJA_TEAM_SHARED double s_D[MEA_Q1D][MEA_Q1D][MEA_Q1D]; + RAJA_TEAM_SHARED Real_array3 s_D; #define MASS3DEA_2_CPU \ - double s_D[MEA_Q1D][MEA_Q1D][MEA_Q1D]; + Real_array3 s_D; #define MASS3DEA_3 s_D[k1][k2][k3] = MEA_D(k1, k2, k3, e); @@ -146,6 +146,7 @@ class MASS3DEA : public KernelBase { void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index 2c8ef5c39..22c5e3d42 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -99,5 +99,112 @@ void MASS3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_Y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MASS3DPA::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + MASS3DPA_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type e = 0; e < NE; ++e)) { + RAJAPERF_COUNTERS_TEAM_CONTEXT(); + + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_0_CPU); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, MPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, MPA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_1); + } + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, MPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_2); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, MPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, MPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_3); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, MPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, MPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_4); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, MPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, MPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_5); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(d, y, MPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(q, x, MPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_6); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, MPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, MPA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_7); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, MPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, MPA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_8); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, MPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, MPA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_9); + } + } + + } // element loop + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index 5fe683b5c..f6b6332b6 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -181,36 +181,36 @@ Index_type NE = m_NE; D[qx + MPA_Q1D * qy + MPA_Q1D * MPA_Q1D * qz + MPA_Q1D * MPA_Q1D * MPA_Q1D * e] #define MASS3DPA_0_CPU \ - constexpr int MQ1 = MPA_Q1D; \ - constexpr int MD1 = MPA_D1D; \ - constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - double sDQ[MQ1 * MD1]; \ - double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; \ - double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ; \ - double sm0[MDQ * MDQ * MDQ]; \ - double sm1[MDQ * MDQ * MDQ]; \ - double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0; \ - double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; \ - double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; \ - double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; \ - double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; \ - double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; + constexpr auto MQ1 = MPA_Q1D; \ + constexpr auto MD1 = MPA_D1D; \ + constexpr auto MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + Real_array2 sDQ; \ + Real_array2_ref Bsmem(sDQ); \ + Real_array2_ref Btsmem(sDQ); \ + Real_array3 sm0; \ + Real_array3 sm1; \ + Real_array3_ref Xsmem(sm0); \ + Real_array3_ref DDQ(sm1); \ + Real_array3_ref DQQ(sm0); \ + Real_array3_ref QQQ(sm1); \ + Real_array3_ref QQD(sm0); \ + Real_array3_ref QDD(sm1); #define MASS3DPA_0_GPU \ - constexpr int MQ1 = MPA_Q1D; \ - constexpr int MD1 = MPA_D1D; \ - constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - RAJA_TEAM_SHARED double sDQ[MQ1 * MD1]; \ - double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; \ - double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ; \ - RAJA_TEAM_SHARED double sm0[MDQ * MDQ * MDQ]; \ - RAJA_TEAM_SHARED double sm1[MDQ * MDQ * MDQ]; \ - double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0; \ - double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; \ - double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; \ - double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; \ - double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; \ - double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; + constexpr auto MQ1 = MPA_Q1D; \ + constexpr auto MD1 = MPA_D1D; \ + constexpr auto MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + RAJA_TEAM_SHARED Real_array2 sDQ; \ + Real_array2_ref Bsmem(sDQ); \ + Real_array2_ref Btsmem(sDQ); \ + RAJA_TEAM_SHARED Real_array3 sm0; \ + RAJA_TEAM_SHARED Real_array3 sm1; \ + Real_array3_ref Xsmem(sm0); \ + Real_array3_ref DDQ(sm1); \ + Real_array3_ref DQQ(sm0); \ + Real_array3_ref QQQ(sm1); \ + Real_array3_ref QQD(sm0); \ + Real_array3_ref QDD(sm1); #define MASS3DPA_1 \ RAJAPERF_UNROLL(MD1) \ @@ -357,6 +357,7 @@ class MASS3DPA : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/MATVEC_3D_STENCIL.cpp b/src/apps/MATVEC_3D_STENCIL.cpp index 081a648af..7179dd0aa 100644 --- a/src/apps/MATVEC_3D_STENCIL.cpp +++ b/src/apps/MATVEC_3D_STENCIL.cpp @@ -182,5 +182,48 @@ void MATVEC_3D_STENCIL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ deallocData(m_real_zones, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MATVEC_3D_STENCIL::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + MATVEC_3D_STENCIL_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ii = ibegin ; ii < iend ; ++ii )) { + RAJAPERF_COUNTERS_LOOP_BODY(MATVEC_3D_STENCIL_BODY_INDEX); + RAJAPERF_COUNTERS_LOOP_BODY(MATVEC_3D_STENCIL_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/MATVEC_3D_STENCIL.hpp b/src/apps/MATVEC_3D_STENCIL.hpp index e65ea1dad..9c73bb19c 100644 --- a/src/apps/MATVEC_3D_STENCIL.hpp +++ b/src/apps/MATVEC_3D_STENCIL.hpp @@ -131,6 +131,7 @@ class MATVEC_3D_STENCIL : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp index 3ffcce23d..544cde71a 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -102,5 +102,48 @@ void NODAL_ACCUMULATION_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t deallocData(m_real_zones, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void NODAL_ACCUMULATION_3D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + NODAL_ACCUMULATION_3D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ii = ibegin ; ii < iend ; ++ii )) { + RAJAPERF_COUNTERS_LOOP_BODY(NODAL_ACCUMULATION_3D_BODY_INDEX); + RAJAPERF_COUNTERS_LOOP_BODY(NODAL_ACCUMULATION_3D_BODY(RAJAPERF_ATOMIC_ADD_COUNTING)); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/NODAL_ACCUMULATION_3D.hpp b/src/apps/NODAL_ACCUMULATION_3D.hpp index c2ad0a1dc..c41abadd1 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.hpp +++ b/src/apps/NODAL_ACCUMULATION_3D.hpp @@ -79,6 +79,7 @@ class NODAL_ACCUMULATION_3D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index a51f43729..f16cb9bf7 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -96,5 +96,51 @@ void PRESSURE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_vnewc, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void PRESSURE::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + PRESSURE_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(PRESSURE_BODY1); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(PRESSURE_OPT_BODY2); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp index fb51c7e90..4929bf427 100644 --- a/src/apps/PRESSURE.hpp +++ b/src/apps/PRESSURE.hpp @@ -44,6 +44,13 @@ if ( vnewc[i] >= eosvmax ) p_new[i] = 0.0 ; \ if ( p_new[i] < pmin ) p_new[i] = pmin ; +#define PRESSURE_OPT_BODY2 \ + Real_type p = bvc[i] * e_old[i] ; \ + if ( fabs(p) < p_cut ) p = 0.0 ; \ + if ( vnewc[i] >= eosvmax ) p = 0.0 ; \ + if ( p < pmin ) p = pmin ; \ + p_new[i] = p; + #include "common/KernelBase.hpp" @@ -65,6 +72,7 @@ class PRESSURE : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index 11ef9030c..6f68bd530 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -108,5 +108,47 @@ void VOL3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_vol, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void VOL3D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = m_domain->fpz; + const Index_type iend = m_domain->lpz+1; + + VOL3D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin ; i < iend ; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(VOL3D_OPT_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp index 24715cbee..0ff178f19 100644 --- a/src/apps/VOL3D.hpp +++ b/src/apps/VOL3D.hpp @@ -136,6 +136,59 @@ \ vol[i] *= vnormq ; +#define VOL3D_OPT_BODY \ + Real_type x71 = x7[i] - x1[i] ; \ + Real_type x72 = x7[i] - x2[i] ; \ + Real_type x74 = x7[i] - x4[i] ; \ + Real_type x30 = x3[i] - x0[i] ; \ + Real_type x50 = x5[i] - x0[i] ; \ + Real_type x60 = x6[i] - x0[i] ; \ + \ + Real_type y71 = y7[i] - y1[i] ; \ + Real_type y72 = y7[i] - y2[i] ; \ + Real_type y74 = y7[i] - y4[i] ; \ + Real_type y30 = y3[i] - y0[i] ; \ + Real_type y50 = y5[i] - y0[i] ; \ + Real_type y60 = y6[i] - y0[i] ; \ + \ + Real_type z71 = z7[i] - z1[i] ; \ + Real_type z72 = z7[i] - z2[i] ; \ + Real_type z74 = z7[i] - z4[i] ; \ + Real_type z30 = z3[i] - z0[i] ; \ + Real_type z50 = z5[i] - z0[i] ; \ + Real_type z60 = z6[i] - z0[i] ; \ + \ + Real_type xps = x71 + x60 ; \ + Real_type yps = y71 + y60 ; \ + Real_type zps = z71 + z60 ; \ + \ + Real_type cyz = y72 * z30 - z72 * y30 ; \ + Real_type czx = z72 * x30 - x72 * z30 ; \ + Real_type cxy = x72 * y30 - y72 * x30 ; \ + Real_type v = xps * cyz + yps * czx + zps * cxy ; \ + \ + xps = x72 + x50 ; \ + yps = y72 + y50 ; \ + zps = z72 + z50 ; \ + \ + cyz = y74 * z60 - z74 * y60 ; \ + czx = z74 * x60 - x74 * z60 ; \ + cxy = x74 * y60 - y74 * x60 ; \ + v += xps * cyz + yps * czx + zps * cxy ; \ + \ + xps = x74 + x30 ; \ + yps = y74 + y30 ; \ + zps = z74 + z30 ; \ + \ + cyz = y71 * z50 - z71 * y50 ; \ + czx = z71 * x50 - x71 * z50 ; \ + cxy = x71 * y50 - y71 * x50 ; \ + v += xps * cyz + yps * czx + zps * cxy ; \ + \ + v *= vnormq ; \ + \ + vol[i] = v ; + #include "common/KernelBase.hpp" @@ -158,6 +211,7 @@ class VOL3D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/ZONAL_ACCUMULATION_3D.cpp b/src/apps/ZONAL_ACCUMULATION_3D.cpp index 775e4dc18..eef222050 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.cpp @@ -104,5 +104,48 @@ void ZONAL_ACCUMULATION_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t deallocData(m_real_zones, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void ZONAL_ACCUMULATION_3D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + ZONAL_ACCUMULATION_3D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ii = ibegin ; ii < iend ; ++ii )) { + RAJAPERF_COUNTERS_LOOP_BODY(ZONAL_ACCUMULATION_3D_BODY_INDEX); + RAJAPERF_COUNTERS_LOOP_BODY(ZONAL_ACCUMULATION_3D_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/ZONAL_ACCUMULATION_3D.hpp b/src/apps/ZONAL_ACCUMULATION_3D.hpp index 758682764..34cded75c 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.hpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.hpp @@ -75,6 +75,7 @@ class ZONAL_ACCUMULATION_3D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/ARRAY_OF_PTRS.cpp b/src/basic/ARRAY_OF_PTRS.cpp index 9b7577d05..b3518435d 100644 --- a/src/basic/ARRAY_OF_PTRS.cpp +++ b/src/basic/ARRAY_OF_PTRS.cpp @@ -85,5 +85,47 @@ void ARRAY_OF_PTRS::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void ARRAY_OF_PTRS::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ARRAY_OF_PTRS_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ARRAY_OF_PTRS_BODY(x)); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/ARRAY_OF_PTRS.hpp b/src/basic/ARRAY_OF_PTRS.hpp index bbebbf25e..6d5f4c76c 100644 --- a/src/basic/ARRAY_OF_PTRS.hpp +++ b/src/basic/ARRAY_OF_PTRS.hpp @@ -64,6 +64,7 @@ class ARRAY_OF_PTRS : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/COPY8.cpp b/src/basic/COPY8.cpp index 5ce1685cd..0a0e57e86 100644 --- a/src/basic/COPY8.cpp +++ b/src/basic/COPY8.cpp @@ -117,5 +117,47 @@ void COPY8::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y7, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void COPY8::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + COPY8_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(COPY8_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/COPY8.hpp b/src/basic/COPY8.hpp index 1afa1bcb9..de7a4a007 100644 --- a/src/basic/COPY8.hpp +++ b/src/basic/COPY8.hpp @@ -73,6 +73,7 @@ class COPY8 : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index 2cdd97d99..82fcb0eae 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -85,5 +85,47 @@ void DAXPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DAXPY::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DAXPY_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(DAXPY_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index 43c2f6a90..fead77739 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -46,6 +46,7 @@ class DAXPY : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp index 87fa48d49..7785bb3c1 100644 --- a/src/basic/DAXPY_ATOMIC.cpp +++ b/src/basic/DAXPY_ATOMIC.cpp @@ -83,5 +83,47 @@ void DAXPY_ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DAXPY_ATOMIC::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DAXPY_ATOMIC_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(RAJAPERF_ATOMIC_ADD_COUNTING(y[i], a * x[i]);); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/DAXPY_ATOMIC.hpp b/src/basic/DAXPY_ATOMIC.hpp index 4b8d91dcf..89b150ad2 100644 --- a/src/basic/DAXPY_ATOMIC.hpp +++ b/src/basic/DAXPY_ATOMIC.hpp @@ -46,6 +46,7 @@ class DAXPY_ATOMIC : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/EMPTY.cpp b/src/basic/EMPTY.cpp index f7532de8b..163faccc8 100644 --- a/src/basic/EMPTY.cpp +++ b/src/basic/EMPTY.cpp @@ -73,5 +73,47 @@ void EMPTY::tearDown(VariantID RAJAPERF_UNUSED_ARG(vid), size_t RAJAPERF_UNUSED_ { } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void EMPTY::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + EMPTY_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(EMPTY_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/EMPTY.hpp b/src/basic/EMPTY.hpp index 5e7a1b156..61e4cd6b0 100644 --- a/src/basic/EMPTY.hpp +++ b/src/basic/EMPTY.hpp @@ -50,6 +50,7 @@ class EMPTY : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index f3e264a02..ce66d528a 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -97,5 +97,47 @@ void IF_QUAD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_x2, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void IF_QUAD::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + IF_QUAD_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(IF_QUAD_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp index 151b4ad8c..58dcb3b1d 100644 --- a/src/basic/IF_QUAD.hpp +++ b/src/basic/IF_QUAD.hpp @@ -63,6 +63,7 @@ class IF_QUAD : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index d4ab1d659..7cb0c9f4d 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -83,5 +83,57 @@ void INDEXLIST::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_list, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INDEXLIST::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INDEXLIST_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Index_type count = 0; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INDEXLIST_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_len = count; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INDEXLIST.hpp b/src/basic/INDEXLIST.hpp index 7ec1414bf..774bd85b7 100644 --- a/src/basic/INDEXLIST.hpp +++ b/src/basic/INDEXLIST.hpp @@ -54,6 +54,7 @@ class INDEXLIST : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index 733397f2e..08a715287 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -91,5 +91,77 @@ void INDEXLIST_3LOOP::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_id deallocData(m_list, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INDEXLIST_3LOOP::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INDEXLIST_3LOOP_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + INDEXLIST_3LOOP_COUNTS_SETUP(DataSpace::Host); + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0); + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Index_type count = 0; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend+1; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY( + Index_type inc = counts[i]; + counts[i] = count; + count += inc; + ); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INDEXLIST_3LOOP_MAKE_LIST); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_len = counts[iend]; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + INDEXLIST_3LOOP_COUNTS_TEARDOWN(DataSpace::Host); + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INDEXLIST_3LOOP.hpp b/src/basic/INDEXLIST_3LOOP.hpp index 5a9e1e7ab..f5e9e3661 100644 --- a/src/basic/INDEXLIST_3LOOP.hpp +++ b/src/basic/INDEXLIST_3LOOP.hpp @@ -72,6 +72,7 @@ class INDEXLIST_3LOOP : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index f18cfcbba..293d71ac8 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -92,5 +92,47 @@ void INIT3::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_in2, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INIT3::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INIT3_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INIT3_OPT_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp index 872edc6c0..b6e966233 100644 --- a/src/basic/INIT3.hpp +++ b/src/basic/INIT3.hpp @@ -28,6 +28,12 @@ #define INIT3_BODY \ out1[i] = out2[i] = out3[i] = - in1[i] - in2[i] ; +#define INIT3_OPT_BODY \ + Real_type tmp = - in1[i] - in2[i]; \ + out1[i] = tmp ; \ + out2[i] = tmp ; \ + out3[i] = tmp ; + #include "common/KernelBase.hpp" @@ -49,6 +55,7 @@ class INIT3 : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index 3790de2b8..f458297c1 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -84,5 +84,47 @@ void INIT_VIEW1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_a, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INIT_VIEW1D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INIT_VIEW1D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INIT_VIEW1D_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp index 05daf479b..06a6c9c9e 100644 --- a/src/basic/INIT_VIEW1D.hpp +++ b/src/basic/INIT_VIEW1D.hpp @@ -60,6 +60,7 @@ class INIT_VIEW1D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index 3e028af6c..66a32aac9 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -84,5 +84,47 @@ void INIT_VIEW1D_OFFSET::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune deallocData(m_a, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INIT_VIEW1D_OFFSET::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 1; + const Index_type iend = getActualProblemSize()+1; + + INIT_VIEW1D_OFFSET_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INIT_VIEW1D_OFFSET_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp index 01a6712d9..cf422cd6c 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.hpp +++ b/src/basic/INIT_VIEW1D_OFFSET.hpp @@ -59,6 +59,7 @@ class INIT_VIEW1D_OFFSET : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index bad2c1cd0..297f81a35 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -39,7 +39,9 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, __syncthreads(); - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } __syncthreads(); } @@ -128,7 +130,11 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) __syncthreads(); auto inner_y_3 = [&](Index_type ty) { - auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(tile_size) }; + auto inner_x_3 = [&](Index_type tx) { + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } + }; { Index_type tx = threadIdx.x; @@ -241,7 +247,9 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) [&](Index_type ty) { RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } } ); // RAJA::loop } diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index f2c29f04e..0d54e8c35 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -39,7 +39,9 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, __syncthreads(); - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } __syncthreads(); } @@ -128,7 +130,11 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) __syncthreads(); auto inner_y_3 = [&](Index_type ty) { - auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(tile_size) }; + auto inner_x_3 = [&](Index_type tx) { + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } + }; { Index_type tx = threadIdx.x; @@ -240,7 +246,9 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) [&](Index_type ty) { RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } } ); // RAJA::loop } diff --git a/src/basic/MAT_MAT_SHARED-OMP.cpp b/src/basic/MAT_MAT_SHARED-OMP.cpp index b4563cc52..31762b191 100644 --- a/src/basic/MAT_MAT_SHARED-OMP.cpp +++ b/src/basic/MAT_MAT_SHARED-OMP.cpp @@ -61,7 +61,9 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( for (Index_type ty = 0; ty < TL_SZ; ++ty) { for (Index_type tx = 0; tx < TL_SZ; ++tx) { - MAT_MAT_SHARED_BODY_3(TL_SZ) + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } } } } @@ -118,7 +120,11 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( } auto inner_y_3 = [&](Index_type ty) { - auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(TL_SZ) }; + auto inner_x_3 = [&](Index_type tx) { + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } + }; for (Index_type tx = 0; tx < TL_SZ; ++tx) { inner_x_3(tx); @@ -215,7 +221,9 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( [&](Index_type ty) { RAJA::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3(TL_SZ) + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } } ); // RAJA::loop } diff --git a/src/basic/MAT_MAT_SHARED-Seq.cpp b/src/basic/MAT_MAT_SHARED-Seq.cpp index 1dabaef8f..8e03155cf 100644 --- a/src/basic/MAT_MAT_SHARED-Seq.cpp +++ b/src/basic/MAT_MAT_SHARED-Seq.cpp @@ -52,7 +52,9 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun for (Index_type ty = 0; ty < TL_SZ; ++ty) { for (Index_type tx = 0; tx < TL_SZ; ++tx) { - MAT_MAT_SHARED_BODY_3(TL_SZ) + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } } } @@ -112,7 +114,11 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun } auto inner_y_3 = [&](Index_type ty) { - auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(TL_SZ) }; + auto inner_x_3 = [&](Index_type tx) { + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } + }; for (Index_type tx = 0; tx < TL_SZ; ++tx) { inner_x_3(tx); @@ -125,7 +131,9 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun } auto inner_y_4 = [&](Index_type ty) { - auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4(TL_SZ) }; + auto inner_x_4 = [&](Index_type tx) { + MAT_MAT_SHARED_BODY_4(TL_SZ) + }; for (Index_type tx = 0; tx < TL_SZ; ++tx) { inner_x_4(tx); @@ -210,7 +218,9 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun [&](Index_type ty) { RAJA::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3(TL_SZ) + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } } ); // RAJA::loop } diff --git a/src/basic/MAT_MAT_SHARED-Sycl.cpp b/src/basic/MAT_MAT_SHARED-Sycl.cpp index c4c15ae40..2e6a53203 100644 --- a/src/basic/MAT_MAT_SHARED-Sycl.cpp +++ b/src/basic/MAT_MAT_SHARED-Sycl.cpp @@ -49,9 +49,9 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) qu->submit([&](::sycl::handler& h) { - ::sycl::local_accessor As(::sycl::range<2>(tile_size, tile_size), h); - ::sycl::local_accessor Bs(::sycl::range<2>(tile_size, tile_size), h); - ::sycl::local_accessor Cs(::sycl::range<2>(tile_size, tile_size), h); + ::sycl::local_accessor As(::sycl::range<2>(tile_size, tile_size), h); + ::sycl::local_accessor Bs(::sycl::range<2>(tile_size, tile_size), h); + ::sycl::local_accessor Cs(::sycl::range<2>(tile_size, tile_size), h); h.parallel_for (::sycl::nd_range<3>(gridSize, workGroupSize), @@ -70,7 +70,9 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) itm.barrier(::sycl::access::fence_space::local_space); - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } itm.barrier(::sycl::access::fence_space::local_space); } @@ -90,7 +92,7 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) constexpr bool async = true; const int local_mats = 3; - constexpr size_t shmem = tile_size * tile_size * local_mats * sizeof(double); + constexpr size_t shmem = tile_size * tile_size * local_mats * sizeof(Real_type); using launch_policy = RAJA::LaunchPolicy>; @@ -118,12 +120,12 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) //We only support dynamic shared memory in Sycl //Thus requiring a different setup than other backends //which use static shared memory - double * As_ptr = ctx.getSharedMemory(tile_size * tile_size); - double * Bs_ptr = ctx.getSharedMemory(tile_size * tile_size); - double * Cs_ptr = ctx.getSharedMemory(tile_size * tile_size); - double (*As)[tile_size] = (double (*)[tile_size]) As_ptr; - double (*Bs)[tile_size] = (double (*)[tile_size]) Bs_ptr; - double (*Cs)[tile_size] = (double (*)[tile_size]) Cs_ptr; + Real_type * As_ptr = ctx.getSharedMemory(tile_size * tile_size); + Real_type * Bs_ptr = ctx.getSharedMemory(tile_size * tile_size); + Real_type * Cs_ptr = ctx.getSharedMemory(tile_size * tile_size); + Real_type (*As)[tile_size] = (Real_type (*)[tile_size]) As_ptr; + Real_type (*Bs)[tile_size] = (Real_type (*)[tile_size]) Bs_ptr; + Real_type (*Cs)[tile_size] = (Real_type (*)[tile_size]) Cs_ptr; RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type ty) { @@ -154,7 +156,9 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) [&](Index_type ty) { RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } } ); // RAJA::loop } diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index 4c93b90eb..8574168ec 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -87,5 +87,88 @@ void MAT_MAT_SHARED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx deallocData(m_C, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MAT_MAT_SHARED::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type N = m_N; + + MAT_MAT_SHARED_DATA_SETUP; + const Index_type Nx = RAJA_DIVIDE_CEILING_INT(N, TL_SZ); + const Index_type Ny = RAJA_DIVIDE_CEILING_INT(N, TL_SZ); + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type by = 0; by < Ny; ++by)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type bx = 0; bx < Nx; ++bx)) { + RAJAPERF_COUNTERS_TEAM_CONTEXT(); + + //Work around for when compiling with CLANG and HIP + //See notes in MAT_MAT_SHARED.hpp + RAJAPERF_COUNTERS_LOOP_BODY(MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU(TL_SZ)); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ty = 0; ty < TL_SZ; ++ty)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type tx = 0; tx < TL_SZ; ++tx)) { + RAJAPERF_COUNTERS_LOOP_BODY(MAT_MAT_SHARED_BODY_1(TL_SZ)); + } + } + + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; ++k)) { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ty = 0; ty < TL_SZ; ++ty)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type tx = 0; tx < TL_SZ; ++tx)) { + RAJAPERF_COUNTERS_LOOP_BODY(MAT_MAT_SHARED_BODY_2(TL_SZ)); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ty = 0; ty < TL_SZ; ++ty)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type tx = 0; tx < TL_SZ; ++tx)) { + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type n = 0; n < TL_SZ; ++n)) { + RAJAPERF_COUNTERS_LOOP_BODY(MAT_MAT_SHARED_BODY_3(TL_SZ)); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + } // Sequential loop + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ty = 0; ty < TL_SZ; ++ty)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type tx = 0; tx < TL_SZ; ++tx)) { + RAJAPERF_COUNTERS_LOOP_BODY(MAT_MAT_SHARED_BODY_4(TL_SZ)); + } + } + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MAT_MAT_SHARED.hpp b/src/basic/MAT_MAT_SHARED.hpp index f77408006..768960603 100644 --- a/src/basic/MAT_MAT_SHARED.hpp +++ b/src/basic/MAT_MAT_SHARED.hpp @@ -85,14 +85,14 @@ constexpr rajaperf::Index_type TL_SZ = 16; so it doesn't see these kind of problems. */ #define MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU(tile_size) \ - Real_type As[tile_size][tile_size]; \ - Real_type Bs[tile_size][tile_size]; \ - Real_type Cs[tile_size][tile_size]; + Real_array2 As; \ + Real_array2 Bs; \ + Real_array2 Cs; #define MAT_MAT_SHARED_BODY_0(tile_size) \ - RAJA_TEAM_SHARED Real_type As[tile_size][tile_size]; \ - RAJA_TEAM_SHARED Real_type Bs[tile_size][tile_size]; \ - RAJA_TEAM_SHARED Real_type Cs[tile_size][tile_size]; + RAJA_TEAM_SHARED Real_array2 As; \ + RAJA_TEAM_SHARED Real_array2 Bs; \ + RAJA_TEAM_SHARED Real_array2 Cs; #define MAT_MAT_SHARED_BODY_1(tile_size) \ Cs[ty][tx] = 0; @@ -110,7 +110,6 @@ constexpr rajaperf::Index_type TL_SZ = 16; Bs[ty][tx] = 0.0; #define MAT_MAT_SHARED_BODY_3(tile_size) \ - for (Index_type n = 0; n < tile_size; ++n) \ Cs[ty][tx] += As[ty][n] * Bs[n][tx]; #define MAT_MAT_SHARED_BODY_4(tile_size) \ @@ -133,6 +132,7 @@ class MAT_MAT_SHARED : public KernelBase { void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index aaed61b01..323d842e8 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -92,5 +92,47 @@ void MULADDSUB::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_in2, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MULADDSUB::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MULADDSUB_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(MULADDSUB_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp index 260d07212..8c951280f 100644 --- a/src/basic/MULADDSUB.hpp +++ b/src/basic/MULADDSUB.hpp @@ -52,6 +52,7 @@ class MULADDSUB : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/MULTI_REDUCE.cpp b/src/basic/MULTI_REDUCE.cpp index bc3114929..f14ef648b 100644 --- a/src/basic/MULTI_REDUCE.cpp +++ b/src/basic/MULTI_REDUCE.cpp @@ -149,5 +149,65 @@ void MULTI_REDUCE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(DataSpace::Host, m_values_final); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MULTI_REDUCE::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MULTI_REDUCE_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + MULTI_REDUCE_SETUP_VALUES; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + MULTI_REDUCE_INIT_VALUES; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(MULTI_REDUCE_BODY(RAJAPERF_ATOMIC_ADD_COUNTING)); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + MULTI_REDUCE_FINALIZE_VALUES; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + MULTI_REDUCE_TEARDOWN_VALUES; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MULTI_REDUCE.hpp b/src/basic/MULTI_REDUCE.hpp index 1b6c8e4fc..87f7fe031 100644 --- a/src/basic/MULTI_REDUCE.hpp +++ b/src/basic/MULTI_REDUCE.hpp @@ -89,6 +89,7 @@ class MULTI_REDUCE : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index f0d98b337..f9ae4c717 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -93,5 +93,48 @@ void NESTED_INIT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_array, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void NESTED_INIT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + NESTED_INIT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = 0; k < nk; ++k )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < nj; ++j )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(NESTED_INIT_BODY); + } + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp index 40208565d..655bacf52 100644 --- a/src/basic/NESTED_INIT.hpp +++ b/src/basic/NESTED_INIT.hpp @@ -52,6 +52,7 @@ class NESTED_INIT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index 8f9bb64f0..f50a7eaa9 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -81,5 +81,60 @@ void PI_ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) (void) vid; } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void PI_ATOMIC::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + PI_ATOMIC_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + *pi = m_pi_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_type x = (Real_type(i) + 0.5) * dx; + RAJAPERF_ATOMIC_ADD_COUNTING(*pi, dx / (1.0 + x * x)); + ); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_pi_final = *pi * 4.0; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_ATOMIC.hpp b/src/basic/PI_ATOMIC.hpp index 3c8433b0d..ec4c11f0b 100644 --- a/src/basic/PI_ATOMIC.hpp +++ b/src/basic/PI_ATOMIC.hpp @@ -59,6 +59,7 @@ class PI_ATOMIC : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index 1ad505ad0..20c81c8a6 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -81,5 +81,57 @@ void PI_REDUCE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) (void) vid; } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void PI_REDUCE::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + PI_REDUCE_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Real_type pi = m_pi_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(PI_REDUCE_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_pi = 4.0 * pi; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index d585333c6..fae40b6fa 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -50,6 +50,7 @@ class PI_REDUCE : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index e3ca630d4..8cadd2fbd 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -96,5 +96,61 @@ void REDUCE3_INT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_vec, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void REDUCE3_INT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE3_INT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Int_type vsum = m_vsum_init; + Int_type vmin = m_vmin_init; + Int_type vmax = m_vmax_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(REDUCE3_INT_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_vsum = vsum; + m_vmin = vmin; + m_vmax = vmax; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index ce8323a89..3ca502749 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -64,6 +64,7 @@ class REDUCE3_INT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index 2608191dc..307763499 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -102,5 +102,64 @@ void REDUCE_STRUCT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void REDUCE_STRUCT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE_STRUCT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Real_type xsum = m_init_sum; Real_type ysum = m_init_sum; + Real_type xmin = m_init_min; Real_type ymin = m_init_min; + Real_type xmax = m_init_max; Real_type ymax = m_init_max; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(REDUCE_STRUCT_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + points.SetCenter(xsum/(points.N), ysum/(points.N)); + points.SetXMin(xmin); + points.SetXMax(xmax); + points.SetYMin(ymin); + points.SetYMax(ymax); + m_points = points; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index d1d289f91..63db1d62d 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -80,6 +80,7 @@ class REDUCE_STRUCT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/TRAP_INT-func.hpp b/src/basic/TRAP_INT-func.hpp index 4d62fa1c0..ac1f8b49e 100644 --- a/src/basic/TRAP_INT-func.hpp +++ b/src/basic/TRAP_INT-func.hpp @@ -28,6 +28,20 @@ Real_type trap_int_func(Real_type x, denom = 1.0/sqrt(denom); return denom; } +/// +RAJA_INLINE +RAJA_HOST_DEVICE +Real_type trap_int_opt_func(Real_type x, + Real_type y, + Real_type xp, + Real_type yp) +{ + Real_type xmxp = x - xp; + Real_type ymyp = y - yp; + Real_type denom = xmxp*xmxp + ymyp*ymyp; + denom = 1.0/sqrt(denom); + return denom; +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index a5a8fbfee..97ed1bfa3 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -92,5 +92,69 @@ void TRAP_INT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) (void) vid; } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + + +} // end namespace basic +} // end namespace rajaperf + +// This shouldn't result in ODR violations as the argument types have changed +#include "TRAP_INT-func.hpp" + +namespace rajaperf +{ +namespace basic +{ + +void TRAP_INT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + TRAP_INT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Real_type sumx = m_sumx_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(TRAP_INT_OPT_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_sumx += sumx * h; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index 27a606695..ee191cccc 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -40,6 +40,10 @@ Real_type x = x0 + i*h; \ sumx += trap_int_func(x, y, xp, yp); +#define TRAP_INT_OPT_BODY \ + Real_type x = x0 + i*h; \ + sumx += trap_int_opt_func(x, y, xp, yp); + #include "common/KernelBase.hpp" @@ -61,6 +65,7 @@ class TRAP_INT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/comm/HALO_PACKING.cpp b/src/comm/HALO_PACKING.cpp index 2f883926e..e970758ee 100644 --- a/src/comm/HALO_PACKING.cpp +++ b/src/comm/HALO_PACKING.cpp @@ -109,5 +109,99 @@ void HALO_PACKING::tearDown(VariantID vid, size_t tune_idx) tearDown_base(vid, tune_idx); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void HALO_PACKING::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + HALO_PACKING_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type l = 0; l < num_neighbors; ++l)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + ); + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type v = 0; v < num_vars; ++v)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr var = vars[v]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(HALO_PACK_BODY); + } + RAJAPERF_COUNTERS_LOOP_BODY( + buffer += len; + ); + } + + RAJAPERF_COUNTERS_IF(if (separate_buffers)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr send_buffer = send_buffers[l]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len*num_vars; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(send_buffer[i] = buffer[i]); + } + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + } + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type l = 0; l < num_neighbors; ++l)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + ); + RAJAPERF_COUNTERS_IF(if (separate_buffers)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr recv_buffer = recv_buffers[l]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len*num_vars; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(buffer[i] = recv_buffer[i]); + } + } + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type v = 0; v < num_vars; ++v)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr var = vars[v]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(HALO_UNPACK_BODY); + } + RAJAPERF_COUNTERS_LOOP_BODY( + buffer += len; + ); + } + } + RAJAPERF_COUNTERS_PAR_SYNC(); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALO_PACKING.hpp b/src/comm/HALO_PACKING.hpp index ab458280f..05cdd50ec 100644 --- a/src/comm/HALO_PACKING.hpp +++ b/src/comm/HALO_PACKING.hpp @@ -80,6 +80,7 @@ class HALO_PACKING : public HALO_base void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/comm/HALO_PACKING_FUSED.cpp b/src/comm/HALO_PACKING_FUSED.cpp index 6ffc5ac90..7a9607548 100644 --- a/src/comm/HALO_PACKING_FUSED.cpp +++ b/src/comm/HALO_PACKING_FUSED.cpp @@ -109,5 +109,137 @@ void HALO_PACKING_FUSED::tearDown(VariantID vid, size_t tune_idx) tearDown_base(vid, tune_idx); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void HALO_PACKING_FUSED::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + HALO_PACKING_FUSED_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HALO_PACKING_FUSED_MANUAL_FUSER_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Index_type pack_index = 0; + ); + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type l = 0; l < num_neighbors; ++l)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + ); + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type v = 0; v < num_vars; ++v)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr var = vars[v]; + pack_ptr_holders[pack_index] = ptr_holder{buffer, list, var}; + pack_lens[pack_index] = len; + pack_index += 1; + buffer += len; + ); + } + } + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < pack_index; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY( + ptr_holder pack_ptrs = pack_ptr_holders[j]; + Real_ptr buffer = pack_ptrs.buffer; + Int_ptr list = pack_ptrs.list; + Real_ptr var = pack_ptrs.var; + Index_type len = pack_lens[j]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(HALO_PACK_BODY); + } + } + RAJAPERF_COUNTERS_IF(if (separate_buffers)) { + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type l = 0; l < num_neighbors; ++l)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Index_type len = pack_index_list_lengths[l]; + Real_ptr send_buffer = send_buffers[l]; + Real_ptr buffer = pack_buffers[l]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len*num_vars; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(send_buffer[i] = buffer[i]); + } + } + } + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Index_type unpack_index = 0; + ); + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type l = 0; l < num_neighbors; ++l)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + ); + RAJAPERF_COUNTERS_IF(if (separate_buffers)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr recv_buffer = recv_buffers[l]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len*num_vars; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(buffer[i] = recv_buffer[i]); + } + } + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type v = 0; v < num_vars; ++v)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr var = vars[v]; + unpack_ptr_holders[unpack_index] = ptr_holder{buffer, list, var}; + unpack_lens[unpack_index] = len; + unpack_index += 1; + buffer += len; + ); + } + } + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < unpack_index; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY( + ptr_holder unpack_ptrs = unpack_ptr_holders[j]; + Real_ptr buffer = unpack_ptrs.buffer; + Int_ptr list = unpack_ptrs.list; + Real_ptr var = unpack_ptrs.var; + Index_type len = unpack_lens[j]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(HALO_UNPACK_BODY); + } + } + RAJAPERF_COUNTERS_PAR_SYNC(); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALO_PACKING_FUSED.hpp b/src/comm/HALO_PACKING_FUSED.hpp index 9d8c04994..7360f6f75 100644 --- a/src/comm/HALO_PACKING_FUSED.hpp +++ b/src/comm/HALO_PACKING_FUSED.hpp @@ -58,9 +58,9 @@ Real_ptr_ptr recv_buffers = m_recv_buffers; #define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP \ - ptr_holder* pack_ptr_holders = nullptr; \ + RAJAPERF_WRAPPER(ptr_holder*) pack_ptr_holders = nullptr; \ Index_ptr pack_lens = nullptr; \ - ptr_holder* unpack_ptr_holders = nullptr; \ + RAJAPERF_WRAPPER(ptr_holder*) unpack_ptr_holders = nullptr; \ Index_ptr unpack_lens = nullptr; \ allocData(DataSpace::Host, pack_ptr_holders, num_neighbors * num_vars); \ allocData(DataSpace::Host, pack_lens, num_neighbors * num_vars); \ @@ -130,6 +130,7 @@ class HALO_PACKING_FUSED : public HALO_base void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/comm/HALO_base.hpp b/src/comm/HALO_base.hpp index 1e0567e8b..c3b70cd3f 100644 --- a/src/comm/HALO_base.hpp +++ b/src/comm/HALO_base.hpp @@ -126,7 +126,7 @@ class HALO_base : public KernelBase Index_type k_max; }; - static const int s_num_neighbors = 26; + static inline constexpr int s_num_neighbors = 26; static const int s_boundary_offsets[s_num_neighbors][3]; static Index_type s_grid_dims_default[3]; diff --git a/src/common/CountingData.hpp b/src/common/CountingData.hpp new file mode 100644 index 000000000..27fff3f7b --- /dev/null +++ b/src/common/CountingData.hpp @@ -0,0 +1,1294 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJAPerf_CountingData_HPP +#define RAJAPerf_CountingData_HPP + +#include "common/RAJAPerfSuite.hpp" +#include "common/RPTypes.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace rajaperf +{ + +namespace counting +{ + +enum struct OpType : int +{ + fp64, + int32, + int64, + ptr, + other, + NumOpTypes // must be at the end of the valid values +}; + +template < typename T > +constexpr OpType getOpType() +{ + using decayed_T = std::decay_t; + if constexpr (std::is_floating_point_v && sizeof(decayed_T) == sizeof(double)) { + return OpType::fp64; + } else if constexpr (std::is_integral_v && sizeof(decayed_T) == sizeof(std::int32_t)) { + return OpType::int32; + } else if constexpr (std::is_integral_v && sizeof(decayed_T) == sizeof(std::int64_t)) { + return OpType::int64; + } else if constexpr (std::is_pointer_v) { + return OpType::ptr; + } else { + return OpType::other; + } +} + +constexpr const char* getOpTypeName(OpType ot) +{ + switch (ot) { + case OpType::int32: return "int32"; + case OpType::int64: return "int64"; + case OpType::ptr: return "ptr"; + case OpType::fp64: return "fp64"; + case OpType::other: return "other"; + default: throw std::invalid_argument("ot is not in OpType"); + } +} + +template < typename T > +const char* get_type_name() +{ + OpType ot = getOpType(); + if (ot != OpType::other) { + return getOpTypeName(ot); + } else { + return typeid(T).name(); + } +} + +enum struct Operation : int +{ + copy, + assign, + load, + store, + uplus, + uminus, + abs, + add, + sub, + mult, + div, + rem, + preinc, + predec, + postinc, + postdec, + atomic_add, + sqrt, + exp, + bit_not, + bit_and, + bit_or, + bit_xor, + bit_lsh, + bit_rsh, + eq, + ne, + lt, + le, + gt, + ge, + NumOperations, // must be at the end of the valid values + FLOP_begin = add, // used when counting what counts as a flop + FLOP_end = eq // used when counting what counts as a flop +}; + +constexpr const char* getOperationName(Operation op) +{ + switch (op) { + case Operation::copy: return "copy"; + case Operation::assign: return "assign"; + case Operation::load: return "load"; + case Operation::store: return "store"; + case Operation::uplus: return "uplus"; + case Operation::uminus: return "uminus"; + case Operation::abs: return "abs"; + case Operation::add: return "add"; + case Operation::sub: return "sub"; + case Operation::mult: return "mult"; + case Operation::div: return "div"; + case Operation::rem: return "rem"; + case Operation::preinc: return "preinc"; + case Operation::predec: return "predec"; + case Operation::postinc: return "postinc"; + case Operation::postdec: return "postdec"; + case Operation::atomic_add: return "atomic_add"; + case Operation::sqrt: return "sqrt"; + case Operation::exp: return "exp"; + case Operation::bit_not: return "bit_not"; + case Operation::bit_and: return "bit_and"; + case Operation::bit_or: return "bit_or"; + case Operation::bit_xor: return "bit_xor"; + case Operation::bit_lsh: return "bit_lsh"; + case Operation::bit_rsh: return "bit_rsh"; + case Operation::eq: return "eq"; + case Operation::ne: return "ne"; + case Operation::lt: return "lt"; + case Operation::le: return "le"; + case Operation::gt: return "gt"; + case Operation::ge: return "ge"; + default: throw std::invalid_argument("op is not in Operation"); + } +} + +enum struct ContextType : int +{ + exterior, + outer, + repetition, + cond, + outer_loop, + seq_loop, + par_loop, + team, + body, + par_sync, + team_sync, + NumContextTypes // must be at the end of the valid values +}; + +constexpr const char* getContextTypeName(ContextType ct) +{ + switch (ct) { + case ContextType::exterior: return "exterior"; + case ContextType::outer: return "outer"; + case ContextType::repetition: return "repetition"; + case ContextType::cond: return "cond"; + case ContextType::outer_loop: return "outer_loop"; + case ContextType::seq_loop: return "seq_loop"; + case ContextType::par_loop: return "par_loop"; + case ContextType::team: return "team"; + case ContextType::body: return "body"; + case ContextType::par_sync: return "par_sync"; + case ContextType::team_sync: return "team_sync"; + default: throw std::invalid_argument("Unknown ContextType"); + } +} + +enum struct MemoryAccess : int +{ + read, + write, + atomicModifyWrite, + NumMemoryAccesses // must be at the end of the valid values +}; + +constexpr const char* getMemoryAccessName(MemoryAccess ma) +{ + switch (ma) { + case MemoryAccess::read: return "read"; + case MemoryAccess::write: return "write"; + case MemoryAccess::atomicModifyWrite: return "atomicModifyWrite"; + default: throw std::invalid_argument("Unknown MemoryAccess"); + } +} + +constexpr const char* getMemoryAccessNamePastTense(MemoryAccess ma) +{ + switch (ma) { + case MemoryAccess::read: return "read"; + case MemoryAccess::write: return "written"; + case MemoryAccess::atomicModifyWrite: return "atomicModifyWritten"; + default: throw std::invalid_argument("Unknown MemoryAccess"); + } +} + +constexpr const char* getMemoryAccessNamePastTenseTitle(MemoryAccess ma) +{ + switch (ma) { + case MemoryAccess::read: return "Read"; + case MemoryAccess::write: return "Written"; + case MemoryAccess::atomicModifyWrite: return "AtomicModifyWritten"; + default: throw std::invalid_argument("Unknown MemoryAccess"); + } +} + +enum struct AllocationGroup : int +{ + global, + team, + NumAllocationGroups // must be at the end of the valid values +}; + +constexpr const char* getAllocationGroupName(AllocationGroup ma) +{ + switch (ma) { + case AllocationGroup::global: return "global"; + case AllocationGroup::team: return "team"; + default: throw std::invalid_argument("Unknown AllocationGroup"); + } +} + +// Must be in order innermost to outermost, so loop must be before rep, etc. +enum struct CountingPoint : int +{ + team, + loop, + rep, + NumCountingPoints // must be at the end of the valid values +}; + +constexpr const char* getCountingPointName(CountingPoint ma) +{ + switch (ma) { + case CountingPoint::team: return "team"; + case CountingPoint::loop: return "loop"; + case CountingPoint::rep: return "rep"; + default: throw std::invalid_argument("Unknown CountingPoint"); + } +} + + +constexpr std::string get_spacing(Size_type depth) +{ + return std::string(depth*2, ' '); +} + +struct MemoryCounts +{ + Size_type touched = 0; + Size_type accessed[Size_type(MemoryAccess::NumMemoryAccesses)] = {0}; + + void add(MemoryCounts const& other_counts, Size_type multiplier = 1) + { + touched += other_counts.touched * multiplier; + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + accessed[a] += other_counts.accessed[a] * multiplier; + } + } +}; + +struct AddressTouches +{ + std::vector address_accessed[Size_type(MemoryAccess::NumMemoryAccesses)]; + + AddressTouches() = default; + + explicit AddressTouches(Size_type size, bool value = false) + { + resize(size, value); + } + + void resize(Size_type size, bool value = false) + { + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + address_accessed[a].resize(size, value); + } + } + + Size_type size() const + { + return address_accessed[0].size(); + } + + void set_all(Size_type size, bool value) + { + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + for (Size_type i = 0; i < size; ++i) { + address_accessed[a][i] = value; + } + } + } + + void count(Size_type size, + MemoryCounts& address_counts) const + { + for (Size_type i = 0; i < size; ++i) { + bool addr_touched = false; + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + bool addr_accessed = address_accessed[a][i]; + addr_touched = addr_touched || addr_accessed; + address_counts.accessed[a] += addr_accessed ? 1 : 0; + } + address_counts.touched += addr_touched ? 1 : 0; + } + } + + void combine(Size_type size, + AddressTouches const& other_touches) + { + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + for (Size_type i = 0; i < size; ++i) { + address_accessed[a][i] = other_touches.address_accessed[a][i] || address_accessed[a][i]; + } + } + } + + void clear() + { + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + address_accessed[a].clear(); + address_accessed[a].shrink_to_fit(); + } + } +}; + +struct TouchCounts +{ + Size_type m_size = 0; + MemoryCounts total_counts; + MemoryCounts address_counts[Size_type(CountingPoint::NumCountingPoints)]; + AddressTouches address_touches[Size_type(CountingPoint::NumCountingPoints)]; + + TouchCounts() = default; + + TouchCounts(CountingPoint point, Size_type size) + { + resize(point, size); + } + + void resize(CountingPoint point, Size_type size) + { + for (Size_type p = Size_type(point); + p < Size_type(CountingPoint::NumCountingPoints); ++p) { + address_touches[p].resize(size); + } + m_size = size; + } + + Size_type size() const + { + return m_size; + } + + void set_all_accesses(CountingPoint point, bool value) + { + address_touches[Size_type(point)].set_all(m_size, value); + } + + void touch(CountingPoint point, MemoryAccess access, Size_type offset, + Size_type num_ops) + { + if (point < CountingPoint::NumCountingPoints) { + total_counts.touched += num_ops; + total_counts.accessed[Size_type(access)] += num_ops; + address_touches[Size_type(point)].address_accessed[Size_type(access)].at(offset) = true; + } + } + + void count(CountingPoint point) + { + address_touches[Size_type(point)].count(m_size, address_counts[Size_type(point)]); + } + + void combine_accesses(CountingPoint point, + TouchCounts const& other_touches, + CountingPoint other_point) + { + address_touches[Size_type(point)].combine( + m_size, other_touches.address_touches[Size_type(other_point)]); + } + + void clear_accesses(CountingPoint point) + { + address_touches[Size_type(point)].clear(); + } +}; + + +struct AllocationMetadata +{ + Index_type idx = std::numeric_limits::min(); + const void* ptr_ptr = nullptr; + std::source_location allocate_location; + AllocationGroup group; + + void* ptr = nullptr; + + std::string pointed_to_type_name; + Size_type element_size = 0; + Size_type size = 0; + + TouchCounts counts; + + AllocationMetadata(Index_type idx_, const void* ptr_ptr_, + std::source_location location, AllocationGroup group_, + std::string pointed_to_type_name_, void* ptr_, + Size_type size_, Size_type element_size_) + : idx(idx_) + , ptr_ptr(ptr_ptr_) + , allocate_location(location) + , group(group_) + , ptr(ptr_) + , pointed_to_type_name(std::move(pointed_to_type_name_)) + , element_size(element_size_) + , size(size_) + , counts(CountingPoint(0), size_) + { + } + + void allocate(void* ptr_) + { + ptr = ptr_; + } + + void deallocate() + { + ptr = nullptr; + } + + void print_allocation(std::ostream& str, Size_type depth) const + { + std::string spacing = get_spacing(depth); + spacing += "// "; + str << spacing << pointed_to_type_name << "* allocation_" << idx + << " = " << getAllocationGroupName(group) << "_malloc(" + << size << " * " << element_size << ");\n"; + } + + void print_deallocation(std::ostream& str, Size_type depth) const + { + std::string spacing = get_spacing(depth); + spacing += "// "; + str << spacing << getAllocationGroupName(group) << "_free(" + << "allocation_" << idx << ");\n"; + } +}; + +struct Context +{ + Index_type idx = -1; + Size_type hit_count = 0; + ContextType type = ContextType::NumContextTypes; + const char* text = nullptr; + CountingPoint point = CountingPoint::NumCountingPoints; + Index_type point_depth = 0; + + Context* parent = nullptr; + // children are stored in order of increasing idx + std::vector> children; + std::vector child_idcs; + + Size_type operation_counters[Size_type(OpType::NumOpTypes)][Size_type(Operation::NumOperations)] = {{0}}; + + std::vector aloc_counts; + + MemoryCounts aloc_total_bytes; + MemoryCounts aloc_totals_bytes[Size_type(CountingPoint::NumCountingPoints)]; + + std::vector allocation_indices; + std::vector deallocation_indices; + + static constexpr CountingPoint get_point(Context* parent, ContextType type) + { + CountingPoint point = CountingPoint::NumCountingPoints; + if (type == ContextType::repetition) { + point = CountingPoint::rep; + } else if (type == ContextType::par_loop) { + point = CountingPoint::loop; + } else if (type == ContextType::team) { + point = CountingPoint::team; + } + if (parent) { + point = std::min(parent->point, point); + } + return point; + } + + // depth of 0 indicates this does not have a valid point + // depth of 1 indicates this is the first context with this point + // depths greater than 1 are children of of a context of this point + static constexpr Index_type get_depth(Context* parent, CountingPoint point) + { + Index_type depth = 0; + if (parent) { + if (point != parent->point) { + depth = 1; + } else if (parent->point_depth > 0) { + depth = parent->point_depth + 1; + } + } + return depth; + } + + Context(Index_type idx_, Context* parent_, ContextType type_, const char* text_, + std::vector> const& allocations) + : idx(idx_) + , type(type_) + , text(text_) + , point(get_point(parent_, type_)) + , point_depth(get_depth(parent_, get_point(parent_, type_))) + , parent(parent_) + , aloc_counts(allocations.size()) + { + if (type == ContextType::par_sync) { + if (point != CountingPoint::rep) { + throw std::runtime_error("par_sync must be in a repetition context"); + } + } else if (type == ContextType::team_sync) { + if (point != CountingPoint::team) { + throw std::runtime_error("team_sync must be in a team context"); + } + } + for (Size_type i = 0; i < allocations.size(); ++i) { + auto const& item = allocations[i]; + aloc_counts[i].resize(point, item->size); + } + } + + void update_allocations(std::vector> const& allocations) + { + for (Size_type i = 0; i < allocations.size(); ++i) { + auto const& item = allocations[i]; + if (i < aloc_counts.size()) { + if (item->size != aloc_counts[i].size()) { + throw std::runtime_error("Allocation record changed since last update"); + } + } else { + aloc_counts.resize(i+1); + aloc_counts[i].resize(point, item->size); + } + } + + for (auto& child_ptr : children) { + child_ptr->update_allocations(allocations); + } + } + + void add_allocation(AllocationMetadata const& item) + { + auto iter = std::ranges::find(allocation_indices, item.idx); + if (iter == allocation_indices.end()) { + allocation_indices.emplace_back(item.idx); + } + } + + void remove_allocation(AllocationMetadata const& item) + { + auto iter = std::ranges::find(deallocation_indices, item.idx); + if (iter == deallocation_indices.end()) { + deallocation_indices.emplace_back(item.idx); + } + } + + template < typename... Args > + Context* get_or_emplace_child(Index_type idx, Args&&... args) + { + using std::distance; + auto idx_iter = std::ranges::lower_bound(child_idcs, idx, std::ranges::less{}); + Size_type offset = distance(child_idcs.begin(), idx_iter); + auto iter = children.begin() + offset; + if (idx_iter == child_idcs.end() || *idx_iter != idx) { + idx_iter = child_idcs.emplace(idx_iter, idx); + iter = children.emplace(iter, std::make_unique(idx, this, std::forward(args)...)); + } + return iter->get(); + } + + void count_totals(AllocationMetadata& item) + { + aloc_total_bytes.add(aloc_counts[item.idx].total_counts, item.element_size); + item.counts.total_counts.add(aloc_counts[item.idx].total_counts, item.element_size); + + for (Size_type p = 0; p < Size_type(CountingPoint::NumCountingPoints); ++p) { + aloc_totals_bytes[p].add(aloc_counts[item.idx].address_counts[p], item.element_size); + } + } + + void clear() + { + for (Size_type i = 0; i < aloc_counts.size(); ++i) { + + for (Size_type p = 0; p < Size_type(CountingPoint::NumCountingPoints); ++p) { + aloc_counts[i].clear_accesses(CountingPoint(p)); + } + + } + } + + + void print_header(std::ostream& str, Size_type depth) const + { + std::string spacing = get_spacing(depth); + spacing += "// "; + + str << spacing << "Line " << idx << " hit " << hit_count << " times\n"; + } + + void print_allocations(std::ostream& str, Size_type depth, + std::vector> const& allocations) const + { + for (Index_type const& allocation_idx : allocation_indices) { + allocations[allocation_idx]->print_allocation(str, depth); + } + for (Index_type const& allocation_idx : deallocation_indices) { + allocations[allocation_idx]->print_deallocation(str, depth); + } + } + + void print_allocation_counts(std::ostream& str, Size_type depth, + std::string_view name, + MemoryCounts const& mem_counts) const + { + std::string spacing = get_spacing(depth); + spacing += "// "; + + if (mem_counts.touched) { + str << spacing + << name + << " touched " + << mem_counts.touched << "\n"; + } + + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + + if (mem_counts.accessed[a]) { + str << spacing + << name + << " " << getMemoryAccessNamePastTense(MemoryAccess(a)) << " " + << mem_counts.accessed[a] << "\n"; + } + + } + } + + void print_counters(std::ostream& str, Size_type depth) const + { + std::string spacing = get_spacing(depth); + spacing += "// "; + + for (Size_type ot = 0; ot < Size_type(OpType::NumOpTypes); ++ot) { + + std::string opTypeName = getOpTypeName(OpType(ot)); + + for (Size_type op = 0; op < Size_type(Operation::NumOperations); ++op) { + + std::string opName = getOperationName(Operation(op)); + + Size_type num_ops = operation_counters[ot][op]; + + if (num_ops > 0) { + str << spacing << opTypeName << " " << opName << " " << num_ops << "\n"; + } + } + } + + print_allocation_counts(str, depth, "bytes", aloc_total_bytes); + + for (Size_type p = 0; p < Size_type(CountingPoint::NumCountingPoints); ++p) { + + std::string name = std::format("by {} bytes", + getCountingPointName(CountingPoint(p))); + + print_allocation_counts(str, depth, name, aloc_totals_bytes[p]); + + } + + for (Size_type i = 0; i < aloc_counts.size(); ++i) { + + for (Size_type p = 0; p < Size_type(CountingPoint::NumCountingPoints); ++p) { + + std::string name = std::format("by {} allocation_{} elements", + getCountingPointName(CountingPoint(p)), i); + + print_allocation_counts(str, depth, + name, aloc_counts[i].address_counts[p]); + + } + + } + + } + + std::string replace_values(std::string str, + std::vector const& wrapper_formats) const + { + for (const char* wrapper_format : wrapper_formats) { + std::regex re(std::vformat(wrapper_format, std::make_format_args("(.*?)"))); + str = std::regex_replace(str, re, "$1"); + } + + return str; + } + + void print_text(std::ostream& str, Size_type depth, + std::vector const& wrapper_formats) const + { + if (text == nullptr) return; + + std::string spacing = get_spacing(depth); + + std::string new_text = replace_values(text, wrapper_formats); + + std::string_view tv = new_text; + + if (!tv.empty()) { + + Size_type pos = 0; + while (pos < tv.size()) { + + // skip spacing between lines and extra semicolons + if (std::isspace(tv[pos]) || + tv[pos] == ';') { + ++pos; + continue; + } + + Size_type end = tv.find(';', pos); + if (end < tv.size()) { + end += 1; + } else { + end = tv.size(); + } + + str << spacing << tv.substr(pos, end-pos) << "\n"; + + pos = end; + } + } + } + + void print(std::ostream& str, Size_type depth, std::string_view tv) const + { + std::string spacing = get_spacing(depth); + + str << spacing << tv << "\n"; + } +}; + +struct CountingData; + +struct ScopedContext +{ + Context* context; + CountingData* countingData; + + ScopedContext(CountingData* countingData_, Context* context_) + : context(context_) + , countingData(countingData_) + { + } + + ScopedContext() = delete; + ScopedContext(ScopedContext const&) = delete; + ScopedContext(ScopedContext &&) = delete; + ScopedContext& operator=(ScopedContext const&) = delete; + ScopedContext& operator=(ScopedContext &&) = delete; + + ~ScopedContext() + { + pop_context(); + } + + void release() + { + countingData = nullptr; + context = nullptr; + } + + inline void pop_context(); +}; + +struct CountingData +{ + static inline Context* current_context = nullptr; + static inline CountingData* current_data = nullptr; + + Size_type par_it_per_rep_counter = 0; + Size_type all_it_per_rep_counter = 0; + + Size_type max_par_loop_depth = 0; + Size_type max_all_loop_depth = 0; + + Size_type kernel_per_rep_counter = 0; + Size_type par_sync_per_rep_counter = 0; + Size_type team_sync_per_rep_counter = 0; + + + Size_type memory_allocations[Size_type(AllocationGroup::NumAllocationGroups)] = {0}; + Size_type memory_bytes[Size_type(AllocationGroup::NumAllocationGroups)] = {0}; + + MemoryCounts memory_total_bytes[Size_type(AllocationGroup::NumAllocationGroups)]; + MemoryCounts memory_totals_bytes[Size_type(CountingPoint::NumCountingPoints)][Size_type(AllocationGroup::NumAllocationGroups)]; + + std::vector> allocations; + + + Size_type operation_counters[Size_type(OpType::NumOpTypes)][Size_type(Operation::NumOperations)] = {{0}}; + + + std::unique_ptr counter_context; + + + std::vector wrapper_formats; + + + void set_formats(std::initializer_list wrapper_formats) + { + for (const char* wrapper_format : wrapper_formats) { + this->wrapper_formats.emplace_back(wrapper_format); + } + } + + + AllocationMetadata* get_allocation(const void* ptr) + { + if (!ptr) { + return nullptr; + } + auto iter = std::ranges::find_if(allocations, + [&](std::unique_ptr const& item) { + if (!item->ptr) { return false; } + const char* allocation_begin = static_cast(item->ptr); + const char* allocation_end = allocation_begin + item->size*item->element_size; + return (allocation_begin <= static_cast(ptr) && + allocation_end > static_cast(ptr)); + }); + if (iter == allocations.end()) { + return nullptr; + } + return iter->get(); + } + /// + AllocationMetadata* get_allocation(const void* ptr_ptr, std::source_location location) + { + auto iter = std::ranges::find(allocations, + std::make_tuple(ptr_ptr, location.line(), location.column()), + [](std::unique_ptr const& item) { + return std::make_tuple(item->ptr_ptr, + item->allocate_location.line(), + item->allocate_location.column()); + }); + if (iter == allocations.end()) { + return nullptr; + } + return iter->get(); + } + + void add_allocation_impl(std::string pointed_to_type_name, AllocationGroup group, void* ptr, + Size_type size, Size_type element_size, + const void* ptr_ptr, std::source_location location) + { + auto item = get_allocation(ptr); + if (item) { + throw std::runtime_error("Allocation with this pointer already registered"); + } + item = get_allocation(ptr_ptr, location); + if (item) { + if (pointed_to_type_name != item->pointed_to_type_name || + size != item->size || + element_size != item->element_size) { + throw std::runtime_error("Allocation at this location changed type, size, or element_size"); + } + item->allocate(ptr); + } else { + item = allocations.emplace_back( + std::make_unique( + allocations.size(), ptr_ptr, location, group, + std::move(pointed_to_type_name), ptr, size, element_size)).get(); + counter_context->update_allocations(allocations); + current_context->add_allocation(*item); + } + } + + void add_allocation(std::string pointed_to_type_name, void* ptr, + Size_type size, Size_type element_size, + const void* ptr_ptr, std::source_location location) + { + add_allocation_impl(std::move(pointed_to_type_name), AllocationGroup::global, + ptr, size, element_size, + ptr_ptr, location); + } + + void add_team_allocation(std::string pointed_to_type_name, void* ptr, + Size_type size, Size_type element_size, + const void* ptr_ptr, std::source_location location) + { + add_allocation_impl(std::move(pointed_to_type_name), AllocationGroup::team, + ptr, size, element_size, + ptr_ptr, location); + } + + void remove_allocation(void* ptr, + [[maybe_unused]] const void* ptr_ptr, + [[maybe_unused]] std::source_location location = std::source_location::current()) + { + auto item = get_allocation(ptr); + if (!item) { + throw std::runtime_error("Allocation with this pointer not registered"); + } + item->deallocate(); + current_context->remove_allocation(*item); + } + + + ScopedContext create_context(const char* text, + std::source_location location = std::source_location::current()) + { + if (counter_context) { + throw std::runtime_error("Already created exterior context"); + } + + counter_context = std::make_unique( + location.line(), nullptr, ContextType::exterior, text, allocations); + + current_data = this; + current_context = counter_context.get(); + + current_context->hit_count += 1; + + return {this, current_context}; + } + + void push_context(ContextType type, const char* text, + std::source_location location = std::source_location::current()) + { + if (!current_data) { + throw std::runtime_error("Current data not set"); + } + if (!current_context) { + throw std::runtime_error("Current context not set"); + } + current_context = current_context->get_or_emplace_child( + location.line(), type, text, allocations); + current_context->hit_count += 1; + } + + ScopedContext push_outer_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::outer, text, location); + return {this, current_context}; + } + + ScopedContext push_rep_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::repetition, text, location); + return {this, current_context}; + } + + ScopedContext push_cond_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::cond, text, location); + return {this, current_context}; + } + + ScopedContext push_outer_loop_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::outer_loop, text, location); + return {this, current_context}; + } + + ScopedContext push_seq_loop_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::seq_loop, text, location); + return {this, current_context}; + } + + ScopedContext push_par_loop_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::par_loop, text, location); + return {this, current_context}; + } + + ScopedContext push_body_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::body, text, location); + return {this, current_context}; + } + + ScopedContext push_team_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::team, text, location); + return {this, current_context}; + } + + void add_par_sync(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::par_sync, text, location); + pop_context(); + } + + void add_team_sync(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::team_sync, text, location); + pop_context(); + } + + void pop_context() + { + if (!current_context) { + throw std::runtime_error("No context to pop"); + } + if (current_context->point_depth == 1) { + CountingPoint src_point = current_context->point; + CountingPoint dst_point = current_context->parent + ? current_context->parent->point + : src_point; + count_touches(current_context, src_point, dst_point, 0); + } + + current_context = current_context->parent; + } + + + + void finalize_context([[maybe_unused]] std::source_location location) + { + if (!counter_context) throw std::runtime_error("Exterior context not created"); + if (!current_context) throw std::runtime_error("No current context"); + if (current_context != counter_context.get()) throw std::runtime_error("Not at outer context"); + current_context = nullptr; + current_data = nullptr; + + count_totals(counter_context.get(), 0); + + // count stats for allocations + for (auto& item : allocations) { + + Size_type g = Size_type(item->group); + + memory_allocations[g] += 1; + memory_bytes[g] += item->size * item->element_size; + + memory_total_bytes[g].add(item->counts.total_counts, item->element_size); + + for (Size_type p = 0; p < Size_type(CountingPoint::NumCountingPoints); ++p) { + + memory_totals_bytes[p][g].add(item->counts.address_counts[p], item->element_size); + + item->counts.clear_accesses(CountingPoint(p)); + } + } + + count_kernels_and_iterations(counter_context.get()); + + count_operations(counter_context.get()); + } + + void count_totals(Context* context, Size_type depth) + { + for (auto& child_ptr : context->children) { + count_totals(child_ptr.get(), depth+1); + } + + for (auto& item : allocations) { + context->count_totals(*item); + } + context->clear(); + } + + void count_touches(Context* context, CountingPoint src_point, + CountingPoint dst_point, Size_type depth) + { + for (auto& child_ptr : context->children) { + count_touches(child_ptr.get(), src_point, dst_point, depth+1); + } + + for (auto& item : allocations) { + + auto& src_counts = context->aloc_counts[item->idx]; + + item->counts.combine_accesses(src_point, src_counts, src_point); + + if (dst_point < CountingPoint::NumCountingPoints && + dst_point != src_point) { + + context->aloc_counts[item->idx].combine_accesses( + dst_point, src_counts, src_point); + + item->counts.combine_accesses(dst_point, src_counts, src_point); + + } + + src_counts.count(src_point); + + src_counts.set_all_accesses(src_point, false); + + if (depth == 0) { + + item->counts.count(src_point); + + item->counts.set_all_accesses(src_point, false); + } + } + + } + + std::array count_kernels_and_iterations( + Context* context, + Size_type par_loop_stack_depth = 0, + Size_type all_loop_stack_depth = 0) + { + if (!context->parent) { + par_it_per_rep_counter = 0; + all_it_per_rep_counter = 0; + max_par_loop_depth = 0; + max_all_loop_depth = 0; + kernel_per_rep_counter = 0; + par_sync_per_rep_counter = 0; + team_sync_per_rep_counter = 0; + } + + if (context->type == ContextType::par_loop) { + par_loop_stack_depth += 1; + all_loop_stack_depth += 1; + max_par_loop_depth = std::max(par_loop_stack_depth, max_par_loop_depth); + max_all_loop_depth = std::max(all_loop_stack_depth, max_all_loop_depth); + } else if (context->type == ContextType::seq_loop) { + all_loop_stack_depth += 1; + max_all_loop_depth = std::max(all_loop_stack_depth, max_all_loop_depth); + } + + Size_type max_child_par_iterations = 0; + Size_type all_child_par_iterations = 0; + Size_type max_child_iterations = 0; + Size_type all_loop_iterations = 0; + + for (auto& child_ptr : context->children) { + + auto [par_iter, all_iter] = + count_kernels_and_iterations(child_ptr.get(), + par_loop_stack_depth, + all_loop_stack_depth); + + max_child_par_iterations = std::max(par_iter, max_child_par_iterations); + all_child_par_iterations += par_iter; + max_child_iterations = std::max(child_ptr->hit_count, max_child_iterations); + all_loop_iterations += all_iter; + + } + + Size_type child_par_iterations = all_child_par_iterations; + Size_type child_all_iterations = all_loop_iterations; + if (context->type == ContextType::seq_loop) { + child_all_iterations = std::max(all_loop_iterations, max_child_iterations); + } + + if (context->type == ContextType::team_sync) { + team_sync_per_rep_counter += context->hit_count; + } else if (context->type == ContextType::par_sync) { + par_sync_per_rep_counter += context->hit_count; + } + + if (Size_type(context->point) <= Size_type(CountingPoint::loop)) { + + if (context->point == CountingPoint::loop && context->point_depth == 1) { + kernel_per_rep_counter += context->hit_count; + } + + child_par_iterations = max_child_par_iterations; + if (context->type == ContextType::par_loop) { + child_par_iterations = std::max(max_child_par_iterations, max_child_iterations); + child_all_iterations = std::max(all_loop_iterations, max_child_iterations); + } + + } + + if (context->point == CountingPoint::rep && context->point_depth == 1) { + par_it_per_rep_counter = all_child_par_iterations; + all_it_per_rep_counter = all_loop_iterations; + } + + return {{child_par_iterations, child_all_iterations}}; + + } + + void count_operations(Context* context) + { + for (auto& child_ptr : context->children) { + count_operations(child_ptr.get()); + } + + if (Size_type(context->point) > Size_type(CountingPoint::rep)) { + return; // don't count operations outside of the repetition + } + + for (Size_type ot = 0; ot < Size_type(OpType::NumOpTypes); ++ot) { + for (Size_type op = 0; op < Size_type(Operation::NumOperations); ++op) { + operation_counters[ot][op] += context->operation_counters[ot][op]; + } + } + } + + void print_context(std::ostream& str, Context const& context, Size_type depth) const + { + context.print_header(str, depth+1); + + context.print_allocations(str, depth+1, allocations); + + context.print_counters(str, depth+1); + + context.print_text(str, depth+1, wrapper_formats); + + if (!context.children.empty()) { + + context.print(str, depth+1, "{"); + + for (auto const& child_ptr : context.children) { + print_context(str, *child_ptr.get(), depth+1); + } + + context.print(str, depth+1, "}"); + } + } + + void print(std::ostream& str) const + { + Context const& context = *counter_context.get(); + Size_type depth = 0; + context.print(str, depth, "{"); + print_context(str, context, depth); + context.print(str, depth, "}"); + } +}; + +inline void ScopedContext::pop_context() +{ + if (context) { + if (CountingData::current_context != context) { + throw std::runtime_error("ScopedContext popped in wrong context"); + } + if (CountingData::current_data != countingData) { + throw std::runtime_error("ScopedContext popped in wrong context"); + } + CountingData::current_data->pop_context(); + release(); + } +} + +} // closing brace for counting namespace + +} // closing brace for rajaperf namespace + +#endif // closing endif for header file include guard diff --git a/src/common/CountingMacros.hpp b/src/common/CountingMacros.hpp new file mode 100644 index 000000000..9a9ed2e6e --- /dev/null +++ b/src/common/CountingMacros.hpp @@ -0,0 +1,142 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJAPerf_CountingMacros_HPP +#define RAJAPerf_CountingMacros_HPP + +// Note that using this should change the signature of functions but +// can cause ODR violations if it does not + +// Use this wrapper type in variable declarations in a kernel +// ex. +// RAJAPERF_WRAPPER(Real_type) val = ptr[i]; +// Note do not use it if declaring variables with constant values +#ifdef RAJAPERF_WRAPPER +#undef RAJAPERF_WRAPPER +#endif +#define RAJAPERF_WRAPPER(type) counting::Wrapper +#define RAJAPERF_ARRAY1_WRAPPER(type_name) typename counting::Array1WrapperHelper::template type +#define RAJAPERF_ARRAY2_WRAPPER(type_name) typename counting::Array2WrapperHelper::template type +#define RAJAPERF_ARRAY3_WRAPPER(type_name) typename counting::Array3WrapperHelper::template type +#define RAJAPERF_ARRAY4_WRAPPER(type_name) typename counting::Array4WrapperHelper::template type + +#define RAJAPERF_ATOMIC_ADD_COUNTING(lhs, rhs) \ + (lhs).atomic_add(rhs); + + +#define RAJAPERF_COUNTERS_INITIALIZE() \ + auto _exterior_context = this->initializeCounters({ \ + RAJAPERF_STRINGIFY(RAJAPERF_WRAPPER({0})), \ + RAJAPERF_STRINGIFY(RAJAPERF_ARRAY1_WRAPPER({0})), \ + RAJAPERF_STRINGIFY(RAJAPERF_ARRAY2_WRAPPER({0})), \ + RAJAPERF_STRINGIFY(RAJAPERF_ARRAY3_WRAPPER({0}))}); + +#define RAJAPERF_COUNTERS_CODE_WRAPPER(...) \ + auto RAJAPERF_NAME_PER_LINE(_code_context_) = \ + counting::CountingData::current_data-> \ + push_outer_context(RAJAPERF_STRINGIFY(__VA_ARGS__)); \ + __VA_ARGS__; \ + RAJAPERF_NAME_PER_LINE(_code_context_).pop_context() + +#define RAJAPERF_COUNTERS_REP_SCOPE() \ + if constexpr (auto _rep_context = \ + counting::CountingData::current_data->push_rep_context( \ + "for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1)"); \ + false) {} else + +#define RAJAPERF_COUNTERS_IF(...) \ + if constexpr (auto RAJAPERF_NAME_PER_LINE(_cond_context_) = \ + counting::CountingData::current_data->push_cond_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); false) {} else \ + __VA_ARGS__ + +#define RAJAPERF_COUNTERS_ELSE_IF(...) \ + else if constexpr (auto RAJAPERF_NAME_PER_LINE(_cond_context_) = \ + counting::CountingData::current_data->push_cond_context( \ + "else " RAJAPERF_STRINGIFY(__VA_ARGS__)); false) {} else \ + __VA_ARGS__ + +#define RAJAPERF_COUNTERS_ELSE() \ + else if constexpr (auto RAJAPERF_NAME_PER_LINE(_cond_context_) = \ + counting::CountingData::current_data->push_cond_context( \ + "else"); false) {} else + +// Note the main practical difference between this and SEQ_LOOP +// is that only SEQ_LOOP counts iterations +#define RAJAPERF_COUNTERS_OUTER_LOOP(...) \ + if constexpr (auto RAJAPERF_NAME_PER_LINE(_loop_context_) = \ + counting::CountingData::current_data->push_outer_loop_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); false) {} else \ + __VA_ARGS__ + +#define RAJAPERF_COUNTERS_SEQ_LOOP(...) \ + if constexpr (auto RAJAPERF_NAME_PER_LINE(_loop_context_) = \ + counting::CountingData::current_data->push_seq_loop_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); false) {} else \ + __VA_ARGS__ + +#define RAJAPERF_COUNTERS_PAR_LOOP(...) \ + if constexpr (auto RAJAPERF_NAME_PER_LINE(_loop_context_) = \ + counting::CountingData::current_data->push_par_loop_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); false) {} else \ + __VA_ARGS__ + +#define RAJAPERF_COUNTERS_LOOP_BODY(...) \ + auto RAJAPERF_NAME_PER_LINE(_body_context_) = \ + counting::CountingData::current_data->push_body_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); \ + __VA_ARGS__; \ + RAJAPERF_NAME_PER_LINE(_body_context_).pop_context() + +#define RAJAPERF_COUNTERS_TEAM_CONTEXT() \ + auto RAJAPERF_NAME_PER_LINE(_team_context_) = \ + counting::CountingData::current_data->push_team_context(""); + +#define RAJAPERF_COUNTERS_PAR_ALG(...) \ + auto RAJAPERF_NAME_PER_LINE(_alg_context_) = \ + counting::CountingData::current_data->push_par_loop_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); \ + __VA_ARGS__; \ + RAJAPERF_NAME_PER_LINE(_alg_context_).pop_context() + + +#define RAJAPERF_COUNTERS_PAR_SYNC() \ + counting::CountingData::current_data->add_par_sync("synchronize();") + +#define RAJAPERF_COUNTERS_TEAM_SYNC() \ + counting::CountingData::current_data->add_team_sync("synchronize();") + +#define RAJAPERF_COUNTERS_FINALIZE() \ + this->finalizeCounters(_exterior_context) + + +// Wrap rajaperf data types after implementing everything +#define Index_type RAJAPERF_WRAPPER(Index_type) +#define Index_ptr RAJAPERF_WRAPPER(Index_ptr) +#define Index_ptr_ptr RAJAPERF_WRAPPER(Index_ptr_ptr) +#define Size_type RAJAPERF_WRAPPER(Size_type) +#define Int_type RAJAPERF_WRAPPER(Int_type) +#define Int_ptr RAJAPERF_WRAPPER(Int_ptr) +#define Int_ptr_ptr RAJAPERF_WRAPPER(Int_ptr_ptr) +#define Real_type RAJAPERF_WRAPPER(Real_type) +#define Real_array RAJAPERF_ARRAY1_WRAPPER(Real_array) +#define Real_array2 RAJAPERF_ARRAY2_WRAPPER(Real_array2) +#define Real_array3 RAJAPERF_ARRAY3_WRAPPER(Real_array3) +#define Real_array4 RAJAPERF_ARRAY4_WRAPPER(Real_array4) +#define Real_array_ref RAJAPERF_ARRAY1_WRAPPER(Real_array_ref) +#define Real_array2_ref RAJAPERF_ARRAY2_WRAPPER(Real_array2_ref) +#define Real_array3_ref RAJAPERF_ARRAY3_WRAPPER(Real_array3_ref) +#define Real_array4_ref RAJAPERF_ARRAY4_WRAPPER(Real_array4_ref) +#define Real_ptr RAJAPERF_WRAPPER(Real_ptr) +#define Real_ptr_ptr RAJAPERF_WRAPPER(Real_ptr_ptr) +#define Complex_type RAJAPERF_WRAPPER(Complex_type) +#define Complex_ptr RAJAPERF_WRAPPER(Complex_ptr) +#define Data_type RAJAPERF_WRAPPER(Data_type) +#define Data_ptr RAJAPERF_WRAPPER(Data_ptr) + +#endif // closing endif for header file include guard diff --git a/src/common/CountingWrapper.hpp b/src/common/CountingWrapper.hpp new file mode 100644 index 000000000..ce53e2a41 --- /dev/null +++ b/src/common/CountingWrapper.hpp @@ -0,0 +1,1017 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJAPerf_CountingWrapper_HPP +#define RAJAPerf_CountingWrapper_HPP + +#include "common/RAJAPerfSuite.hpp" +#include "common/RPTypes.hpp" +#include "common/CountingData.hpp" + +#include +#include +#include +#include +#include + +namespace rajaperf +{ + +namespace counting +{ + +// Wrapper types that count operations +template < typename T > +struct Wrapper; + + +template < typename T > +struct is_wrapper +{ + static inline constexpr bool value = false; +}; + +template < typename T > +struct is_wrapper> +{ + static inline constexpr bool value = true; +}; + +template < typename T > +inline constexpr bool is_wrapper_v = is_wrapper::value; + + + +template < typename T > +concept Wrapped = is_wrapper_v>; + +template < typename T > +concept NonWrapped = !Wrapped; + +template < typename T > +concept WrappedVal = Wrapped && T::is_val; + +template < typename T > +concept WrappedArray = Wrapped && T::is_array; + +template < typename T > +concept WrappedPtr = Wrapped && T::is_ptr; + +template < typename T > +concept WrappedNonPtr = Wrapped && !T::is_ptr; + +template < typename T > +concept WrappedRef = Wrapped && T::is_ref; + + +template < typename T > +struct PointedToType +{ + using type = std::remove_reference_t())>; +}; + +template < WrappedPtr T > +struct PointedToType +{ + using type = typename std::remove_cvref_t::pointed_to_type; +}; + +template < typename T > +using pointed_to_type_t = typename PointedToType::type; + + +template < typename T > +struct WrappedType +{ + using type = T; +}; + +template < Wrapped T > +struct WrappedType +{ + using direct_type = typename std::remove_cvref_t::wrapped_type; + using const_type = std::conditional_t, std::add_const_t, direct_type>; + using lref_type = std::conditional_t, std::add_lvalue_reference_t, const_type>; + using rref_type = std::conditional_t, std::add_rvalue_reference_t, lref_type>; + using type = rref_type; +}; + +template < typename T > +using wrapped_type_t = typename WrappedType::type; + + +template < typename T > +concept raw_pointer = std::is_pointer_v; + +template < typename T > +concept pointer = raw_pointer || WrappedPtr; + +template < typename T > +concept convertible_to_pointer = std::convertible_to, pointed_to_type_t*>; + +template < typename T, typename U > +concept convertible_to = std::convertible_to, wrapped_type_t>; + +template < typename T > +concept integral = std::integral || + (Wrapped && std::integral); + + +template < typename T > +constexpr decltype(auto) get_value(T&& val, Size_type num_ops=0) +{ + if constexpr (Wrapped) { + return std::forward(val).get_native(num_ops); + } else { + return std::forward(val); + } +} + +template +struct add_all_extents_of_to +{ + using type = V; +}; + +template +struct add_all_extents_of_to +{ + using type = typename add_all_extents_of_to::type[]; +}; + +template +struct add_all_extents_of_to +{ + using type = typename add_all_extents_of_to::type[N]; +}; + +template +using add_all_extents_of_to_t = typename add_all_extents_of_to::type; + +template < typename T > +struct Wrapper +{ + static inline constexpr bool is_ref = std::is_reference_v; + static inline constexpr bool is_val = !is_ref; + static inline constexpr bool is_array = std::is_array_v>; + static inline constexpr bool is_ptr = std::is_pointer_v; + + template < typename U > + friend struct Wrapper; + + using wrapped_type = T; + + using value_type = std::conditional_t, T>; + using const_value_type = std::conditional_t>, + const value_type>; + + using member_type = std::conditional_t>, value_type>; + + using pointed_to_type = + std::conditional_t, + std::conditional_t, + value_type>>; + using const_pointed_to_type = + std::conditional_t, + std::conditional_t, + const_value_type>>; + + template < size_t... Is > + static constexpr size_t get_array_size(std::index_sequence) + { + return (... * std::extent_v); + } + /// + static constexpr size_t get_array_size() + { + if constexpr (is_array) { + using dims = std::make_index_sequence>; + return get_array_size(dims{}); + } + return size_t(0); + } + + explicit Wrapper(AllocationMetadata* allocation, member_type value) + : m_value(value) + , m_allocation(allocation) + { + } + + // allow default construction of non-ref values + Wrapper() + requires(is_val && !is_array) + : m_value() + { + } + /// + Wrapper(std::source_location location = std::source_location::current()) + requires(is_val && is_array) + : m_value() + { + registerArray(location); + m_allocation = CountingData::current_data->get_allocation( + static_cast(&m_value)); + } + + // allow implicit construction from non-wrapped values + template < convertible_to rhs_T > + Wrapper(rhs_T&& rhs) + requires(is_val && !is_array && !is_ptr) + : m_value(get_value(std::forward(rhs), 1)) + { + this->count(Operation::copy, 1); + } + /// + Wrapper(std::nullptr_t) + requires(is_val && !is_array && is_ptr) + : Wrapper() + { + } + /// + template < convertible_to rhs_T > + Wrapper(rhs_T&& rhs) + requires(is_val && !is_array && is_ptr) + : m_value(get_value(std::forward(rhs), 1)) + { + if constexpr (WrappedPtr) { + m_allocation = rhs.m_allocation; + } else { + m_allocation = CountingData::current_data->get_allocation( + static_cast(m_value)); + } + if (!m_allocation) { + std::ostringstream str; + str << "Couldn't find allocation "; + str << static_cast(get_value(std::forward(rhs))); + throw std::runtime_error(str.str()); + } + this->count(Operation::copy, 1); + } + /// + template < NonWrapped rhs_T > + Wrapper(rhs_T& rhs) + requires(is_ref) + : m_value(&rhs) + { + m_allocation = CountingData::current_data->get_allocation( + static_cast(m_value)); + if (!m_allocation) { + throw std::runtime_error("Couldn't find allocation"); + } + } + + // copy and move constructors + Wrapper(Wrapper const& rhs) + requires(!(is_val && is_array)) + : m_value(rhs.get_native()) + , m_allocation(rhs.m_allocation) + { + if constexpr (is_val) { + this->count(Operation::copy, 1); + } + } + /// + Wrapper(Wrapper && rhs) + requires(!(is_val && is_array)) + : m_value(std::move(rhs).get_native()) + , m_allocation(rhs.m_allocation) + { + if constexpr (is_val) { + this->count(Operation::copy, 1); + } + } + + // count assignments from non-wrapped values + template < NonWrapped rhs_T > + Wrapper& operator=(rhs_T&& rhs) + requires(!is_array) + { + this->set(std::forward(rhs)); + if constexpr (is_ptr) { + this->m_allocation = CountingData::current_data->get_allocation( + (void*)(m_value)); + if (!m_allocation) { + throw std::runtime_error("Couldn't find allocation"); + } + } + this->count(Operation::assign, 1); + return *this; + } + /// + Wrapper& operator=(std::nullptr_t) + requires(is_val && is_ptr) + { + return (*this) = Wrapper(); + } + + // count assignments from wrappers + Wrapper& operator=(Wrapper const& rhs) + requires(!is_array) + { + this->set(rhs.get_native()); + if constexpr (is_ptr) { + this->m_allocation = rhs.m_allocation; + } + this->count(Operation::assign, 1); + return *this; + } + /// + Wrapper& operator=(Wrapper&& rhs) + requires(!is_array) + { + this->set(std::move(rhs).get_native()); + if constexpr (is_ptr) { + this->m_allocation = rhs.m_allocation; + } + this->count(Operation::assign, 1); + return *this; + } + /// + template < Wrapped rhs_T > + Wrapper& operator=(rhs_T&& rhs) + requires(!is_array) + { + this->set(std::forward(rhs).get_native()); + if constexpr (is_ptr) { + this->m_allocation = rhs.m_allocation; + } + this->count(Operation::assign, 1); + return *this; + } + + ~Wrapper() + { + if constexpr (is_val && is_array) { + deregisterArray(); + } + } + + +#define RAJAPERF_DEFINE_WRAPPER_PRE_OPERATOR(op_name, op, op_enum) \ + auto& op_name() \ + requires(!is_array) \ + { \ + this->set(this->get_native() op 1); \ + this->count(op_enum, 1); \ + return *this; \ + } + + RAJAPERF_DEFINE_WRAPPER_PRE_OPERATOR(operator++, +, Operation::preinc) + RAJAPERF_DEFINE_WRAPPER_PRE_OPERATOR(operator--, -, Operation::predec) + + +#define RAJAPERF_DEFINE_WRAPPER_POST_OPERATOR(op_name, op, op_enum) \ + auto op_name(int) \ + requires(!is_array) \ + { \ + auto value = this->get_value_wrapper(); \ + this->set(value.get_native() op 1); \ + this->count(op_enum, 1); \ + return value; \ + } + + RAJAPERF_DEFINE_WRAPPER_POST_OPERATOR(operator++, +, Operation::postinc) + RAJAPERF_DEFINE_WRAPPER_POST_OPERATOR(operator--, -, Operation::postdec) + + +#define RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(op_name, op, op_enum) \ + template < WrappedNonPtr rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(!is_array && !is_ptr) \ + { \ + this->set(this->get_native() op rhs.get_native()); \ + this->count(op_enum, 1); \ + return *this; \ + } \ + template < NonWrapped rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(!is_array && !is_ptr) \ + { \ + this->set(this->get_native() op rhs); \ + this->count(op_enum, 1); \ + return *this; \ + } + + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator+=, +, Operation::add) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator-=, -, Operation::sub) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator*=, *, Operation::mult) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator/=, /, Operation::div) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator%=, %, Operation::rem) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator&=, &, Operation::bit_and) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator|=, |, Operation::bit_or) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator^=, ^, Operation::bit_xor) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator<<=, <<, Operation::bit_lsh) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator>>=, >>, Operation::bit_rsh) + + +#define RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_POINTER_OPERATOR(op_name, op, op_enum) \ + template < WrappedNonPtr rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(is_ptr) \ + { \ + this->m_value op##= rhs.get_native(); \ + this->count(op_enum, 1); \ + return *this; \ + } \ + template < NonWrapped rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(is_ptr) \ + { \ + this->m_value op##= rhs; \ + this->count(op_enum, 1); \ + return *this; \ + } + + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_POINTER_OPERATOR(operator+=, +, Operation::add) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_POINTER_OPERATOR(operator-=, -, Operation::sub) + + +#define RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_ATOMIC_OPERATOR(op_name, op, op_enum) \ + template < WrappedNonPtr rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(is_ref) \ + { \ + this->set(this->get_native(0) op rhs.get_native(), 0); \ + this->count(op_enum, 1); \ + return *this; \ + } \ + template < NonWrapped rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(is_ref) \ + { \ + this->set(this->get_native(0) op rhs, 0); \ + this->count(op_enum, 1); \ + return *this; \ + } + + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_ATOMIC_OPERATOR(atomic_add, +, Operation::atomic_add) + + + auto operator&() + requires(is_ref) + { + return Wrapper(m_allocation, m_value); + } + /// + auto operator&() const + requires(is_ref) + { + return Wrapper(m_allocation, m_value); + } + + + auto operator*() + requires(is_array || is_ptr) + { + if constexpr (is_val) { + if constexpr (is_array) { + return Wrapper(m_allocation, &*m_value); + } else { + return Wrapper(m_allocation, m_value); + } + } else { + if constexpr (is_array) { + return Wrapper(m_allocation, &**m_value); + } else { + return Wrapper(nullptr, *m_value); + } + } + } + /// + auto operator*() const + requires(is_array || is_ptr) + { + if constexpr (is_val) { + if constexpr (is_array) { + return Wrapper(m_allocation, &*m_value); + } else { + return Wrapper(m_allocation, m_value); + } + } else { + if constexpr (is_array) { + return Wrapper(m_allocation, &*(*m_value)); + } else { + return Wrapper(nullptr, (*m_value)); + } + } + } + + auto operator->() const + requires(is_ptr) + { + return m_value; + } + + + template < convertible_to I > + auto operator[](I&& i) + requires(is_array || is_ptr) + { + this->count(Operation::add, 1); + if constexpr (is_val) { + if constexpr (is_array) { + return Wrapper(m_allocation, &m_value[get_value(std::forward(i), 1)]); + } else { + return Wrapper(m_allocation, m_value+get_value(std::forward(i), 1)); + } + } else { + if constexpr (is_array) { + return Wrapper(m_allocation, &(*m_value)[get_value(std::forward(i), 1)]); + } else { + return Wrapper(nullptr, (*m_value)+get_value(std::forward(i), 1)); + } + } + } + /// + template < convertible_to I > + auto operator[](I&& i) const + requires(is_array || is_ptr) + { + this->count(Operation::add, 1); + if constexpr (is_val) { + if constexpr (is_array) { + return Wrapper(m_allocation, &m_value[get_value(std::forward(i), 1)]); + } else { + return Wrapper(m_allocation, m_value+get_value(std::forward(i), 1)); + } + } else { + if constexpr (is_array) { + return Wrapper(m_allocation, &(*m_value)[get_value(std::forward(i), 1)]); + } else { + return Wrapper(nullptr, (*m_value)+get_value(std::forward(i), 1)); + } + } + } + + operator auto() const + { + if constexpr (!is_array) { + this->count(Operation::copy, 1); + return this->get_native(); + } + } + /// + explicit operator Wrapper() + requires(is_val) + { + if constexpr (is_array) { + return Wrapper(m_allocation, &m_value); + } else { + return Wrapper(nullptr, &m_value); + } + } + /// + explicit operator Wrapper() const + requires(is_val) + { + if constexpr (is_array) { + return Wrapper(m_allocation, &m_value); + } else { + return Wrapper(nullptr, &m_value); + } + } + + void swap(Wrapper& rhs) + requires(!is_array) // consider implementing array version later + { + using std::swap; + value_type rhs_tmp(std::move(rhs).get_native()); + rhs.set(std::move(*this).get_native()); + this->set(std::move(rhs_tmp)); + swap(this->m_allocation, rhs.m_allocation); + } + + void swap(Wrapper&& rhs) && + requires(is_ref && ! is_array) + { + using std::swap; + value_type rhs_tmp(std::move(rhs).get_native()); + rhs.set(std::move(*this).get_native()); + this->set(std::move(rhs_tmp)); + swap(this->m_allocation, rhs.m_allocation); + } + + + // internal interface methods, should only be used in this file + template < typename rhs_T > + void set(rhs_T&& rhs, Size_type num_ops = 1) + requires(!is_array) + { + if constexpr (is_val) { + m_value = std::forward(rhs); + } else { + this->count(Operation::store, num_ops); + *m_value = std::forward(rhs); + } + } + + // gets a copy of the value represented by this object + auto get_value_wrapper(Size_type num_ops = 1) const + requires(!is_array) + { + if constexpr (is_val) { + return Wrapper(m_allocation, m_value); + } else { + this->count(Operation::load, num_ops); + return Wrapper(nullptr, *m_value); + } + } + + // gets a reference to the underlying value + auto&& get_native(Size_type num_ops = 1) & + { + if constexpr (is_val) { + return m_value; + } else { + this->count(Operation::load, num_ops); + return *m_value; + } + } + /// + auto&& get_native(Size_type num_ops = 1) && + { + if constexpr (is_val) { + return std::move(m_value); + } else { + this->count(Operation::load, num_ops); + return *m_value; + } + } + /// + auto&& get_native(Size_type num_ops = 1) const& + { + if constexpr (is_val) { + return m_value; + } else { + this->count(Operation::load, num_ops); + return *m_value; + } + } + /// + auto&& get_native(Size_type num_ops = 1) const&& + { + if constexpr (is_val) { + return std::move(m_value); + } else { + this->count(Operation::load, num_ops); + return *m_value; + } + } + + + template < typename U = T > + void count(Operation op, Size_type num_ops) const + { + using V = std::decay_t; // decay arrays to pointers + + if (!CountingData::current_context) { + throw std::runtime_error("Can't count if there is no current context"); + } + + CountingData::current_context->operation_counters[ + Size_type(getOpType())][Size_type(op)] += num_ops; + + if constexpr (std::is_pointer_v && sizeof(std::remove_pointer_t) > 1) { + + if (op == Operation::add || op == Operation::sub) { + // Note that this fails to differentiate between + // adding/subtracting a pointer and an integer which entails a mult or bit_lsh + // and subtracting two pointers which entails a div or bit_rsh + auto is_pow_2 = [](size_t n) { return (n & (n-1)) == size_t(0); }; + Operation extra_op = is_pow_2(sizeof(std::remove_pointer_t)) + ? Operation::bit_lsh : Operation::mult ; + CountingData::current_context->operation_counters[ + Size_type(getOpType())][Size_type(extra_op)] += num_ops; + } + } + + if constexpr (std::is_reference_v) { + if (op == Operation::load || op == Operation::store || + op == Operation::atomic_add) { + + if (!m_allocation) { + throw std::runtime_error("Memory access to unknown allocation"); + } + + auto base_ptr = static_cast(m_allocation->ptr); + check_bounds(base_ptr); + + if (num_ops > Size_type(0)) { + CountingPoint point = CountingData::current_context->point; + MemoryAccess access = MemoryAccess::NumMemoryAccesses; + if (op == Operation::load) { + access = MemoryAccess::read; + } else if (op == Operation::store) { + access = MemoryAccess::write; + } else if (op == Operation::atomic_add) { + access = MemoryAccess::atomicModifyWrite; + } + Size_type offset = m_value - base_ptr; + CountingData::current_context->aloc_counts[m_allocation->idx]. + touch(point, access, offset, num_ops); + } + } + } + } + + void check_bounds(member_type base_ptr) const + requires(is_ref) + { + if (!base_ptr) { + throw std::runtime_error("Memory access to deallocated pointer"); + } + if (m_value < base_ptr) { + throw std::runtime_error("Memory access is out of bounds low"); + } + if (m_value >= (base_ptr + m_allocation->size)) { + throw std::runtime_error("Memory access is out of bounds high"); + } + } + + void registerArray(std::source_location location = std::source_location::current()) + requires(is_val && is_array) + { + CountingData::current_data->add_team_allocation( + get_type_name>(), + static_cast(&m_value), + get_array_size(), sizeof(std::remove_all_extents_t), + static_cast(&m_value), location); + } + + void deregisterArray(std::source_location location = std::source_location::current()) + requires(is_val && is_array) + { + CountingData::current_data->remove_allocation( + static_cast(&m_value), + static_cast(&m_value), location); + } + +private: + member_type m_value; + AllocationMetadata* m_allocation = nullptr; +}; + +template < typename U > +auto make_ValueWrapper(U&& value) +{ + return Wrapper>(value); +} + +// Operations with Wrapper types +// Some of these will be found before functions of the same name in the +// global namespace + +#define RAJAPERF_DEFINE_WRAPPER_UNARY_FUNCTION(op_name, op, op_enum) \ + template < typename T > \ + auto op_name(Wrapper const& obj) \ + { \ + using ::op; \ + auto value = make_ValueWrapper(op(obj.get_native())); \ + value.count(op_enum, 1); \ + return value; \ + } + +RAJAPERF_DEFINE_WRAPPER_UNARY_FUNCTION(exp, exp, Operation::exp) +RAJAPERF_DEFINE_WRAPPER_UNARY_FUNCTION(sqrt, sqrt, Operation::sqrt) +RAJAPERF_DEFINE_WRAPPER_UNARY_FUNCTION(fabs, fabs, Operation::abs) + + +#define RAJAPERF_DEFINE_WRAPPER_UNARY_OPERATOR(op_name, op, op_enum) \ + template < WrappedNonPtr T > \ + auto op_name(T const& obj) \ + { \ + auto value = make_ValueWrapper(op(obj.get_native())); \ + value.count(op_enum, 1); \ + return value; \ + } + +RAJAPERF_DEFINE_WRAPPER_UNARY_OPERATOR(operator+, +, Operation::uplus) +RAJAPERF_DEFINE_WRAPPER_UNARY_OPERATOR(operator-, -, Operation::uminus) + +template < WrappedPtr T > +auto operator+(T const& obj) +{ + Wrapper> value( + obj.m_allocation, +(obj.get_native())); + value.count(Operation::uplus, 1); + return value; +} + + +#define RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(op_name, op, op_enum) \ + template < NonWrapped lhs_T, Wrapped rhs_T > \ + auto op_name(lhs_T & lhs, \ + rhs_T const& rhs) \ + { \ + rhs.template count(op_enum, 1); \ + return lhs op rhs.get_native(); \ + } + +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator+=, +=, Operation::add) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator-=, -=, Operation::sub) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator*=, *=, Operation::mult) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator/=, /=, Operation::div) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator%=, %=, Operation::rem) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator&=, &=, Operation::bit_and) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator|=, |=, Operation::bit_or) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator^=, ^=, Operation::bit_xor) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator<<=, <<=, Operation::bit_lsh) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator>>=, >>=, Operation::bit_rsh) + + +#define RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(op_name, op, op_enum) \ + template < WrappedNonPtr lhs_T, WrappedNonPtr rhs_T > \ + auto op_name(lhs_T const& lhs, \ + rhs_T const& rhs) \ + { \ + auto value = make_ValueWrapper(lhs.get_native() op rhs.get_native()); \ + value.count(op_enum, 1); \ + return value; \ + } \ + template < WrappedNonPtr lhs_T, NonWrapped rhs_T > \ + auto op_name(lhs_T const& lhs, \ + rhs_T const& rhs) \ + { \ + auto value = make_ValueWrapper(lhs.get_native() op rhs); \ + value.count(op_enum, 1); \ + return value; \ + } \ + template < NonWrapped lhs_T, WrappedNonPtr rhs_T > \ + auto op_name(lhs_T const& lhs, \ + rhs_T const& rhs) \ + { \ + auto value = make_ValueWrapper(lhs op rhs.get_native()); \ + value.count(op_enum, 1); \ + return value; \ + } + +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator+, +, Operation::add) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator-, -, Operation::sub) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator*, *, Operation::mult) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator/, /, Operation::div) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator%, %, Operation::rem) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator&, &, Operation::bit_and) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator|, |, Operation::bit_or) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator^, ^, Operation::bit_xor) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator<<, <<, Operation::bit_lsh) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator>>, >>, Operation::bit_rsh) + + +template < typename lhs_T, typename rhs_T > +auto operator+(Wrapper const& lhs, + Wrapper const& rhs) +requires((Wrapper::is_ptr || Wrapper::is_ptr) && + !(Wrapper::is_ptr && Wrapper::is_ptr)) +{ + if constexpr (Wrapper::is_ptr) { + auto value = lhs.get_value_wrapper(); + value += rhs; + return value; + } else { + auto value = rhs.get_value_wrapper(); + value += lhs; + return value; + } +} +template < WrappedPtr lhs_T, NonWrapped rhs_T > +auto operator+(lhs_T const& lhs, + rhs_T const& rhs) +{ + auto value = lhs.get_value_wrapper(); + value += rhs; + return value; +} +template < NonWrapped lhs_T, WrappedPtr rhs_T > +auto operator+(lhs_T const& lhs, + rhs_T const& rhs) +{ + auto value = rhs.get_value_wrapper(); + value += lhs; + return value; +} + +template < WrappedPtr lhs_T, WrappedNonPtr rhs_T > +auto operator-(lhs_T const& lhs, + rhs_T const& rhs) +{ + auto value = lhs.get_value_wrapper(); + value -= rhs; + return value; +} +template < WrappedPtr lhs_T, NonWrapped rhs_T > +auto operator-(lhs_T const& lhs, + rhs_T const& rhs) +{ + auto value = lhs.get_value_wrapper(); + value -= rhs; + return value; +} + + +#define RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(op_name, op, op_enum) \ + template < typename lhs_T, typename rhs_T > \ + auto op_name(Wrapper const& lhs, \ + Wrapper const& rhs) \ + { \ + lhs.template count>(op_enum, 1); \ + return lhs.get_native() op rhs.get_native(); \ + } \ + template < typename lhs_T, NonWrapped rhs_T > \ + auto op_name(Wrapper const& lhs, \ + rhs_T const& rhs) \ + { \ + lhs.template count>(op_enum, 1); \ + return lhs.get_native() op rhs; \ + } \ + template < NonWrapped lhs_T, typename rhs_T > \ + auto op_name(lhs_T const& lhs, \ + Wrapper const& rhs) \ + { \ + rhs.template count>(op_enum, 1); \ + return lhs op rhs.get_native(); \ + } + +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator==, ==, Operation::eq) +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator!=, !=, Operation::ne) +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator< , < , Operation::lt) +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator<=, <=, Operation::le) +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator> , > , Operation::gt) +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator>=, >=, Operation::ge) + +template < Wrapped T > +void swap(T& lhs, T& rhs) +{ + lhs.swap(rhs); +} + +template < WrappedRef T > +void swap(T&& lhs, T&& rhs) +{ + std::move(lhs).swap(std::move(rhs)); +} + +// helper for getting right type +template < template typename T > +struct Array1WrapperHelper +{ + template < size_t N > + using type = Wrapper>; +}; +/// +template < template typename T > +struct Array2WrapperHelper +{ + template < size_t N0, size_t N1 > + using type = Wrapper>; +}; +/// +template < template typename T > +struct Array3WrapperHelper +{ + template < size_t N0, size_t N1, size_t N2 > + using type = Wrapper>; +}; +/// +template < template typename T > +struct Array4WrapperHelper +{ + template < size_t N0, size_t N1, size_t N2, size_t N3 > + using type = Wrapper>; +}; + +} // closing brace for counting namespace + +} // closing brace for rajaperf namespace + +namespace std +{ + +template < typename T > +struct iterator_traits<::rajaperf::counting::Wrapper> +{ + using difference_type = ::rajaperf::counting::Wrapper; + using value_type = ::rajaperf::counting::Wrapper>; + using pointer = ::rajaperf::counting::Wrapper; + using reference = ::rajaperf::counting::Wrapper; + using iterator_category = std::random_access_iterator_tag; +}; + +} // closing brace for std namespace + + +// Use this wrapper type in variable declarations in a kernel +// ex. +// RAJAPERF_WRAPPER(my_struct*) val; +// Note wrapping is done for most types in CountingMacros.hpp, but some types +// like structs specific to a kernel need to be wrapped manually +// Note do not use it if declaring variables with constant values +#define RAJAPERF_WRAPPER(type) type + +#endif // closing endif for header file include guard diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index bfa34efa9..de5f86f18 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -257,59 +257,26 @@ inline void moveData(DataSpace new_dataSpace, DataSpace old_dataSpace, } -template +template struct AutoDataMover { - AutoDataMover(DataSpace new_dataSpace, DataSpace old_dataSpace, - T*& ptr, Size_type len, Size_type align) - : m_ptr(&ptr) - , m_new_dataSpace(new_dataSpace) - , m_old_dataSpace(old_dataSpace) - , m_len(len) - , m_align(align) + AutoDataMover(Func func) + : m_func(func) { } AutoDataMover(AutoDataMover const&) = delete; AutoDataMover& operator=(AutoDataMover const&) = delete; - AutoDataMover(AutoDataMover&& rhs) - : m_ptr(std::exchange(rhs.m_ptr, nullptr)) - , m_new_dataSpace(rhs.m_new_dataSpace) - , m_old_dataSpace(rhs.m_old_dataSpace) - , m_len(rhs.m_len) - , m_align(rhs.m_align) - { } - AutoDataMover& operator=(AutoDataMover&& rhs) - { - finalize(); - m_ptr = std::exchange(rhs.m_ptr, nullptr); - m_new_dataSpace = rhs.m_new_dataSpace; - m_old_dataSpace = rhs.m_old_dataSpace; - m_len = rhs.m_len; - m_align = rhs.m_align; - return *this; - } - - void finalize() - { - if (m_ptr) { - moveData(m_new_dataSpace, m_old_dataSpace, - *m_ptr, m_len, m_align); - m_ptr = nullptr; - } - } + AutoDataMover(AutoDataMover&& rhs) = delete; + AutoDataMover& operator=(AutoDataMover&& rhs) = delete; ~AutoDataMover() { - finalize(); + m_func(); } private: - T** m_ptr; - DataSpace m_new_dataSpace; - DataSpace m_old_dataSpace; - Size_type m_len; - Size_type m_align; + Func m_func; }; /*! diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 7805376bb..551eea808 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -39,7 +39,10 @@ #include #include #include + +#include #include +#include #include #if defined(_WIN32) @@ -263,6 +266,7 @@ void Executor::setupSuite() const std::set& run_kern = run_params.getKernelIDsToRun(); for (auto kid = run_kern.begin(); kid != run_kern.end(); ++kid) { kernels.push_back( getKernelObject(*kid, run_params) ); + kernels.back()->setCountedAttributes(); } const std::set& run_var = run_params.getVariantIDsToRun(); @@ -518,131 +522,334 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const #endif } + const bool skip_if_nonpositive = !to_file; + // // Set up column headers and column widths for kernel summary output. // + string attr_category_head(""); + string kern_head("Kernels"); - size_t kercol_width = kern_head.size(); - - Index_type psize_width = 0; - Index_type reps_width = 0; - Index_type itsrep_width = 0; - Index_type bytesrep_width = 0; - Index_type flopsrep_width = 0; - Index_type bytesReadrep_width = 0; - Index_type bytesWrittenrep_width = 0; - Index_type bytesAtomicModifyWrittenrep_width = 0; - Index_type dash_width = 0; + Index_type kercol_width = static_cast(kern_head.size()); for (size_t ik = 0; ik < kernels.size(); ++ik) { - kercol_width = max(kercol_width, kernels[ik]->getName().size()); - psize_width = max(psize_width, kernels[ik]->getActualProblemSize()); - reps_width = max(reps_width, kernels[ik]->getRunReps()); - itsrep_width = max(itsrep_width, kernels[ik]->getItsPerRep()); - bytesrep_width = max(bytesrep_width, kernels[ik]->getBytesPerRep()); - flopsrep_width = max(flopsrep_width, kernels[ik]->getFLOPsPerRep()); - bytesReadrep_width = max(bytesReadrep_width, kernels[ik]->getBytesReadPerRep()); - bytesWrittenrep_width = max(bytesWrittenrep_width, kernels[ik]->getBytesWrittenPerRep()); - bytesAtomicModifyWrittenrep_width = max(bytesAtomicModifyWrittenrep_width, kernels[ik]->getBytesAtomicModifyWrittenPerRep()); + kercol_width = max(kercol_width, static_cast(kernels[ik]->getName().size())); } + kercol_width += 2; +// +// Set up separators and width parameters. +// const string sepchr(" , "); - kercol_width += 2; - dash_width += kercol_width; - - double psize = log10( static_cast(psize_width) ); - string psize_head("Problem size"); - psize_width = max( static_cast(psize_head.size()), - static_cast(psize) ) + 3; - dash_width += psize_width + static_cast(sepchr.size()); - - double rsize = log10( static_cast(reps_width) ); - string rsize_head("Reps"); - reps_width = max( static_cast(rsize_head.size()), - static_cast(rsize) ) + 3; - dash_width += reps_width + static_cast(sepchr.size()); - - double irsize = log10( static_cast(itsrep_width) ); - string itsrep_head("Iterations/rep"); - itsrep_width = max( static_cast(itsrep_head.size()), - static_cast(irsize) ) + 3; - dash_width += itsrep_width + static_cast(sepchr.size()); - - string kernsrep_head("Kernels/rep"); - Index_type kernsrep_width = - max( static_cast(kernsrep_head.size()), - static_cast(4) ); - dash_width += kernsrep_width + static_cast(sepchr.size()); - - double brsize = log10( static_cast(bytesrep_width) ); - string bytesrep_head("Bytes/rep"); - bytesrep_width = max( static_cast(bytesrep_head.size()), - static_cast(brsize) ) + 3; - dash_width += bytesrep_width + static_cast(sepchr.size()); - - double frsize = log10( static_cast(flopsrep_width) ); - string flopsrep_head("FLOPS/rep"); - flopsrep_width = max( static_cast(flopsrep_head.size()), - static_cast(frsize) ) + 3; - dash_width += flopsrep_width + static_cast(sepchr.size()); - - double brrsize = log10( static_cast(bytesReadrep_width) ); - string bytesReadrep_head("BytesRead/rep"); - bytesReadrep_width = max( static_cast(bytesReadrep_head.size()), - static_cast(brrsize) ) + 3; - dash_width += bytesReadrep_width + static_cast(sepchr.size()); - - double bwrsize = log10( static_cast(bytesWrittenrep_width) ); - string bytesWrittenrep_head("BytesWritten/rep"); - bytesWrittenrep_width = max( static_cast(bytesWrittenrep_head.size()), - static_cast(bwrsize) ) + 3; - dash_width += bytesWrittenrep_width + static_cast(sepchr.size()); - - double bamrrsize = log10( static_cast(bytesAtomicModifyWrittenrep_width) ); - string bytesAtomicModifyWrittenrep_head("BytesAtomicModifyWritten/rep"); - bytesAtomicModifyWrittenrep_width = max( static_cast(bytesAtomicModifyWrittenrep_head.size()), - static_cast(bamrrsize) ) + 3; - dash_width += bytesAtomicModifyWrittenrep_width + static_cast(sepchr.size()); - - str <::max() + : max(screen_width-kercol_width, screen_width/2); + +// +// Set up storage for attributes which will become the columns. +// + struct Attribute + { + std::string category_name; + std::string name; + Index_type width; + std::function getter; + }; + + std::vector attrs; + +// +// function used to print the table, includes the kernel column and attr columns. +// Clears attr columns after printing to make using more than once easier. +// + auto print_attr_table = [&]() { + + // print row of categories + str <getName(); + for (Attribute const& attr : attrs) { + str << sepchr <::min(); + for (size_t ik = 0; ik < kernels.size(); ++ik) { + max_value = max(max_value, getter(kernels[ik])); + } + if (skip_if_nonpositive && max_value <= static_cast(0)) return; + max_value = max(max_value, static_cast(1)); + double value_width = log10(static_cast(max_value)) + 1.0; + Index_type width = max( static_cast(category_name.size()), + static_cast(name.size()) ); + width = max( width, static_cast(value_width) ); + width += 2; + Index_type width_with_sep = static_cast(sepchr.size()) + width; + + if (current_width + width_with_sep > max_width) { + print_attr_table(); + } + + current_width += width_with_sep; + attrs.emplace_back(Attribute{category_name, name, width, getter}); + }; + +// +// user settable attributes +// + add_attr("Input", "Problem size", [](KernelBase const* kernel){ + return static_cast(kernel->getActualProblemSize()); + }); + + add_attr("Input", "Reps", [](KernelBase const* kernel){ + return static_cast(kernel->getRunReps()); + }); + + if ( !to_file && current_width > 0 ) { + print_attr_table(); } - for (size_t ik = 0; ik < kernels.size(); ++ik) { - KernelBase* kern = kernels[ik]; - str <getName() - << sepchr <getActualProblemSize() - << sepchr <getRunReps() - << sepchr <getItsPerRep() - << sepchr <getKernelsPerRep() - << sepchr <getBytesPerRep() - << sepchr <getFLOPsPerRep() - << sepchr <getBytesReadPerRep() - << sepchr <getBytesWrittenPerRep() - << sepchr <getBytesAtomicModifyWrittenPerRep() - << endl; +// +// manually counted attributes +// + add_attr("Estimate", "Iterations/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getItsPerRep()); + }); + + add_attr("Estimate", "Kernels/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getKernelsPerRep()); + }); + + add_attr("Estimate", "Bytes/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getBytesPerRep()); + }); + + add_attr("Estimate", "FLOPS/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getFLOPsPerRep()); + }); + + add_attr("Estimate", "BytesRead/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getBytesReadPerRep()); + }); + + add_attr("Estimate", "BytesWritten/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getBytesWrittenPerRep()); + }); + + add_attr("Estimate", "BytesAtomicModifyWritten/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getBytesAtomicModifyWrittenPerRep()); + }); + + if ( !to_file && current_width > 0 ) { + print_attr_table(); + } + +// +// automatically counted high level attributes +// + add_attr("Counted", "Iterations/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedItsPerRep()); + }); + + add_attr("Counted", "ParallelIterations/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedParItsPerRep()); + }); + + add_attr("Counted", "MaxLoopNestDepth", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedMaxLoopNestDepth()); + }); + + add_attr("Counted", "MaxParallelLoopNestDepth", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedMaxParLoopNestDepth()); + }); + + add_attr("Counted", "Kernels/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedKernelsPerRep()); + }); + + add_attr("Counted", "Synchronizes/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedSyncsPerRep()); + }); + + add_attr("Counted", "TeamSynchronizes/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedTeamSyncsPerRep()); + }); + + if ( !to_file && current_width > 0 ) { + print_attr_table(); + } + +// +// automatically counted memory attributes, at the per loop usage granularity +// + for (Size_type g = 0; g < Size_type(counting::AllocationGroup::NumAllocationGroups); ++g) { + auto gg = counting::AllocationGroup(g); + + std::string num_name = std::format("{}NumAllocations", + counting::getAllocationGroupName(gg)); + + add_attr("Counted", num_name, [gg](KernelBase const* kernel){ + return static_cast(kernel->getCountedNumAllocations(gg)); + }); + + std::string bytes_name = std::format("{}AllocatedBytes", + counting::getAllocationGroupName(gg)); + + add_attr("Counted", bytes_name, [gg](KernelBase const* kernel){ + return static_cast(kernel->getCountedAllocatedBytes(gg)); + }); + + std::string bytes_total_name = std::format("{}BytesTotal/rep", + counting::getAllocationGroupName(gg)); + + add_attr("Counted", bytes_total_name, [gg](KernelBase const* kernel){ + return static_cast( + kernel->getCountedTotalBytes(gg)); + }); + + for (Size_type a = 0; a < Size_type(counting::MemoryAccess::NumMemoryAccesses); ++a) { + auto aa = counting::MemoryAccess(a); + + std::string bytes_total_accessed_name = std::format("{}BytesTotal{}/rep", + counting::getAllocationGroupName(gg), + counting::getMemoryAccessNamePastTenseTitle(aa)); + add_attr("Counted", bytes_total_accessed_name, [gg, aa](KernelBase const* kernel){ + return static_cast( + kernel->getCountedTotalBytesPerAccess(gg, aa)); + }); + + } + + for (Size_type p = 0; p < Size_type(counting::CountingPoint::NumCountingPoints); ++p) { + auto pp = counting::CountingPoint(p); + + std::string bytes_touched_name = std::format("{}BytesTouched/{}", + counting::getAllocationGroupName(gg), + counting::getCountingPointName(pp)); + + add_attr("Counted", bytes_touched_name, [pp, gg](KernelBase const* kernel){ + return static_cast( + kernel->getCountedBytesTouched(pp, gg)); + }); + + std::string bytes_name = std::format("{}Bytes/{}", + counting::getAllocationGroupName(gg), + counting::getCountingPointName(pp)); + + add_attr("Counted", bytes_name, [pp, gg](KernelBase const* kernel){ + return static_cast( + kernel->getCountedBytes(pp, gg)); + }); + + for (Size_type a = 0; a < Size_type(counting::MemoryAccess::NumMemoryAccesses); ++a) { + auto aa = counting::MemoryAccess(a); + + std::string bytes_accessed_name = std::format("{}Bytes{}/{}", + counting::getAllocationGroupName(gg), + counting::getMemoryAccessNamePastTenseTitle(aa), + counting::getCountingPointName(pp)); + add_attr("Counted", bytes_accessed_name, [pp, gg, aa](KernelBase const* kernel){ + return static_cast( + kernel->getCountedBytesPerAccess(pp, gg, aa)); + }); + + } + + } + } + + if ( !to_file && current_width > 0 ) { + print_attr_table(); + } + +// +// automatically counted operations attributes +// + for (Size_type ot = 0; ot < Size_type(counting::OpType::NumOpTypes); ++ot) { + + std::string opTypeName = counting::getOpTypeName(counting::OpType(ot)); + + add_attr("Counted", opTypeName+"_ops/rep", [ot](KernelBase const* kernel){ + return static_cast(kernel->getCountedArithmeticOpsPerRep(counting::OpType(ot))); + }, skip_if_nonpositive); + + for (Size_type op = 0; op < Size_type(counting::Operation::NumOperations); ++op) { + + std::string opName = counting::getOperationName(counting::Operation(op)); + + add_attr("Counted", opTypeName+"_"+opName+"/rep", [ot, op](KernelBase const* kernel){ + return static_cast(kernel->getCountedOpsPerRep(counting::OpType(ot), counting::Operation(op))); + }, skip_if_nonpositive); + + } + + if ( !to_file && current_width > 0 ) { + print_attr_table(); + } + + } + + if (current_width > 0) { + print_attr_table(); } str.flush(); } +void Executor::writeKernelCounterSummary(ostream& str) const +{ + for (size_t ik = 0; ik < kernels.size(); ++ik) { + str << "\n/******** Kernel " << kernels[ik]->getName() << " ********/\n"; + kernels[ik]->printCounters(str); + } + str.flush(); +} + + + void Executor::runSuite() { RunParams::InputOpt in_state = run_params.getInputState(); @@ -904,6 +1111,11 @@ void Executor::outputRunData() writeKernelInfoSummary(*file, to_file); } + file = openOutputFile(out_fprefix + "-counters.txt"); + if ( *file ) { + writeKernelCounterSummary(*file); + } + #if defined(RAJA_PERFSUITE_USE_CALIPER) KernelBase::setCaliperMgrFlush(); #endif diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index 5d4a41d16..33cd4e35d 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -76,6 +76,7 @@ class Executor std::unique_ptr openOutputFile(const std::string& filename) const; void writeKernelInfoSummary(std::ostream& str, bool to_file) const; + void writeKernelCounterSummary(std::ostream& str) const; void writeCSVReport(std::ostream& file, CSVRepMode mode, RunParams::CombinerOpt combiner, size_t prec); diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 9dc824f9c..b819cad41 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -14,6 +14,8 @@ #include "common/DataUtils.hpp" #include "common/RunParams.hpp" #include "common/GPUUtils.hpp" +#include "common/CountingData.hpp" +#include "common/CountingWrapper.hpp" #include "RAJA/util/Timer.hpp" #if defined(RAJA_PERFSUITE_ENABLE_MPI) @@ -37,6 +39,10 @@ #include #include #include +#include +#include +#include +#include #if defined(RAJA_PERFSUITE_USE_CALIPER) @@ -156,18 +162,88 @@ class KernelBase Index_type getDefaultProblemSize() const { return default_prob_size; } Index_type getActualProblemSize() const { return actual_prob_size; } Index_type getDefaultReps() const { return default_reps; } + Index_type getTargetProblemSize() const; + Index_type getRunReps() const; + Index_type getItsPerRep() const { return its_per_rep; }; Index_type getKernelsPerRep() const { return kernels_per_rep; }; - Index_type getBytesPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + 2*bytes_atomic_modify_written_per_rep; } // count atomic_modify_write operations as a read and a write to match previous counting + + // count atomic_modify_write operations as a read and a write to match previous counting + Index_type getBytesPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + 2*bytes_atomic_modify_written_per_rep; } Index_type getBytesReadPerRep() const { return bytes_read_per_rep; } Index_type getBytesWrittenPerRep() const { return bytes_written_per_rep; } Index_type getBytesAtomicModifyWrittenPerRep() const { return bytes_atomic_modify_written_per_rep; } + Index_type getFLOPsPerRep() const { return FLOPs_per_rep; } + + + Index_type getCountedItsPerRep() const { return countingData ? (countingData->all_it_per_rep_counter) : -1; } + Index_type getCountedParItsPerRep() const { return countingData ? (countingData->par_it_per_rep_counter) : -1; } + Index_type getCountedMaxLoopNestDepth() const { return countingData ? (countingData->max_all_loop_depth) : -1; } + Index_type getCountedMaxParLoopNestDepth() const { return countingData ? (countingData->max_par_loop_depth) : -1; } + Index_type getCountedKernelsPerRep() const { return countingData ? countingData->kernel_per_rep_counter : -1; } + Index_type getCountedSyncsPerRep() const { return countingData ? countingData->par_sync_per_rep_counter : -1; } + Index_type getCountedTeamSyncsPerRep() const { return countingData ? countingData->team_sync_per_rep_counter : -1; } + Index_type getCountedNumAllocations(counting::AllocationGroup g) const { return countingData ? countingData->memory_allocations[Size_type(g)] : -1; } + Index_type getCountedAllocatedBytes(counting::AllocationGroup g) const { return countingData ? countingData->memory_bytes[Size_type(g)] : -1; } + + Index_type getCountedTotalBytes(counting::AllocationGroup g) const + { + Index_type count = -1; + if (countingData) { + count = 0; + for (Size_type a = 0; a < Size_type(counting::MemoryAccess::NumMemoryAccesses); ++a) { + count += countingData->memory_total_bytes[Size_type(g)].accessed[a]; + if (counting::MemoryAccess(a) == counting::MemoryAccess::atomicModifyWrite) { + count += countingData->memory_total_bytes[Size_type(g)].accessed[a]; // count twice, as both a read and a write + } + } + }; + return count; + } + Index_type getCountedTotalBytesPerAccess(counting::AllocationGroup g, counting::MemoryAccess ma) const + { return countingData ? countingData->memory_total_bytes[Size_type(g)].accessed[Size_type(ma)] : -1; } + + // count atomic_modify_write operations as a read and a write to match previous counting + Index_type getCountedBytesTouched(counting::CountingPoint p, counting::AllocationGroup g) const + { return countingData ? countingData->memory_totals_bytes[Size_type(p)][Size_type(g)].touched : -1; } + Index_type getCountedBytes(counting::CountingPoint p, counting::AllocationGroup g) const + { + Index_type count = -1; + if (countingData) { + count = 0; + for (Size_type a = 0; a < Size_type(counting::MemoryAccess::NumMemoryAccesses); ++a) { + count += countingData->memory_totals_bytes[Size_type(p)][Size_type(g)].accessed[a]; + if (counting::MemoryAccess(a) == counting::MemoryAccess::atomicModifyWrite) { + count += countingData->memory_totals_bytes[Size_type(p)][Size_type(g)].accessed[a]; // count twice, as both a read and a write + } + } + }; + return count; + } + Index_type getCountedBytesPerAccess(counting::CountingPoint p, counting::AllocationGroup g, counting::MemoryAccess ma) const + { return countingData ? countingData->memory_totals_bytes[Size_type(p)][Size_type(g)].accessed[Size_type(ma)] : -1; } + + Index_type getCountedOpsPerRep(counting::OpType ot, counting::Operation op) const { return countingData ? countingData->operation_counters[Size_type(ot)][Size_type(op)] : -1; } + + Index_type getCountedArithmeticOpsPerRep(counting::OpType ot) const + { + Index_type count = -1; + if (countingData) { + // count a subset of operations including things like add, sub, mult, div, abs, sqrt, but not assign, eq, ne, lt, le, gt, or ge + count = 0; + for (Size_type op = Size_type(counting::Operation::FLOP_begin); + op < Size_type(counting::Operation::FLOP_end); ++op) { + count += countingData->operation_counters[Size_type(ot)][op]; + } + } + return count; + } + + double getBlockSize() const { return kernel_block_size; } - Complexity getComplexity() const { return complexity; }; - Index_type getTargetProblemSize() const; - Index_type getRunReps() const; + Complexity getComplexity() const { return complexity; }; bool usesFeature(FeatureID fid) const { return uses_feature[fid]; }; @@ -320,151 +396,244 @@ class KernelBase DataSpace getReductionDataSpace(VariantID vid) const; DataSpace getMPIDataSpace(VariantID vid) const; - template - void allocData(DataSpace dataSpace, T& ptr, Size_type len) + + + virtual void setCountedAttributes() {}; // + + + counting::ScopedContext initializeCounters( + std::initializer_list wrapper_formats, + std::source_location location = std::source_location::current()) { - rajaperf::allocData(dataSpace, - ptr, len, getDataAlignment()); + countingData = std::make_unique(); + countingData->set_formats(wrapper_formats); + enable_data_registration = true; + return countingData->create_context("", location); } - template - void allocAndInitData(DataSpace dataSpace, T*& ptr, Size_type len) + void finalizeCounters(counting::ScopedContext& context, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitData(dataSpace, - ptr, len, getDataAlignment()); + context.release(); + enable_data_registration = false; + countingData->finalize_context(location); } - template - void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, Size_type len, V val) + void printCounters(std::ostream& str) const { - rajaperf::allocAndInitDataConst(dataSpace, - ptr, len, getDataAlignment(), val); + if (countingData) { + countingData->print(str); + } } - template - void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, Size_type len) + + void registerData(counting::pointer auto& ptr, + counting::integral auto const& len, + counting::raw_pointer auto ptr_ptr, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitDataRandSign(dataSpace, - ptr, len, getDataAlignment()); + using pointed_to_type = counting::pointed_to_type_t; + if (!enable_data_registration) return; + countingData->add_allocation( + counting::get_type_name(), + static_cast(counting::get_value(ptr)), + counting::get_value(len), sizeof(pointed_to_type), + static_cast(ptr_ptr), location); } - template - void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, Size_type len) + void deRegisterData(counting::pointer auto& ptr, + counting::raw_pointer auto ptr_ptr, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitDataRandValue(dataSpace, - ptr, len, getDataAlignment()); + if (!enable_data_registration) return; + countingData->remove_allocation( + static_cast(counting::get_value(ptr)), + static_cast(ptr_ptr), location); } - template - rajaperf::AutoDataMover scopedMoveData(DataSpace dataSpace, T*& ptr, Size_type len) + void allocData(DataSpace dataSpace, counting::pointer auto& ptr_in, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) + { + auto ptr = counting::get_value(ptr_in); + Size_type len = counting::get_value(len_in); + rajaperf::allocData(dataSpace, ptr, len, getDataAlignment()); + registerData(ptr, len, &counting::get_value(ptr_in), location); + ptr_in = ptr; + } + + void allocAndInitData(DataSpace dataSpace, counting::pointer auto& ptr_in, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) + { + auto ptr = counting::get_value(ptr_in); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitData(dataSpace, ptr, len, getDataAlignment()); + registerData(ptr, len, &counting::get_value(ptr_in), location); + ptr_in = ptr; + } + + void allocAndInitDataConst(DataSpace dataSpace, counting::pointer auto& ptr_in, + counting::integral auto const& len_in, auto const& val, + std::source_location location = std::source_location::current()) + { + auto ptr = counting::get_value(ptr_in); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitDataConst(dataSpace, ptr, len, getDataAlignment(), counting::get_value(val)); + registerData(ptr, len, &counting::get_value(ptr_in), location); + ptr_in = ptr; + } + + void allocAndInitDataRandSign(DataSpace dataSpace, counting::pointer auto& ptr_in, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) + { + auto ptr = counting::get_value(ptr_in); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitDataRandSign(dataSpace, ptr, len, getDataAlignment()); + registerData(ptr, len, &counting::get_value(ptr_in), location); + ptr_in = ptr; + } + + void allocAndInitDataRandValue(DataSpace dataSpace, counting::pointer auto& ptr_in, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) + { + auto ptr = counting::get_value(ptr_in); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitDataRandValue(dataSpace, ptr, len, getDataAlignment()); + registerData(ptr, len, &counting::get_value(ptr_in), location); + ptr_in = ptr; + } + + auto scopedMoveDataForInit(DataSpace dataSpace, DataSpace hds, counting::raw_pointer auto& ptr, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) + { + Size_type len = counting::get_value(len_in); + Size_type align = getDataAlignment(); + KernelBase& self = *this; + return rajaperf::AutoDataMover([=, &self, &ptr](){ + rajaperf::moveData(dataSpace, hds, ptr, len, align); + self.registerData(ptr, len, &ptr, location); + }); + } + + auto allocDataForInit(DataSpace dataSpace, counting::raw_pointer auto& ptr, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) { DataSpace hds = rajaperf::hostCopyDataSpace(dataSpace); - rajaperf::moveData(hds, dataSpace, ptr, len, getDataAlignment()); - return {dataSpace, hds, ptr, len, getDataAlignment()}; + Size_type len = counting::get_value(len_in); + rajaperf::allocData(hds, ptr, len, getDataAlignment()); + // don't register temporary data + return scopedMoveDataForInit(dataSpace, hds, ptr, len, location); } - template - void copyData(DataSpace dst_dataSpace, T* dst_ptr, - DataSpace src_dataSpace, const T* src_ptr, - Size_type len) + auto allocAndInitDataForInit(DataSpace dataSpace, counting::raw_pointer auto& ptr, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) { - rajaperf::copyData(dst_dataSpace, dst_ptr, src_dataSpace, src_ptr, len); + DataSpace hds = rajaperf::hostCopyDataSpace(dataSpace); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitData(hds, ptr, len, getDataAlignment()); + // don't register temporary data + return scopedMoveDataForInit(dataSpace, hds, ptr, len, location); } - template - void deallocData(DataSpace dataSpace, T& ptr) + auto allocAndInitDataConstForInit(DataSpace dataSpace, counting::raw_pointer auto& ptr, + counting::integral auto const& len_in, auto const& val, + std::source_location location = std::source_location::current()) { - rajaperf::deallocData(dataSpace, ptr); + DataSpace hds = rajaperf::hostCopyDataSpace(dataSpace); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitDataConst(hds, ptr, len, getDataAlignment(), counting::get_value(val)); + // don't register temporary data + return scopedMoveDataForInit(dataSpace, hds, ptr, len, location); } - template - void allocData(T*& ptr, Size_type len, VariantID vid) + void copyData(DataSpace dst_dataSpace, counting::convertible_to_pointer auto const& dst, + DataSpace src_dataSpace, counting::convertible_to_pointer auto const& src, + counting::integral auto const& len) { - rajaperf::allocData(getDataSpace(vid), - ptr, len, getDataAlignment()); + rajaperf::copyData(dst_dataSpace, counting::get_value(dst), + src_dataSpace, counting::get_value(src), len); } - template - void allocAndCopyHostData(T*& dst_ptr, - const T* src_ptr, - Size_type len, - VariantID vid) + void deallocData(DataSpace dataSpace, counting::pointer auto& ptr_in, + std::source_location location = std::source_location::current()) { - rajaperf::allocData(getDataSpace(vid), - dst_ptr, len, getDataAlignment()); + auto ptr = counting::get_value(ptr_in); + deRegisterData(ptr, &counting::get_value(ptr_in), location); + rajaperf::deallocData(dataSpace, ptr); + ptr_in = nullptr; + } - rajaperf::copyData(getDataSpace(vid), - dst_ptr, DataSpace::Host, src_ptr, len); + + void allocData(counting::pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) + { + allocData(getDataSpace(vid), ptr, len, location); } - template - void allocAndInitData(T*& ptr, Size_type len, VariantID vid) + void allocAndCopyHostData(counting::pointer auto& dst_ptr, + counting::convertible_to_pointer auto const& src, + counting::integral auto const& len, + VariantID vid, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitData(getDataSpace(vid), - ptr, len, getDataAlignment()); + allocData(getDataSpace(vid), dst_ptr, len, location); + copyData(getDataSpace(vid), dst_ptr, DataSpace::Host, src, len); } - template - void allocAndInitDataConst(T*& ptr, Size_type len, V val, VariantID vid) + void allocAndInitData(counting::pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitDataConst(getDataSpace(vid), - ptr, len, getDataAlignment(), val); + allocAndInitData(getDataSpace(vid), ptr, len, location); } - template - void allocAndInitDataRandSign(T*& ptr, Size_type len, VariantID vid) + void allocAndInitDataConst(counting::pointer auto& ptr, counting::integral auto const& len, + auto const& val, VariantID vid, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitDataRandSign(getDataSpace(vid), - ptr, len, getDataAlignment()); + allocAndInitDataConst(getDataSpace(vid), ptr, len, val, location); } - template - void allocAndInitDataRandValue(T*& ptr, Size_type len, VariantID vid) + void allocAndInitDataRandSign(counting::pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitDataRandValue(getDataSpace(vid), - ptr, len, getDataAlignment()); + allocAndInitDataRandSign(getDataSpace(vid), ptr, len, location); } - template - rajaperf::AutoDataMover allocDataForInit(T*& ptr, Size_type len, VariantID vid) + void allocAndInitDataRandValue(counting::pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) { - DataSpace ds = getDataSpace(vid); - DataSpace hds = rajaperf::hostCopyDataSpace(ds); - rajaperf::allocData(hds, ptr, len, getDataAlignment()); - return {ds, hds, ptr, len, getDataAlignment()}; + allocAndInitDataRandValue(getDataSpace(vid), ptr, len, location); } - template - rajaperf::AutoDataMover allocAndInitDataForInit(T*& ptr, Size_type len, VariantID vid) + auto allocDataForInit(counting::raw_pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) { - DataSpace ds = getDataSpace(vid); - DataSpace hds = rajaperf::hostCopyDataSpace(ds); - rajaperf::allocAndInitData(hds, ptr, len, getDataAlignment()); - return {ds, hds, ptr, len, getDataAlignment()}; + return allocDataForInit(getDataSpace(vid), ptr, len, location); } - template - rajaperf::AutoDataMover allocAndInitDataConstForInit(T*& ptr, Size_type len, T val, VariantID vid) + auto allocAndInitDataForInit(counting::raw_pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) { - DataSpace ds = getDataSpace(vid); - DataSpace hds = rajaperf::hostCopyDataSpace(ds); - rajaperf::allocAndInitDataConst(hds, ptr, len, getDataAlignment(), val); - return {ds, hds, ptr, len, getDataAlignment()}; + return allocAndInitDataForInit(getDataSpace(vid), ptr, len, location); } - template - rajaperf::AutoDataMover scopedMoveData(T*& ptr, Size_type len, VariantID vid) + auto allocAndInitDataConstForInit(counting::raw_pointer auto& ptr, counting::integral auto const& len, + auto const& val, VariantID vid, + std::source_location location = std::source_location::current()) { - DataSpace ds = getDataSpace(vid); - DataSpace hds = rajaperf::hostCopyDataSpace(ds); - rajaperf::moveData(hds, ds, ptr, len, getDataAlignment()); - return {ds, hds, ptr, len, getDataAlignment()}; + return allocAndInitDataConstForInit(getDataSpace(vid), ptr, len, val, location); } - template - void deallocData(T*& ptr, VariantID vid) + void deallocData(counting::pointer auto& ptr, VariantID vid, + std::source_location location = std::source_location::current()) { - rajaperf::deallocData(getDataSpace(vid), ptr); + deallocData(getDataSpace(vid), ptr, location); } template @@ -639,6 +808,10 @@ class KernelBase Index_type bytes_written_per_rep; Index_type bytes_atomic_modify_written_per_rep; Index_type FLOPs_per_rep; + + bool enable_data_registration = false; + std::unique_ptr countingData; + double kernel_block_size = nan(""); // Set default value for non GPU kernels VariantID running_variant; diff --git a/src/common/RPTypes.hpp b/src/common/RPTypes.hpp index df059e276..339e3aca6 100644 --- a/src/common/RPTypes.hpp +++ b/src/common/RPTypes.hpp @@ -142,6 +142,30 @@ using Real_type = float; #endif +template < size_t N > +using Real_array = Real_type[N]; + +template < size_t N0, size_t N1 > +using Real_array2 = Real_type[N0][N1]; + +template < size_t N0, size_t N1, size_t N2 > +using Real_array3 = Real_type[N0][N1][N2]; + +template < size_t N0, size_t N1, size_t N2, size_t N3 > +using Real_array4 = Real_type[N0][N1][N2][N3]; + +template < size_t N > +using Real_array_ref = Real_type(&)[N]; + +template < size_t N0, size_t N1 > +using Real_array2_ref = Real_type(&)[N0][N1]; + +template < size_t N0, size_t N1, size_t N2 > +using Real_array3_ref = Real_type(&)[N0][N1][N2]; + +template < size_t N0, size_t N1, size_t N2, size_t N3 > +using Real_array4_ref = Real_type(&)[N0][N1][N2][N3]; + using Real_ptr = Real_type*; using Real_const_ptr = Real_type const *; /// diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index d12ee75ae..ffbb2881e 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -85,5 +85,47 @@ void DIFF_PREDICT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_cx, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DIFF_PREDICT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DIFF_PREDICT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFF_PREDICT_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/DIFF_PREDICT.hpp b/src/lcals/DIFF_PREDICT.hpp index 60abcb31b..980eef80e 100644 --- a/src/lcals/DIFF_PREDICT.hpp +++ b/src/lcals/DIFF_PREDICT.hpp @@ -87,6 +87,7 @@ class DIFF_PREDICT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index 5c4340142..fddeed891 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -98,5 +98,47 @@ void EOS::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_u, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void EOS::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + EOS_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(EOS_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/EOS.hpp b/src/lcals/EOS.hpp index b0bb91050..b4b433588 100644 --- a/src/lcals/EOS.hpp +++ b/src/lcals/EOS.hpp @@ -56,6 +56,7 @@ class EOS : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index 650a3ecff..192150c92 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -84,5 +84,47 @@ void FIRST_DIFF::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void FIRST_DIFF::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + FIRST_DIFF_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(FIRST_DIFF_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_DIFF.hpp b/src/lcals/FIRST_DIFF.hpp index d85348be5..4292a5bd3 100644 --- a/src/lcals/FIRST_DIFF.hpp +++ b/src/lcals/FIRST_DIFF.hpp @@ -46,6 +46,7 @@ class FIRST_DIFF : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index 4dcdfdaba..ed0668b45 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -95,5 +95,57 @@ void FIRST_MIN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_x, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void FIRST_MIN::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + FIRST_MIN_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + FIRST_MIN_MINLOC_INIT; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(FIRST_MIN_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_minloc = mymin.loc; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index fa4804859..684387483 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -73,6 +73,7 @@ class FIRST_MIN : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index c8f920406..90793110e 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -84,5 +84,47 @@ void FIRST_SUM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void FIRST_SUM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 1; + const Index_type iend = getActualProblemSize(); + + FIRST_SUM_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(FIRST_SUM_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp index e47362a69..3bd92301f 100644 --- a/src/lcals/FIRST_SUM.hpp +++ b/src/lcals/FIRST_SUM.hpp @@ -49,6 +49,7 @@ class FIRST_SUM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index aeec387d5..89dd53da0 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -97,5 +97,49 @@ void GEN_LIN_RECUR::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_sb, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void GEN_LIN_RECUR::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + GEN_LIN_RECUR_DATA_SETUP; + const Index_type iend = N+1; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = 0; k < N; ++k )) { + RAJAPERF_COUNTERS_LOOP_BODY(GEN_LIN_RECUR_OPT_BODY1); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(GEN_LIN_RECUR_OPT_BODY2); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/GEN_LIN_RECUR.hpp b/src/lcals/GEN_LIN_RECUR.hpp index 6996a59b6..056d56dd3 100644 --- a/src/lcals/GEN_LIN_RECUR.hpp +++ b/src/lcals/GEN_LIN_RECUR.hpp @@ -50,6 +50,20 @@ stb5[k] = b5[k+kb5i] - stb5[k]; +#define GEN_LIN_RECUR_OPT_BODY1 \ + Real_type tmp; \ + Real_type stb = stb5[k]; \ + b5[k+kb5i] = tmp = sa[k] + stb*sb[k]; \ + stb5[k] = tmp - stb; + +#define GEN_LIN_RECUR_OPT_BODY2 \ + Index_type k = N - i ; \ + Real_type tmp; \ + Real_type stb = stb5[k]; \ + b5[k+kb5i] = tmp = sa[k] + stb*sb[k]; \ + stb5[k] = tmp - stb; + + #include "common/KernelBase.hpp" namespace rajaperf @@ -70,6 +84,7 @@ class GEN_LIN_RECUR : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index fb05917a5..efc8e73a2 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -95,5 +95,47 @@ void HYDRO_1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_z, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void HYDRO_1D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + HYDRO_1D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(HYDRO_1D_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_1D.hpp b/src/lcals/HYDRO_1D.hpp index 28fcb3f93..57d56823f 100644 --- a/src/lcals/HYDRO_1D.hpp +++ b/src/lcals/HYDRO_1D.hpp @@ -51,6 +51,7 @@ class HYDRO_1D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index 307470a16..f615007ca 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -129,5 +129,63 @@ void HYDRO_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_zz, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void HYDRO_2D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type kbeg = 1; + const Index_type kend = m_kn - 1; + const Index_type jbeg = 1; + const Index_type jend = m_jn - 1; + + HYDRO_2D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = kbeg; k < kend; ++k )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = jbeg; j < jend; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(HYDRO_2D_BODY1); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = kbeg; k < kend; ++k )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = jbeg; j < jend; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(HYDRO_2D_BODY2); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = kbeg; k < kend; ++k )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = jbeg; j < jend; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(HYDRO_2D_BODY3); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_2D.hpp b/src/lcals/HYDRO_2D.hpp index 098c46cad..6631fe637 100644 --- a/src/lcals/HYDRO_2D.hpp +++ b/src/lcals/HYDRO_2D.hpp @@ -145,6 +145,7 @@ class HYDRO_2D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index feb9dda68..c2a2030d2 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -111,5 +111,47 @@ void INT_PREDICT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_px, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INT_PREDICT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INT_PREDICT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INT_PREDICT_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/INT_PREDICT.hpp b/src/lcals/INT_PREDICT.hpp index 28d7ebe9e..639f674dc 100644 --- a/src/lcals/INT_PREDICT.hpp +++ b/src/lcals/INT_PREDICT.hpp @@ -66,6 +66,7 @@ class INT_PREDICT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index b33b7ea8b..95543f3d9 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -88,5 +88,47 @@ void PLANCKIAN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_w, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void PLANCKIAN::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + PLANCKIAN_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(PLANCKIAN_OPT_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/PLANCKIAN.hpp b/src/lcals/PLANCKIAN.hpp index 08412db4d..dc6bb3ee6 100644 --- a/src/lcals/PLANCKIAN.hpp +++ b/src/lcals/PLANCKIAN.hpp @@ -31,6 +31,12 @@ w[i] = x[i] / ( exp( y[i] ) - 1.0 ); +#define PLANCKIAN_OPT_BODY \ + Real_type tmp; \ + y[i] = tmp = u[i] / v[i]; \ + w[i] = x[i] / ( exp( tmp ) - 1.0 ); + + #include "common/KernelBase.hpp" namespace rajaperf @@ -51,6 +57,7 @@ class PLANCKIAN : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index d1210efaa..cfd9fe14a 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -88,5 +88,47 @@ void TRIDIAG_ELIM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_z, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void TRIDIAG_ELIM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 1; + const Index_type iend = m_N; + + TRIDIAG_ELIM_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(TRIDIAG_ELIM_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/TRIDIAG_ELIM.hpp b/src/lcals/TRIDIAG_ELIM.hpp index b940abac8..7860762d7 100644 --- a/src/lcals/TRIDIAG_ELIM.hpp +++ b/src/lcals/TRIDIAG_ELIM.hpp @@ -51,6 +51,7 @@ class TRIDIAG_ELIM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index e7455414d..917f8b700 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -116,5 +116,60 @@ void POLYBENCH_2MM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_D, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_2MM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_2MM_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; i++ )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < nj; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY1); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = 0; k < nk; k++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY2); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY3); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; i++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type l = 0; l < nl; l++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY4); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < nj; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY5); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY6); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp index 077c91716..df15b971b 100644 --- a/src/polybench/POLYBENCH_2MM.hpp +++ b/src/polybench/POLYBENCH_2MM.hpp @@ -121,6 +121,7 @@ class POLYBENCH_2MM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index 4c4c8546a..0edeaf58a 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -129,5 +129,70 @@ void POLYBENCH_3MM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_G, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_3MM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_3MM_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; i++ )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < nj; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY1); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = 0; k < nk; k++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY2); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY3); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < nj; j++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type l = 0; l < nl; l++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY4); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type m = 0; m < nm; m++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY5); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY6); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; i++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type l = 0; l < nl; l++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY7); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < nj; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY8); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY9); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_3MM.hpp b/src/polybench/POLYBENCH_3MM.hpp index 936714cfa..f2fa3830b 100644 --- a/src/polybench/POLYBENCH_3MM.hpp +++ b/src/polybench/POLYBENCH_3MM.hpp @@ -147,6 +147,7 @@ class POLYBENCH_3MM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index 562984feb..11c340f23 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -105,5 +105,62 @@ void POLYBENCH_ADI::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_Q, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_ADI::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_ADI_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < n-1; ++i)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY2); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 1; j < n-1; ++j)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY3); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY4); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = n-2; k >= 1; --k)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY5); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < n-1; ++i)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY6); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 1; j < n-1; ++j)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY7); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY8); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = n-2; k >= 1; --k)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY9); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_ADI.hpp b/src/polybench/POLYBENCH_ADI.hpp index ad51e7429..8e71fb18a 100644 --- a/src/polybench/POLYBENCH_ADI.hpp +++ b/src/polybench/POLYBENCH_ADI.hpp @@ -122,6 +122,49 @@ U[i * n + k] = P[i * n + k] * U[i * n + k +1] + Q[i * n + k]; +#define POLYBENCH_ADI_OPT_BODY2 \ + V[0 * n + i] = 1.0; \ + Real_type last_P = 0.0; \ + Real_type last_Q = 1.0; \ + P[i * n + 0] = last_P; \ + Q[i * n + 0] = last_Q; + +#define POLYBENCH_ADI_OPT_BODY3 \ + Real_type tmp_div = a * last_P + b; \ + P[i * n + j] = last_P = -c / tmp_div; \ + Q[i * n + j] = last_Q = (-d * U[j * n + i-1] + (1.0 + 2.0*d) * U[j * n + i] - \ + f * U[j * n + i + 1] - a * last_Q) / \ + tmp_div; + +#define POLYBENCH_ADI_OPT_BODY4 \ + Real_type last_V = 1.0; \ + V[(n-1) * n + i] = last_V; + +#define POLYBENCH_ADI_OPT_BODY5 \ + V[k * n + i] = last_V = P[i * n + k] * last_V + Q[i * n + k]; + +#define POLYBENCH_ADI_OPT_BODY6 \ + U[i * n + 0] = 1.0; \ + Real_type last_P = 0.0; \ + Real_type last_Q = 1.0; \ + P[i * n + 0] = last_P; \ + Q[i * n + 0] = last_Q; + +#define POLYBENCH_ADI_OPT_BODY7 \ + Real_type tmp_div = d * last_P + e; \ + P[i * n + j] = last_P = -f / tmp_div; \ + Q[i * n + j] = last_Q = (-a * V[(i-1) * n + j] + (1.0 + 2.0*a) * V[i * n + j] - \ + c * V[(i + 1) * n + j] - d * last_Q) / \ + tmp_div; + +#define POLYBENCH_ADI_OPT_BODY8 \ + Real_type last_U = 1.0; \ + U[i * n + n-1] = last_U; + +#define POLYBENCH_ADI_OPT_BODY9 \ + U[i * n + k] = last_U = P[i * n + k] * last_U + Q[i * n + k]; + + #define POLYBENCH_ADI_BODY2_RAJA \ Vview(0, i) = 1.0; \ Pview(i, 0) = 0.0; \ @@ -188,6 +231,7 @@ class POLYBENCH_ADI : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index a18fab318..493176659 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -102,5 +102,56 @@ void POLYBENCH_ATAX::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx deallocData(m_A, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_ATAX::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_ATAX_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < N; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY1); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < N; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY2); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY3); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < N; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY4); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type i = 0; i < N; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY5); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY6); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_ATAX.hpp b/src/polybench/POLYBENCH_ATAX.hpp index bd896bf08..a4e4c0505 100644 --- a/src/polybench/POLYBENCH_ATAX.hpp +++ b/src/polybench/POLYBENCH_ATAX.hpp @@ -109,6 +109,7 @@ class POLYBENCH_ATAX : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index b86d29622..5d1114a1b 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -124,5 +124,61 @@ void POLYBENCH_FDTD_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ deallocData(m_hz, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_FDTD_2D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_FDTD_2D_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < ny; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_FDTD_2D_BODY1); + } + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < nx; i++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < ny; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_FDTD_2D_BODY2); + } + } + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < nx; i++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 1; j < ny; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_FDTD_2D_BODY3); + } + } + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < nx - 1; i++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < ny - 1; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_FDTD_2D_BODY4); + } + } + + t = (t+1) % m_tsteps; + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_FDTD_2D.hpp b/src/polybench/POLYBENCH_FDTD_2D.hpp index 75e5c2a17..b17882930 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.hpp +++ b/src/polybench/POLYBENCH_FDTD_2D.hpp @@ -106,6 +106,7 @@ class POLYBENCH_FDTD_2D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index cbb4542eb..542f18974 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -92,5 +92,48 @@ void POLYBENCH_FLOYD_WARSHALL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_AR deallocData(m_pout, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_FLOYD_WARSHALL::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_FLOYD_WARSHALL_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = 0; k < N; ++k)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < N; ++i)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < N; ++j)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_FLOYD_WARSHALL_BODY); + } + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp index a6c1e9133..08818c57c 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp @@ -70,6 +70,7 @@ class POLYBENCH_FLOYD_WARSHALL : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index 34d093fb9..ad7799b67 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -104,5 +104,51 @@ void POLYBENCH_GEMM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx deallocData(m_C, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_GEMM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_GEMM_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; ++i )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < nj; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMM_BODY1); + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMM_BODY2); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = 0; k < nk; ++k )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMM_BODY3); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMM_BODY4); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GEMM.hpp b/src/polybench/POLYBENCH_GEMM.hpp index ff4817e37..e208babe5 100644 --- a/src/polybench/POLYBENCH_GEMM.hpp +++ b/src/polybench/POLYBENCH_GEMM.hpp @@ -98,6 +98,7 @@ class POLYBENCH_GEMM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index 1b542f9a3..6b3ef7baf 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -132,5 +132,66 @@ void POLYBENCH_GEMVER::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i deallocData(m_z, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_GEMVER::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_GEMVER_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < n; i++ )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < n; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY1); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < n; i++ )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY2); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < n; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY3); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY4); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < n; i++ )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY5); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < n; i++ )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY6); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < n; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY7); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY8); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GEMVER.hpp b/src/polybench/POLYBENCH_GEMVER.hpp index 6bfb141b3..743b35749 100644 --- a/src/polybench/POLYBENCH_GEMVER.hpp +++ b/src/polybench/POLYBENCH_GEMVER.hpp @@ -149,6 +149,7 @@ class POLYBENCH_GEMVER : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index 7ba2fd8fc..bfb86ebd4 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -95,5 +95,48 @@ void POLYBENCH_GESUMMV::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ deallocData(m_B, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_GESUMMV::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_GESUMMV_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < N; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GESUMMV_BODY1); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < N; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GESUMMV_BODY2); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GESUMMV_BODY3); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GESUMMV.hpp b/src/polybench/POLYBENCH_GESUMMV.hpp index 816869351..9559e31d9 100644 --- a/src/polybench/POLYBENCH_GESUMMV.hpp +++ b/src/polybench/POLYBENCH_GESUMMV.hpp @@ -92,6 +92,7 @@ class POLYBENCH_GESUMMV : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index cec6a1383..1ef5a00fb 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -101,5 +101,58 @@ void POLYBENCH_HEAT_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ deallocData(m_Binit, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_HEAT_3D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_HEAT_3D_DATA_SETUP; + + const Index_type ijkend = N-1; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < ijkend; ++i )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 1; j < ijkend; ++j )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = 1; k < ijkend; ++k )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_HEAT_3D_BODY1); + } + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < ijkend; ++i)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 1; j < ijkend; ++j )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = 1; k < ijkend; ++k )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_HEAT_3D_BODY2); + } + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp index 7a8c6c635..1c14b5f57 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.hpp +++ b/src/polybench/POLYBENCH_HEAT_3D.hpp @@ -110,6 +110,7 @@ class POLYBENCH_HEAT_3D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index ebef81138..5530acab6 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -98,5 +98,50 @@ void POLYBENCH_JACOBI_1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun deallocData(m_Binit, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_JACOBI_1D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_JACOBI_1D_DATA_SETUP; + + const Index_type iend = N-1; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_JACOBI_1D_BODY1); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < iend; ++i)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_JACOBI_1D_BODY2); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_JACOBI_1D.hpp b/src/polybench/POLYBENCH_JACOBI_1D.hpp index fe113ffcc..6206c016d 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.hpp @@ -58,6 +58,7 @@ class POLYBENCH_JACOBI_1D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index e7da05444..60e821046 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -100,5 +100,54 @@ void POLYBENCH_JACOBI_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun deallocData(m_Binit, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_JACOBI_2D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_JACOBI_2D_DATA_SETUP; + + const Index_type ijend = N-1; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < ijend; ++i )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 1; j < ijend; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_JACOBI_2D_BODY1); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < ijend; ++i)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 1; j < ijend; ++j)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_JACOBI_2D_BODY2); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp index d6083cbac..5e5866bea 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp @@ -77,6 +77,7 @@ class POLYBENCH_JACOBI_2D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index 3e4a55fec..46ce45712 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -104,5 +104,56 @@ void POLYBENCH_MVT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_A, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_MVT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_MVT_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < N; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY1); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < N; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY2); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY3); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < N; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY4); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < N; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY5); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY6); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_MVT.hpp b/src/polybench/POLYBENCH_MVT.hpp index 514870844..83a4612f2 100644 --- a/src/polybench/POLYBENCH_MVT.hpp +++ b/src/polybench/POLYBENCH_MVT.hpp @@ -113,6 +113,7 @@ class POLYBENCH_MVT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 9078659f1..b40b397bd 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -86,5 +86,47 @@ void ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_c, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void ADD::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ADD_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ADD_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp index 761ce64d7..99a6d94a1 100644 --- a/src/stream/ADD.hpp +++ b/src/stream/ADD.hpp @@ -46,6 +46,7 @@ class ADD : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index cfee91918..6364af0d4 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -84,5 +84,47 @@ void COPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_c, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void COPY::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + COPY_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(COPY_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp index e24757da9..6bf533720 100644 --- a/src/stream/COPY.hpp +++ b/src/stream/COPY.hpp @@ -45,6 +45,7 @@ class COPY : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index 4af896a88..287fda7c3 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -87,5 +87,57 @@ void DOT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_b, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DOT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DOT_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Real_type dot = m_dot_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(DOT_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_dot += dot; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index 1229d861e..b4efdc3d6 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -45,6 +45,7 @@ class DOT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index 2c5e2a594..1de04eb01 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -85,5 +85,47 @@ void MUL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_c, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MUL::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MUL_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(MUL_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp index 1e04b2c53..9bad4e557 100644 --- a/src/stream/MUL.hpp +++ b/src/stream/MUL.hpp @@ -46,6 +46,7 @@ class MUL : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index 395dce001..9814345ec 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -91,5 +91,47 @@ void TRIAD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_c, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void TRIAD::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + TRIAD_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(TRIAD_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp index f901314a4..4ed11ce9a 100644 --- a/src/stream/TRIAD.hpp +++ b/src/stream/TRIAD.hpp @@ -47,6 +47,7 @@ class TRIAD : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx);