From 5bcf20a56e8b3db5015b77890891f068472682a2 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 4 Nov 2025 11:28:20 -0800 Subject: [PATCH 1/3] Use c++20 --- .github/workflows/build.yml | 2 +- CMakeLists.txt | 19 ++++++++++--------- Dockerfile | 2 +- azure-pipelines.yml | 2 +- scripts/alcf-builds/sycl.sh | 2 +- scripts/lc-builds/corona_sycl.sh | 2 +- scripts/lc-builds/toss4_amdclang.sh | 2 +- scripts/lc-builds/toss4_amdclang_asan.sh | 2 +- scripts/lc-builds/toss4_cce_hip.sh | 2 +- scripts/lc-builds/toss4_clang-mpi_caliper.sh | 2 +- scripts/lc-builds/toss4_clang.sh | 2 +- scripts/lc-builds/toss4_clang_caliper.sh | 2 +- .../lc-builds/toss4_cray-mpich_amdclang.sh | 2 +- .../toss4_cray-mpich_amdclang_asan.sh | 2 +- .../toss4_cray-mpich_amdclang_caliper.sh | 2 +- scripts/lc-builds/toss4_gcc-mpi_caliper.sh | 2 +- scripts/lc-builds/toss4_gcc.sh | 2 +- scripts/lc-builds/toss4_gcc_caliper.sh | 2 +- scripts/lc-builds/toss4_hipcc.sh | 2 +- scripts/lc-builds/toss4_icpc-classic.sh | 2 +- scripts/lc-builds/toss4_icpc.sh | 2 +- scripts/lc-builds/toss4_icpx.sh | 2 +- scripts/lc-builds/toss4_mvapich2_icpx.sh | 2 +- scripts/ubuntu-builds/ubuntu_amdclang.sh | 2 +- 24 files changed, 33 insertions(+), 32 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 274497683..932fc15bd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -64,7 +64,7 @@ jobs: build-dir: build options: ENABLE_WARNINGS_AS_ERRORS=Off - BLT_CXX_STD=c++17 + BLT_CXX_STD=c++20 CMAKE_BUILD_TYPE=Release PERFSUITE_RUN_SHORT_TEST=On ${{ matrix.shared.args }} diff --git a/CMakeLists.txt b/CMakeLists.txt index aa09cfc50..0cdbf877b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,17 +20,17 @@ set(COMPILERS_KNOWN_TO_CMAKE33 AppleClang Clang GNU MSVC) include(CheckCXXCompilerFlag) if(NOT DEFINED BLT_CXX_STD) - if("cxx_std_20" IN_LIST CMAKE_CXX_KNOWN_FEATURES) - set(BLT_CXX_STD c++20 CACHE STRING "Version of C++ standard") + if("cxx_std_23" IN_LIST CMAKE_CXX_KNOWN_FEATURES) + set(BLT_CXX_STD c++23 CACHE STRING "Version of C++ standard") message("Using C++ standard: ${BLT_CXX_STD}") - elseif("cxx_std_17" IN_LIST CMAKE_CXX_KNOWN_FEATURES) - set(BLT_CXX_STD c++17 CACHE STRING "Version of C++ standard") + elseif("cxx_std_20" IN_LIST CMAKE_CXX_KNOWN_FEATURES) + set(BLT_CXX_STD c++20 CACHE STRING "Version of C++ standard") message("Using C++ standard: ${BLT_CXX_STD}") elseif("${CMAKE_CXX_COMPILER_ID}" IN_LIST COMPILERS_KNOWN_TO_CMAKE33) - set(BLT_CXX_STD c++17 CACHE STRING "Version of C++ standard") + set(BLT_CXX_STD c++20 CACHE STRING "Version of C++ standard") message("Using C++ standard: ${BLT_CXX_STD}") else() #cmake has no idea what to do, do it ourselves... - set(flag_var "c++17") + set(flag_var "c++20") CHECK_CXX_COMPILER_FLAG("-std=${flag_var}" COMPILER_SUPPORTS_${flag_var}) if(COMPILER_SUPPORTS_${flag_var}) set(BLT_CXX_STD ${flag_var} CACHE STRING "Version of C++ standard") @@ -41,8 +41,9 @@ if(NOT DEFINED BLT_CXX_STD) else() #check BLT_CXX_STD is high enough by disallowing the only invalid option if(("${BLT_CXX_STD}" STREQUAL "c++98") OR ("${BLT_CXX_STD}" STREQUAL "c++11") OR - ("${BLT_CXX_STD}" STREQUAL "c++14")) - message(FATAL_ERROR "RAJA requires minimum C++ standard of c++17") + ("${BLT_CXX_STD}" STREQUAL "c++14") OR + ("${BLT_CXX_STD}" STREQUAL "c++17")) + message(FATAL_ERROR "RAJA requires minimum C++ standard of c++20") endif() endif(NOT DEFINED BLT_CXX_STD) @@ -247,7 +248,7 @@ set(RAJAPERF_BUILD_SYSTYPE $ENV{SYS_TYPE}) set(RAJAPERF_BUILD_HOST $ENV{HOSTNAME}) if (ENABLE_CUDA) - set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_STANDARD 20) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -restrict --extended-lambda --expt-relaxed-constexpr") set(RAJAPERF_COMPILER "${CUDA_NVCC_EXECUTABLE}") diff --git a/Dockerfile b/Dockerfile index 86353abf0..e88a2a221 100644 --- a/Dockerfile +++ b/Dockerfile @@ -137,5 +137,5 @@ ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN /bin/bash -c "source /opt/intel/oneapi/setvars.sh 2>&1 > /dev/null && \ - cmake -DCMAKE_CXX_COMPILER=dpcpp -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=Off -DRAJA_ENABLE_SYCL=On -DBLT_CXX_STD=c++17 -DRAJA_ENABLE_DESUL_ATOMICS=On .. && \ + cmake -DCMAKE_CXX_COMPILER=dpcpp -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=Off -DRAJA_ENABLE_SYCL=On -DBLT_CXX_STD=c++20 -DRAJA_ENABLE_DESUL_ATOMICS=On .. && \ make -j 16" diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0f9a16593..35b102ec4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -9,7 +9,7 @@ jobs: # pool: # vmImage: 'windows-2019' # variables: -# CMAKE_EXTRA_FLAGS: '-DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off -DBLT_CXX_STD="" -DCMAKE_CXX_STANDARD=17' +# CMAKE_EXTRA_FLAGS: '-DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off -DBLT_CXX_STD="" -DCMAKE_CXX_STANDARD=20' # steps: # - checkout: self # clean: boolean diff --git a/scripts/alcf-builds/sycl.sh b/scripts/alcf-builds/sycl.sh index 2482444ea..63e4c8813 100755 --- a/scripts/alcf-builds/sycl.sh +++ b/scripts/alcf-builds/sycl.sh @@ -24,7 +24,7 @@ cmake \ -DENABLE_TARGET_OPENMP=Off \ -DENABLE_ALL_WARNINGS=Off \ -DENABLE_SYCL=On \ - -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_CXX_STANDARD=20 \ -DCMAKE_LINKER=icpx \ "$@" \ .. diff --git a/scripts/lc-builds/corona_sycl.sh b/scripts/lc-builds/corona_sycl.sh index 7470ffb5c..b00f25b2e 100755 --- a/scripts/lc-builds/corona_sycl.sh +++ b/scripts/lc-builds/corona_sycl.sh @@ -52,7 +52,7 @@ cmake \ -DCMAKE_C_COMPILER=clang \ -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_LINKER=clang++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -DENABLE_TESTS=On \ -DENABLE_EXAMPLES=On \ "$@" \ diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh index 73caddb1c..7d6dc6d7f 100755 --- a/scripts/lc-builds/toss4_amdclang.sh +++ b/scripts/lc-builds/toss4_amdclang.sh @@ -79,7 +79,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ diff --git a/scripts/lc-builds/toss4_amdclang_asan.sh b/scripts/lc-builds/toss4_amdclang_asan.sh index 409125ab7..038648a8c 100755 --- a/scripts/lc-builds/toss4_amdclang_asan.sh +++ b/scripts/lc-builds/toss4_amdclang_asan.sh @@ -86,7 +86,7 @@ cmake \ -DCMAKE_CXX_FLAGS="-fsanitize=address -fsanitize=undefined -shared-libsan" \ -DCMAKE_HIP_FLAGS="-fsanitize=address -fsanitize=undefined -shared-libsan -fgpu-rdc --hip-version=${COMP_HIP_VER}" \ -DCMAKE_EXE_LINKER_FLAGS="-L/opt/rocm-${COMP_HIP_VER}/lib/asan/ -L/opt/rocm-${COMP_HIP_VER}/llvm/lib/asan -Wl,-rpath,/opt/rocm-${COMP_HIP_VER}/lib/asan/:/opt/rocm-${COMP_HIP_VER}/llvm/lib/asan:/opt/rocm-${COMP_HIP_VER}/lib/llvm/lib/clang/${COMP_CLANG_MAJOR_VER}/lib/linux -fgpu-rdc --hip-version=${COMP_HIP_VER}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ diff --git a/scripts/lc-builds/toss4_cce_hip.sh b/scripts/lc-builds/toss4_cce_hip.sh index e3f746723..c506be0c5 100755 --- a/scripts/lc-builds/toss4_cce_hip.sh +++ b/scripts/lc-builds/toss4_cce_hip.sh @@ -53,7 +53,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES=${HIP_ARCH} \ -DGPU_TARGETS=${HIP_ARCH} \ -DAMDGPU_TARGETS=${HIP_ARCH} \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ diff --git a/scripts/lc-builds/toss4_clang-mpi_caliper.sh b/scripts/lc-builds/toss4_clang-mpi_caliper.sh index cc6522a37..5f840fa04 100755 --- a/scripts/lc-builds/toss4_clang-mpi_caliper.sh +++ b/scripts/lc-builds/toss4_clang-mpi_caliper.sh @@ -41,7 +41,7 @@ module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DENABLE_OPENMP=On \ diff --git a/scripts/lc-builds/toss4_clang.sh b/scripts/lc-builds/toss4_clang.sh index d97228fdb..a21c01980 100755 --- a/scripts/lc-builds/toss4_clang.sh +++ b/scripts/lc-builds/toss4_clang.sh @@ -34,7 +34,7 @@ module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/toss4_clang_caliper.sh b/scripts/lc-builds/toss4_clang_caliper.sh index 47773adfd..73ae770b9 100755 --- a/scripts/lc-builds/toss4_clang_caliper.sh +++ b/scripts/lc-builds/toss4_clang_caliper.sh @@ -41,7 +41,7 @@ module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh index 197afac9b..5909722fe 100755 --- a/scripts/lc-builds/toss4_cray-mpich_amdclang.sh +++ b/scripts/lc-builds/toss4_cray-mpich_amdclang.sh @@ -104,7 +104,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DENABLE_HIP=ON \ diff --git a/scripts/lc-builds/toss4_cray-mpich_amdclang_asan.sh b/scripts/lc-builds/toss4_cray-mpich_amdclang_asan.sh index a2442b4a2..f27a02f5a 100755 --- a/scripts/lc-builds/toss4_cray-mpich_amdclang_asan.sh +++ b/scripts/lc-builds/toss4_cray-mpich_amdclang_asan.sh @@ -111,7 +111,7 @@ cmake \ -DCMAKE_CXX_FLAGS="-fsanitize=address -shared-libsan" \ -DCMAKE_HIP_FLAGS="-fsanitize=address -shared-libsan -fgpu-rdc --hip-version=${COMP_HIP_VER}" \ -DCMAKE_EXE_LINKER_FLAGS="-L/opt/rocm-${COMP_HIP_VER}/lib/asan/ -L/opt/rocm-${COMP_HIP_VER}/llvm/lib/asan -Wl,-rpath,/opt/rocm-${COMP_HIP_VER}/lib/asan/:/opt/rocm-${COMP_HIP_VER}/llvm/lib/asan:/opt/rocm-${COMP_HIP_VER}/lib/llvm/lib/clang/${COMP_CLANG_MAJOR_VER}/lib/linux -fgpu-rdc --hip-version=${COMP_HIP_VER}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DENABLE_HIP=ON \ diff --git a/scripts/lc-builds/toss4_cray-mpich_amdclang_caliper.sh b/scripts/lc-builds/toss4_cray-mpich_amdclang_caliper.sh index 95fa3bb64..15b104dfc 100755 --- a/scripts/lc-builds/toss4_cray-mpich_amdclang_caliper.sh +++ b/scripts/lc-builds/toss4_cray-mpich_amdclang_caliper.sh @@ -108,7 +108,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DENABLE_HIP=ON \ diff --git a/scripts/lc-builds/toss4_gcc-mpi_caliper.sh b/scripts/lc-builds/toss4_gcc-mpi_caliper.sh index bf1f077ff..2b2ba87b8 100755 --- a/scripts/lc-builds/toss4_gcc-mpi_caliper.sh +++ b/scripts/lc-builds/toss4_gcc-mpi_caliper.sh @@ -42,7 +42,7 @@ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_C_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/gcc \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DENABLE_OPENMP=On \ diff --git a/scripts/lc-builds/toss4_gcc.sh b/scripts/lc-builds/toss4_gcc.sh index d20f8fe6d..568e425ce 100755 --- a/scripts/lc-builds/toss4_gcc.sh +++ b/scripts/lc-builds/toss4_gcc.sh @@ -34,7 +34,7 @@ module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/toss4_gcc_caliper.sh b/scripts/lc-builds/toss4_gcc_caliper.sh index 31c33f325..0484031d8 100755 --- a/scripts/lc-builds/toss4_gcc_caliper.sh +++ b/scripts/lc-builds/toss4_gcc_caliper.sh @@ -41,7 +41,7 @@ module load cmake/3.23.1 cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ diff --git a/scripts/lc-builds/toss4_hipcc.sh b/scripts/lc-builds/toss4_hipcc.sh index 19b427627..327f6c08e 100755 --- a/scripts/lc-builds/toss4_hipcc.sh +++ b/scripts/lc-builds/toss4_hipcc.sh @@ -69,7 +69,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ diff --git a/scripts/lc-builds/toss4_icpc-classic.sh b/scripts/lc-builds/toss4_icpc-classic.sh index a0a90ce98..fcbd276e1 100755 --- a/scripts/lc-builds/toss4_icpc-classic.sh +++ b/scripts/lc-builds/toss4_icpc-classic.sh @@ -40,7 +40,7 @@ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icpc \ -DCMAKE_C_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icc \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ diff --git a/scripts/lc-builds/toss4_icpc.sh b/scripts/lc-builds/toss4_icpc.sh index 07628ff2c..c5572e28e 100755 --- a/scripts/lc-builds/toss4_icpc.sh +++ b/scripts/lc-builds/toss4_icpc.sh @@ -40,7 +40,7 @@ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/bin/icpc \ -DCMAKE_C_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/bin/icc \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ diff --git a/scripts/lc-builds/toss4_icpx.sh b/scripts/lc-builds/toss4_icpx.sh index cd78adaa4..8b3c69fa9 100755 --- a/scripts/lc-builds/toss4_icpx.sh +++ b/scripts/lc-builds/toss4_icpx.sh @@ -42,7 +42,7 @@ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=icpx \ -DCMAKE_C_COMPILER=icx \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ -DENABLE_OPENMP=On \ diff --git a/scripts/lc-builds/toss4_mvapich2_icpx.sh b/scripts/lc-builds/toss4_mvapich2_icpx.sh index 3505c502f..4dd9ec443 100755 --- a/scripts/lc-builds/toss4_mvapich2_icpx.sh +++ b/scripts/lc-builds/toss4_mvapich2_icpx.sh @@ -50,7 +50,7 @@ cmake \ -DMPI_CXX_COMPILER="/usr/tce/packages/mvapich2/mvapich2-${MPI_VER}-intel-${COMP_VER}/bin/mpicxx" \ -DCMAKE_CXX_COMPILER=icpx \ -DCMAKE_C_COMPILER=icx \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_MPI=ON \ -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ diff --git a/scripts/ubuntu-builds/ubuntu_amdclang.sh b/scripts/ubuntu-builds/ubuntu_amdclang.sh index c82a45a39..4da4785b9 100755 --- a/scripts/ubuntu-builds/ubuntu_amdclang.sh +++ b/scripts/ubuntu-builds/ubuntu_amdclang.sh @@ -60,7 +60,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ - -DBLT_CXX_STD=c++17 \ + -DBLT_CXX_STD=c++20 \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ From 7569583c58e80bc5c5d8c09da061dd5e71c2839c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 16 Sep 2025 17:42:33 -0700 Subject: [PATCH 2/3] Add automatic counting to get new attributes This adds a check that our manual counts are corrects and adds the potential for more accurate counts in the future. Note that accurate counts after compiler optimization is difficult but in some cases some manual optimization can avoid double counting in many cases that the compiler can optimize. The accuracy of some counts also depends on how you define the counts. This leads to differences between the exact counts of bytes and iterations for some kernels. Normally the LoopBytes*/Rep counters should be the same as the estimated bytes. Similarly the ParallelIterations/Rep should match the estimate. The fp64Ops/rep should be the same as the estimated flops. --- src/common/CountingData.hpp | 1294 ++++++++++++++++++++++++++++++++ src/common/CountingMacros.hpp | 142 ++++ src/common/CountingWrapper.hpp | 1017 +++++++++++++++++++++++++ src/common/DataUtils.hpp | 47 +- src/common/Executor.cpp | 424 ++++++++--- src/common/Executor.hpp | 1 + src/common/KernelBase.hpp | 357 ++++++--- src/common/RPTypes.hpp | 24 + 8 files changed, 3068 insertions(+), 238 deletions(-) create mode 100644 src/common/CountingData.hpp create mode 100644 src/common/CountingMacros.hpp create mode 100644 src/common/CountingWrapper.hpp diff --git a/src/common/CountingData.hpp b/src/common/CountingData.hpp new file mode 100644 index 000000000..27fff3f7b --- /dev/null +++ b/src/common/CountingData.hpp @@ -0,0 +1,1294 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJAPerf_CountingData_HPP +#define RAJAPerf_CountingData_HPP + +#include "common/RAJAPerfSuite.hpp" +#include "common/RPTypes.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace rajaperf +{ + +namespace counting +{ + +enum struct OpType : int +{ + fp64, + int32, + int64, + ptr, + other, + NumOpTypes // must be at the end of the valid values +}; + +template < typename T > +constexpr OpType getOpType() +{ + using decayed_T = std::decay_t; + if constexpr (std::is_floating_point_v && sizeof(decayed_T) == sizeof(double)) { + return OpType::fp64; + } else if constexpr (std::is_integral_v && sizeof(decayed_T) == sizeof(std::int32_t)) { + return OpType::int32; + } else if constexpr (std::is_integral_v && sizeof(decayed_T) == sizeof(std::int64_t)) { + return OpType::int64; + } else if constexpr (std::is_pointer_v) { + return OpType::ptr; + } else { + return OpType::other; + } +} + +constexpr const char* getOpTypeName(OpType ot) +{ + switch (ot) { + case OpType::int32: return "int32"; + case OpType::int64: return "int64"; + case OpType::ptr: return "ptr"; + case OpType::fp64: return "fp64"; + case OpType::other: return "other"; + default: throw std::invalid_argument("ot is not in OpType"); + } +} + +template < typename T > +const char* get_type_name() +{ + OpType ot = getOpType(); + if (ot != OpType::other) { + return getOpTypeName(ot); + } else { + return typeid(T).name(); + } +} + +enum struct Operation : int +{ + copy, + assign, + load, + store, + uplus, + uminus, + abs, + add, + sub, + mult, + div, + rem, + preinc, + predec, + postinc, + postdec, + atomic_add, + sqrt, + exp, + bit_not, + bit_and, + bit_or, + bit_xor, + bit_lsh, + bit_rsh, + eq, + ne, + lt, + le, + gt, + ge, + NumOperations, // must be at the end of the valid values + FLOP_begin = add, // used when counting what counts as a flop + FLOP_end = eq // used when counting what counts as a flop +}; + +constexpr const char* getOperationName(Operation op) +{ + switch (op) { + case Operation::copy: return "copy"; + case Operation::assign: return "assign"; + case Operation::load: return "load"; + case Operation::store: return "store"; + case Operation::uplus: return "uplus"; + case Operation::uminus: return "uminus"; + case Operation::abs: return "abs"; + case Operation::add: return "add"; + case Operation::sub: return "sub"; + case Operation::mult: return "mult"; + case Operation::div: return "div"; + case Operation::rem: return "rem"; + case Operation::preinc: return "preinc"; + case Operation::predec: return "predec"; + case Operation::postinc: return "postinc"; + case Operation::postdec: return "postdec"; + case Operation::atomic_add: return "atomic_add"; + case Operation::sqrt: return "sqrt"; + case Operation::exp: return "exp"; + case Operation::bit_not: return "bit_not"; + case Operation::bit_and: return "bit_and"; + case Operation::bit_or: return "bit_or"; + case Operation::bit_xor: return "bit_xor"; + case Operation::bit_lsh: return "bit_lsh"; + case Operation::bit_rsh: return "bit_rsh"; + case Operation::eq: return "eq"; + case Operation::ne: return "ne"; + case Operation::lt: return "lt"; + case Operation::le: return "le"; + case Operation::gt: return "gt"; + case Operation::ge: return "ge"; + default: throw std::invalid_argument("op is not in Operation"); + } +} + +enum struct ContextType : int +{ + exterior, + outer, + repetition, + cond, + outer_loop, + seq_loop, + par_loop, + team, + body, + par_sync, + team_sync, + NumContextTypes // must be at the end of the valid values +}; + +constexpr const char* getContextTypeName(ContextType ct) +{ + switch (ct) { + case ContextType::exterior: return "exterior"; + case ContextType::outer: return "outer"; + case ContextType::repetition: return "repetition"; + case ContextType::cond: return "cond"; + case ContextType::outer_loop: return "outer_loop"; + case ContextType::seq_loop: return "seq_loop"; + case ContextType::par_loop: return "par_loop"; + case ContextType::team: return "team"; + case ContextType::body: return "body"; + case ContextType::par_sync: return "par_sync"; + case ContextType::team_sync: return "team_sync"; + default: throw std::invalid_argument("Unknown ContextType"); + } +} + +enum struct MemoryAccess : int +{ + read, + write, + atomicModifyWrite, + NumMemoryAccesses // must be at the end of the valid values +}; + +constexpr const char* getMemoryAccessName(MemoryAccess ma) +{ + switch (ma) { + case MemoryAccess::read: return "read"; + case MemoryAccess::write: return "write"; + case MemoryAccess::atomicModifyWrite: return "atomicModifyWrite"; + default: throw std::invalid_argument("Unknown MemoryAccess"); + } +} + +constexpr const char* getMemoryAccessNamePastTense(MemoryAccess ma) +{ + switch (ma) { + case MemoryAccess::read: return "read"; + case MemoryAccess::write: return "written"; + case MemoryAccess::atomicModifyWrite: return "atomicModifyWritten"; + default: throw std::invalid_argument("Unknown MemoryAccess"); + } +} + +constexpr const char* getMemoryAccessNamePastTenseTitle(MemoryAccess ma) +{ + switch (ma) { + case MemoryAccess::read: return "Read"; + case MemoryAccess::write: return "Written"; + case MemoryAccess::atomicModifyWrite: return "AtomicModifyWritten"; + default: throw std::invalid_argument("Unknown MemoryAccess"); + } +} + +enum struct AllocationGroup : int +{ + global, + team, + NumAllocationGroups // must be at the end of the valid values +}; + +constexpr const char* getAllocationGroupName(AllocationGroup ma) +{ + switch (ma) { + case AllocationGroup::global: return "global"; + case AllocationGroup::team: return "team"; + default: throw std::invalid_argument("Unknown AllocationGroup"); + } +} + +// Must be in order innermost to outermost, so loop must be before rep, etc. +enum struct CountingPoint : int +{ + team, + loop, + rep, + NumCountingPoints // must be at the end of the valid values +}; + +constexpr const char* getCountingPointName(CountingPoint ma) +{ + switch (ma) { + case CountingPoint::team: return "team"; + case CountingPoint::loop: return "loop"; + case CountingPoint::rep: return "rep"; + default: throw std::invalid_argument("Unknown CountingPoint"); + } +} + + +constexpr std::string get_spacing(Size_type depth) +{ + return std::string(depth*2, ' '); +} + +struct MemoryCounts +{ + Size_type touched = 0; + Size_type accessed[Size_type(MemoryAccess::NumMemoryAccesses)] = {0}; + + void add(MemoryCounts const& other_counts, Size_type multiplier = 1) + { + touched += other_counts.touched * multiplier; + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + accessed[a] += other_counts.accessed[a] * multiplier; + } + } +}; + +struct AddressTouches +{ + std::vector address_accessed[Size_type(MemoryAccess::NumMemoryAccesses)]; + + AddressTouches() = default; + + explicit AddressTouches(Size_type size, bool value = false) + { + resize(size, value); + } + + void resize(Size_type size, bool value = false) + { + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + address_accessed[a].resize(size, value); + } + } + + Size_type size() const + { + return address_accessed[0].size(); + } + + void set_all(Size_type size, bool value) + { + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + for (Size_type i = 0; i < size; ++i) { + address_accessed[a][i] = value; + } + } + } + + void count(Size_type size, + MemoryCounts& address_counts) const + { + for (Size_type i = 0; i < size; ++i) { + bool addr_touched = false; + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + bool addr_accessed = address_accessed[a][i]; + addr_touched = addr_touched || addr_accessed; + address_counts.accessed[a] += addr_accessed ? 1 : 0; + } + address_counts.touched += addr_touched ? 1 : 0; + } + } + + void combine(Size_type size, + AddressTouches const& other_touches) + { + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + for (Size_type i = 0; i < size; ++i) { + address_accessed[a][i] = other_touches.address_accessed[a][i] || address_accessed[a][i]; + } + } + } + + void clear() + { + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + address_accessed[a].clear(); + address_accessed[a].shrink_to_fit(); + } + } +}; + +struct TouchCounts +{ + Size_type m_size = 0; + MemoryCounts total_counts; + MemoryCounts address_counts[Size_type(CountingPoint::NumCountingPoints)]; + AddressTouches address_touches[Size_type(CountingPoint::NumCountingPoints)]; + + TouchCounts() = default; + + TouchCounts(CountingPoint point, Size_type size) + { + resize(point, size); + } + + void resize(CountingPoint point, Size_type size) + { + for (Size_type p = Size_type(point); + p < Size_type(CountingPoint::NumCountingPoints); ++p) { + address_touches[p].resize(size); + } + m_size = size; + } + + Size_type size() const + { + return m_size; + } + + void set_all_accesses(CountingPoint point, bool value) + { + address_touches[Size_type(point)].set_all(m_size, value); + } + + void touch(CountingPoint point, MemoryAccess access, Size_type offset, + Size_type num_ops) + { + if (point < CountingPoint::NumCountingPoints) { + total_counts.touched += num_ops; + total_counts.accessed[Size_type(access)] += num_ops; + address_touches[Size_type(point)].address_accessed[Size_type(access)].at(offset) = true; + } + } + + void count(CountingPoint point) + { + address_touches[Size_type(point)].count(m_size, address_counts[Size_type(point)]); + } + + void combine_accesses(CountingPoint point, + TouchCounts const& other_touches, + CountingPoint other_point) + { + address_touches[Size_type(point)].combine( + m_size, other_touches.address_touches[Size_type(other_point)]); + } + + void clear_accesses(CountingPoint point) + { + address_touches[Size_type(point)].clear(); + } +}; + + +struct AllocationMetadata +{ + Index_type idx = std::numeric_limits::min(); + const void* ptr_ptr = nullptr; + std::source_location allocate_location; + AllocationGroup group; + + void* ptr = nullptr; + + std::string pointed_to_type_name; + Size_type element_size = 0; + Size_type size = 0; + + TouchCounts counts; + + AllocationMetadata(Index_type idx_, const void* ptr_ptr_, + std::source_location location, AllocationGroup group_, + std::string pointed_to_type_name_, void* ptr_, + Size_type size_, Size_type element_size_) + : idx(idx_) + , ptr_ptr(ptr_ptr_) + , allocate_location(location) + , group(group_) + , ptr(ptr_) + , pointed_to_type_name(std::move(pointed_to_type_name_)) + , element_size(element_size_) + , size(size_) + , counts(CountingPoint(0), size_) + { + } + + void allocate(void* ptr_) + { + ptr = ptr_; + } + + void deallocate() + { + ptr = nullptr; + } + + void print_allocation(std::ostream& str, Size_type depth) const + { + std::string spacing = get_spacing(depth); + spacing += "// "; + str << spacing << pointed_to_type_name << "* allocation_" << idx + << " = " << getAllocationGroupName(group) << "_malloc(" + << size << " * " << element_size << ");\n"; + } + + void print_deallocation(std::ostream& str, Size_type depth) const + { + std::string spacing = get_spacing(depth); + spacing += "// "; + str << spacing << getAllocationGroupName(group) << "_free(" + << "allocation_" << idx << ");\n"; + } +}; + +struct Context +{ + Index_type idx = -1; + Size_type hit_count = 0; + ContextType type = ContextType::NumContextTypes; + const char* text = nullptr; + CountingPoint point = CountingPoint::NumCountingPoints; + Index_type point_depth = 0; + + Context* parent = nullptr; + // children are stored in order of increasing idx + std::vector> children; + std::vector child_idcs; + + Size_type operation_counters[Size_type(OpType::NumOpTypes)][Size_type(Operation::NumOperations)] = {{0}}; + + std::vector aloc_counts; + + MemoryCounts aloc_total_bytes; + MemoryCounts aloc_totals_bytes[Size_type(CountingPoint::NumCountingPoints)]; + + std::vector allocation_indices; + std::vector deallocation_indices; + + static constexpr CountingPoint get_point(Context* parent, ContextType type) + { + CountingPoint point = CountingPoint::NumCountingPoints; + if (type == ContextType::repetition) { + point = CountingPoint::rep; + } else if (type == ContextType::par_loop) { + point = CountingPoint::loop; + } else if (type == ContextType::team) { + point = CountingPoint::team; + } + if (parent) { + point = std::min(parent->point, point); + } + return point; + } + + // depth of 0 indicates this does not have a valid point + // depth of 1 indicates this is the first context with this point + // depths greater than 1 are children of of a context of this point + static constexpr Index_type get_depth(Context* parent, CountingPoint point) + { + Index_type depth = 0; + if (parent) { + if (point != parent->point) { + depth = 1; + } else if (parent->point_depth > 0) { + depth = parent->point_depth + 1; + } + } + return depth; + } + + Context(Index_type idx_, Context* parent_, ContextType type_, const char* text_, + std::vector> const& allocations) + : idx(idx_) + , type(type_) + , text(text_) + , point(get_point(parent_, type_)) + , point_depth(get_depth(parent_, get_point(parent_, type_))) + , parent(parent_) + , aloc_counts(allocations.size()) + { + if (type == ContextType::par_sync) { + if (point != CountingPoint::rep) { + throw std::runtime_error("par_sync must be in a repetition context"); + } + } else if (type == ContextType::team_sync) { + if (point != CountingPoint::team) { + throw std::runtime_error("team_sync must be in a team context"); + } + } + for (Size_type i = 0; i < allocations.size(); ++i) { + auto const& item = allocations[i]; + aloc_counts[i].resize(point, item->size); + } + } + + void update_allocations(std::vector> const& allocations) + { + for (Size_type i = 0; i < allocations.size(); ++i) { + auto const& item = allocations[i]; + if (i < aloc_counts.size()) { + if (item->size != aloc_counts[i].size()) { + throw std::runtime_error("Allocation record changed since last update"); + } + } else { + aloc_counts.resize(i+1); + aloc_counts[i].resize(point, item->size); + } + } + + for (auto& child_ptr : children) { + child_ptr->update_allocations(allocations); + } + } + + void add_allocation(AllocationMetadata const& item) + { + auto iter = std::ranges::find(allocation_indices, item.idx); + if (iter == allocation_indices.end()) { + allocation_indices.emplace_back(item.idx); + } + } + + void remove_allocation(AllocationMetadata const& item) + { + auto iter = std::ranges::find(deallocation_indices, item.idx); + if (iter == deallocation_indices.end()) { + deallocation_indices.emplace_back(item.idx); + } + } + + template < typename... Args > + Context* get_or_emplace_child(Index_type idx, Args&&... args) + { + using std::distance; + auto idx_iter = std::ranges::lower_bound(child_idcs, idx, std::ranges::less{}); + Size_type offset = distance(child_idcs.begin(), idx_iter); + auto iter = children.begin() + offset; + if (idx_iter == child_idcs.end() || *idx_iter != idx) { + idx_iter = child_idcs.emplace(idx_iter, idx); + iter = children.emplace(iter, std::make_unique(idx, this, std::forward(args)...)); + } + return iter->get(); + } + + void count_totals(AllocationMetadata& item) + { + aloc_total_bytes.add(aloc_counts[item.idx].total_counts, item.element_size); + item.counts.total_counts.add(aloc_counts[item.idx].total_counts, item.element_size); + + for (Size_type p = 0; p < Size_type(CountingPoint::NumCountingPoints); ++p) { + aloc_totals_bytes[p].add(aloc_counts[item.idx].address_counts[p], item.element_size); + } + } + + void clear() + { + for (Size_type i = 0; i < aloc_counts.size(); ++i) { + + for (Size_type p = 0; p < Size_type(CountingPoint::NumCountingPoints); ++p) { + aloc_counts[i].clear_accesses(CountingPoint(p)); + } + + } + } + + + void print_header(std::ostream& str, Size_type depth) const + { + std::string spacing = get_spacing(depth); + spacing += "// "; + + str << spacing << "Line " << idx << " hit " << hit_count << " times\n"; + } + + void print_allocations(std::ostream& str, Size_type depth, + std::vector> const& allocations) const + { + for (Index_type const& allocation_idx : allocation_indices) { + allocations[allocation_idx]->print_allocation(str, depth); + } + for (Index_type const& allocation_idx : deallocation_indices) { + allocations[allocation_idx]->print_deallocation(str, depth); + } + } + + void print_allocation_counts(std::ostream& str, Size_type depth, + std::string_view name, + MemoryCounts const& mem_counts) const + { + std::string spacing = get_spacing(depth); + spacing += "// "; + + if (mem_counts.touched) { + str << spacing + << name + << " touched " + << mem_counts.touched << "\n"; + } + + for (Size_type a = 0; a < Size_type(MemoryAccess::NumMemoryAccesses); ++a) { + + if (mem_counts.accessed[a]) { + str << spacing + << name + << " " << getMemoryAccessNamePastTense(MemoryAccess(a)) << " " + << mem_counts.accessed[a] << "\n"; + } + + } + } + + void print_counters(std::ostream& str, Size_type depth) const + { + std::string spacing = get_spacing(depth); + spacing += "// "; + + for (Size_type ot = 0; ot < Size_type(OpType::NumOpTypes); ++ot) { + + std::string opTypeName = getOpTypeName(OpType(ot)); + + for (Size_type op = 0; op < Size_type(Operation::NumOperations); ++op) { + + std::string opName = getOperationName(Operation(op)); + + Size_type num_ops = operation_counters[ot][op]; + + if (num_ops > 0) { + str << spacing << opTypeName << " " << opName << " " << num_ops << "\n"; + } + } + } + + print_allocation_counts(str, depth, "bytes", aloc_total_bytes); + + for (Size_type p = 0; p < Size_type(CountingPoint::NumCountingPoints); ++p) { + + std::string name = std::format("by {} bytes", + getCountingPointName(CountingPoint(p))); + + print_allocation_counts(str, depth, name, aloc_totals_bytes[p]); + + } + + for (Size_type i = 0; i < aloc_counts.size(); ++i) { + + for (Size_type p = 0; p < Size_type(CountingPoint::NumCountingPoints); ++p) { + + std::string name = std::format("by {} allocation_{} elements", + getCountingPointName(CountingPoint(p)), i); + + print_allocation_counts(str, depth, + name, aloc_counts[i].address_counts[p]); + + } + + } + + } + + std::string replace_values(std::string str, + std::vector const& wrapper_formats) const + { + for (const char* wrapper_format : wrapper_formats) { + std::regex re(std::vformat(wrapper_format, std::make_format_args("(.*?)"))); + str = std::regex_replace(str, re, "$1"); + } + + return str; + } + + void print_text(std::ostream& str, Size_type depth, + std::vector const& wrapper_formats) const + { + if (text == nullptr) return; + + std::string spacing = get_spacing(depth); + + std::string new_text = replace_values(text, wrapper_formats); + + std::string_view tv = new_text; + + if (!tv.empty()) { + + Size_type pos = 0; + while (pos < tv.size()) { + + // skip spacing between lines and extra semicolons + if (std::isspace(tv[pos]) || + tv[pos] == ';') { + ++pos; + continue; + } + + Size_type end = tv.find(';', pos); + if (end < tv.size()) { + end += 1; + } else { + end = tv.size(); + } + + str << spacing << tv.substr(pos, end-pos) << "\n"; + + pos = end; + } + } + } + + void print(std::ostream& str, Size_type depth, std::string_view tv) const + { + std::string spacing = get_spacing(depth); + + str << spacing << tv << "\n"; + } +}; + +struct CountingData; + +struct ScopedContext +{ + Context* context; + CountingData* countingData; + + ScopedContext(CountingData* countingData_, Context* context_) + : context(context_) + , countingData(countingData_) + { + } + + ScopedContext() = delete; + ScopedContext(ScopedContext const&) = delete; + ScopedContext(ScopedContext &&) = delete; + ScopedContext& operator=(ScopedContext const&) = delete; + ScopedContext& operator=(ScopedContext &&) = delete; + + ~ScopedContext() + { + pop_context(); + } + + void release() + { + countingData = nullptr; + context = nullptr; + } + + inline void pop_context(); +}; + +struct CountingData +{ + static inline Context* current_context = nullptr; + static inline CountingData* current_data = nullptr; + + Size_type par_it_per_rep_counter = 0; + Size_type all_it_per_rep_counter = 0; + + Size_type max_par_loop_depth = 0; + Size_type max_all_loop_depth = 0; + + Size_type kernel_per_rep_counter = 0; + Size_type par_sync_per_rep_counter = 0; + Size_type team_sync_per_rep_counter = 0; + + + Size_type memory_allocations[Size_type(AllocationGroup::NumAllocationGroups)] = {0}; + Size_type memory_bytes[Size_type(AllocationGroup::NumAllocationGroups)] = {0}; + + MemoryCounts memory_total_bytes[Size_type(AllocationGroup::NumAllocationGroups)]; + MemoryCounts memory_totals_bytes[Size_type(CountingPoint::NumCountingPoints)][Size_type(AllocationGroup::NumAllocationGroups)]; + + std::vector> allocations; + + + Size_type operation_counters[Size_type(OpType::NumOpTypes)][Size_type(Operation::NumOperations)] = {{0}}; + + + std::unique_ptr counter_context; + + + std::vector wrapper_formats; + + + void set_formats(std::initializer_list wrapper_formats) + { + for (const char* wrapper_format : wrapper_formats) { + this->wrapper_formats.emplace_back(wrapper_format); + } + } + + + AllocationMetadata* get_allocation(const void* ptr) + { + if (!ptr) { + return nullptr; + } + auto iter = std::ranges::find_if(allocations, + [&](std::unique_ptr const& item) { + if (!item->ptr) { return false; } + const char* allocation_begin = static_cast(item->ptr); + const char* allocation_end = allocation_begin + item->size*item->element_size; + return (allocation_begin <= static_cast(ptr) && + allocation_end > static_cast(ptr)); + }); + if (iter == allocations.end()) { + return nullptr; + } + return iter->get(); + } + /// + AllocationMetadata* get_allocation(const void* ptr_ptr, std::source_location location) + { + auto iter = std::ranges::find(allocations, + std::make_tuple(ptr_ptr, location.line(), location.column()), + [](std::unique_ptr const& item) { + return std::make_tuple(item->ptr_ptr, + item->allocate_location.line(), + item->allocate_location.column()); + }); + if (iter == allocations.end()) { + return nullptr; + } + return iter->get(); + } + + void add_allocation_impl(std::string pointed_to_type_name, AllocationGroup group, void* ptr, + Size_type size, Size_type element_size, + const void* ptr_ptr, std::source_location location) + { + auto item = get_allocation(ptr); + if (item) { + throw std::runtime_error("Allocation with this pointer already registered"); + } + item = get_allocation(ptr_ptr, location); + if (item) { + if (pointed_to_type_name != item->pointed_to_type_name || + size != item->size || + element_size != item->element_size) { + throw std::runtime_error("Allocation at this location changed type, size, or element_size"); + } + item->allocate(ptr); + } else { + item = allocations.emplace_back( + std::make_unique( + allocations.size(), ptr_ptr, location, group, + std::move(pointed_to_type_name), ptr, size, element_size)).get(); + counter_context->update_allocations(allocations); + current_context->add_allocation(*item); + } + } + + void add_allocation(std::string pointed_to_type_name, void* ptr, + Size_type size, Size_type element_size, + const void* ptr_ptr, std::source_location location) + { + add_allocation_impl(std::move(pointed_to_type_name), AllocationGroup::global, + ptr, size, element_size, + ptr_ptr, location); + } + + void add_team_allocation(std::string pointed_to_type_name, void* ptr, + Size_type size, Size_type element_size, + const void* ptr_ptr, std::source_location location) + { + add_allocation_impl(std::move(pointed_to_type_name), AllocationGroup::team, + ptr, size, element_size, + ptr_ptr, location); + } + + void remove_allocation(void* ptr, + [[maybe_unused]] const void* ptr_ptr, + [[maybe_unused]] std::source_location location = std::source_location::current()) + { + auto item = get_allocation(ptr); + if (!item) { + throw std::runtime_error("Allocation with this pointer not registered"); + } + item->deallocate(); + current_context->remove_allocation(*item); + } + + + ScopedContext create_context(const char* text, + std::source_location location = std::source_location::current()) + { + if (counter_context) { + throw std::runtime_error("Already created exterior context"); + } + + counter_context = std::make_unique( + location.line(), nullptr, ContextType::exterior, text, allocations); + + current_data = this; + current_context = counter_context.get(); + + current_context->hit_count += 1; + + return {this, current_context}; + } + + void push_context(ContextType type, const char* text, + std::source_location location = std::source_location::current()) + { + if (!current_data) { + throw std::runtime_error("Current data not set"); + } + if (!current_context) { + throw std::runtime_error("Current context not set"); + } + current_context = current_context->get_or_emplace_child( + location.line(), type, text, allocations); + current_context->hit_count += 1; + } + + ScopedContext push_outer_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::outer, text, location); + return {this, current_context}; + } + + ScopedContext push_rep_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::repetition, text, location); + return {this, current_context}; + } + + ScopedContext push_cond_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::cond, text, location); + return {this, current_context}; + } + + ScopedContext push_outer_loop_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::outer_loop, text, location); + return {this, current_context}; + } + + ScopedContext push_seq_loop_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::seq_loop, text, location); + return {this, current_context}; + } + + ScopedContext push_par_loop_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::par_loop, text, location); + return {this, current_context}; + } + + ScopedContext push_body_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::body, text, location); + return {this, current_context}; + } + + ScopedContext push_team_context(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::team, text, location); + return {this, current_context}; + } + + void add_par_sync(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::par_sync, text, location); + pop_context(); + } + + void add_team_sync(const char* text, + std::source_location location = std::source_location::current()) + { + push_context(ContextType::team_sync, text, location); + pop_context(); + } + + void pop_context() + { + if (!current_context) { + throw std::runtime_error("No context to pop"); + } + if (current_context->point_depth == 1) { + CountingPoint src_point = current_context->point; + CountingPoint dst_point = current_context->parent + ? current_context->parent->point + : src_point; + count_touches(current_context, src_point, dst_point, 0); + } + + current_context = current_context->parent; + } + + + + void finalize_context([[maybe_unused]] std::source_location location) + { + if (!counter_context) throw std::runtime_error("Exterior context not created"); + if (!current_context) throw std::runtime_error("No current context"); + if (current_context != counter_context.get()) throw std::runtime_error("Not at outer context"); + current_context = nullptr; + current_data = nullptr; + + count_totals(counter_context.get(), 0); + + // count stats for allocations + for (auto& item : allocations) { + + Size_type g = Size_type(item->group); + + memory_allocations[g] += 1; + memory_bytes[g] += item->size * item->element_size; + + memory_total_bytes[g].add(item->counts.total_counts, item->element_size); + + for (Size_type p = 0; p < Size_type(CountingPoint::NumCountingPoints); ++p) { + + memory_totals_bytes[p][g].add(item->counts.address_counts[p], item->element_size); + + item->counts.clear_accesses(CountingPoint(p)); + } + } + + count_kernels_and_iterations(counter_context.get()); + + count_operations(counter_context.get()); + } + + void count_totals(Context* context, Size_type depth) + { + for (auto& child_ptr : context->children) { + count_totals(child_ptr.get(), depth+1); + } + + for (auto& item : allocations) { + context->count_totals(*item); + } + context->clear(); + } + + void count_touches(Context* context, CountingPoint src_point, + CountingPoint dst_point, Size_type depth) + { + for (auto& child_ptr : context->children) { + count_touches(child_ptr.get(), src_point, dst_point, depth+1); + } + + for (auto& item : allocations) { + + auto& src_counts = context->aloc_counts[item->idx]; + + item->counts.combine_accesses(src_point, src_counts, src_point); + + if (dst_point < CountingPoint::NumCountingPoints && + dst_point != src_point) { + + context->aloc_counts[item->idx].combine_accesses( + dst_point, src_counts, src_point); + + item->counts.combine_accesses(dst_point, src_counts, src_point); + + } + + src_counts.count(src_point); + + src_counts.set_all_accesses(src_point, false); + + if (depth == 0) { + + item->counts.count(src_point); + + item->counts.set_all_accesses(src_point, false); + } + } + + } + + std::array count_kernels_and_iterations( + Context* context, + Size_type par_loop_stack_depth = 0, + Size_type all_loop_stack_depth = 0) + { + if (!context->parent) { + par_it_per_rep_counter = 0; + all_it_per_rep_counter = 0; + max_par_loop_depth = 0; + max_all_loop_depth = 0; + kernel_per_rep_counter = 0; + par_sync_per_rep_counter = 0; + team_sync_per_rep_counter = 0; + } + + if (context->type == ContextType::par_loop) { + par_loop_stack_depth += 1; + all_loop_stack_depth += 1; + max_par_loop_depth = std::max(par_loop_stack_depth, max_par_loop_depth); + max_all_loop_depth = std::max(all_loop_stack_depth, max_all_loop_depth); + } else if (context->type == ContextType::seq_loop) { + all_loop_stack_depth += 1; + max_all_loop_depth = std::max(all_loop_stack_depth, max_all_loop_depth); + } + + Size_type max_child_par_iterations = 0; + Size_type all_child_par_iterations = 0; + Size_type max_child_iterations = 0; + Size_type all_loop_iterations = 0; + + for (auto& child_ptr : context->children) { + + auto [par_iter, all_iter] = + count_kernels_and_iterations(child_ptr.get(), + par_loop_stack_depth, + all_loop_stack_depth); + + max_child_par_iterations = std::max(par_iter, max_child_par_iterations); + all_child_par_iterations += par_iter; + max_child_iterations = std::max(child_ptr->hit_count, max_child_iterations); + all_loop_iterations += all_iter; + + } + + Size_type child_par_iterations = all_child_par_iterations; + Size_type child_all_iterations = all_loop_iterations; + if (context->type == ContextType::seq_loop) { + child_all_iterations = std::max(all_loop_iterations, max_child_iterations); + } + + if (context->type == ContextType::team_sync) { + team_sync_per_rep_counter += context->hit_count; + } else if (context->type == ContextType::par_sync) { + par_sync_per_rep_counter += context->hit_count; + } + + if (Size_type(context->point) <= Size_type(CountingPoint::loop)) { + + if (context->point == CountingPoint::loop && context->point_depth == 1) { + kernel_per_rep_counter += context->hit_count; + } + + child_par_iterations = max_child_par_iterations; + if (context->type == ContextType::par_loop) { + child_par_iterations = std::max(max_child_par_iterations, max_child_iterations); + child_all_iterations = std::max(all_loop_iterations, max_child_iterations); + } + + } + + if (context->point == CountingPoint::rep && context->point_depth == 1) { + par_it_per_rep_counter = all_child_par_iterations; + all_it_per_rep_counter = all_loop_iterations; + } + + return {{child_par_iterations, child_all_iterations}}; + + } + + void count_operations(Context* context) + { + for (auto& child_ptr : context->children) { + count_operations(child_ptr.get()); + } + + if (Size_type(context->point) > Size_type(CountingPoint::rep)) { + return; // don't count operations outside of the repetition + } + + for (Size_type ot = 0; ot < Size_type(OpType::NumOpTypes); ++ot) { + for (Size_type op = 0; op < Size_type(Operation::NumOperations); ++op) { + operation_counters[ot][op] += context->operation_counters[ot][op]; + } + } + } + + void print_context(std::ostream& str, Context const& context, Size_type depth) const + { + context.print_header(str, depth+1); + + context.print_allocations(str, depth+1, allocations); + + context.print_counters(str, depth+1); + + context.print_text(str, depth+1, wrapper_formats); + + if (!context.children.empty()) { + + context.print(str, depth+1, "{"); + + for (auto const& child_ptr : context.children) { + print_context(str, *child_ptr.get(), depth+1); + } + + context.print(str, depth+1, "}"); + } + } + + void print(std::ostream& str) const + { + Context const& context = *counter_context.get(); + Size_type depth = 0; + context.print(str, depth, "{"); + print_context(str, context, depth); + context.print(str, depth, "}"); + } +}; + +inline void ScopedContext::pop_context() +{ + if (context) { + if (CountingData::current_context != context) { + throw std::runtime_error("ScopedContext popped in wrong context"); + } + if (CountingData::current_data != countingData) { + throw std::runtime_error("ScopedContext popped in wrong context"); + } + CountingData::current_data->pop_context(); + release(); + } +} + +} // closing brace for counting namespace + +} // closing brace for rajaperf namespace + +#endif // closing endif for header file include guard diff --git a/src/common/CountingMacros.hpp b/src/common/CountingMacros.hpp new file mode 100644 index 000000000..9a9ed2e6e --- /dev/null +++ b/src/common/CountingMacros.hpp @@ -0,0 +1,142 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJAPerf_CountingMacros_HPP +#define RAJAPerf_CountingMacros_HPP + +// Note that using this should change the signature of functions but +// can cause ODR violations if it does not + +// Use this wrapper type in variable declarations in a kernel +// ex. +// RAJAPERF_WRAPPER(Real_type) val = ptr[i]; +// Note do not use it if declaring variables with constant values +#ifdef RAJAPERF_WRAPPER +#undef RAJAPERF_WRAPPER +#endif +#define RAJAPERF_WRAPPER(type) counting::Wrapper +#define RAJAPERF_ARRAY1_WRAPPER(type_name) typename counting::Array1WrapperHelper::template type +#define RAJAPERF_ARRAY2_WRAPPER(type_name) typename counting::Array2WrapperHelper::template type +#define RAJAPERF_ARRAY3_WRAPPER(type_name) typename counting::Array3WrapperHelper::template type +#define RAJAPERF_ARRAY4_WRAPPER(type_name) typename counting::Array4WrapperHelper::template type + +#define RAJAPERF_ATOMIC_ADD_COUNTING(lhs, rhs) \ + (lhs).atomic_add(rhs); + + +#define RAJAPERF_COUNTERS_INITIALIZE() \ + auto _exterior_context = this->initializeCounters({ \ + RAJAPERF_STRINGIFY(RAJAPERF_WRAPPER({0})), \ + RAJAPERF_STRINGIFY(RAJAPERF_ARRAY1_WRAPPER({0})), \ + RAJAPERF_STRINGIFY(RAJAPERF_ARRAY2_WRAPPER({0})), \ + RAJAPERF_STRINGIFY(RAJAPERF_ARRAY3_WRAPPER({0}))}); + +#define RAJAPERF_COUNTERS_CODE_WRAPPER(...) \ + auto RAJAPERF_NAME_PER_LINE(_code_context_) = \ + counting::CountingData::current_data-> \ + push_outer_context(RAJAPERF_STRINGIFY(__VA_ARGS__)); \ + __VA_ARGS__; \ + RAJAPERF_NAME_PER_LINE(_code_context_).pop_context() + +#define RAJAPERF_COUNTERS_REP_SCOPE() \ + if constexpr (auto _rep_context = \ + counting::CountingData::current_data->push_rep_context( \ + "for (RepIndex_type irep = 0; irep < run_reps; irep = irep + 1)"); \ + false) {} else + +#define RAJAPERF_COUNTERS_IF(...) \ + if constexpr (auto RAJAPERF_NAME_PER_LINE(_cond_context_) = \ + counting::CountingData::current_data->push_cond_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); false) {} else \ + __VA_ARGS__ + +#define RAJAPERF_COUNTERS_ELSE_IF(...) \ + else if constexpr (auto RAJAPERF_NAME_PER_LINE(_cond_context_) = \ + counting::CountingData::current_data->push_cond_context( \ + "else " RAJAPERF_STRINGIFY(__VA_ARGS__)); false) {} else \ + __VA_ARGS__ + +#define RAJAPERF_COUNTERS_ELSE() \ + else if constexpr (auto RAJAPERF_NAME_PER_LINE(_cond_context_) = \ + counting::CountingData::current_data->push_cond_context( \ + "else"); false) {} else + +// Note the main practical difference between this and SEQ_LOOP +// is that only SEQ_LOOP counts iterations +#define RAJAPERF_COUNTERS_OUTER_LOOP(...) \ + if constexpr (auto RAJAPERF_NAME_PER_LINE(_loop_context_) = \ + counting::CountingData::current_data->push_outer_loop_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); false) {} else \ + __VA_ARGS__ + +#define RAJAPERF_COUNTERS_SEQ_LOOP(...) \ + if constexpr (auto RAJAPERF_NAME_PER_LINE(_loop_context_) = \ + counting::CountingData::current_data->push_seq_loop_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); false) {} else \ + __VA_ARGS__ + +#define RAJAPERF_COUNTERS_PAR_LOOP(...) \ + if constexpr (auto RAJAPERF_NAME_PER_LINE(_loop_context_) = \ + counting::CountingData::current_data->push_par_loop_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); false) {} else \ + __VA_ARGS__ + +#define RAJAPERF_COUNTERS_LOOP_BODY(...) \ + auto RAJAPERF_NAME_PER_LINE(_body_context_) = \ + counting::CountingData::current_data->push_body_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); \ + __VA_ARGS__; \ + RAJAPERF_NAME_PER_LINE(_body_context_).pop_context() + +#define RAJAPERF_COUNTERS_TEAM_CONTEXT() \ + auto RAJAPERF_NAME_PER_LINE(_team_context_) = \ + counting::CountingData::current_data->push_team_context(""); + +#define RAJAPERF_COUNTERS_PAR_ALG(...) \ + auto RAJAPERF_NAME_PER_LINE(_alg_context_) = \ + counting::CountingData::current_data->push_par_loop_context( \ + RAJAPERF_STRINGIFY(__VA_ARGS__)); \ + __VA_ARGS__; \ + RAJAPERF_NAME_PER_LINE(_alg_context_).pop_context() + + +#define RAJAPERF_COUNTERS_PAR_SYNC() \ + counting::CountingData::current_data->add_par_sync("synchronize();") + +#define RAJAPERF_COUNTERS_TEAM_SYNC() \ + counting::CountingData::current_data->add_team_sync("synchronize();") + +#define RAJAPERF_COUNTERS_FINALIZE() \ + this->finalizeCounters(_exterior_context) + + +// Wrap rajaperf data types after implementing everything +#define Index_type RAJAPERF_WRAPPER(Index_type) +#define Index_ptr RAJAPERF_WRAPPER(Index_ptr) +#define Index_ptr_ptr RAJAPERF_WRAPPER(Index_ptr_ptr) +#define Size_type RAJAPERF_WRAPPER(Size_type) +#define Int_type RAJAPERF_WRAPPER(Int_type) +#define Int_ptr RAJAPERF_WRAPPER(Int_ptr) +#define Int_ptr_ptr RAJAPERF_WRAPPER(Int_ptr_ptr) +#define Real_type RAJAPERF_WRAPPER(Real_type) +#define Real_array RAJAPERF_ARRAY1_WRAPPER(Real_array) +#define Real_array2 RAJAPERF_ARRAY2_WRAPPER(Real_array2) +#define Real_array3 RAJAPERF_ARRAY3_WRAPPER(Real_array3) +#define Real_array4 RAJAPERF_ARRAY4_WRAPPER(Real_array4) +#define Real_array_ref RAJAPERF_ARRAY1_WRAPPER(Real_array_ref) +#define Real_array2_ref RAJAPERF_ARRAY2_WRAPPER(Real_array2_ref) +#define Real_array3_ref RAJAPERF_ARRAY3_WRAPPER(Real_array3_ref) +#define Real_array4_ref RAJAPERF_ARRAY4_WRAPPER(Real_array4_ref) +#define Real_ptr RAJAPERF_WRAPPER(Real_ptr) +#define Real_ptr_ptr RAJAPERF_WRAPPER(Real_ptr_ptr) +#define Complex_type RAJAPERF_WRAPPER(Complex_type) +#define Complex_ptr RAJAPERF_WRAPPER(Complex_ptr) +#define Data_type RAJAPERF_WRAPPER(Data_type) +#define Data_ptr RAJAPERF_WRAPPER(Data_ptr) + +#endif // closing endif for header file include guard diff --git a/src/common/CountingWrapper.hpp b/src/common/CountingWrapper.hpp new file mode 100644 index 000000000..ce53e2a41 --- /dev/null +++ b/src/common/CountingWrapper.hpp @@ -0,0 +1,1017 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-25, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJAPerf_CountingWrapper_HPP +#define RAJAPerf_CountingWrapper_HPP + +#include "common/RAJAPerfSuite.hpp" +#include "common/RPTypes.hpp" +#include "common/CountingData.hpp" + +#include +#include +#include +#include +#include + +namespace rajaperf +{ + +namespace counting +{ + +// Wrapper types that count operations +template < typename T > +struct Wrapper; + + +template < typename T > +struct is_wrapper +{ + static inline constexpr bool value = false; +}; + +template < typename T > +struct is_wrapper> +{ + static inline constexpr bool value = true; +}; + +template < typename T > +inline constexpr bool is_wrapper_v = is_wrapper::value; + + + +template < typename T > +concept Wrapped = is_wrapper_v>; + +template < typename T > +concept NonWrapped = !Wrapped; + +template < typename T > +concept WrappedVal = Wrapped && T::is_val; + +template < typename T > +concept WrappedArray = Wrapped && T::is_array; + +template < typename T > +concept WrappedPtr = Wrapped && T::is_ptr; + +template < typename T > +concept WrappedNonPtr = Wrapped && !T::is_ptr; + +template < typename T > +concept WrappedRef = Wrapped && T::is_ref; + + +template < typename T > +struct PointedToType +{ + using type = std::remove_reference_t())>; +}; + +template < WrappedPtr T > +struct PointedToType +{ + using type = typename std::remove_cvref_t::pointed_to_type; +}; + +template < typename T > +using pointed_to_type_t = typename PointedToType::type; + + +template < typename T > +struct WrappedType +{ + using type = T; +}; + +template < Wrapped T > +struct WrappedType +{ + using direct_type = typename std::remove_cvref_t::wrapped_type; + using const_type = std::conditional_t, std::add_const_t, direct_type>; + using lref_type = std::conditional_t, std::add_lvalue_reference_t, const_type>; + using rref_type = std::conditional_t, std::add_rvalue_reference_t, lref_type>; + using type = rref_type; +}; + +template < typename T > +using wrapped_type_t = typename WrappedType::type; + + +template < typename T > +concept raw_pointer = std::is_pointer_v; + +template < typename T > +concept pointer = raw_pointer || WrappedPtr; + +template < typename T > +concept convertible_to_pointer = std::convertible_to, pointed_to_type_t*>; + +template < typename T, typename U > +concept convertible_to = std::convertible_to, wrapped_type_t>; + +template < typename T > +concept integral = std::integral || + (Wrapped && std::integral); + + +template < typename T > +constexpr decltype(auto) get_value(T&& val, Size_type num_ops=0) +{ + if constexpr (Wrapped) { + return std::forward(val).get_native(num_ops); + } else { + return std::forward(val); + } +} + +template +struct add_all_extents_of_to +{ + using type = V; +}; + +template +struct add_all_extents_of_to +{ + using type = typename add_all_extents_of_to::type[]; +}; + +template +struct add_all_extents_of_to +{ + using type = typename add_all_extents_of_to::type[N]; +}; + +template +using add_all_extents_of_to_t = typename add_all_extents_of_to::type; + +template < typename T > +struct Wrapper +{ + static inline constexpr bool is_ref = std::is_reference_v; + static inline constexpr bool is_val = !is_ref; + static inline constexpr bool is_array = std::is_array_v>; + static inline constexpr bool is_ptr = std::is_pointer_v; + + template < typename U > + friend struct Wrapper; + + using wrapped_type = T; + + using value_type = std::conditional_t, T>; + using const_value_type = std::conditional_t>, + const value_type>; + + using member_type = std::conditional_t>, value_type>; + + using pointed_to_type = + std::conditional_t, + std::conditional_t, + value_type>>; + using const_pointed_to_type = + std::conditional_t, + std::conditional_t, + const_value_type>>; + + template < size_t... Is > + static constexpr size_t get_array_size(std::index_sequence) + { + return (... * std::extent_v); + } + /// + static constexpr size_t get_array_size() + { + if constexpr (is_array) { + using dims = std::make_index_sequence>; + return get_array_size(dims{}); + } + return size_t(0); + } + + explicit Wrapper(AllocationMetadata* allocation, member_type value) + : m_value(value) + , m_allocation(allocation) + { + } + + // allow default construction of non-ref values + Wrapper() + requires(is_val && !is_array) + : m_value() + { + } + /// + Wrapper(std::source_location location = std::source_location::current()) + requires(is_val && is_array) + : m_value() + { + registerArray(location); + m_allocation = CountingData::current_data->get_allocation( + static_cast(&m_value)); + } + + // allow implicit construction from non-wrapped values + template < convertible_to rhs_T > + Wrapper(rhs_T&& rhs) + requires(is_val && !is_array && !is_ptr) + : m_value(get_value(std::forward(rhs), 1)) + { + this->count(Operation::copy, 1); + } + /// + Wrapper(std::nullptr_t) + requires(is_val && !is_array && is_ptr) + : Wrapper() + { + } + /// + template < convertible_to rhs_T > + Wrapper(rhs_T&& rhs) + requires(is_val && !is_array && is_ptr) + : m_value(get_value(std::forward(rhs), 1)) + { + if constexpr (WrappedPtr) { + m_allocation = rhs.m_allocation; + } else { + m_allocation = CountingData::current_data->get_allocation( + static_cast(m_value)); + } + if (!m_allocation) { + std::ostringstream str; + str << "Couldn't find allocation "; + str << static_cast(get_value(std::forward(rhs))); + throw std::runtime_error(str.str()); + } + this->count(Operation::copy, 1); + } + /// + template < NonWrapped rhs_T > + Wrapper(rhs_T& rhs) + requires(is_ref) + : m_value(&rhs) + { + m_allocation = CountingData::current_data->get_allocation( + static_cast(m_value)); + if (!m_allocation) { + throw std::runtime_error("Couldn't find allocation"); + } + } + + // copy and move constructors + Wrapper(Wrapper const& rhs) + requires(!(is_val && is_array)) + : m_value(rhs.get_native()) + , m_allocation(rhs.m_allocation) + { + if constexpr (is_val) { + this->count(Operation::copy, 1); + } + } + /// + Wrapper(Wrapper && rhs) + requires(!(is_val && is_array)) + : m_value(std::move(rhs).get_native()) + , m_allocation(rhs.m_allocation) + { + if constexpr (is_val) { + this->count(Operation::copy, 1); + } + } + + // count assignments from non-wrapped values + template < NonWrapped rhs_T > + Wrapper& operator=(rhs_T&& rhs) + requires(!is_array) + { + this->set(std::forward(rhs)); + if constexpr (is_ptr) { + this->m_allocation = CountingData::current_data->get_allocation( + (void*)(m_value)); + if (!m_allocation) { + throw std::runtime_error("Couldn't find allocation"); + } + } + this->count(Operation::assign, 1); + return *this; + } + /// + Wrapper& operator=(std::nullptr_t) + requires(is_val && is_ptr) + { + return (*this) = Wrapper(); + } + + // count assignments from wrappers + Wrapper& operator=(Wrapper const& rhs) + requires(!is_array) + { + this->set(rhs.get_native()); + if constexpr (is_ptr) { + this->m_allocation = rhs.m_allocation; + } + this->count(Operation::assign, 1); + return *this; + } + /// + Wrapper& operator=(Wrapper&& rhs) + requires(!is_array) + { + this->set(std::move(rhs).get_native()); + if constexpr (is_ptr) { + this->m_allocation = rhs.m_allocation; + } + this->count(Operation::assign, 1); + return *this; + } + /// + template < Wrapped rhs_T > + Wrapper& operator=(rhs_T&& rhs) + requires(!is_array) + { + this->set(std::forward(rhs).get_native()); + if constexpr (is_ptr) { + this->m_allocation = rhs.m_allocation; + } + this->count(Operation::assign, 1); + return *this; + } + + ~Wrapper() + { + if constexpr (is_val && is_array) { + deregisterArray(); + } + } + + +#define RAJAPERF_DEFINE_WRAPPER_PRE_OPERATOR(op_name, op, op_enum) \ + auto& op_name() \ + requires(!is_array) \ + { \ + this->set(this->get_native() op 1); \ + this->count(op_enum, 1); \ + return *this; \ + } + + RAJAPERF_DEFINE_WRAPPER_PRE_OPERATOR(operator++, +, Operation::preinc) + RAJAPERF_DEFINE_WRAPPER_PRE_OPERATOR(operator--, -, Operation::predec) + + +#define RAJAPERF_DEFINE_WRAPPER_POST_OPERATOR(op_name, op, op_enum) \ + auto op_name(int) \ + requires(!is_array) \ + { \ + auto value = this->get_value_wrapper(); \ + this->set(value.get_native() op 1); \ + this->count(op_enum, 1); \ + return value; \ + } + + RAJAPERF_DEFINE_WRAPPER_POST_OPERATOR(operator++, +, Operation::postinc) + RAJAPERF_DEFINE_WRAPPER_POST_OPERATOR(operator--, -, Operation::postdec) + + +#define RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(op_name, op, op_enum) \ + template < WrappedNonPtr rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(!is_array && !is_ptr) \ + { \ + this->set(this->get_native() op rhs.get_native()); \ + this->count(op_enum, 1); \ + return *this; \ + } \ + template < NonWrapped rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(!is_array && !is_ptr) \ + { \ + this->set(this->get_native() op rhs); \ + this->count(op_enum, 1); \ + return *this; \ + } + + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator+=, +, Operation::add) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator-=, -, Operation::sub) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator*=, *, Operation::mult) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator/=, /, Operation::div) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator%=, %, Operation::rem) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator&=, &, Operation::bit_and) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator|=, |, Operation::bit_or) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator^=, ^, Operation::bit_xor) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator<<=, <<, Operation::bit_lsh) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator>>=, >>, Operation::bit_rsh) + + +#define RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_POINTER_OPERATOR(op_name, op, op_enum) \ + template < WrappedNonPtr rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(is_ptr) \ + { \ + this->m_value op##= rhs.get_native(); \ + this->count(op_enum, 1); \ + return *this; \ + } \ + template < NonWrapped rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(is_ptr) \ + { \ + this->m_value op##= rhs; \ + this->count(op_enum, 1); \ + return *this; \ + } + + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_POINTER_OPERATOR(operator+=, +, Operation::add) + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_POINTER_OPERATOR(operator-=, -, Operation::sub) + + +#define RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_ATOMIC_OPERATOR(op_name, op, op_enum) \ + template < WrappedNonPtr rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(is_ref) \ + { \ + this->set(this->get_native(0) op rhs.get_native(), 0); \ + this->count(op_enum, 1); \ + return *this; \ + } \ + template < NonWrapped rhs_T > \ + auto& op_name(rhs_T const& rhs) \ + requires(is_ref) \ + { \ + this->set(this->get_native(0) op rhs, 0); \ + this->count(op_enum, 1); \ + return *this; \ + } + + RAJAPERF_DEFINE_WRAPPER_COMPOUND_ASSIGN_ATOMIC_OPERATOR(atomic_add, +, Operation::atomic_add) + + + auto operator&() + requires(is_ref) + { + return Wrapper(m_allocation, m_value); + } + /// + auto operator&() const + requires(is_ref) + { + return Wrapper(m_allocation, m_value); + } + + + auto operator*() + requires(is_array || is_ptr) + { + if constexpr (is_val) { + if constexpr (is_array) { + return Wrapper(m_allocation, &*m_value); + } else { + return Wrapper(m_allocation, m_value); + } + } else { + if constexpr (is_array) { + return Wrapper(m_allocation, &**m_value); + } else { + return Wrapper(nullptr, *m_value); + } + } + } + /// + auto operator*() const + requires(is_array || is_ptr) + { + if constexpr (is_val) { + if constexpr (is_array) { + return Wrapper(m_allocation, &*m_value); + } else { + return Wrapper(m_allocation, m_value); + } + } else { + if constexpr (is_array) { + return Wrapper(m_allocation, &*(*m_value)); + } else { + return Wrapper(nullptr, (*m_value)); + } + } + } + + auto operator->() const + requires(is_ptr) + { + return m_value; + } + + + template < convertible_to I > + auto operator[](I&& i) + requires(is_array || is_ptr) + { + this->count(Operation::add, 1); + if constexpr (is_val) { + if constexpr (is_array) { + return Wrapper(m_allocation, &m_value[get_value(std::forward(i), 1)]); + } else { + return Wrapper(m_allocation, m_value+get_value(std::forward(i), 1)); + } + } else { + if constexpr (is_array) { + return Wrapper(m_allocation, &(*m_value)[get_value(std::forward(i), 1)]); + } else { + return Wrapper(nullptr, (*m_value)+get_value(std::forward(i), 1)); + } + } + } + /// + template < convertible_to I > + auto operator[](I&& i) const + requires(is_array || is_ptr) + { + this->count(Operation::add, 1); + if constexpr (is_val) { + if constexpr (is_array) { + return Wrapper(m_allocation, &m_value[get_value(std::forward(i), 1)]); + } else { + return Wrapper(m_allocation, m_value+get_value(std::forward(i), 1)); + } + } else { + if constexpr (is_array) { + return Wrapper(m_allocation, &(*m_value)[get_value(std::forward(i), 1)]); + } else { + return Wrapper(nullptr, (*m_value)+get_value(std::forward(i), 1)); + } + } + } + + operator auto() const + { + if constexpr (!is_array) { + this->count(Operation::copy, 1); + return this->get_native(); + } + } + /// + explicit operator Wrapper() + requires(is_val) + { + if constexpr (is_array) { + return Wrapper(m_allocation, &m_value); + } else { + return Wrapper(nullptr, &m_value); + } + } + /// + explicit operator Wrapper() const + requires(is_val) + { + if constexpr (is_array) { + return Wrapper(m_allocation, &m_value); + } else { + return Wrapper(nullptr, &m_value); + } + } + + void swap(Wrapper& rhs) + requires(!is_array) // consider implementing array version later + { + using std::swap; + value_type rhs_tmp(std::move(rhs).get_native()); + rhs.set(std::move(*this).get_native()); + this->set(std::move(rhs_tmp)); + swap(this->m_allocation, rhs.m_allocation); + } + + void swap(Wrapper&& rhs) && + requires(is_ref && ! is_array) + { + using std::swap; + value_type rhs_tmp(std::move(rhs).get_native()); + rhs.set(std::move(*this).get_native()); + this->set(std::move(rhs_tmp)); + swap(this->m_allocation, rhs.m_allocation); + } + + + // internal interface methods, should only be used in this file + template < typename rhs_T > + void set(rhs_T&& rhs, Size_type num_ops = 1) + requires(!is_array) + { + if constexpr (is_val) { + m_value = std::forward(rhs); + } else { + this->count(Operation::store, num_ops); + *m_value = std::forward(rhs); + } + } + + // gets a copy of the value represented by this object + auto get_value_wrapper(Size_type num_ops = 1) const + requires(!is_array) + { + if constexpr (is_val) { + return Wrapper(m_allocation, m_value); + } else { + this->count(Operation::load, num_ops); + return Wrapper(nullptr, *m_value); + } + } + + // gets a reference to the underlying value + auto&& get_native(Size_type num_ops = 1) & + { + if constexpr (is_val) { + return m_value; + } else { + this->count(Operation::load, num_ops); + return *m_value; + } + } + /// + auto&& get_native(Size_type num_ops = 1) && + { + if constexpr (is_val) { + return std::move(m_value); + } else { + this->count(Operation::load, num_ops); + return *m_value; + } + } + /// + auto&& get_native(Size_type num_ops = 1) const& + { + if constexpr (is_val) { + return m_value; + } else { + this->count(Operation::load, num_ops); + return *m_value; + } + } + /// + auto&& get_native(Size_type num_ops = 1) const&& + { + if constexpr (is_val) { + return std::move(m_value); + } else { + this->count(Operation::load, num_ops); + return *m_value; + } + } + + + template < typename U = T > + void count(Operation op, Size_type num_ops) const + { + using V = std::decay_t; // decay arrays to pointers + + if (!CountingData::current_context) { + throw std::runtime_error("Can't count if there is no current context"); + } + + CountingData::current_context->operation_counters[ + Size_type(getOpType())][Size_type(op)] += num_ops; + + if constexpr (std::is_pointer_v && sizeof(std::remove_pointer_t) > 1) { + + if (op == Operation::add || op == Operation::sub) { + // Note that this fails to differentiate between + // adding/subtracting a pointer and an integer which entails a mult or bit_lsh + // and subtracting two pointers which entails a div or bit_rsh + auto is_pow_2 = [](size_t n) { return (n & (n-1)) == size_t(0); }; + Operation extra_op = is_pow_2(sizeof(std::remove_pointer_t)) + ? Operation::bit_lsh : Operation::mult ; + CountingData::current_context->operation_counters[ + Size_type(getOpType())][Size_type(extra_op)] += num_ops; + } + } + + if constexpr (std::is_reference_v) { + if (op == Operation::load || op == Operation::store || + op == Operation::atomic_add) { + + if (!m_allocation) { + throw std::runtime_error("Memory access to unknown allocation"); + } + + auto base_ptr = static_cast(m_allocation->ptr); + check_bounds(base_ptr); + + if (num_ops > Size_type(0)) { + CountingPoint point = CountingData::current_context->point; + MemoryAccess access = MemoryAccess::NumMemoryAccesses; + if (op == Operation::load) { + access = MemoryAccess::read; + } else if (op == Operation::store) { + access = MemoryAccess::write; + } else if (op == Operation::atomic_add) { + access = MemoryAccess::atomicModifyWrite; + } + Size_type offset = m_value - base_ptr; + CountingData::current_context->aloc_counts[m_allocation->idx]. + touch(point, access, offset, num_ops); + } + } + } + } + + void check_bounds(member_type base_ptr) const + requires(is_ref) + { + if (!base_ptr) { + throw std::runtime_error("Memory access to deallocated pointer"); + } + if (m_value < base_ptr) { + throw std::runtime_error("Memory access is out of bounds low"); + } + if (m_value >= (base_ptr + m_allocation->size)) { + throw std::runtime_error("Memory access is out of bounds high"); + } + } + + void registerArray(std::source_location location = std::source_location::current()) + requires(is_val && is_array) + { + CountingData::current_data->add_team_allocation( + get_type_name>(), + static_cast(&m_value), + get_array_size(), sizeof(std::remove_all_extents_t), + static_cast(&m_value), location); + } + + void deregisterArray(std::source_location location = std::source_location::current()) + requires(is_val && is_array) + { + CountingData::current_data->remove_allocation( + static_cast(&m_value), + static_cast(&m_value), location); + } + +private: + member_type m_value; + AllocationMetadata* m_allocation = nullptr; +}; + +template < typename U > +auto make_ValueWrapper(U&& value) +{ + return Wrapper>(value); +} + +// Operations with Wrapper types +// Some of these will be found before functions of the same name in the +// global namespace + +#define RAJAPERF_DEFINE_WRAPPER_UNARY_FUNCTION(op_name, op, op_enum) \ + template < typename T > \ + auto op_name(Wrapper const& obj) \ + { \ + using ::op; \ + auto value = make_ValueWrapper(op(obj.get_native())); \ + value.count(op_enum, 1); \ + return value; \ + } + +RAJAPERF_DEFINE_WRAPPER_UNARY_FUNCTION(exp, exp, Operation::exp) +RAJAPERF_DEFINE_WRAPPER_UNARY_FUNCTION(sqrt, sqrt, Operation::sqrt) +RAJAPERF_DEFINE_WRAPPER_UNARY_FUNCTION(fabs, fabs, Operation::abs) + + +#define RAJAPERF_DEFINE_WRAPPER_UNARY_OPERATOR(op_name, op, op_enum) \ + template < WrappedNonPtr T > \ + auto op_name(T const& obj) \ + { \ + auto value = make_ValueWrapper(op(obj.get_native())); \ + value.count(op_enum, 1); \ + return value; \ + } + +RAJAPERF_DEFINE_WRAPPER_UNARY_OPERATOR(operator+, +, Operation::uplus) +RAJAPERF_DEFINE_WRAPPER_UNARY_OPERATOR(operator-, -, Operation::uminus) + +template < WrappedPtr T > +auto operator+(T const& obj) +{ + Wrapper> value( + obj.m_allocation, +(obj.get_native())); + value.count(Operation::uplus, 1); + return value; +} + + +#define RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(op_name, op, op_enum) \ + template < NonWrapped lhs_T, Wrapped rhs_T > \ + auto op_name(lhs_T & lhs, \ + rhs_T const& rhs) \ + { \ + rhs.template count(op_enum, 1); \ + return lhs op rhs.get_native(); \ + } + +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator+=, +=, Operation::add) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator-=, -=, Operation::sub) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator*=, *=, Operation::mult) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator/=, /=, Operation::div) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator%=, %=, Operation::rem) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator&=, &=, Operation::bit_and) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator|=, |=, Operation::bit_or) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator^=, ^=, Operation::bit_xor) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator<<=, <<=, Operation::bit_lsh) +RAJAPERF_DEFINE_NON_WRAPPER_COMPOUND_ASSIGN_OPERATOR(operator>>=, >>=, Operation::bit_rsh) + + +#define RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(op_name, op, op_enum) \ + template < WrappedNonPtr lhs_T, WrappedNonPtr rhs_T > \ + auto op_name(lhs_T const& lhs, \ + rhs_T const& rhs) \ + { \ + auto value = make_ValueWrapper(lhs.get_native() op rhs.get_native()); \ + value.count(op_enum, 1); \ + return value; \ + } \ + template < WrappedNonPtr lhs_T, NonWrapped rhs_T > \ + auto op_name(lhs_T const& lhs, \ + rhs_T const& rhs) \ + { \ + auto value = make_ValueWrapper(lhs.get_native() op rhs); \ + value.count(op_enum, 1); \ + return value; \ + } \ + template < NonWrapped lhs_T, WrappedNonPtr rhs_T > \ + auto op_name(lhs_T const& lhs, \ + rhs_T const& rhs) \ + { \ + auto value = make_ValueWrapper(lhs op rhs.get_native()); \ + value.count(op_enum, 1); \ + return value; \ + } + +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator+, +, Operation::add) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator-, -, Operation::sub) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator*, *, Operation::mult) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator/, /, Operation::div) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator%, %, Operation::rem) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator&, &, Operation::bit_and) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator|, |, Operation::bit_or) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator^, ^, Operation::bit_xor) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator<<, <<, Operation::bit_lsh) +RAJAPERF_DEFINE_WRAPPER_BINARY_OPERATOR(operator>>, >>, Operation::bit_rsh) + + +template < typename lhs_T, typename rhs_T > +auto operator+(Wrapper const& lhs, + Wrapper const& rhs) +requires((Wrapper::is_ptr || Wrapper::is_ptr) && + !(Wrapper::is_ptr && Wrapper::is_ptr)) +{ + if constexpr (Wrapper::is_ptr) { + auto value = lhs.get_value_wrapper(); + value += rhs; + return value; + } else { + auto value = rhs.get_value_wrapper(); + value += lhs; + return value; + } +} +template < WrappedPtr lhs_T, NonWrapped rhs_T > +auto operator+(lhs_T const& lhs, + rhs_T const& rhs) +{ + auto value = lhs.get_value_wrapper(); + value += rhs; + return value; +} +template < NonWrapped lhs_T, WrappedPtr rhs_T > +auto operator+(lhs_T const& lhs, + rhs_T const& rhs) +{ + auto value = rhs.get_value_wrapper(); + value += lhs; + return value; +} + +template < WrappedPtr lhs_T, WrappedNonPtr rhs_T > +auto operator-(lhs_T const& lhs, + rhs_T const& rhs) +{ + auto value = lhs.get_value_wrapper(); + value -= rhs; + return value; +} +template < WrappedPtr lhs_T, NonWrapped rhs_T > +auto operator-(lhs_T const& lhs, + rhs_T const& rhs) +{ + auto value = lhs.get_value_wrapper(); + value -= rhs; + return value; +} + + +#define RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(op_name, op, op_enum) \ + template < typename lhs_T, typename rhs_T > \ + auto op_name(Wrapper const& lhs, \ + Wrapper const& rhs) \ + { \ + lhs.template count>(op_enum, 1); \ + return lhs.get_native() op rhs.get_native(); \ + } \ + template < typename lhs_T, NonWrapped rhs_T > \ + auto op_name(Wrapper const& lhs, \ + rhs_T const& rhs) \ + { \ + lhs.template count>(op_enum, 1); \ + return lhs.get_native() op rhs; \ + } \ + template < NonWrapped lhs_T, typename rhs_T > \ + auto op_name(lhs_T const& lhs, \ + Wrapper const& rhs) \ + { \ + rhs.template count>(op_enum, 1); \ + return lhs op rhs.get_native(); \ + } + +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator==, ==, Operation::eq) +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator!=, !=, Operation::ne) +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator< , < , Operation::lt) +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator<=, <=, Operation::le) +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator> , > , Operation::gt) +RAJAPERF_DEFINE_WRAPPER_COMPARISON_OPERATOR(operator>=, >=, Operation::ge) + +template < Wrapped T > +void swap(T& lhs, T& rhs) +{ + lhs.swap(rhs); +} + +template < WrappedRef T > +void swap(T&& lhs, T&& rhs) +{ + std::move(lhs).swap(std::move(rhs)); +} + +// helper for getting right type +template < template typename T > +struct Array1WrapperHelper +{ + template < size_t N > + using type = Wrapper>; +}; +/// +template < template typename T > +struct Array2WrapperHelper +{ + template < size_t N0, size_t N1 > + using type = Wrapper>; +}; +/// +template < template typename T > +struct Array3WrapperHelper +{ + template < size_t N0, size_t N1, size_t N2 > + using type = Wrapper>; +}; +/// +template < template typename T > +struct Array4WrapperHelper +{ + template < size_t N0, size_t N1, size_t N2, size_t N3 > + using type = Wrapper>; +}; + +} // closing brace for counting namespace + +} // closing brace for rajaperf namespace + +namespace std +{ + +template < typename T > +struct iterator_traits<::rajaperf::counting::Wrapper> +{ + using difference_type = ::rajaperf::counting::Wrapper; + using value_type = ::rajaperf::counting::Wrapper>; + using pointer = ::rajaperf::counting::Wrapper; + using reference = ::rajaperf::counting::Wrapper; + using iterator_category = std::random_access_iterator_tag; +}; + +} // closing brace for std namespace + + +// Use this wrapper type in variable declarations in a kernel +// ex. +// RAJAPERF_WRAPPER(my_struct*) val; +// Note wrapping is done for most types in CountingMacros.hpp, but some types +// like structs specific to a kernel need to be wrapped manually +// Note do not use it if declaring variables with constant values +#define RAJAPERF_WRAPPER(type) type + +#endif // closing endif for header file include guard diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp index bfa34efa9..de5f86f18 100644 --- a/src/common/DataUtils.hpp +++ b/src/common/DataUtils.hpp @@ -257,59 +257,26 @@ inline void moveData(DataSpace new_dataSpace, DataSpace old_dataSpace, } -template +template struct AutoDataMover { - AutoDataMover(DataSpace new_dataSpace, DataSpace old_dataSpace, - T*& ptr, Size_type len, Size_type align) - : m_ptr(&ptr) - , m_new_dataSpace(new_dataSpace) - , m_old_dataSpace(old_dataSpace) - , m_len(len) - , m_align(align) + AutoDataMover(Func func) + : m_func(func) { } AutoDataMover(AutoDataMover const&) = delete; AutoDataMover& operator=(AutoDataMover const&) = delete; - AutoDataMover(AutoDataMover&& rhs) - : m_ptr(std::exchange(rhs.m_ptr, nullptr)) - , m_new_dataSpace(rhs.m_new_dataSpace) - , m_old_dataSpace(rhs.m_old_dataSpace) - , m_len(rhs.m_len) - , m_align(rhs.m_align) - { } - AutoDataMover& operator=(AutoDataMover&& rhs) - { - finalize(); - m_ptr = std::exchange(rhs.m_ptr, nullptr); - m_new_dataSpace = rhs.m_new_dataSpace; - m_old_dataSpace = rhs.m_old_dataSpace; - m_len = rhs.m_len; - m_align = rhs.m_align; - return *this; - } - - void finalize() - { - if (m_ptr) { - moveData(m_new_dataSpace, m_old_dataSpace, - *m_ptr, m_len, m_align); - m_ptr = nullptr; - } - } + AutoDataMover(AutoDataMover&& rhs) = delete; + AutoDataMover& operator=(AutoDataMover&& rhs) = delete; ~AutoDataMover() { - finalize(); + m_func(); } private: - T** m_ptr; - DataSpace m_new_dataSpace; - DataSpace m_old_dataSpace; - Size_type m_len; - Size_type m_align; + Func m_func; }; /*! diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp index 7805376bb..551eea808 100644 --- a/src/common/Executor.cpp +++ b/src/common/Executor.cpp @@ -39,7 +39,10 @@ #include #include #include + +#include #include +#include #include #if defined(_WIN32) @@ -263,6 +266,7 @@ void Executor::setupSuite() const std::set& run_kern = run_params.getKernelIDsToRun(); for (auto kid = run_kern.begin(); kid != run_kern.end(); ++kid) { kernels.push_back( getKernelObject(*kid, run_params) ); + kernels.back()->setCountedAttributes(); } const std::set& run_var = run_params.getVariantIDsToRun(); @@ -518,131 +522,334 @@ void Executor::writeKernelInfoSummary(ostream& str, bool to_file) const #endif } + const bool skip_if_nonpositive = !to_file; + // // Set up column headers and column widths for kernel summary output. // + string attr_category_head(""); + string kern_head("Kernels"); - size_t kercol_width = kern_head.size(); - - Index_type psize_width = 0; - Index_type reps_width = 0; - Index_type itsrep_width = 0; - Index_type bytesrep_width = 0; - Index_type flopsrep_width = 0; - Index_type bytesReadrep_width = 0; - Index_type bytesWrittenrep_width = 0; - Index_type bytesAtomicModifyWrittenrep_width = 0; - Index_type dash_width = 0; + Index_type kercol_width = static_cast(kern_head.size()); for (size_t ik = 0; ik < kernels.size(); ++ik) { - kercol_width = max(kercol_width, kernels[ik]->getName().size()); - psize_width = max(psize_width, kernels[ik]->getActualProblemSize()); - reps_width = max(reps_width, kernels[ik]->getRunReps()); - itsrep_width = max(itsrep_width, kernels[ik]->getItsPerRep()); - bytesrep_width = max(bytesrep_width, kernels[ik]->getBytesPerRep()); - flopsrep_width = max(flopsrep_width, kernels[ik]->getFLOPsPerRep()); - bytesReadrep_width = max(bytesReadrep_width, kernels[ik]->getBytesReadPerRep()); - bytesWrittenrep_width = max(bytesWrittenrep_width, kernels[ik]->getBytesWrittenPerRep()); - bytesAtomicModifyWrittenrep_width = max(bytesAtomicModifyWrittenrep_width, kernels[ik]->getBytesAtomicModifyWrittenPerRep()); + kercol_width = max(kercol_width, static_cast(kernels[ik]->getName().size())); } + kercol_width += 2; +// +// Set up separators and width parameters. +// const string sepchr(" , "); - kercol_width += 2; - dash_width += kercol_width; - - double psize = log10( static_cast(psize_width) ); - string psize_head("Problem size"); - psize_width = max( static_cast(psize_head.size()), - static_cast(psize) ) + 3; - dash_width += psize_width + static_cast(sepchr.size()); - - double rsize = log10( static_cast(reps_width) ); - string rsize_head("Reps"); - reps_width = max( static_cast(rsize_head.size()), - static_cast(rsize) ) + 3; - dash_width += reps_width + static_cast(sepchr.size()); - - double irsize = log10( static_cast(itsrep_width) ); - string itsrep_head("Iterations/rep"); - itsrep_width = max( static_cast(itsrep_head.size()), - static_cast(irsize) ) + 3; - dash_width += itsrep_width + static_cast(sepchr.size()); - - string kernsrep_head("Kernels/rep"); - Index_type kernsrep_width = - max( static_cast(kernsrep_head.size()), - static_cast(4) ); - dash_width += kernsrep_width + static_cast(sepchr.size()); - - double brsize = log10( static_cast(bytesrep_width) ); - string bytesrep_head("Bytes/rep"); - bytesrep_width = max( static_cast(bytesrep_head.size()), - static_cast(brsize) ) + 3; - dash_width += bytesrep_width + static_cast(sepchr.size()); - - double frsize = log10( static_cast(flopsrep_width) ); - string flopsrep_head("FLOPS/rep"); - flopsrep_width = max( static_cast(flopsrep_head.size()), - static_cast(frsize) ) + 3; - dash_width += flopsrep_width + static_cast(sepchr.size()); - - double brrsize = log10( static_cast(bytesReadrep_width) ); - string bytesReadrep_head("BytesRead/rep"); - bytesReadrep_width = max( static_cast(bytesReadrep_head.size()), - static_cast(brrsize) ) + 3; - dash_width += bytesReadrep_width + static_cast(sepchr.size()); - - double bwrsize = log10( static_cast(bytesWrittenrep_width) ); - string bytesWrittenrep_head("BytesWritten/rep"); - bytesWrittenrep_width = max( static_cast(bytesWrittenrep_head.size()), - static_cast(bwrsize) ) + 3; - dash_width += bytesWrittenrep_width + static_cast(sepchr.size()); - - double bamrrsize = log10( static_cast(bytesAtomicModifyWrittenrep_width) ); - string bytesAtomicModifyWrittenrep_head("BytesAtomicModifyWritten/rep"); - bytesAtomicModifyWrittenrep_width = max( static_cast(bytesAtomicModifyWrittenrep_head.size()), - static_cast(bamrrsize) ) + 3; - dash_width += bytesAtomicModifyWrittenrep_width + static_cast(sepchr.size()); - - str <::max() + : max(screen_width-kercol_width, screen_width/2); + +// +// Set up storage for attributes which will become the columns. +// + struct Attribute + { + std::string category_name; + std::string name; + Index_type width; + std::function getter; + }; + + std::vector attrs; + +// +// function used to print the table, includes the kernel column and attr columns. +// Clears attr columns after printing to make using more than once easier. +// + auto print_attr_table = [&]() { + + // print row of categories + str <getName(); + for (Attribute const& attr : attrs) { + str << sepchr <::min(); + for (size_t ik = 0; ik < kernels.size(); ++ik) { + max_value = max(max_value, getter(kernels[ik])); + } + if (skip_if_nonpositive && max_value <= static_cast(0)) return; + max_value = max(max_value, static_cast(1)); + double value_width = log10(static_cast(max_value)) + 1.0; + Index_type width = max( static_cast(category_name.size()), + static_cast(name.size()) ); + width = max( width, static_cast(value_width) ); + width += 2; + Index_type width_with_sep = static_cast(sepchr.size()) + width; + + if (current_width + width_with_sep > max_width) { + print_attr_table(); + } + + current_width += width_with_sep; + attrs.emplace_back(Attribute{category_name, name, width, getter}); + }; + +// +// user settable attributes +// + add_attr("Input", "Problem size", [](KernelBase const* kernel){ + return static_cast(kernel->getActualProblemSize()); + }); + + add_attr("Input", "Reps", [](KernelBase const* kernel){ + return static_cast(kernel->getRunReps()); + }); + + if ( !to_file && current_width > 0 ) { + print_attr_table(); } - for (size_t ik = 0; ik < kernels.size(); ++ik) { - KernelBase* kern = kernels[ik]; - str <getName() - << sepchr <getActualProblemSize() - << sepchr <getRunReps() - << sepchr <getItsPerRep() - << sepchr <getKernelsPerRep() - << sepchr <getBytesPerRep() - << sepchr <getFLOPsPerRep() - << sepchr <getBytesReadPerRep() - << sepchr <getBytesWrittenPerRep() - << sepchr <getBytesAtomicModifyWrittenPerRep() - << endl; +// +// manually counted attributes +// + add_attr("Estimate", "Iterations/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getItsPerRep()); + }); + + add_attr("Estimate", "Kernels/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getKernelsPerRep()); + }); + + add_attr("Estimate", "Bytes/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getBytesPerRep()); + }); + + add_attr("Estimate", "FLOPS/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getFLOPsPerRep()); + }); + + add_attr("Estimate", "BytesRead/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getBytesReadPerRep()); + }); + + add_attr("Estimate", "BytesWritten/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getBytesWrittenPerRep()); + }); + + add_attr("Estimate", "BytesAtomicModifyWritten/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getBytesAtomicModifyWrittenPerRep()); + }); + + if ( !to_file && current_width > 0 ) { + print_attr_table(); + } + +// +// automatically counted high level attributes +// + add_attr("Counted", "Iterations/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedItsPerRep()); + }); + + add_attr("Counted", "ParallelIterations/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedParItsPerRep()); + }); + + add_attr("Counted", "MaxLoopNestDepth", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedMaxLoopNestDepth()); + }); + + add_attr("Counted", "MaxParallelLoopNestDepth", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedMaxParLoopNestDepth()); + }); + + add_attr("Counted", "Kernels/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedKernelsPerRep()); + }); + + add_attr("Counted", "Synchronizes/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedSyncsPerRep()); + }); + + add_attr("Counted", "TeamSynchronizes/rep", [](KernelBase const* kernel){ + return static_cast(kernel->getCountedTeamSyncsPerRep()); + }); + + if ( !to_file && current_width > 0 ) { + print_attr_table(); + } + +// +// automatically counted memory attributes, at the per loop usage granularity +// + for (Size_type g = 0; g < Size_type(counting::AllocationGroup::NumAllocationGroups); ++g) { + auto gg = counting::AllocationGroup(g); + + std::string num_name = std::format("{}NumAllocations", + counting::getAllocationGroupName(gg)); + + add_attr("Counted", num_name, [gg](KernelBase const* kernel){ + return static_cast(kernel->getCountedNumAllocations(gg)); + }); + + std::string bytes_name = std::format("{}AllocatedBytes", + counting::getAllocationGroupName(gg)); + + add_attr("Counted", bytes_name, [gg](KernelBase const* kernel){ + return static_cast(kernel->getCountedAllocatedBytes(gg)); + }); + + std::string bytes_total_name = std::format("{}BytesTotal/rep", + counting::getAllocationGroupName(gg)); + + add_attr("Counted", bytes_total_name, [gg](KernelBase const* kernel){ + return static_cast( + kernel->getCountedTotalBytes(gg)); + }); + + for (Size_type a = 0; a < Size_type(counting::MemoryAccess::NumMemoryAccesses); ++a) { + auto aa = counting::MemoryAccess(a); + + std::string bytes_total_accessed_name = std::format("{}BytesTotal{}/rep", + counting::getAllocationGroupName(gg), + counting::getMemoryAccessNamePastTenseTitle(aa)); + add_attr("Counted", bytes_total_accessed_name, [gg, aa](KernelBase const* kernel){ + return static_cast( + kernel->getCountedTotalBytesPerAccess(gg, aa)); + }); + + } + + for (Size_type p = 0; p < Size_type(counting::CountingPoint::NumCountingPoints); ++p) { + auto pp = counting::CountingPoint(p); + + std::string bytes_touched_name = std::format("{}BytesTouched/{}", + counting::getAllocationGroupName(gg), + counting::getCountingPointName(pp)); + + add_attr("Counted", bytes_touched_name, [pp, gg](KernelBase const* kernel){ + return static_cast( + kernel->getCountedBytesTouched(pp, gg)); + }); + + std::string bytes_name = std::format("{}Bytes/{}", + counting::getAllocationGroupName(gg), + counting::getCountingPointName(pp)); + + add_attr("Counted", bytes_name, [pp, gg](KernelBase const* kernel){ + return static_cast( + kernel->getCountedBytes(pp, gg)); + }); + + for (Size_type a = 0; a < Size_type(counting::MemoryAccess::NumMemoryAccesses); ++a) { + auto aa = counting::MemoryAccess(a); + + std::string bytes_accessed_name = std::format("{}Bytes{}/{}", + counting::getAllocationGroupName(gg), + counting::getMemoryAccessNamePastTenseTitle(aa), + counting::getCountingPointName(pp)); + add_attr("Counted", bytes_accessed_name, [pp, gg, aa](KernelBase const* kernel){ + return static_cast( + kernel->getCountedBytesPerAccess(pp, gg, aa)); + }); + + } + + } + } + + if ( !to_file && current_width > 0 ) { + print_attr_table(); + } + +// +// automatically counted operations attributes +// + for (Size_type ot = 0; ot < Size_type(counting::OpType::NumOpTypes); ++ot) { + + std::string opTypeName = counting::getOpTypeName(counting::OpType(ot)); + + add_attr("Counted", opTypeName+"_ops/rep", [ot](KernelBase const* kernel){ + return static_cast(kernel->getCountedArithmeticOpsPerRep(counting::OpType(ot))); + }, skip_if_nonpositive); + + for (Size_type op = 0; op < Size_type(counting::Operation::NumOperations); ++op) { + + std::string opName = counting::getOperationName(counting::Operation(op)); + + add_attr("Counted", opTypeName+"_"+opName+"/rep", [ot, op](KernelBase const* kernel){ + return static_cast(kernel->getCountedOpsPerRep(counting::OpType(ot), counting::Operation(op))); + }, skip_if_nonpositive); + + } + + if ( !to_file && current_width > 0 ) { + print_attr_table(); + } + + } + + if (current_width > 0) { + print_attr_table(); } str.flush(); } +void Executor::writeKernelCounterSummary(ostream& str) const +{ + for (size_t ik = 0; ik < kernels.size(); ++ik) { + str << "\n/******** Kernel " << kernels[ik]->getName() << " ********/\n"; + kernels[ik]->printCounters(str); + } + str.flush(); +} + + + void Executor::runSuite() { RunParams::InputOpt in_state = run_params.getInputState(); @@ -904,6 +1111,11 @@ void Executor::outputRunData() writeKernelInfoSummary(*file, to_file); } + file = openOutputFile(out_fprefix + "-counters.txt"); + if ( *file ) { + writeKernelCounterSummary(*file); + } + #if defined(RAJA_PERFSUITE_USE_CALIPER) KernelBase::setCaliperMgrFlush(); #endif diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp index 5d4a41d16..33cd4e35d 100644 --- a/src/common/Executor.hpp +++ b/src/common/Executor.hpp @@ -76,6 +76,7 @@ class Executor std::unique_ptr openOutputFile(const std::string& filename) const; void writeKernelInfoSummary(std::ostream& str, bool to_file) const; + void writeKernelCounterSummary(std::ostream& str) const; void writeCSVReport(std::ostream& file, CSVRepMode mode, RunParams::CombinerOpt combiner, size_t prec); diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp index 9dc824f9c..b819cad41 100644 --- a/src/common/KernelBase.hpp +++ b/src/common/KernelBase.hpp @@ -14,6 +14,8 @@ #include "common/DataUtils.hpp" #include "common/RunParams.hpp" #include "common/GPUUtils.hpp" +#include "common/CountingData.hpp" +#include "common/CountingWrapper.hpp" #include "RAJA/util/Timer.hpp" #if defined(RAJA_PERFSUITE_ENABLE_MPI) @@ -37,6 +39,10 @@ #include #include #include +#include +#include +#include +#include #if defined(RAJA_PERFSUITE_USE_CALIPER) @@ -156,18 +162,88 @@ class KernelBase Index_type getDefaultProblemSize() const { return default_prob_size; } Index_type getActualProblemSize() const { return actual_prob_size; } Index_type getDefaultReps() const { return default_reps; } + Index_type getTargetProblemSize() const; + Index_type getRunReps() const; + Index_type getItsPerRep() const { return its_per_rep; }; Index_type getKernelsPerRep() const { return kernels_per_rep; }; - Index_type getBytesPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + 2*bytes_atomic_modify_written_per_rep; } // count atomic_modify_write operations as a read and a write to match previous counting + + // count atomic_modify_write operations as a read and a write to match previous counting + Index_type getBytesPerRep() const { return bytes_read_per_rep + bytes_written_per_rep + 2*bytes_atomic_modify_written_per_rep; } Index_type getBytesReadPerRep() const { return bytes_read_per_rep; } Index_type getBytesWrittenPerRep() const { return bytes_written_per_rep; } Index_type getBytesAtomicModifyWrittenPerRep() const { return bytes_atomic_modify_written_per_rep; } + Index_type getFLOPsPerRep() const { return FLOPs_per_rep; } + + + Index_type getCountedItsPerRep() const { return countingData ? (countingData->all_it_per_rep_counter) : -1; } + Index_type getCountedParItsPerRep() const { return countingData ? (countingData->par_it_per_rep_counter) : -1; } + Index_type getCountedMaxLoopNestDepth() const { return countingData ? (countingData->max_all_loop_depth) : -1; } + Index_type getCountedMaxParLoopNestDepth() const { return countingData ? (countingData->max_par_loop_depth) : -1; } + Index_type getCountedKernelsPerRep() const { return countingData ? countingData->kernel_per_rep_counter : -1; } + Index_type getCountedSyncsPerRep() const { return countingData ? countingData->par_sync_per_rep_counter : -1; } + Index_type getCountedTeamSyncsPerRep() const { return countingData ? countingData->team_sync_per_rep_counter : -1; } + Index_type getCountedNumAllocations(counting::AllocationGroup g) const { return countingData ? countingData->memory_allocations[Size_type(g)] : -1; } + Index_type getCountedAllocatedBytes(counting::AllocationGroup g) const { return countingData ? countingData->memory_bytes[Size_type(g)] : -1; } + + Index_type getCountedTotalBytes(counting::AllocationGroup g) const + { + Index_type count = -1; + if (countingData) { + count = 0; + for (Size_type a = 0; a < Size_type(counting::MemoryAccess::NumMemoryAccesses); ++a) { + count += countingData->memory_total_bytes[Size_type(g)].accessed[a]; + if (counting::MemoryAccess(a) == counting::MemoryAccess::atomicModifyWrite) { + count += countingData->memory_total_bytes[Size_type(g)].accessed[a]; // count twice, as both a read and a write + } + } + }; + return count; + } + Index_type getCountedTotalBytesPerAccess(counting::AllocationGroup g, counting::MemoryAccess ma) const + { return countingData ? countingData->memory_total_bytes[Size_type(g)].accessed[Size_type(ma)] : -1; } + + // count atomic_modify_write operations as a read and a write to match previous counting + Index_type getCountedBytesTouched(counting::CountingPoint p, counting::AllocationGroup g) const + { return countingData ? countingData->memory_totals_bytes[Size_type(p)][Size_type(g)].touched : -1; } + Index_type getCountedBytes(counting::CountingPoint p, counting::AllocationGroup g) const + { + Index_type count = -1; + if (countingData) { + count = 0; + for (Size_type a = 0; a < Size_type(counting::MemoryAccess::NumMemoryAccesses); ++a) { + count += countingData->memory_totals_bytes[Size_type(p)][Size_type(g)].accessed[a]; + if (counting::MemoryAccess(a) == counting::MemoryAccess::atomicModifyWrite) { + count += countingData->memory_totals_bytes[Size_type(p)][Size_type(g)].accessed[a]; // count twice, as both a read and a write + } + } + }; + return count; + } + Index_type getCountedBytesPerAccess(counting::CountingPoint p, counting::AllocationGroup g, counting::MemoryAccess ma) const + { return countingData ? countingData->memory_totals_bytes[Size_type(p)][Size_type(g)].accessed[Size_type(ma)] : -1; } + + Index_type getCountedOpsPerRep(counting::OpType ot, counting::Operation op) const { return countingData ? countingData->operation_counters[Size_type(ot)][Size_type(op)] : -1; } + + Index_type getCountedArithmeticOpsPerRep(counting::OpType ot) const + { + Index_type count = -1; + if (countingData) { + // count a subset of operations including things like add, sub, mult, div, abs, sqrt, but not assign, eq, ne, lt, le, gt, or ge + count = 0; + for (Size_type op = Size_type(counting::Operation::FLOP_begin); + op < Size_type(counting::Operation::FLOP_end); ++op) { + count += countingData->operation_counters[Size_type(ot)][op]; + } + } + return count; + } + + double getBlockSize() const { return kernel_block_size; } - Complexity getComplexity() const { return complexity; }; - Index_type getTargetProblemSize() const; - Index_type getRunReps() const; + Complexity getComplexity() const { return complexity; }; bool usesFeature(FeatureID fid) const { return uses_feature[fid]; }; @@ -320,151 +396,244 @@ class KernelBase DataSpace getReductionDataSpace(VariantID vid) const; DataSpace getMPIDataSpace(VariantID vid) const; - template - void allocData(DataSpace dataSpace, T& ptr, Size_type len) + + + virtual void setCountedAttributes() {}; // + + + counting::ScopedContext initializeCounters( + std::initializer_list wrapper_formats, + std::source_location location = std::source_location::current()) { - rajaperf::allocData(dataSpace, - ptr, len, getDataAlignment()); + countingData = std::make_unique(); + countingData->set_formats(wrapper_formats); + enable_data_registration = true; + return countingData->create_context("", location); } - template - void allocAndInitData(DataSpace dataSpace, T*& ptr, Size_type len) + void finalizeCounters(counting::ScopedContext& context, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitData(dataSpace, - ptr, len, getDataAlignment()); + context.release(); + enable_data_registration = false; + countingData->finalize_context(location); } - template - void allocAndInitDataConst(DataSpace dataSpace, T*& ptr, Size_type len, V val) + void printCounters(std::ostream& str) const { - rajaperf::allocAndInitDataConst(dataSpace, - ptr, len, getDataAlignment(), val); + if (countingData) { + countingData->print(str); + } } - template - void allocAndInitDataRandSign(DataSpace dataSpace, T*& ptr, Size_type len) + + void registerData(counting::pointer auto& ptr, + counting::integral auto const& len, + counting::raw_pointer auto ptr_ptr, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitDataRandSign(dataSpace, - ptr, len, getDataAlignment()); + using pointed_to_type = counting::pointed_to_type_t; + if (!enable_data_registration) return; + countingData->add_allocation( + counting::get_type_name(), + static_cast(counting::get_value(ptr)), + counting::get_value(len), sizeof(pointed_to_type), + static_cast(ptr_ptr), location); } - template - void allocAndInitDataRandValue(DataSpace dataSpace, T*& ptr, Size_type len) + void deRegisterData(counting::pointer auto& ptr, + counting::raw_pointer auto ptr_ptr, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitDataRandValue(dataSpace, - ptr, len, getDataAlignment()); + if (!enable_data_registration) return; + countingData->remove_allocation( + static_cast(counting::get_value(ptr)), + static_cast(ptr_ptr), location); } - template - rajaperf::AutoDataMover scopedMoveData(DataSpace dataSpace, T*& ptr, Size_type len) + void allocData(DataSpace dataSpace, counting::pointer auto& ptr_in, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) + { + auto ptr = counting::get_value(ptr_in); + Size_type len = counting::get_value(len_in); + rajaperf::allocData(dataSpace, ptr, len, getDataAlignment()); + registerData(ptr, len, &counting::get_value(ptr_in), location); + ptr_in = ptr; + } + + void allocAndInitData(DataSpace dataSpace, counting::pointer auto& ptr_in, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) + { + auto ptr = counting::get_value(ptr_in); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitData(dataSpace, ptr, len, getDataAlignment()); + registerData(ptr, len, &counting::get_value(ptr_in), location); + ptr_in = ptr; + } + + void allocAndInitDataConst(DataSpace dataSpace, counting::pointer auto& ptr_in, + counting::integral auto const& len_in, auto const& val, + std::source_location location = std::source_location::current()) + { + auto ptr = counting::get_value(ptr_in); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitDataConst(dataSpace, ptr, len, getDataAlignment(), counting::get_value(val)); + registerData(ptr, len, &counting::get_value(ptr_in), location); + ptr_in = ptr; + } + + void allocAndInitDataRandSign(DataSpace dataSpace, counting::pointer auto& ptr_in, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) + { + auto ptr = counting::get_value(ptr_in); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitDataRandSign(dataSpace, ptr, len, getDataAlignment()); + registerData(ptr, len, &counting::get_value(ptr_in), location); + ptr_in = ptr; + } + + void allocAndInitDataRandValue(DataSpace dataSpace, counting::pointer auto& ptr_in, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) + { + auto ptr = counting::get_value(ptr_in); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitDataRandValue(dataSpace, ptr, len, getDataAlignment()); + registerData(ptr, len, &counting::get_value(ptr_in), location); + ptr_in = ptr; + } + + auto scopedMoveDataForInit(DataSpace dataSpace, DataSpace hds, counting::raw_pointer auto& ptr, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) + { + Size_type len = counting::get_value(len_in); + Size_type align = getDataAlignment(); + KernelBase& self = *this; + return rajaperf::AutoDataMover([=, &self, &ptr](){ + rajaperf::moveData(dataSpace, hds, ptr, len, align); + self.registerData(ptr, len, &ptr, location); + }); + } + + auto allocDataForInit(DataSpace dataSpace, counting::raw_pointer auto& ptr, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) { DataSpace hds = rajaperf::hostCopyDataSpace(dataSpace); - rajaperf::moveData(hds, dataSpace, ptr, len, getDataAlignment()); - return {dataSpace, hds, ptr, len, getDataAlignment()}; + Size_type len = counting::get_value(len_in); + rajaperf::allocData(hds, ptr, len, getDataAlignment()); + // don't register temporary data + return scopedMoveDataForInit(dataSpace, hds, ptr, len, location); } - template - void copyData(DataSpace dst_dataSpace, T* dst_ptr, - DataSpace src_dataSpace, const T* src_ptr, - Size_type len) + auto allocAndInitDataForInit(DataSpace dataSpace, counting::raw_pointer auto& ptr, + counting::integral auto const& len_in, + std::source_location location = std::source_location::current()) { - rajaperf::copyData(dst_dataSpace, dst_ptr, src_dataSpace, src_ptr, len); + DataSpace hds = rajaperf::hostCopyDataSpace(dataSpace); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitData(hds, ptr, len, getDataAlignment()); + // don't register temporary data + return scopedMoveDataForInit(dataSpace, hds, ptr, len, location); } - template - void deallocData(DataSpace dataSpace, T& ptr) + auto allocAndInitDataConstForInit(DataSpace dataSpace, counting::raw_pointer auto& ptr, + counting::integral auto const& len_in, auto const& val, + std::source_location location = std::source_location::current()) { - rajaperf::deallocData(dataSpace, ptr); + DataSpace hds = rajaperf::hostCopyDataSpace(dataSpace); + Size_type len = counting::get_value(len_in); + rajaperf::allocAndInitDataConst(hds, ptr, len, getDataAlignment(), counting::get_value(val)); + // don't register temporary data + return scopedMoveDataForInit(dataSpace, hds, ptr, len, location); } - template - void allocData(T*& ptr, Size_type len, VariantID vid) + void copyData(DataSpace dst_dataSpace, counting::convertible_to_pointer auto const& dst, + DataSpace src_dataSpace, counting::convertible_to_pointer auto const& src, + counting::integral auto const& len) { - rajaperf::allocData(getDataSpace(vid), - ptr, len, getDataAlignment()); + rajaperf::copyData(dst_dataSpace, counting::get_value(dst), + src_dataSpace, counting::get_value(src), len); } - template - void allocAndCopyHostData(T*& dst_ptr, - const T* src_ptr, - Size_type len, - VariantID vid) + void deallocData(DataSpace dataSpace, counting::pointer auto& ptr_in, + std::source_location location = std::source_location::current()) { - rajaperf::allocData(getDataSpace(vid), - dst_ptr, len, getDataAlignment()); + auto ptr = counting::get_value(ptr_in); + deRegisterData(ptr, &counting::get_value(ptr_in), location); + rajaperf::deallocData(dataSpace, ptr); + ptr_in = nullptr; + } - rajaperf::copyData(getDataSpace(vid), - dst_ptr, DataSpace::Host, src_ptr, len); + + void allocData(counting::pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) + { + allocData(getDataSpace(vid), ptr, len, location); } - template - void allocAndInitData(T*& ptr, Size_type len, VariantID vid) + void allocAndCopyHostData(counting::pointer auto& dst_ptr, + counting::convertible_to_pointer auto const& src, + counting::integral auto const& len, + VariantID vid, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitData(getDataSpace(vid), - ptr, len, getDataAlignment()); + allocData(getDataSpace(vid), dst_ptr, len, location); + copyData(getDataSpace(vid), dst_ptr, DataSpace::Host, src, len); } - template - void allocAndInitDataConst(T*& ptr, Size_type len, V val, VariantID vid) + void allocAndInitData(counting::pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitDataConst(getDataSpace(vid), - ptr, len, getDataAlignment(), val); + allocAndInitData(getDataSpace(vid), ptr, len, location); } - template - void allocAndInitDataRandSign(T*& ptr, Size_type len, VariantID vid) + void allocAndInitDataConst(counting::pointer auto& ptr, counting::integral auto const& len, + auto const& val, VariantID vid, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitDataRandSign(getDataSpace(vid), - ptr, len, getDataAlignment()); + allocAndInitDataConst(getDataSpace(vid), ptr, len, val, location); } - template - void allocAndInitDataRandValue(T*& ptr, Size_type len, VariantID vid) + void allocAndInitDataRandSign(counting::pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) { - rajaperf::allocAndInitDataRandValue(getDataSpace(vid), - ptr, len, getDataAlignment()); + allocAndInitDataRandSign(getDataSpace(vid), ptr, len, location); } - template - rajaperf::AutoDataMover allocDataForInit(T*& ptr, Size_type len, VariantID vid) + void allocAndInitDataRandValue(counting::pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) { - DataSpace ds = getDataSpace(vid); - DataSpace hds = rajaperf::hostCopyDataSpace(ds); - rajaperf::allocData(hds, ptr, len, getDataAlignment()); - return {ds, hds, ptr, len, getDataAlignment()}; + allocAndInitDataRandValue(getDataSpace(vid), ptr, len, location); } - template - rajaperf::AutoDataMover allocAndInitDataForInit(T*& ptr, Size_type len, VariantID vid) + auto allocDataForInit(counting::raw_pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) { - DataSpace ds = getDataSpace(vid); - DataSpace hds = rajaperf::hostCopyDataSpace(ds); - rajaperf::allocAndInitData(hds, ptr, len, getDataAlignment()); - return {ds, hds, ptr, len, getDataAlignment()}; + return allocDataForInit(getDataSpace(vid), ptr, len, location); } - template - rajaperf::AutoDataMover allocAndInitDataConstForInit(T*& ptr, Size_type len, T val, VariantID vid) + auto allocAndInitDataForInit(counting::raw_pointer auto& ptr, counting::integral auto const& len, VariantID vid, + std::source_location location = std::source_location::current()) { - DataSpace ds = getDataSpace(vid); - DataSpace hds = rajaperf::hostCopyDataSpace(ds); - rajaperf::allocAndInitDataConst(hds, ptr, len, getDataAlignment(), val); - return {ds, hds, ptr, len, getDataAlignment()}; + return allocAndInitDataForInit(getDataSpace(vid), ptr, len, location); } - template - rajaperf::AutoDataMover scopedMoveData(T*& ptr, Size_type len, VariantID vid) + auto allocAndInitDataConstForInit(counting::raw_pointer auto& ptr, counting::integral auto const& len, + auto const& val, VariantID vid, + std::source_location location = std::source_location::current()) { - DataSpace ds = getDataSpace(vid); - DataSpace hds = rajaperf::hostCopyDataSpace(ds); - rajaperf::moveData(hds, ds, ptr, len, getDataAlignment()); - return {ds, hds, ptr, len, getDataAlignment()}; + return allocAndInitDataConstForInit(getDataSpace(vid), ptr, len, val, location); } - template - void deallocData(T*& ptr, VariantID vid) + void deallocData(counting::pointer auto& ptr, VariantID vid, + std::source_location location = std::source_location::current()) { - rajaperf::deallocData(getDataSpace(vid), ptr); + deallocData(getDataSpace(vid), ptr, location); } template @@ -639,6 +808,10 @@ class KernelBase Index_type bytes_written_per_rep; Index_type bytes_atomic_modify_written_per_rep; Index_type FLOPs_per_rep; + + bool enable_data_registration = false; + std::unique_ptr countingData; + double kernel_block_size = nan(""); // Set default value for non GPU kernels VariantID running_variant; diff --git a/src/common/RPTypes.hpp b/src/common/RPTypes.hpp index df059e276..339e3aca6 100644 --- a/src/common/RPTypes.hpp +++ b/src/common/RPTypes.hpp @@ -142,6 +142,30 @@ using Real_type = float; #endif +template < size_t N > +using Real_array = Real_type[N]; + +template < size_t N0, size_t N1 > +using Real_array2 = Real_type[N0][N1]; + +template < size_t N0, size_t N1, size_t N2 > +using Real_array3 = Real_type[N0][N1][N2]; + +template < size_t N0, size_t N1, size_t N2, size_t N3 > +using Real_array4 = Real_type[N0][N1][N2][N3]; + +template < size_t N > +using Real_array_ref = Real_type(&)[N]; + +template < size_t N0, size_t N1 > +using Real_array2_ref = Real_type(&)[N0][N1]; + +template < size_t N0, size_t N1, size_t N2 > +using Real_array3_ref = Real_type(&)[N0][N1][N2]; + +template < size_t N0, size_t N1, size_t N2, size_t N3 > +using Real_array4_ref = Real_type(&)[N0][N1][N2][N3]; + using Real_ptr = Real_type*; using Real_const_ptr = Real_type const *; /// From e44e5496e87e337805ce4e82cee0da53ce0c095a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 4 Nov 2025 11:54:52 -0800 Subject: [PATCH 3/3] Add counting to most kernels. Some kernels do not yet support counting and will have values of -1 to indicate that they were not counted. In some cases the kernels helper file may have not cleanly worked with the wrappers. Some kernels use library functions like MPI or std::sort that may not reliably work with the wrappers. --- src/algorithm/ATOMIC.cpp | 48 +++++++ src/algorithm/ATOMIC.hpp | 1 + src/algorithm/HISTOGRAM.cpp | 58 +++++++++ src/algorithm/HISTOGRAM.hpp | 1 + src/algorithm/MEMCPY.cpp | 42 ++++++ src/algorithm/MEMCPY.hpp | 1 + src/algorithm/MEMSET.cpp | 42 ++++++ src/algorithm/MEMSET.hpp | 1 + src/algorithm/REDUCE_SUM.cpp | 50 +++++++ src/algorithm/REDUCE_SUM.hpp | 1 + src/algorithm/SCAN.cpp | 45 +++++++ src/algorithm/SCAN.hpp | 1 + src/apps/CONVECTION3DPA.cpp | 144 +++++++++++++++++++++ src/apps/CONVECTION3DPA.hpp | 85 ++++++------ src/apps/DEL_DOT_VEC_2D.cpp | 43 ++++++ src/apps/DEL_DOT_VEC_2D.hpp | 1 + src/apps/DIFFUSION3DPA.cpp | 124 ++++++++++++++++++ src/apps/DIFFUSION3DPA.hpp | 101 ++++++++------- src/apps/ENERGY.cpp | 62 +++++++++ src/apps/ENERGY.hpp | 1 + src/apps/FIR.cpp | 47 +++++++ src/apps/FIR.hpp | 1 + src/apps/LTIMES_NOVIEW.cpp | 45 +++++++ src/apps/LTIMES_NOVIEW.hpp | 1 + src/apps/MASS3DEA.cpp | 68 ++++++++++ src/apps/MASS3DEA.hpp | 9 +- src/apps/MASS3DPA.cpp | 107 +++++++++++++++ src/apps/MASS3DPA.hpp | 57 ++++---- src/apps/MATVEC_3D_STENCIL.cpp | 43 ++++++ src/apps/MATVEC_3D_STENCIL.hpp | 1 + src/apps/NODAL_ACCUMULATION_3D.cpp | 43 ++++++ src/apps/NODAL_ACCUMULATION_3D.hpp | 1 + src/apps/PRESSURE.cpp | 46 +++++++ src/apps/PRESSURE.hpp | 8 ++ src/apps/VOL3D.cpp | 42 ++++++ src/apps/VOL3D.hpp | 54 ++++++++ src/apps/ZONAL_ACCUMULATION_3D.cpp | 43 ++++++ src/apps/ZONAL_ACCUMULATION_3D.hpp | 1 + src/basic/ARRAY_OF_PTRS.cpp | 42 ++++++ src/basic/ARRAY_OF_PTRS.hpp | 1 + src/basic/COPY8.cpp | 42 ++++++ src/basic/COPY8.hpp | 1 + src/basic/DAXPY.cpp | 42 ++++++ src/basic/DAXPY.hpp | 1 + src/basic/DAXPY_ATOMIC.cpp | 42 ++++++ src/basic/DAXPY_ATOMIC.hpp | 1 + src/basic/EMPTY.cpp | 42 ++++++ src/basic/EMPTY.hpp | 1 + src/basic/IF_QUAD.cpp | 42 ++++++ src/basic/IF_QUAD.hpp | 1 + src/basic/INDEXLIST.cpp | 52 ++++++++ src/basic/INDEXLIST.hpp | 1 + src/basic/INDEXLIST_3LOOP.cpp | 72 +++++++++++ src/basic/INDEXLIST_3LOOP.hpp | 1 + src/basic/INIT3.cpp | 42 ++++++ src/basic/INIT3.hpp | 7 + src/basic/INIT_VIEW1D.cpp | 42 ++++++ src/basic/INIT_VIEW1D.hpp | 1 + src/basic/INIT_VIEW1D_OFFSET.cpp | 42 ++++++ src/basic/INIT_VIEW1D_OFFSET.hpp | 1 + src/basic/MAT_MAT_SHARED-Cuda.cpp | 14 +- src/basic/MAT_MAT_SHARED-Hip.cpp | 14 +- src/basic/MAT_MAT_SHARED-OMP.cpp | 14 +- src/basic/MAT_MAT_SHARED-Seq.cpp | 18 ++- src/basic/MAT_MAT_SHARED-Sycl.cpp | 28 ++-- src/basic/MAT_MAT_SHARED.cpp | 83 ++++++++++++ src/basic/MAT_MAT_SHARED.hpp | 14 +- src/basic/MULADDSUB.cpp | 42 ++++++ src/basic/MULADDSUB.hpp | 1 + src/basic/MULTI_REDUCE.cpp | 60 +++++++++ src/basic/MULTI_REDUCE.hpp | 1 + src/basic/NESTED_INIT.cpp | 43 ++++++ src/basic/NESTED_INIT.hpp | 1 + src/basic/PI_ATOMIC.cpp | 55 ++++++++ src/basic/PI_ATOMIC.hpp | 1 + src/basic/PI_REDUCE.cpp | 52 ++++++++ src/basic/PI_REDUCE.hpp | 1 + src/basic/REDUCE3_INT.cpp | 56 ++++++++ src/basic/REDUCE3_INT.hpp | 1 + src/basic/REDUCE_STRUCT.cpp | 59 +++++++++ src/basic/REDUCE_STRUCT.hpp | 1 + src/basic/TRAP_INT-func.hpp | 14 ++ src/basic/TRAP_INT.cpp | 64 +++++++++ src/basic/TRAP_INT.hpp | 5 + src/comm/HALO_PACKING.cpp | 94 ++++++++++++++ src/comm/HALO_PACKING.hpp | 1 + src/comm/HALO_PACKING_FUSED.cpp | 132 +++++++++++++++++++ src/comm/HALO_PACKING_FUSED.hpp | 5 +- src/comm/HALO_base.hpp | 2 +- src/lcals/DIFF_PREDICT.cpp | 42 ++++++ src/lcals/DIFF_PREDICT.hpp | 1 + src/lcals/EOS.cpp | 42 ++++++ src/lcals/EOS.hpp | 1 + src/lcals/FIRST_DIFF.cpp | 42 ++++++ src/lcals/FIRST_DIFF.hpp | 1 + src/lcals/FIRST_MIN.cpp | 52 ++++++++ src/lcals/FIRST_MIN.hpp | 1 + src/lcals/FIRST_SUM.cpp | 42 ++++++ src/lcals/FIRST_SUM.hpp | 1 + src/lcals/GEN_LIN_RECUR.cpp | 44 +++++++ src/lcals/GEN_LIN_RECUR.hpp | 15 +++ src/lcals/HYDRO_1D.cpp | 42 ++++++ src/lcals/HYDRO_1D.hpp | 1 + src/lcals/HYDRO_2D.cpp | 58 +++++++++ src/lcals/HYDRO_2D.hpp | 1 + src/lcals/INT_PREDICT.cpp | 42 ++++++ src/lcals/INT_PREDICT.hpp | 1 + src/lcals/PLANCKIAN.cpp | 42 ++++++ src/lcals/PLANCKIAN.hpp | 7 + src/lcals/TRIDIAG_ELIM.cpp | 42 ++++++ src/lcals/TRIDIAG_ELIM.hpp | 1 + src/polybench/POLYBENCH_2MM.cpp | 55 ++++++++ src/polybench/POLYBENCH_2MM.hpp | 1 + src/polybench/POLYBENCH_3MM.cpp | 65 ++++++++++ src/polybench/POLYBENCH_3MM.hpp | 1 + src/polybench/POLYBENCH_ADI.cpp | 57 ++++++++ src/polybench/POLYBENCH_ADI.hpp | 44 +++++++ src/polybench/POLYBENCH_ATAX.cpp | 51 ++++++++ src/polybench/POLYBENCH_ATAX.hpp | 1 + src/polybench/POLYBENCH_FDTD_2D.cpp | 56 ++++++++ src/polybench/POLYBENCH_FDTD_2D.hpp | 1 + src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp | 43 ++++++ src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp | 1 + src/polybench/POLYBENCH_GEMM.cpp | 46 +++++++ src/polybench/POLYBENCH_GEMM.hpp | 1 + src/polybench/POLYBENCH_GEMVER.cpp | 61 +++++++++ src/polybench/POLYBENCH_GEMVER.hpp | 1 + src/polybench/POLYBENCH_GESUMMV.cpp | 43 ++++++ src/polybench/POLYBENCH_GESUMMV.hpp | 1 + src/polybench/POLYBENCH_HEAT_3D.cpp | 53 ++++++++ src/polybench/POLYBENCH_HEAT_3D.hpp | 1 + src/polybench/POLYBENCH_JACOBI_1D.cpp | 45 +++++++ src/polybench/POLYBENCH_JACOBI_1D.hpp | 1 + src/polybench/POLYBENCH_JACOBI_2D.cpp | 49 +++++++ src/polybench/POLYBENCH_JACOBI_2D.hpp | 1 + src/polybench/POLYBENCH_MVT.cpp | 51 ++++++++ src/polybench/POLYBENCH_MVT.hpp | 1 + src/stream/ADD.cpp | 42 ++++++ src/stream/ADD.hpp | 1 + src/stream/COPY.cpp | 42 ++++++ src/stream/COPY.hpp | 1 + src/stream/DOT.cpp | 52 ++++++++ src/stream/DOT.hpp | 1 + src/stream/MUL.cpp | 42 ++++++ src/stream/MUL.hpp | 1 + src/stream/TRIAD.cpp | 42 ++++++ src/stream/TRIAD.hpp | 1 + 147 files changed, 4182 insertions(+), 159 deletions(-) diff --git a/src/algorithm/ATOMIC.cpp b/src/algorithm/ATOMIC.cpp index 2f163a61a..30dcc8192 100644 --- a/src/algorithm/ATOMIC.cpp +++ b/src/algorithm/ATOMIC.cpp @@ -78,5 +78,53 @@ void ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) (void) vid; } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void ATOMIC::setCountedAttributes() +{ + const size_t replication = getActualProblemSize(); + + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ATOMIC_DATA_SETUP(replication); + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ATOMIC_BODY(RAJAPERF_ATOMIC_ADD_COUNTING, i, ATOMIC_VALUE)); + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + ATOMIC_DATA_TEARDOWN(replication); + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/ATOMIC.hpp b/src/algorithm/ATOMIC.hpp index 92348aea5..1800b5ce1 100644 --- a/src/algorithm/ATOMIC.hpp +++ b/src/algorithm/ATOMIC.hpp @@ -68,6 +68,7 @@ class ATOMIC : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/algorithm/HISTOGRAM.cpp b/src/algorithm/HISTOGRAM.cpp index 57ae5a6a6..969ae125e 100644 --- a/src/algorithm/HISTOGRAM.cpp +++ b/src/algorithm/HISTOGRAM.cpp @@ -146,5 +146,63 @@ void HISTOGRAM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(DataSpace::Host, m_counts_final); } + +// // Only define setCountedAttributes functions past this point +// // BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void HISTOGRAM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + HISTOGRAM_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HISTOGRAM_SETUP_COUNTS; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HISTOGRAM_INIT_COUNTS; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(HISTOGRAM_BODY(RAJAPERF_ATOMIC_ADD_COUNTING)); + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HISTOGRAM_FINALIZE_COUNTS; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HISTOGRAM_TEARDOWN_COUNTS; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/HISTOGRAM.hpp b/src/algorithm/HISTOGRAM.hpp index 852b65fa6..7472e8eb2 100644 --- a/src/algorithm/HISTOGRAM.hpp +++ b/src/algorithm/HISTOGRAM.hpp @@ -90,6 +90,7 @@ class HISTOGRAM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/algorithm/MEMCPY.cpp b/src/algorithm/MEMCPY.cpp index d5f28a5f3..8923bca6c 100644 --- a/src/algorithm/MEMCPY.cpp +++ b/src/algorithm/MEMCPY.cpp @@ -79,5 +79,47 @@ void MEMCPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MEMCPY::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MEMCPY_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(MEMCPY_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/MEMCPY.hpp b/src/algorithm/MEMCPY.hpp index 90f506613..920788869 100644 --- a/src/algorithm/MEMCPY.hpp +++ b/src/algorithm/MEMCPY.hpp @@ -48,6 +48,7 @@ class MEMCPY : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/algorithm/MEMSET.cpp b/src/algorithm/MEMSET.cpp index 638abf20a..8ddccff64 100644 --- a/src/algorithm/MEMSET.cpp +++ b/src/algorithm/MEMSET.cpp @@ -78,5 +78,47 @@ void MEMSET::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_x, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MEMSET::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MEMSET_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(MEMSET_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/MEMSET.hpp b/src/algorithm/MEMSET.hpp index 2719751bf..6c97ba489 100644 --- a/src/algorithm/MEMSET.hpp +++ b/src/algorithm/MEMSET.hpp @@ -48,6 +48,7 @@ class MEMSET : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/algorithm/REDUCE_SUM.cpp b/src/algorithm/REDUCE_SUM.cpp index 8adb472a4..958f977b5 100644 --- a/src/algorithm/REDUCE_SUM.cpp +++ b/src/algorithm/REDUCE_SUM.cpp @@ -81,5 +81,55 @@ void REDUCE_SUM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_x, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void REDUCE_SUM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE_SUM_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Real_type sum = m_sum_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(REDUCE_SUM_BODY); + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_sum = sum; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/REDUCE_SUM.hpp b/src/algorithm/REDUCE_SUM.hpp index b0e504349..00260d734 100644 --- a/src/algorithm/REDUCE_SUM.hpp +++ b/src/algorithm/REDUCE_SUM.hpp @@ -52,6 +52,7 @@ class REDUCE_SUM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/algorithm/SCAN.cpp b/src/algorithm/SCAN.cpp index ba54a8ceb..f7d193614 100644 --- a/src/algorithm/SCAN.cpp +++ b/src/algorithm/SCAN.cpp @@ -83,5 +83,50 @@ void SCAN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void SCAN::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + SCAN_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + SCAN_PROLOGUE; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(SCAN_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace algorithm } // end namespace rajaperf diff --git a/src/algorithm/SCAN.hpp b/src/algorithm/SCAN.hpp index f4ad374ac..2a5574d65 100644 --- a/src/algorithm/SCAN.hpp +++ b/src/algorithm/SCAN.hpp @@ -56,6 +56,7 @@ class SCAN : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/CONVECTION3DPA.cpp b/src/apps/CONVECTION3DPA.cpp index d476e1289..1af41e83e 100644 --- a/src/apps/CONVECTION3DPA.cpp +++ b/src/apps/CONVECTION3DPA.cpp @@ -104,5 +104,149 @@ void CONVECTION3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx deallocData(m_Y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void CONVECTION3DPA::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + CONVECTION3DPA_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type e = 0; e < NE; ++e)) { + RAJAPERF_COUNTERS_TEAM_CONTEXT(); + + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_0_CPU); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,CPA_D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy,y,CPA_D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx,x,CPA_D1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_1); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,CPA_D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy,y,CPA_D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,CPA_Q1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_2); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,CPA_D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,CPA_Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy,y,CPA_Q1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_3); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,CPA_Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy,y,CPA_Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qz,z,CPA_Q1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_4); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qz,z,CPA_Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy,y,CPA_Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,CPA_Q1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_5); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,CPA_Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy,y,CPA_Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,CPA_D1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_6); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,CPA_D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx,x,CPA_Q1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy,y,CPA_D1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_7); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz,z,CPA_D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy,y,CPA_D1D)) + { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx,x,CPA_D1D)) + { + RAJAPERF_COUNTERS_LOOP_BODY(CONVECTION3DPA_8); + } + } + } + + } // element loop + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/CONVECTION3DPA.hpp b/src/apps/CONVECTION3DPA.hpp index 688dd1649..4f147cf28 100644 --- a/src/apps/CONVECTION3DPA.hpp +++ b/src/apps/CONVECTION3DPA.hpp @@ -223,51 +223,51 @@ Index_type NE = m_NE; D[qx + CPA_Q1D * qy + CPA_Q1D * CPA_Q1D * qz + CPA_Q1D * CPA_Q1D * CPA_Q1D * d + CPA_VDIM * CPA_Q1D * CPA_Q1D * CPA_Q1D * e] #define CONVECTION3DPA_0_GPU \ - constexpr int max_D1D = CPA_D1D; \ - constexpr int max_Q1D = CPA_Q1D; \ - constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; \ - RAJA_TEAM_SHARED double sm0[max_DQ*max_DQ*max_DQ]; \ - RAJA_TEAM_SHARED double sm1[max_DQ*max_DQ*max_DQ]; \ - RAJA_TEAM_SHARED double sm2[max_DQ*max_DQ*max_DQ]; \ - RAJA_TEAM_SHARED double sm3[max_DQ*max_DQ*max_DQ]; \ - RAJA_TEAM_SHARED double sm4[max_DQ*max_DQ*max_DQ]; \ - RAJA_TEAM_SHARED double sm5[max_DQ*max_DQ*max_DQ]; \ - double (*u)[max_D1D][max_D1D] = (double (*)[max_D1D][max_D1D]) sm0; \ - double (*Bu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm1; \ - double (*Gu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm2; \ - double (*BBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; \ - double (*GBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; \ - double (*BGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm5; \ - double (*GBBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm0; \ - double (*BGBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm1; \ - double (*BBGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm2; \ - double (*DGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; \ - double (*BDGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; \ - double (*BBDGu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm5; + constexpr auto max_D1D = CPA_D1D; \ + constexpr auto max_Q1D = CPA_Q1D; \ + constexpr auto max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; \ + RAJA_TEAM_SHARED Real_array3 sm0; \ + RAJA_TEAM_SHARED Real_array3 sm1; \ + RAJA_TEAM_SHARED Real_array3 sm2; \ + RAJA_TEAM_SHARED Real_array3 sm3; \ + RAJA_TEAM_SHARED Real_array3 sm4; \ + RAJA_TEAM_SHARED Real_array3 sm5; \ + Real_array3_ref u( sm0); \ + Real_array3_ref Bu(sm1); \ + Real_array3_ref Gu(sm2); \ + Real_array3_ref BBu(sm3); \ + Real_array3_ref GBu(sm4); \ + Real_array3_ref BGu(sm5); \ + Real_array3_ref GBBu(sm0); \ + Real_array3_ref BGBu(sm1); \ + Real_array3_ref BBGu(sm2); \ + Real_array3_ref DGu(sm3); \ + Real_array3_ref BDGu(sm4); \ + Real_array3_ref BBDGu(sm5); #define CONVECTION3DPA_0_CPU \ - constexpr int max_D1D = CPA_D1D; \ - constexpr int max_Q1D = CPA_Q1D; \ - constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; \ - double sm0[max_DQ*max_DQ*max_DQ]; \ - double sm1[max_DQ*max_DQ*max_DQ]; \ - double sm2[max_DQ*max_DQ*max_DQ]; \ - double sm3[max_DQ*max_DQ*max_DQ]; \ - double sm4[max_DQ*max_DQ*max_DQ]; \ - double sm5[max_DQ*max_DQ*max_DQ]; \ - double (*u)[max_D1D][max_D1D] = (double (*)[max_D1D][max_D1D]) sm0; \ - double (*Bu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm1; \ - double (*Gu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm2; \ - double (*BBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; \ - double (*GBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; \ - double (*BGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm5; \ - double (*GBBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm0; \ - double (*BGBu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm1; \ - double (*BBGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm2; \ - double (*DGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm3; \ - double (*BDGu)[max_Q1D][max_Q1D] = (double (*)[max_Q1D][max_Q1D])sm4; \ - double (*BBDGu)[max_D1D][max_Q1D] = (double (*)[max_D1D][max_Q1D])sm5; + constexpr auto max_D1D = CPA_D1D; \ + constexpr auto max_Q1D = CPA_Q1D; \ + constexpr auto max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D; \ + Real_array3 sm0; \ + Real_array3 sm1; \ + Real_array3 sm2; \ + Real_array3 sm3; \ + Real_array3 sm4; \ + Real_array3 sm5; \ + Real_array3_ref u( sm0); \ + Real_array3_ref Bu(sm1); \ + Real_array3_ref Gu(sm2); \ + Real_array3_ref BBu(sm3); \ + Real_array3_ref GBu(sm4); \ + Real_array3_ref BGu(sm5); \ + Real_array3_ref GBBu(sm0); \ + Real_array3_ref BGBu(sm1); \ + Real_array3_ref BBGu(sm2); \ + Real_array3_ref DGu(sm3); \ + Real_array3_ref BDGu(sm4); \ + Real_array3_ref BBDGu(sm5); #define CONVECTION3DPA_1 \ u[dz][dy][dx] = CPA_X(dx,dy,dz,e); @@ -372,6 +372,7 @@ class CONVECTION3DPA : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp index dcd7a29a5..5bb2aa6a1 100644 --- a/src/apps/DEL_DOT_VEC_2D.cpp +++ b/src/apps/DEL_DOT_VEC_2D.cpp @@ -113,5 +113,48 @@ void DEL_DOT_VEC_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx deallocData(m_div, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DEL_DOT_VEC_2D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + DEL_DOT_VEC_2D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ii = ibegin ; ii < iend ; ++ii )) { + RAJAPERF_COUNTERS_LOOP_BODY(DEL_DOT_VEC_2D_BODY_INDEX); + RAJAPERF_COUNTERS_LOOP_BODY(DEL_DOT_VEC_2D_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp index 14bd78533..8e1e75738 100644 --- a/src/apps/DEL_DOT_VEC_2D.hpp +++ b/src/apps/DEL_DOT_VEC_2D.hpp @@ -112,6 +112,7 @@ class DEL_DOT_VEC_2D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/DIFFUSION3DPA.cpp b/src/apps/DIFFUSION3DPA.cpp index 705622b30..a01caeb7f 100644 --- a/src/apps/DIFFUSION3DPA.cpp +++ b/src/apps/DIFFUSION3DPA.cpp @@ -103,5 +103,129 @@ void DIFFUSION3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_Y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DIFFUSION3DPA::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + DIFFUSION3DPA_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type e = 0; e < NE; ++e)) { + RAJAPERF_COUNTERS_TEAM_CONTEXT(); + + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_0_CPU); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz, z, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, DPA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_1); + } + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, DPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_2); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz, z, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, DPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_3); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz, z, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, DPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, DPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_4); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qz, z, DPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, DPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, DPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_5); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(d, y, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(q, x, DPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_6); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qz, z, DPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, DPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, DPA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_7); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qz, z, DPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, DPA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_8); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dz, z, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, DPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, DPA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFFUSION3DPA_9); + } + } + } + + } // element loop + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/DIFFUSION3DPA.hpp b/src/apps/DIFFUSION3DPA.hpp index b03e90ea0..52fffdbba 100644 --- a/src/apps/DIFFUSION3DPA.hpp +++ b/src/apps/DIFFUSION3DPA.hpp @@ -254,58 +254,58 @@ const bool symmetric = true; (((q)<=(d)) ? -1.0 : 1.0) #define DIFFUSION3DPA_0_GPU \ - constexpr int MQ1 = DPA_Q1D; \ - constexpr int MD1 = DPA_D1D; \ - constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - RAJA_TEAM_SHARED double sBG[MQ1*MD1]; \ - double (*B)[MD1] = (double (*)[MD1]) sBG; \ - double (*G)[MD1] = (double (*)[MD1]) sBG; \ - double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; \ - double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; \ - RAJA_TEAM_SHARED double sm0[3][MDQ*MDQ*MDQ]; \ - RAJA_TEAM_SHARED double sm1[3][MDQ*MDQ*MDQ]; \ - double (*s_X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); \ - double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0); \ - double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1); \ - double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0); \ - double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1); \ - double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2); \ - double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0); \ - double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1); \ - double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2); \ - double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0); \ - double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1); \ - double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2); \ - double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0); \ - double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); \ - double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); + constexpr auto MQ1 = DPA_Q1D; \ + constexpr auto MD1 = DPA_D1D; \ + constexpr auto MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + RAJA_TEAM_SHARED Real_array2 sBG; \ + Real_array2_ref B(sBG); \ + Real_array2_ref G(sBG); \ + Real_array2_ref Bt(sBG); \ + Real_array2_ref Gt(sBG); \ + RAJA_TEAM_SHARED Real_array4<3, MDQ, MDQ, MDQ> sm0; \ + RAJA_TEAM_SHARED Real_array4<3, MDQ, MDQ, MDQ> sm1; \ + Real_array3_ref s_X(sm0[2]); \ + Real_array3_ref DDQ0(sm0[0]); \ + Real_array3_ref DDQ1(sm0[1]); \ + Real_array3_ref DQQ0(sm1[0]); \ + Real_array3_ref DQQ1(sm1[1]); \ + Real_array3_ref DQQ2(sm1[2]); \ + Real_array3_ref QQQ0(sm0[0]); \ + Real_array3_ref QQQ1(sm0[1]); \ + Real_array3_ref QQQ2(sm0[2]); \ + Real_array3_ref QQD0(sm1[0]); \ + Real_array3_ref QQD1(sm1[1]); \ + Real_array3_ref QQD2(sm1[2]); \ + Real_array3_ref QDD0(sm0[0]); \ + Real_array3_ref QDD1(sm0[1]); \ + Real_array3_ref QDD2(sm0[2]); #define DIFFUSION3DPA_0_CPU \ - constexpr int MQ1 = DPA_Q1D; \ - constexpr int MD1 = DPA_D1D; \ - constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - double sBG[MQ1*MD1]; \ - double (*B)[MD1] = (double (*)[MD1]) sBG; \ - double (*G)[MD1] = (double (*)[MD1]) sBG; \ - double (*Bt)[MQ1] = (double (*)[MQ1]) sBG; \ - double (*Gt)[MQ1] = (double (*)[MQ1]) sBG; \ - double sm0[3][MDQ*MDQ*MDQ]; \ - double sm1[3][MDQ*MDQ*MDQ]; \ - double (*s_X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); \ - double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0); \ - double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1); \ - double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0); \ - double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1); \ - double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2); \ - double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0); \ - double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1); \ - double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2); \ - double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0); \ - double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1); \ - double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2); \ - double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0); \ - double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1); \ - double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2); + constexpr auto MQ1 = DPA_Q1D; \ + constexpr auto MD1 = DPA_D1D; \ + constexpr auto MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + Real_array2 sBG; \ + Real_array2_ref B(sBG); \ + Real_array2_ref G(sBG); \ + Real_array2_ref Bt(sBG); \ + Real_array2_ref Gt(sBG); \ + Real_array4<3, MDQ, MDQ, MDQ> sm0; \ + Real_array4<3, MDQ, MDQ, MDQ> sm1; \ + Real_array3_ref s_X(sm0[2]); \ + Real_array3_ref DDQ0(sm0[0]); \ + Real_array3_ref DDQ1(sm0[1]); \ + Real_array3_ref DQQ0(sm1[0]); \ + Real_array3_ref DQQ1(sm1[1]); \ + Real_array3_ref DQQ2(sm1[2]); \ + Real_array3_ref QQQ0(sm0[0]); \ + Real_array3_ref QQQ1(sm0[1]); \ + Real_array3_ref QQQ2(sm0[2]); \ + Real_array3_ref QQD0(sm1[0]); \ + Real_array3_ref QQD1(sm1[1]); \ + Real_array3_ref QQD2(sm1[2]); \ + Real_array3_ref QDD0(sm0[0]); \ + Real_array3_ref QDD1(sm0[1]); \ + Real_array3_ref QDD2(sm0[2]); #define DIFFUSION3DPA_1 \ s_X[dz][dy][dx] = DPA_X(dx,dy,dz,e); @@ -461,6 +461,7 @@ class DIFFUSION3DPA : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp index cdcc75a7f..8a3865cdc 100644 --- a/src/apps/ENERGY.cpp +++ b/src/apps/ENERGY.cpp @@ -133,5 +133,67 @@ void ENERGY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_vnewc, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void ENERGY::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ENERGY_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY1); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY2); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY3); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY4); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY5); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ENERGY_BODY6); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp index 079ff07d2..0cdc06158 100644 --- a/src/apps/ENERGY.hpp +++ b/src/apps/ENERGY.hpp @@ -197,6 +197,7 @@ class ENERGY : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp index e250cb91e..99ecb6e11 100644 --- a/src/apps/FIR.cpp +++ b/src/apps/FIR.cpp @@ -88,5 +88,52 @@ void FIR::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_out, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void FIR::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + FIR_COEFF; + + FIR_DATA_SETUP; + + Real_type coeff[FIR_COEFFLEN]; + std::copy(std::begin(coeff_array), std::end(coeff_array), std::begin(coeff)); + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(FIR_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp index 11e5c8e2c..fbb4eec1d 100644 --- a/src/apps/FIR.hpp +++ b/src/apps/FIR.hpp @@ -72,6 +72,7 @@ class FIR : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp index 9cc621909..443efbf3e 100644 --- a/src/apps/LTIMES_NOVIEW.cpp +++ b/src/apps/LTIMES_NOVIEW.cpp @@ -106,5 +106,50 @@ void LTIMES_NOVIEW::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_psidat, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void LTIMES_NOVIEW::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + LTIMES_NOVIEW_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type z = 0; z < num_z; ++z )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type g = 0; g < num_g; ++g )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type m = 0; m < num_m; ++m )) { + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type d = 0; d < num_d; ++d )) { + RAJAPERF_COUNTERS_LOOP_BODY(LTIMES_NOVIEW_BODY); + } + } + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index 8c0d4652b..811295c9e 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -60,6 +60,7 @@ class LTIMES_NOVIEW : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/MASS3DEA.cpp b/src/apps/MASS3DEA.cpp index 322ed9163..fb800bb91 100644 --- a/src/apps/MASS3DEA.cpp +++ b/src/apps/MASS3DEA.cpp @@ -93,5 +93,73 @@ void MASS3DEA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_M, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MASS3DEA::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + MASS3DEA_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type e = 0; e < NE; ++e)) { + RAJAPERF_COUNTERS_TEAM_CONTEXT(); + + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DEA_0_CPU); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(d, x, MEA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(q, y, MEA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DEA_1); + } + } + + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DEA_2_CPU); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(k1, x, MEA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(k2, y, MEA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(k3, z, MEA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DEA_3); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(i1, x, MEA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(i2, y, MEA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(i3, z, MEA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DEA_4); + } + } + } + + } // element loop + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/MASS3DEA.hpp b/src/apps/MASS3DEA.hpp index 1b3ca0074..465f8bf88 100644 --- a/src/apps/MASS3DEA.hpp +++ b/src/apps/MASS3DEA.hpp @@ -97,17 +97,17 @@ D[qx + MEA_Q1D * qy + MEA_Q1D * MEA_Q1D * qz + \ MEA_Q1D * MEA_Q1D * MEA_Q1D * e] -#define MASS3DEA_0 RAJA_TEAM_SHARED double s_B[MEA_Q1D][MEA_D1D]; +#define MASS3DEA_0 RAJA_TEAM_SHARED Real_array2 s_B; -#define MASS3DEA_0_CPU double s_B[MEA_Q1D][MEA_D1D]; +#define MASS3DEA_0_CPU Real_array2 s_B; #define MASS3DEA_1 s_B[q][d] = MEA_B(q, d); #define MASS3DEA_2 \ - RAJA_TEAM_SHARED double s_D[MEA_Q1D][MEA_Q1D][MEA_Q1D]; + RAJA_TEAM_SHARED Real_array3 s_D; #define MASS3DEA_2_CPU \ - double s_D[MEA_Q1D][MEA_Q1D][MEA_Q1D]; + Real_array3 s_D; #define MASS3DEA_3 s_D[k1][k2][k3] = MEA_D(k1, k2, k3, e); @@ -146,6 +146,7 @@ class MASS3DEA : public KernelBase { void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/MASS3DPA.cpp b/src/apps/MASS3DPA.cpp index 2c8ef5c39..22c5e3d42 100644 --- a/src/apps/MASS3DPA.cpp +++ b/src/apps/MASS3DPA.cpp @@ -99,5 +99,112 @@ void MASS3DPA::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_Y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MASS3DPA::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + MASS3DPA_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type e = 0; e < NE; ++e)) { + RAJAPERF_COUNTERS_TEAM_CONTEXT(); + + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_0_CPU); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, MPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, MPA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_1); + } + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, MPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_2); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, MPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, MPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_3); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, MPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, MPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_4); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, MPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qx, x, MPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_5); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(d, y, MPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(q, x, MPA_Q1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_6); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(qy, y, MPA_Q1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, MPA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_7); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, MPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, MPA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_8); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dy, y, MPA_D1D)) { + RAJAPERF_COUNTERS_PAR_LOOP(CPU_FOREACH(dx, x, MPA_D1D)) { + RAJAPERF_COUNTERS_LOOP_BODY(MASS3DPA_9); + } + } + + } // element loop + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index 5fe683b5c..f6b6332b6 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -181,36 +181,36 @@ Index_type NE = m_NE; D[qx + MPA_Q1D * qy + MPA_Q1D * MPA_Q1D * qz + MPA_Q1D * MPA_Q1D * MPA_Q1D * e] #define MASS3DPA_0_CPU \ - constexpr int MQ1 = MPA_Q1D; \ - constexpr int MD1 = MPA_D1D; \ - constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - double sDQ[MQ1 * MD1]; \ - double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; \ - double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ; \ - double sm0[MDQ * MDQ * MDQ]; \ - double sm1[MDQ * MDQ * MDQ]; \ - double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0; \ - double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; \ - double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; \ - double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; \ - double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; \ - double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; + constexpr auto MQ1 = MPA_Q1D; \ + constexpr auto MD1 = MPA_D1D; \ + constexpr auto MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + Real_array2 sDQ; \ + Real_array2_ref Bsmem(sDQ); \ + Real_array2_ref Btsmem(sDQ); \ + Real_array3 sm0; \ + Real_array3 sm1; \ + Real_array3_ref Xsmem(sm0); \ + Real_array3_ref DDQ(sm1); \ + Real_array3_ref DQQ(sm0); \ + Real_array3_ref QQQ(sm1); \ + Real_array3_ref QQD(sm0); \ + Real_array3_ref QDD(sm1); #define MASS3DPA_0_GPU \ - constexpr int MQ1 = MPA_Q1D; \ - constexpr int MD1 = MPA_D1D; \ - constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ - RAJA_TEAM_SHARED double sDQ[MQ1 * MD1]; \ - double(*Bsmem)[MD1] = (double(*)[MD1])sDQ; \ - double(*Btsmem)[MQ1] = (double(*)[MQ1])sDQ; \ - RAJA_TEAM_SHARED double sm0[MDQ * MDQ * MDQ]; \ - RAJA_TEAM_SHARED double sm1[MDQ * MDQ * MDQ]; \ - double(*Xsmem)[MD1][MD1] = (double(*)[MD1][MD1])sm0; \ - double(*DDQ)[MD1][MQ1] = (double(*)[MD1][MQ1])sm1; \ - double(*DQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm0; \ - double(*QQQ)[MQ1][MQ1] = (double(*)[MQ1][MQ1])sm1; \ - double(*QQD)[MQ1][MD1] = (double(*)[MQ1][MD1])sm0; \ - double(*QDD)[MD1][MD1] = (double(*)[MD1][MD1])sm1; + constexpr auto MQ1 = MPA_Q1D; \ + constexpr auto MD1 = MPA_D1D; \ + constexpr auto MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ + RAJA_TEAM_SHARED Real_array2 sDQ; \ + Real_array2_ref Bsmem(sDQ); \ + Real_array2_ref Btsmem(sDQ); \ + RAJA_TEAM_SHARED Real_array3 sm0; \ + RAJA_TEAM_SHARED Real_array3 sm1; \ + Real_array3_ref Xsmem(sm0); \ + Real_array3_ref DDQ(sm1); \ + Real_array3_ref DQQ(sm0); \ + Real_array3_ref QQQ(sm1); \ + Real_array3_ref QQD(sm0); \ + Real_array3_ref QDD(sm1); #define MASS3DPA_1 \ RAJAPERF_UNROLL(MD1) \ @@ -357,6 +357,7 @@ class MASS3DPA : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/MATVEC_3D_STENCIL.cpp b/src/apps/MATVEC_3D_STENCIL.cpp index 081a648af..7179dd0aa 100644 --- a/src/apps/MATVEC_3D_STENCIL.cpp +++ b/src/apps/MATVEC_3D_STENCIL.cpp @@ -182,5 +182,48 @@ void MATVEC_3D_STENCIL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ deallocData(m_real_zones, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MATVEC_3D_STENCIL::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + MATVEC_3D_STENCIL_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ii = ibegin ; ii < iend ; ++ii )) { + RAJAPERF_COUNTERS_LOOP_BODY(MATVEC_3D_STENCIL_BODY_INDEX); + RAJAPERF_COUNTERS_LOOP_BODY(MATVEC_3D_STENCIL_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/MATVEC_3D_STENCIL.hpp b/src/apps/MATVEC_3D_STENCIL.hpp index e65ea1dad..9c73bb19c 100644 --- a/src/apps/MATVEC_3D_STENCIL.hpp +++ b/src/apps/MATVEC_3D_STENCIL.hpp @@ -131,6 +131,7 @@ class MATVEC_3D_STENCIL : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/NODAL_ACCUMULATION_3D.cpp b/src/apps/NODAL_ACCUMULATION_3D.cpp index 3ffcce23d..544cde71a 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.cpp +++ b/src/apps/NODAL_ACCUMULATION_3D.cpp @@ -102,5 +102,48 @@ void NODAL_ACCUMULATION_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t deallocData(m_real_zones, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void NODAL_ACCUMULATION_3D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + NODAL_ACCUMULATION_3D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ii = ibegin ; ii < iend ; ++ii )) { + RAJAPERF_COUNTERS_LOOP_BODY(NODAL_ACCUMULATION_3D_BODY_INDEX); + RAJAPERF_COUNTERS_LOOP_BODY(NODAL_ACCUMULATION_3D_BODY(RAJAPERF_ATOMIC_ADD_COUNTING)); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/NODAL_ACCUMULATION_3D.hpp b/src/apps/NODAL_ACCUMULATION_3D.hpp index c2ad0a1dc..c41abadd1 100644 --- a/src/apps/NODAL_ACCUMULATION_3D.hpp +++ b/src/apps/NODAL_ACCUMULATION_3D.hpp @@ -79,6 +79,7 @@ class NODAL_ACCUMULATION_3D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp index a51f43729..f16cb9bf7 100644 --- a/src/apps/PRESSURE.cpp +++ b/src/apps/PRESSURE.cpp @@ -96,5 +96,51 @@ void PRESSURE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_vnewc, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void PRESSURE::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + PRESSURE_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(PRESSURE_BODY1); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(PRESSURE_OPT_BODY2); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp index fb51c7e90..4929bf427 100644 --- a/src/apps/PRESSURE.hpp +++ b/src/apps/PRESSURE.hpp @@ -44,6 +44,13 @@ if ( vnewc[i] >= eosvmax ) p_new[i] = 0.0 ; \ if ( p_new[i] < pmin ) p_new[i] = pmin ; +#define PRESSURE_OPT_BODY2 \ + Real_type p = bvc[i] * e_old[i] ; \ + if ( fabs(p) < p_cut ) p = 0.0 ; \ + if ( vnewc[i] >= eosvmax ) p = 0.0 ; \ + if ( p < pmin ) p = pmin ; \ + p_new[i] = p; + #include "common/KernelBase.hpp" @@ -65,6 +72,7 @@ class PRESSURE : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp index 11ef9030c..6f68bd530 100644 --- a/src/apps/VOL3D.cpp +++ b/src/apps/VOL3D.cpp @@ -108,5 +108,47 @@ void VOL3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_vol, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void VOL3D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = m_domain->fpz; + const Index_type iend = m_domain->lpz+1; + + VOL3D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin ; i < iend ; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(VOL3D_OPT_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp index 24715cbee..0ff178f19 100644 --- a/src/apps/VOL3D.hpp +++ b/src/apps/VOL3D.hpp @@ -136,6 +136,59 @@ \ vol[i] *= vnormq ; +#define VOL3D_OPT_BODY \ + Real_type x71 = x7[i] - x1[i] ; \ + Real_type x72 = x7[i] - x2[i] ; \ + Real_type x74 = x7[i] - x4[i] ; \ + Real_type x30 = x3[i] - x0[i] ; \ + Real_type x50 = x5[i] - x0[i] ; \ + Real_type x60 = x6[i] - x0[i] ; \ + \ + Real_type y71 = y7[i] - y1[i] ; \ + Real_type y72 = y7[i] - y2[i] ; \ + Real_type y74 = y7[i] - y4[i] ; \ + Real_type y30 = y3[i] - y0[i] ; \ + Real_type y50 = y5[i] - y0[i] ; \ + Real_type y60 = y6[i] - y0[i] ; \ + \ + Real_type z71 = z7[i] - z1[i] ; \ + Real_type z72 = z7[i] - z2[i] ; \ + Real_type z74 = z7[i] - z4[i] ; \ + Real_type z30 = z3[i] - z0[i] ; \ + Real_type z50 = z5[i] - z0[i] ; \ + Real_type z60 = z6[i] - z0[i] ; \ + \ + Real_type xps = x71 + x60 ; \ + Real_type yps = y71 + y60 ; \ + Real_type zps = z71 + z60 ; \ + \ + Real_type cyz = y72 * z30 - z72 * y30 ; \ + Real_type czx = z72 * x30 - x72 * z30 ; \ + Real_type cxy = x72 * y30 - y72 * x30 ; \ + Real_type v = xps * cyz + yps * czx + zps * cxy ; \ + \ + xps = x72 + x50 ; \ + yps = y72 + y50 ; \ + zps = z72 + z50 ; \ + \ + cyz = y74 * z60 - z74 * y60 ; \ + czx = z74 * x60 - x74 * z60 ; \ + cxy = x74 * y60 - y74 * x60 ; \ + v += xps * cyz + yps * czx + zps * cxy ; \ + \ + xps = x74 + x30 ; \ + yps = y74 + y30 ; \ + zps = z74 + z30 ; \ + \ + cyz = y71 * z50 - z71 * y50 ; \ + czx = z71 * x50 - x71 * z50 ; \ + cxy = x71 * y50 - y71 * x50 ; \ + v += xps * cyz + yps * czx + zps * cxy ; \ + \ + v *= vnormq ; \ + \ + vol[i] = v ; + #include "common/KernelBase.hpp" @@ -158,6 +211,7 @@ class VOL3D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/apps/ZONAL_ACCUMULATION_3D.cpp b/src/apps/ZONAL_ACCUMULATION_3D.cpp index 775e4dc18..eef222050 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.cpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.cpp @@ -104,5 +104,48 @@ void ZONAL_ACCUMULATION_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(t deallocData(m_real_zones, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void ZONAL_ACCUMULATION_3D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = m_domain->n_real_zones; + + ZONAL_ACCUMULATION_3D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ii = ibegin ; ii < iend ; ++ii )) { + RAJAPERF_COUNTERS_LOOP_BODY(ZONAL_ACCUMULATION_3D_BODY_INDEX); + RAJAPERF_COUNTERS_LOOP_BODY(ZONAL_ACCUMULATION_3D_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace apps } // end namespace rajaperf diff --git a/src/apps/ZONAL_ACCUMULATION_3D.hpp b/src/apps/ZONAL_ACCUMULATION_3D.hpp index 758682764..34cded75c 100644 --- a/src/apps/ZONAL_ACCUMULATION_3D.hpp +++ b/src/apps/ZONAL_ACCUMULATION_3D.hpp @@ -75,6 +75,7 @@ class ZONAL_ACCUMULATION_3D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/ARRAY_OF_PTRS.cpp b/src/basic/ARRAY_OF_PTRS.cpp index 9b7577d05..b3518435d 100644 --- a/src/basic/ARRAY_OF_PTRS.cpp +++ b/src/basic/ARRAY_OF_PTRS.cpp @@ -85,5 +85,47 @@ void ARRAY_OF_PTRS::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void ARRAY_OF_PTRS::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ARRAY_OF_PTRS_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ARRAY_OF_PTRS_BODY(x)); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/ARRAY_OF_PTRS.hpp b/src/basic/ARRAY_OF_PTRS.hpp index bbebbf25e..6d5f4c76c 100644 --- a/src/basic/ARRAY_OF_PTRS.hpp +++ b/src/basic/ARRAY_OF_PTRS.hpp @@ -64,6 +64,7 @@ class ARRAY_OF_PTRS : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/COPY8.cpp b/src/basic/COPY8.cpp index 5ce1685cd..0a0e57e86 100644 --- a/src/basic/COPY8.cpp +++ b/src/basic/COPY8.cpp @@ -117,5 +117,47 @@ void COPY8::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y7, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void COPY8::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + COPY8_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(COPY8_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/COPY8.hpp b/src/basic/COPY8.hpp index 1afa1bcb9..de7a4a007 100644 --- a/src/basic/COPY8.hpp +++ b/src/basic/COPY8.hpp @@ -73,6 +73,7 @@ class COPY8 : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index 2cdd97d99..82fcb0eae 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -85,5 +85,47 @@ void DAXPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DAXPY::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DAXPY_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(DAXPY_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index 43c2f6a90..fead77739 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -46,6 +46,7 @@ class DAXPY : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/DAXPY_ATOMIC.cpp b/src/basic/DAXPY_ATOMIC.cpp index 87fa48d49..7785bb3c1 100644 --- a/src/basic/DAXPY_ATOMIC.cpp +++ b/src/basic/DAXPY_ATOMIC.cpp @@ -83,5 +83,47 @@ void DAXPY_ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DAXPY_ATOMIC::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DAXPY_ATOMIC_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(RAJAPERF_ATOMIC_ADD_COUNTING(y[i], a * x[i]);); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/DAXPY_ATOMIC.hpp b/src/basic/DAXPY_ATOMIC.hpp index 4b8d91dcf..89b150ad2 100644 --- a/src/basic/DAXPY_ATOMIC.hpp +++ b/src/basic/DAXPY_ATOMIC.hpp @@ -46,6 +46,7 @@ class DAXPY_ATOMIC : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/EMPTY.cpp b/src/basic/EMPTY.cpp index f7532de8b..163faccc8 100644 --- a/src/basic/EMPTY.cpp +++ b/src/basic/EMPTY.cpp @@ -73,5 +73,47 @@ void EMPTY::tearDown(VariantID RAJAPERF_UNUSED_ARG(vid), size_t RAJAPERF_UNUSED_ { } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void EMPTY::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + EMPTY_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(EMPTY_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/EMPTY.hpp b/src/basic/EMPTY.hpp index 5e7a1b156..61e4cd6b0 100644 --- a/src/basic/EMPTY.hpp +++ b/src/basic/EMPTY.hpp @@ -50,6 +50,7 @@ class EMPTY : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp index f3e264a02..ce66d528a 100644 --- a/src/basic/IF_QUAD.cpp +++ b/src/basic/IF_QUAD.cpp @@ -97,5 +97,47 @@ void IF_QUAD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_x2, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void IF_QUAD::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + IF_QUAD_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(IF_QUAD_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp index 151b4ad8c..58dcb3b1d 100644 --- a/src/basic/IF_QUAD.hpp +++ b/src/basic/IF_QUAD.hpp @@ -63,6 +63,7 @@ class IF_QUAD : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/INDEXLIST.cpp b/src/basic/INDEXLIST.cpp index d4ab1d659..7cb0c9f4d 100644 --- a/src/basic/INDEXLIST.cpp +++ b/src/basic/INDEXLIST.cpp @@ -83,5 +83,57 @@ void INDEXLIST::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_list, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INDEXLIST::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INDEXLIST_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Index_type count = 0; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INDEXLIST_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_len = count; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INDEXLIST.hpp b/src/basic/INDEXLIST.hpp index 7ec1414bf..774bd85b7 100644 --- a/src/basic/INDEXLIST.hpp +++ b/src/basic/INDEXLIST.hpp @@ -54,6 +54,7 @@ class INDEXLIST : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/INDEXLIST_3LOOP.cpp b/src/basic/INDEXLIST_3LOOP.cpp index 733397f2e..08a715287 100644 --- a/src/basic/INDEXLIST_3LOOP.cpp +++ b/src/basic/INDEXLIST_3LOOP.cpp @@ -91,5 +91,77 @@ void INDEXLIST_3LOOP::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_id deallocData(m_list, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INDEXLIST_3LOOP::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INDEXLIST_3LOOP_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + INDEXLIST_3LOOP_COUNTS_SETUP(DataSpace::Host); + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(counts[i] = (INDEXLIST_3LOOP_CONDITIONAL) ? 1 : 0); + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Index_type count = 0; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend+1; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY( + Index_type inc = counts[i]; + counts[i] = count; + count += inc; + ); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INDEXLIST_3LOOP_MAKE_LIST); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_len = counts[iend]; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + INDEXLIST_3LOOP_COUNTS_TEARDOWN(DataSpace::Host); + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INDEXLIST_3LOOP.hpp b/src/basic/INDEXLIST_3LOOP.hpp index 5a9e1e7ab..f5e9e3661 100644 --- a/src/basic/INDEXLIST_3LOOP.hpp +++ b/src/basic/INDEXLIST_3LOOP.hpp @@ -72,6 +72,7 @@ class INDEXLIST_3LOOP : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index f18cfcbba..293d71ac8 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -92,5 +92,47 @@ void INIT3::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_in2, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INIT3::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INIT3_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INIT3_OPT_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp index 872edc6c0..b6e966233 100644 --- a/src/basic/INIT3.hpp +++ b/src/basic/INIT3.hpp @@ -28,6 +28,12 @@ #define INIT3_BODY \ out1[i] = out2[i] = out3[i] = - in1[i] - in2[i] ; +#define INIT3_OPT_BODY \ + Real_type tmp = - in1[i] - in2[i]; \ + out1[i] = tmp ; \ + out2[i] = tmp ; \ + out3[i] = tmp ; + #include "common/KernelBase.hpp" @@ -49,6 +55,7 @@ class INIT3 : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index 3790de2b8..f458297c1 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -84,5 +84,47 @@ void INIT_VIEW1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_a, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INIT_VIEW1D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INIT_VIEW1D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INIT_VIEW1D_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp index 05daf479b..06a6c9c9e 100644 --- a/src/basic/INIT_VIEW1D.hpp +++ b/src/basic/INIT_VIEW1D.hpp @@ -60,6 +60,7 @@ class INIT_VIEW1D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp index 3e028af6c..66a32aac9 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.cpp +++ b/src/basic/INIT_VIEW1D_OFFSET.cpp @@ -84,5 +84,47 @@ void INIT_VIEW1D_OFFSET::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune deallocData(m_a, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INIT_VIEW1D_OFFSET::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 1; + const Index_type iend = getActualProblemSize()+1; + + INIT_VIEW1D_OFFSET_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INIT_VIEW1D_OFFSET_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp index 01a6712d9..cf422cd6c 100644 --- a/src/basic/INIT_VIEW1D_OFFSET.hpp +++ b/src/basic/INIT_VIEW1D_OFFSET.hpp @@ -59,6 +59,7 @@ class INIT_VIEW1D_OFFSET : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/MAT_MAT_SHARED-Cuda.cpp b/src/basic/MAT_MAT_SHARED-Cuda.cpp index bad2c1cd0..297f81a35 100644 --- a/src/basic/MAT_MAT_SHARED-Cuda.cpp +++ b/src/basic/MAT_MAT_SHARED-Cuda.cpp @@ -39,7 +39,9 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, __syncthreads(); - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } __syncthreads(); } @@ -128,7 +130,11 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) __syncthreads(); auto inner_y_3 = [&](Index_type ty) { - auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(tile_size) }; + auto inner_x_3 = [&](Index_type tx) { + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } + }; { Index_type tx = threadIdx.x; @@ -241,7 +247,9 @@ void MAT_MAT_SHARED::runCudaVariantImpl(VariantID vid) [&](Index_type ty) { RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } } ); // RAJA::loop } diff --git a/src/basic/MAT_MAT_SHARED-Hip.cpp b/src/basic/MAT_MAT_SHARED-Hip.cpp index f2c29f04e..0d54e8c35 100644 --- a/src/basic/MAT_MAT_SHARED-Hip.cpp +++ b/src/basic/MAT_MAT_SHARED-Hip.cpp @@ -39,7 +39,9 @@ __global__ void mat_mat_shared(Index_type N, Real_ptr C, Real_ptr A, __syncthreads(); - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } __syncthreads(); } @@ -128,7 +130,11 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) __syncthreads(); auto inner_y_3 = [&](Index_type ty) { - auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(tile_size) }; + auto inner_x_3 = [&](Index_type tx) { + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } + }; { Index_type tx = threadIdx.x; @@ -240,7 +246,9 @@ void MAT_MAT_SHARED::runHipVariantImpl(VariantID vid) [&](Index_type ty) { RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } } ); // RAJA::loop } diff --git a/src/basic/MAT_MAT_SHARED-OMP.cpp b/src/basic/MAT_MAT_SHARED-OMP.cpp index b4563cc52..31762b191 100644 --- a/src/basic/MAT_MAT_SHARED-OMP.cpp +++ b/src/basic/MAT_MAT_SHARED-OMP.cpp @@ -61,7 +61,9 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( for (Index_type ty = 0; ty < TL_SZ; ++ty) { for (Index_type tx = 0; tx < TL_SZ; ++tx) { - MAT_MAT_SHARED_BODY_3(TL_SZ) + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } } } } @@ -118,7 +120,11 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( } auto inner_y_3 = [&](Index_type ty) { - auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(TL_SZ) }; + auto inner_x_3 = [&](Index_type tx) { + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } + }; for (Index_type tx = 0; tx < TL_SZ; ++tx) { inner_x_3(tx); @@ -215,7 +221,9 @@ void MAT_MAT_SHARED::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG( [&](Index_type ty) { RAJA::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3(TL_SZ) + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } } ); // RAJA::loop } diff --git a/src/basic/MAT_MAT_SHARED-Seq.cpp b/src/basic/MAT_MAT_SHARED-Seq.cpp index 1dabaef8f..8e03155cf 100644 --- a/src/basic/MAT_MAT_SHARED-Seq.cpp +++ b/src/basic/MAT_MAT_SHARED-Seq.cpp @@ -52,7 +52,9 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun for (Index_type ty = 0; ty < TL_SZ; ++ty) { for (Index_type tx = 0; tx < TL_SZ; ++tx) { - MAT_MAT_SHARED_BODY_3(TL_SZ) + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } } } @@ -112,7 +114,11 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun } auto inner_y_3 = [&](Index_type ty) { - auto inner_x_3 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_3(TL_SZ) }; + auto inner_x_3 = [&](Index_type tx) { + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } + }; for (Index_type tx = 0; tx < TL_SZ; ++tx) { inner_x_3(tx); @@ -125,7 +131,9 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun } auto inner_y_4 = [&](Index_type ty) { - auto inner_x_4 = [&](Index_type tx) { MAT_MAT_SHARED_BODY_4(TL_SZ) }; + auto inner_x_4 = [&](Index_type tx) { + MAT_MAT_SHARED_BODY_4(TL_SZ) + }; for (Index_type tx = 0; tx < TL_SZ; ++tx) { inner_x_4(tx); @@ -210,7 +218,9 @@ void MAT_MAT_SHARED::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun [&](Index_type ty) { RAJA::loop(ctx, RAJA::RangeSegment(0, TL_SZ), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3(TL_SZ) + for (Index_type n = 0; n < TL_SZ; ++n) { + MAT_MAT_SHARED_BODY_3(TL_SZ) + } } ); // RAJA::loop } diff --git a/src/basic/MAT_MAT_SHARED-Sycl.cpp b/src/basic/MAT_MAT_SHARED-Sycl.cpp index c4c15ae40..2e6a53203 100644 --- a/src/basic/MAT_MAT_SHARED-Sycl.cpp +++ b/src/basic/MAT_MAT_SHARED-Sycl.cpp @@ -49,9 +49,9 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) qu->submit([&](::sycl::handler& h) { - ::sycl::local_accessor As(::sycl::range<2>(tile_size, tile_size), h); - ::sycl::local_accessor Bs(::sycl::range<2>(tile_size, tile_size), h); - ::sycl::local_accessor Cs(::sycl::range<2>(tile_size, tile_size), h); + ::sycl::local_accessor As(::sycl::range<2>(tile_size, tile_size), h); + ::sycl::local_accessor Bs(::sycl::range<2>(tile_size, tile_size), h); + ::sycl::local_accessor Cs(::sycl::range<2>(tile_size, tile_size), h); h.parallel_for (::sycl::nd_range<3>(gridSize, workGroupSize), @@ -70,7 +70,9 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) itm.barrier(::sycl::access::fence_space::local_space); - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } itm.barrier(::sycl::access::fence_space::local_space); } @@ -90,7 +92,7 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) constexpr bool async = true; const int local_mats = 3; - constexpr size_t shmem = tile_size * tile_size * local_mats * sizeof(double); + constexpr size_t shmem = tile_size * tile_size * local_mats * sizeof(Real_type); using launch_policy = RAJA::LaunchPolicy>; @@ -118,12 +120,12 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) //We only support dynamic shared memory in Sycl //Thus requiring a different setup than other backends //which use static shared memory - double * As_ptr = ctx.getSharedMemory(tile_size * tile_size); - double * Bs_ptr = ctx.getSharedMemory(tile_size * tile_size); - double * Cs_ptr = ctx.getSharedMemory(tile_size * tile_size); - double (*As)[tile_size] = (double (*)[tile_size]) As_ptr; - double (*Bs)[tile_size] = (double (*)[tile_size]) Bs_ptr; - double (*Cs)[tile_size] = (double (*)[tile_size]) Cs_ptr; + Real_type * As_ptr = ctx.getSharedMemory(tile_size * tile_size); + Real_type * Bs_ptr = ctx.getSharedMemory(tile_size * tile_size); + Real_type * Cs_ptr = ctx.getSharedMemory(tile_size * tile_size); + Real_type (*As)[tile_size] = (Real_type (*)[tile_size]) As_ptr; + Real_type (*Bs)[tile_size] = (Real_type (*)[tile_size]) Bs_ptr; + Real_type (*Cs)[tile_size] = (Real_type (*)[tile_size]) Cs_ptr; RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type ty) { @@ -154,7 +156,9 @@ void MAT_MAT_SHARED::runSyclVariantImpl(VariantID vid) [&](Index_type ty) { RAJA::loop(ctx, RAJA::RangeSegment(0, tile_size), [&](Index_type tx) { - MAT_MAT_SHARED_BODY_3(tile_size) + for (Index_type n = 0; n < tile_size; ++n) { + MAT_MAT_SHARED_BODY_3(tile_size) + } } ); // RAJA::loop } diff --git a/src/basic/MAT_MAT_SHARED.cpp b/src/basic/MAT_MAT_SHARED.cpp index 4c93b90eb..8574168ec 100644 --- a/src/basic/MAT_MAT_SHARED.cpp +++ b/src/basic/MAT_MAT_SHARED.cpp @@ -87,5 +87,88 @@ void MAT_MAT_SHARED::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx deallocData(m_C, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MAT_MAT_SHARED::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type N = m_N; + + MAT_MAT_SHARED_DATA_SETUP; + const Index_type Nx = RAJA_DIVIDE_CEILING_INT(N, TL_SZ); + const Index_type Ny = RAJA_DIVIDE_CEILING_INT(N, TL_SZ); + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type by = 0; by < Ny; ++by)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type bx = 0; bx < Nx; ++bx)) { + RAJAPERF_COUNTERS_TEAM_CONTEXT(); + + //Work around for when compiling with CLANG and HIP + //See notes in MAT_MAT_SHARED.hpp + RAJAPERF_COUNTERS_LOOP_BODY(MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU(TL_SZ)); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ty = 0; ty < TL_SZ; ++ty)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type tx = 0; tx < TL_SZ; ++tx)) { + RAJAPERF_COUNTERS_LOOP_BODY(MAT_MAT_SHARED_BODY_1(TL_SZ)); + } + } + + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = 0; k < (TL_SZ + N - 1) / TL_SZ; ++k)) { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ty = 0; ty < TL_SZ; ++ty)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type tx = 0; tx < TL_SZ; ++tx)) { + RAJAPERF_COUNTERS_LOOP_BODY(MAT_MAT_SHARED_BODY_2(TL_SZ)); + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ty = 0; ty < TL_SZ; ++ty)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type tx = 0; tx < TL_SZ; ++tx)) { + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type n = 0; n < TL_SZ; ++n)) { + RAJAPERF_COUNTERS_LOOP_BODY(MAT_MAT_SHARED_BODY_3(TL_SZ)); + } + } + } + + RAJAPERF_COUNTERS_TEAM_SYNC(); + + } // Sequential loop + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type ty = 0; ty < TL_SZ; ++ty)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type tx = 0; tx < TL_SZ; ++tx)) { + RAJAPERF_COUNTERS_LOOP_BODY(MAT_MAT_SHARED_BODY_4(TL_SZ)); + } + } + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MAT_MAT_SHARED.hpp b/src/basic/MAT_MAT_SHARED.hpp index f77408006..768960603 100644 --- a/src/basic/MAT_MAT_SHARED.hpp +++ b/src/basic/MAT_MAT_SHARED.hpp @@ -85,14 +85,14 @@ constexpr rajaperf::Index_type TL_SZ = 16; so it doesn't see these kind of problems. */ #define MAT_MAT_SHARED_BODY_0_CLANG_HIP_CPU(tile_size) \ - Real_type As[tile_size][tile_size]; \ - Real_type Bs[tile_size][tile_size]; \ - Real_type Cs[tile_size][tile_size]; + Real_array2 As; \ + Real_array2 Bs; \ + Real_array2 Cs; #define MAT_MAT_SHARED_BODY_0(tile_size) \ - RAJA_TEAM_SHARED Real_type As[tile_size][tile_size]; \ - RAJA_TEAM_SHARED Real_type Bs[tile_size][tile_size]; \ - RAJA_TEAM_SHARED Real_type Cs[tile_size][tile_size]; + RAJA_TEAM_SHARED Real_array2 As; \ + RAJA_TEAM_SHARED Real_array2 Bs; \ + RAJA_TEAM_SHARED Real_array2 Cs; #define MAT_MAT_SHARED_BODY_1(tile_size) \ Cs[ty][tx] = 0; @@ -110,7 +110,6 @@ constexpr rajaperf::Index_type TL_SZ = 16; Bs[ty][tx] = 0.0; #define MAT_MAT_SHARED_BODY_3(tile_size) \ - for (Index_type n = 0; n < tile_size; ++n) \ Cs[ty][tx] += As[ty][n] * Bs[n][tx]; #define MAT_MAT_SHARED_BODY_4(tile_size) \ @@ -133,6 +132,7 @@ class MAT_MAT_SHARED : public KernelBase { void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index aaed61b01..323d842e8 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -92,5 +92,47 @@ void MULADDSUB::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_in2, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MULADDSUB::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MULADDSUB_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(MULADDSUB_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp index 260d07212..8c951280f 100644 --- a/src/basic/MULADDSUB.hpp +++ b/src/basic/MULADDSUB.hpp @@ -52,6 +52,7 @@ class MULADDSUB : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/MULTI_REDUCE.cpp b/src/basic/MULTI_REDUCE.cpp index bc3114929..f14ef648b 100644 --- a/src/basic/MULTI_REDUCE.cpp +++ b/src/basic/MULTI_REDUCE.cpp @@ -149,5 +149,65 @@ void MULTI_REDUCE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(DataSpace::Host, m_values_final); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MULTI_REDUCE::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MULTI_REDUCE_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + MULTI_REDUCE_SETUP_VALUES; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + MULTI_REDUCE_INIT_VALUES; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(MULTI_REDUCE_BODY(RAJAPERF_ATOMIC_ADD_COUNTING)); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + MULTI_REDUCE_FINALIZE_VALUES; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + MULTI_REDUCE_TEARDOWN_VALUES; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/MULTI_REDUCE.hpp b/src/basic/MULTI_REDUCE.hpp index 1b6c8e4fc..87f7fe031 100644 --- a/src/basic/MULTI_REDUCE.hpp +++ b/src/basic/MULTI_REDUCE.hpp @@ -89,6 +89,7 @@ class MULTI_REDUCE : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp index f0d98b337..f9ae4c717 100644 --- a/src/basic/NESTED_INIT.cpp +++ b/src/basic/NESTED_INIT.cpp @@ -93,5 +93,48 @@ void NESTED_INIT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_array, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void NESTED_INIT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + NESTED_INIT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = 0; k < nk; ++k )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < nj; ++j )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(NESTED_INIT_BODY); + } + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp index 40208565d..655bacf52 100644 --- a/src/basic/NESTED_INIT.hpp +++ b/src/basic/NESTED_INIT.hpp @@ -52,6 +52,7 @@ class NESTED_INIT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/PI_ATOMIC.cpp b/src/basic/PI_ATOMIC.cpp index 8f9bb64f0..f50a7eaa9 100644 --- a/src/basic/PI_ATOMIC.cpp +++ b/src/basic/PI_ATOMIC.cpp @@ -81,5 +81,60 @@ void PI_ATOMIC::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) (void) vid; } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void PI_ATOMIC::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + PI_ATOMIC_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + *pi = m_pi_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_type x = (Real_type(i) + 0.5) * dx; + RAJAPERF_ATOMIC_ADD_COUNTING(*pi, dx / (1.0 + x * x)); + ); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_pi_final = *pi * 4.0; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_ATOMIC.hpp b/src/basic/PI_ATOMIC.hpp index 3c8433b0d..ec4c11f0b 100644 --- a/src/basic/PI_ATOMIC.hpp +++ b/src/basic/PI_ATOMIC.hpp @@ -59,6 +59,7 @@ class PI_ATOMIC : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/PI_REDUCE.cpp b/src/basic/PI_REDUCE.cpp index 1ad505ad0..20c81c8a6 100644 --- a/src/basic/PI_REDUCE.cpp +++ b/src/basic/PI_REDUCE.cpp @@ -81,5 +81,57 @@ void PI_REDUCE::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) (void) vid; } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void PI_REDUCE::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + PI_REDUCE_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Real_type pi = m_pi_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(PI_REDUCE_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_pi = 4.0 * pi; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/PI_REDUCE.hpp b/src/basic/PI_REDUCE.hpp index d585333c6..fae40b6fa 100644 --- a/src/basic/PI_REDUCE.hpp +++ b/src/basic/PI_REDUCE.hpp @@ -50,6 +50,7 @@ class PI_REDUCE : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp index e3ca630d4..8cadd2fbd 100644 --- a/src/basic/REDUCE3_INT.cpp +++ b/src/basic/REDUCE3_INT.cpp @@ -96,5 +96,61 @@ void REDUCE3_INT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_vec, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void REDUCE3_INT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE3_INT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Int_type vsum = m_vsum_init; + Int_type vmin = m_vmin_init; + Int_type vmax = m_vmax_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(REDUCE3_INT_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_vsum = vsum; + m_vmin = vmin; + m_vmax = vmax; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp index ce8323a89..3ca502749 100644 --- a/src/basic/REDUCE3_INT.hpp +++ b/src/basic/REDUCE3_INT.hpp @@ -64,6 +64,7 @@ class REDUCE3_INT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/REDUCE_STRUCT.cpp b/src/basic/REDUCE_STRUCT.cpp index 2608191dc..307763499 100644 --- a/src/basic/REDUCE_STRUCT.cpp +++ b/src/basic/REDUCE_STRUCT.cpp @@ -102,5 +102,64 @@ void REDUCE_STRUCT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void REDUCE_STRUCT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + REDUCE_STRUCT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Real_type xsum = m_init_sum; Real_type ysum = m_init_sum; + Real_type xmin = m_init_min; Real_type ymin = m_init_min; + Real_type xmax = m_init_max; Real_type ymax = m_init_max; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(REDUCE_STRUCT_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + points.SetCenter(xsum/(points.N), ysum/(points.N)); + points.SetXMin(xmin); + points.SetXMax(xmax); + points.SetYMin(ymin); + points.SetYMax(ymax); + m_points = points; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/REDUCE_STRUCT.hpp b/src/basic/REDUCE_STRUCT.hpp index d1d289f91..63db1d62d 100644 --- a/src/basic/REDUCE_STRUCT.hpp +++ b/src/basic/REDUCE_STRUCT.hpp @@ -80,6 +80,7 @@ class REDUCE_STRUCT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/basic/TRAP_INT-func.hpp b/src/basic/TRAP_INT-func.hpp index 4d62fa1c0..ac1f8b49e 100644 --- a/src/basic/TRAP_INT-func.hpp +++ b/src/basic/TRAP_INT-func.hpp @@ -28,6 +28,20 @@ Real_type trap_int_func(Real_type x, denom = 1.0/sqrt(denom); return denom; } +/// +RAJA_INLINE +RAJA_HOST_DEVICE +Real_type trap_int_opt_func(Real_type x, + Real_type y, + Real_type xp, + Real_type yp) +{ + Real_type xmxp = x - xp; + Real_type ymyp = y - yp; + Real_type denom = xmxp*xmxp + ymyp*ymyp; + denom = 1.0/sqrt(denom); + return denom; +} } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp index a5a8fbfee..97ed1bfa3 100644 --- a/src/basic/TRAP_INT.cpp +++ b/src/basic/TRAP_INT.cpp @@ -92,5 +92,69 @@ void TRAP_INT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) (void) vid; } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + + +} // end namespace basic +} // end namespace rajaperf + +// This shouldn't result in ODR violations as the argument types have changed +#include "TRAP_INT-func.hpp" + +namespace rajaperf +{ +namespace basic +{ + +void TRAP_INT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + TRAP_INT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Real_type sumx = m_sumx_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(TRAP_INT_OPT_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_sumx += sumx * h; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp index 27a606695..ee191cccc 100644 --- a/src/basic/TRAP_INT.hpp +++ b/src/basic/TRAP_INT.hpp @@ -40,6 +40,10 @@ Real_type x = x0 + i*h; \ sumx += trap_int_func(x, y, xp, yp); +#define TRAP_INT_OPT_BODY \ + Real_type x = x0 + i*h; \ + sumx += trap_int_opt_func(x, y, xp, yp); + #include "common/KernelBase.hpp" @@ -61,6 +65,7 @@ class TRAP_INT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/comm/HALO_PACKING.cpp b/src/comm/HALO_PACKING.cpp index 2f883926e..e970758ee 100644 --- a/src/comm/HALO_PACKING.cpp +++ b/src/comm/HALO_PACKING.cpp @@ -109,5 +109,99 @@ void HALO_PACKING::tearDown(VariantID vid, size_t tune_idx) tearDown_base(vid, tune_idx); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void HALO_PACKING::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + HALO_PACKING_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type l = 0; l < num_neighbors; ++l)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + ); + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type v = 0; v < num_vars; ++v)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr var = vars[v]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(HALO_PACK_BODY); + } + RAJAPERF_COUNTERS_LOOP_BODY( + buffer += len; + ); + } + + RAJAPERF_COUNTERS_IF(if (separate_buffers)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr send_buffer = send_buffers[l]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len*num_vars; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(send_buffer[i] = buffer[i]); + } + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + } + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type l = 0; l < num_neighbors; ++l)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + ); + RAJAPERF_COUNTERS_IF(if (separate_buffers)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr recv_buffer = recv_buffers[l]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len*num_vars; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(buffer[i] = recv_buffer[i]); + } + } + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type v = 0; v < num_vars; ++v)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr var = vars[v]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(HALO_UNPACK_BODY); + } + RAJAPERF_COUNTERS_LOOP_BODY( + buffer += len; + ); + } + } + RAJAPERF_COUNTERS_PAR_SYNC(); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALO_PACKING.hpp b/src/comm/HALO_PACKING.hpp index ab458280f..05cdd50ec 100644 --- a/src/comm/HALO_PACKING.hpp +++ b/src/comm/HALO_PACKING.hpp @@ -80,6 +80,7 @@ class HALO_PACKING : public HALO_base void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/comm/HALO_PACKING_FUSED.cpp b/src/comm/HALO_PACKING_FUSED.cpp index 6ffc5ac90..7a9607548 100644 --- a/src/comm/HALO_PACKING_FUSED.cpp +++ b/src/comm/HALO_PACKING_FUSED.cpp @@ -109,5 +109,137 @@ void HALO_PACKING_FUSED::tearDown(VariantID vid, size_t tune_idx) tearDown_base(vid, tune_idx); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void HALO_PACKING_FUSED::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + HALO_PACKING_FUSED_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HALO_PACKING_FUSED_MANUAL_FUSER_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Index_type pack_index = 0; + ); + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type l = 0; l < num_neighbors; ++l)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr buffer = pack_buffers[l]; + Int_ptr list = pack_index_lists[l]; + Index_type len = pack_index_list_lengths[l]; + ); + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type v = 0; v < num_vars; ++v)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr var = vars[v]; + pack_ptr_holders[pack_index] = ptr_holder{buffer, list, var}; + pack_lens[pack_index] = len; + pack_index += 1; + buffer += len; + ); + } + } + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < pack_index; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY( + ptr_holder pack_ptrs = pack_ptr_holders[j]; + Real_ptr buffer = pack_ptrs.buffer; + Int_ptr list = pack_ptrs.list; + Real_ptr var = pack_ptrs.var; + Index_type len = pack_lens[j]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(HALO_PACK_BODY); + } + } + RAJAPERF_COUNTERS_IF(if (separate_buffers)) { + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type l = 0; l < num_neighbors; ++l)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Index_type len = pack_index_list_lengths[l]; + Real_ptr send_buffer = send_buffers[l]; + Real_ptr buffer = pack_buffers[l]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len*num_vars; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(send_buffer[i] = buffer[i]); + } + } + } + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Index_type unpack_index = 0; + ); + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type l = 0; l < num_neighbors; ++l)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr buffer = unpack_buffers[l]; + Int_ptr list = unpack_index_lists[l]; + Index_type len = unpack_index_list_lengths[l]; + ); + RAJAPERF_COUNTERS_IF(if (separate_buffers)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr recv_buffer = recv_buffers[l]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len*num_vars; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(buffer[i] = recv_buffer[i]); + } + } + + RAJAPERF_COUNTERS_OUTER_LOOP(for (Index_type v = 0; v < num_vars; ++v)) { + RAJAPERF_COUNTERS_LOOP_BODY( + Real_ptr var = vars[v]; + unpack_ptr_holders[unpack_index] = ptr_holder{buffer, list, var}; + unpack_lens[unpack_index] = len; + unpack_index += 1; + buffer += len; + ); + } + } + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < unpack_index; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY( + ptr_holder unpack_ptrs = unpack_ptr_holders[j]; + Real_ptr buffer = unpack_ptrs.buffer; + Int_ptr list = unpack_ptrs.list; + Real_ptr var = unpack_ptrs.var; + Index_type len = unpack_lens[j]; + ); + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < len; i++)) { + RAJAPERF_COUNTERS_LOOP_BODY(HALO_UNPACK_BODY); + } + } + RAJAPERF_COUNTERS_PAR_SYNC(); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + HALO_PACKING_FUSED_MANUAL_FUSER_TEARDOWN; + ); + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace comm } // end namespace rajaperf diff --git a/src/comm/HALO_PACKING_FUSED.hpp b/src/comm/HALO_PACKING_FUSED.hpp index 9d8c04994..7360f6f75 100644 --- a/src/comm/HALO_PACKING_FUSED.hpp +++ b/src/comm/HALO_PACKING_FUSED.hpp @@ -58,9 +58,9 @@ Real_ptr_ptr recv_buffers = m_recv_buffers; #define HALO_PACKING_FUSED_MANUAL_FUSER_SETUP \ - ptr_holder* pack_ptr_holders = nullptr; \ + RAJAPERF_WRAPPER(ptr_holder*) pack_ptr_holders = nullptr; \ Index_ptr pack_lens = nullptr; \ - ptr_holder* unpack_ptr_holders = nullptr; \ + RAJAPERF_WRAPPER(ptr_holder*) unpack_ptr_holders = nullptr; \ Index_ptr unpack_lens = nullptr; \ allocData(DataSpace::Host, pack_ptr_holders, num_neighbors * num_vars); \ allocData(DataSpace::Host, pack_lens, num_neighbors * num_vars); \ @@ -130,6 +130,7 @@ class HALO_PACKING_FUSED : public HALO_base void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/comm/HALO_base.hpp b/src/comm/HALO_base.hpp index 1e0567e8b..c3b70cd3f 100644 --- a/src/comm/HALO_base.hpp +++ b/src/comm/HALO_base.hpp @@ -126,7 +126,7 @@ class HALO_base : public KernelBase Index_type k_max; }; - static const int s_num_neighbors = 26; + static inline constexpr int s_num_neighbors = 26; static const int s_boundary_offsets[s_num_neighbors][3]; static Index_type s_grid_dims_default[3]; diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp index d12ee75ae..ffbb2881e 100644 --- a/src/lcals/DIFF_PREDICT.cpp +++ b/src/lcals/DIFF_PREDICT.cpp @@ -85,5 +85,47 @@ void DIFF_PREDICT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_cx, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DIFF_PREDICT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DIFF_PREDICT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(DIFF_PREDICT_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/DIFF_PREDICT.hpp b/src/lcals/DIFF_PREDICT.hpp index 60abcb31b..980eef80e 100644 --- a/src/lcals/DIFF_PREDICT.hpp +++ b/src/lcals/DIFF_PREDICT.hpp @@ -87,6 +87,7 @@ class DIFF_PREDICT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp index 5c4340142..fddeed891 100644 --- a/src/lcals/EOS.cpp +++ b/src/lcals/EOS.cpp @@ -98,5 +98,47 @@ void EOS::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_u, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void EOS::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + EOS_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(EOS_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/EOS.hpp b/src/lcals/EOS.hpp index b0bb91050..b4b433588 100644 --- a/src/lcals/EOS.hpp +++ b/src/lcals/EOS.hpp @@ -56,6 +56,7 @@ class EOS : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp index 650a3ecff..192150c92 100644 --- a/src/lcals/FIRST_DIFF.cpp +++ b/src/lcals/FIRST_DIFF.cpp @@ -84,5 +84,47 @@ void FIRST_DIFF::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void FIRST_DIFF::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + FIRST_DIFF_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(FIRST_DIFF_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_DIFF.hpp b/src/lcals/FIRST_DIFF.hpp index d85348be5..4292a5bd3 100644 --- a/src/lcals/FIRST_DIFF.hpp +++ b/src/lcals/FIRST_DIFF.hpp @@ -46,6 +46,7 @@ class FIRST_DIFF : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/FIRST_MIN.cpp b/src/lcals/FIRST_MIN.cpp index 4dcdfdaba..ed0668b45 100644 --- a/src/lcals/FIRST_MIN.cpp +++ b/src/lcals/FIRST_MIN.cpp @@ -95,5 +95,57 @@ void FIRST_MIN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_x, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void FIRST_MIN::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + FIRST_MIN_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + FIRST_MIN_MINLOC_INIT; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(FIRST_MIN_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_minloc = mymin.loc; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_MIN.hpp b/src/lcals/FIRST_MIN.hpp index fa4804859..684387483 100644 --- a/src/lcals/FIRST_MIN.hpp +++ b/src/lcals/FIRST_MIN.hpp @@ -73,6 +73,7 @@ class FIRST_MIN : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index c8f920406..90793110e 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -84,5 +84,47 @@ void FIRST_SUM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_y, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void FIRST_SUM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 1; + const Index_type iend = getActualProblemSize(); + + FIRST_SUM_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(FIRST_SUM_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp index e47362a69..3bd92301f 100644 --- a/src/lcals/FIRST_SUM.hpp +++ b/src/lcals/FIRST_SUM.hpp @@ -49,6 +49,7 @@ class FIRST_SUM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/GEN_LIN_RECUR.cpp b/src/lcals/GEN_LIN_RECUR.cpp index aeec387d5..89dd53da0 100644 --- a/src/lcals/GEN_LIN_RECUR.cpp +++ b/src/lcals/GEN_LIN_RECUR.cpp @@ -97,5 +97,49 @@ void GEN_LIN_RECUR::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_sb, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void GEN_LIN_RECUR::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + GEN_LIN_RECUR_DATA_SETUP; + const Index_type iend = N+1; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = 0; k < N; ++k )) { + RAJAPERF_COUNTERS_LOOP_BODY(GEN_LIN_RECUR_OPT_BODY1); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(GEN_LIN_RECUR_OPT_BODY2); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/GEN_LIN_RECUR.hpp b/src/lcals/GEN_LIN_RECUR.hpp index 6996a59b6..056d56dd3 100644 --- a/src/lcals/GEN_LIN_RECUR.hpp +++ b/src/lcals/GEN_LIN_RECUR.hpp @@ -50,6 +50,20 @@ stb5[k] = b5[k+kb5i] - stb5[k]; +#define GEN_LIN_RECUR_OPT_BODY1 \ + Real_type tmp; \ + Real_type stb = stb5[k]; \ + b5[k+kb5i] = tmp = sa[k] + stb*sb[k]; \ + stb5[k] = tmp - stb; + +#define GEN_LIN_RECUR_OPT_BODY2 \ + Index_type k = N - i ; \ + Real_type tmp; \ + Real_type stb = stb5[k]; \ + b5[k+kb5i] = tmp = sa[k] + stb*sb[k]; \ + stb5[k] = tmp - stb; + + #include "common/KernelBase.hpp" namespace rajaperf @@ -70,6 +84,7 @@ class GEN_LIN_RECUR : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp index fb05917a5..efc8e73a2 100644 --- a/src/lcals/HYDRO_1D.cpp +++ b/src/lcals/HYDRO_1D.cpp @@ -95,5 +95,47 @@ void HYDRO_1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_z, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void HYDRO_1D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + HYDRO_1D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(HYDRO_1D_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_1D.hpp b/src/lcals/HYDRO_1D.hpp index 28fcb3f93..57d56823f 100644 --- a/src/lcals/HYDRO_1D.hpp +++ b/src/lcals/HYDRO_1D.hpp @@ -51,6 +51,7 @@ class HYDRO_1D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/HYDRO_2D.cpp b/src/lcals/HYDRO_2D.cpp index 307470a16..f615007ca 100644 --- a/src/lcals/HYDRO_2D.cpp +++ b/src/lcals/HYDRO_2D.cpp @@ -129,5 +129,63 @@ void HYDRO_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_zz, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void HYDRO_2D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type kbeg = 1; + const Index_type kend = m_kn - 1; + const Index_type jbeg = 1; + const Index_type jend = m_jn - 1; + + HYDRO_2D_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = kbeg; k < kend; ++k )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = jbeg; j < jend; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(HYDRO_2D_BODY1); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = kbeg; k < kend; ++k )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = jbeg; j < jend; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(HYDRO_2D_BODY2); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = kbeg; k < kend; ++k )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = jbeg; j < jend; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(HYDRO_2D_BODY3); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/HYDRO_2D.hpp b/src/lcals/HYDRO_2D.hpp index 098c46cad..6631fe637 100644 --- a/src/lcals/HYDRO_2D.hpp +++ b/src/lcals/HYDRO_2D.hpp @@ -145,6 +145,7 @@ class HYDRO_2D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp index feb9dda68..c2a2030d2 100644 --- a/src/lcals/INT_PREDICT.cpp +++ b/src/lcals/INT_PREDICT.cpp @@ -111,5 +111,47 @@ void INT_PREDICT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_px, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void INT_PREDICT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + INT_PREDICT_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(INT_PREDICT_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/INT_PREDICT.hpp b/src/lcals/INT_PREDICT.hpp index 28d7ebe9e..639f674dc 100644 --- a/src/lcals/INT_PREDICT.hpp +++ b/src/lcals/INT_PREDICT.hpp @@ -66,6 +66,7 @@ class INT_PREDICT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp index b33b7ea8b..95543f3d9 100644 --- a/src/lcals/PLANCKIAN.cpp +++ b/src/lcals/PLANCKIAN.cpp @@ -88,5 +88,47 @@ void PLANCKIAN::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_w, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void PLANCKIAN::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + PLANCKIAN_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(PLANCKIAN_OPT_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/PLANCKIAN.hpp b/src/lcals/PLANCKIAN.hpp index 08412db4d..dc6bb3ee6 100644 --- a/src/lcals/PLANCKIAN.hpp +++ b/src/lcals/PLANCKIAN.hpp @@ -31,6 +31,12 @@ w[i] = x[i] / ( exp( y[i] ) - 1.0 ); +#define PLANCKIAN_OPT_BODY \ + Real_type tmp; \ + y[i] = tmp = u[i] / v[i]; \ + w[i] = x[i] / ( exp( tmp ) - 1.0 ); + + #include "common/KernelBase.hpp" namespace rajaperf @@ -51,6 +57,7 @@ class PLANCKIAN : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/lcals/TRIDIAG_ELIM.cpp b/src/lcals/TRIDIAG_ELIM.cpp index d1210efaa..cfd9fe14a 100644 --- a/src/lcals/TRIDIAG_ELIM.cpp +++ b/src/lcals/TRIDIAG_ELIM.cpp @@ -88,5 +88,47 @@ void TRIDIAG_ELIM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_z, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void TRIDIAG_ELIM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 1; + const Index_type iend = m_N; + + TRIDIAG_ELIM_DATA_SETUP; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(TRIDIAG_ELIM_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace lcals } // end namespace rajaperf diff --git a/src/lcals/TRIDIAG_ELIM.hpp b/src/lcals/TRIDIAG_ELIM.hpp index b940abac8..7860762d7 100644 --- a/src/lcals/TRIDIAG_ELIM.hpp +++ b/src/lcals/TRIDIAG_ELIM.hpp @@ -51,6 +51,7 @@ class TRIDIAG_ELIM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index e7455414d..917f8b700 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -116,5 +116,60 @@ void POLYBENCH_2MM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_D, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_2MM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_2MM_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; i++ )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < nj; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY1); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = 0; k < nk; k++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY2); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY3); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; i++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type l = 0; l < nl; l++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY4); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < nj; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY5); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_2MM_BODY6); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp index 077c91716..df15b971b 100644 --- a/src/polybench/POLYBENCH_2MM.hpp +++ b/src/polybench/POLYBENCH_2MM.hpp @@ -121,6 +121,7 @@ class POLYBENCH_2MM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp index 4c4c8546a..0edeaf58a 100644 --- a/src/polybench/POLYBENCH_3MM.cpp +++ b/src/polybench/POLYBENCH_3MM.cpp @@ -129,5 +129,70 @@ void POLYBENCH_3MM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_G, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_3MM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_3MM_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; i++ )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < nj; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY1); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = 0; k < nk; k++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY2); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY3); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < nj; j++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type l = 0; l < nl; l++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY4); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type m = 0; m < nm; m++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY5); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY6); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; i++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type l = 0; l < nl; l++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY7); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < nj; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY8); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_3MM_BODY9); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_3MM.hpp b/src/polybench/POLYBENCH_3MM.hpp index 936714cfa..f2fa3830b 100644 --- a/src/polybench/POLYBENCH_3MM.hpp +++ b/src/polybench/POLYBENCH_3MM.hpp @@ -147,6 +147,7 @@ class POLYBENCH_3MM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_ADI.cpp b/src/polybench/POLYBENCH_ADI.cpp index 562984feb..11c340f23 100644 --- a/src/polybench/POLYBENCH_ADI.cpp +++ b/src/polybench/POLYBENCH_ADI.cpp @@ -105,5 +105,62 @@ void POLYBENCH_ADI::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_Q, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_ADI::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_ADI_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < n-1; ++i)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY2); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 1; j < n-1; ++j)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY3); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY4); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = n-2; k >= 1; --k)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY5); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < n-1; ++i)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY6); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 1; j < n-1; ++j)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY7); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY8); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = n-2; k >= 1; --k)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ADI_OPT_BODY9); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_ADI.hpp b/src/polybench/POLYBENCH_ADI.hpp index ad51e7429..8e71fb18a 100644 --- a/src/polybench/POLYBENCH_ADI.hpp +++ b/src/polybench/POLYBENCH_ADI.hpp @@ -122,6 +122,49 @@ U[i * n + k] = P[i * n + k] * U[i * n + k +1] + Q[i * n + k]; +#define POLYBENCH_ADI_OPT_BODY2 \ + V[0 * n + i] = 1.0; \ + Real_type last_P = 0.0; \ + Real_type last_Q = 1.0; \ + P[i * n + 0] = last_P; \ + Q[i * n + 0] = last_Q; + +#define POLYBENCH_ADI_OPT_BODY3 \ + Real_type tmp_div = a * last_P + b; \ + P[i * n + j] = last_P = -c / tmp_div; \ + Q[i * n + j] = last_Q = (-d * U[j * n + i-1] + (1.0 + 2.0*d) * U[j * n + i] - \ + f * U[j * n + i + 1] - a * last_Q) / \ + tmp_div; + +#define POLYBENCH_ADI_OPT_BODY4 \ + Real_type last_V = 1.0; \ + V[(n-1) * n + i] = last_V; + +#define POLYBENCH_ADI_OPT_BODY5 \ + V[k * n + i] = last_V = P[i * n + k] * last_V + Q[i * n + k]; + +#define POLYBENCH_ADI_OPT_BODY6 \ + U[i * n + 0] = 1.0; \ + Real_type last_P = 0.0; \ + Real_type last_Q = 1.0; \ + P[i * n + 0] = last_P; \ + Q[i * n + 0] = last_Q; + +#define POLYBENCH_ADI_OPT_BODY7 \ + Real_type tmp_div = d * last_P + e; \ + P[i * n + j] = last_P = -f / tmp_div; \ + Q[i * n + j] = last_Q = (-a * V[(i-1) * n + j] + (1.0 + 2.0*a) * V[i * n + j] - \ + c * V[(i + 1) * n + j] - d * last_Q) / \ + tmp_div; + +#define POLYBENCH_ADI_OPT_BODY8 \ + Real_type last_U = 1.0; \ + U[i * n + n-1] = last_U; + +#define POLYBENCH_ADI_OPT_BODY9 \ + U[i * n + k] = last_U = P[i * n + k] * last_U + Q[i * n + k]; + + #define POLYBENCH_ADI_BODY2_RAJA \ Vview(0, i) = 1.0; \ Pview(i, 0) = 0.0; \ @@ -188,6 +231,7 @@ class POLYBENCH_ADI : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_ATAX.cpp b/src/polybench/POLYBENCH_ATAX.cpp index a18fab318..493176659 100644 --- a/src/polybench/POLYBENCH_ATAX.cpp +++ b/src/polybench/POLYBENCH_ATAX.cpp @@ -102,5 +102,56 @@ void POLYBENCH_ATAX::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx deallocData(m_A, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_ATAX::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_ATAX_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < N; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY1); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < N; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY2); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY3); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < N; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY4); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type i = 0; i < N; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY5); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_ATAX_BODY6); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_ATAX.hpp b/src/polybench/POLYBENCH_ATAX.hpp index bd896bf08..a4e4c0505 100644 --- a/src/polybench/POLYBENCH_ATAX.hpp +++ b/src/polybench/POLYBENCH_ATAX.hpp @@ -109,6 +109,7 @@ class POLYBENCH_ATAX : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_FDTD_2D.cpp b/src/polybench/POLYBENCH_FDTD_2D.cpp index b86d29622..5d1114a1b 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.cpp +++ b/src/polybench/POLYBENCH_FDTD_2D.cpp @@ -124,5 +124,61 @@ void POLYBENCH_FDTD_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ deallocData(m_hz, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_FDTD_2D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_FDTD_2D_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < ny; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_FDTD_2D_BODY1); + } + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < nx; i++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < ny; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_FDTD_2D_BODY2); + } + } + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < nx; i++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 1; j < ny; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_FDTD_2D_BODY3); + } + } + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < nx - 1; i++)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < ny - 1; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_FDTD_2D_BODY4); + } + } + + t = (t+1) % m_tsteps; + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_FDTD_2D.hpp b/src/polybench/POLYBENCH_FDTD_2D.hpp index 75e5c2a17..b17882930 100644 --- a/src/polybench/POLYBENCH_FDTD_2D.hpp +++ b/src/polybench/POLYBENCH_FDTD_2D.hpp @@ -106,6 +106,7 @@ class POLYBENCH_FDTD_2D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp index cbb4542eb..542f18974 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.cpp @@ -92,5 +92,48 @@ void POLYBENCH_FLOYD_WARSHALL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_AR deallocData(m_pout, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_FLOYD_WARSHALL::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_FLOYD_WARSHALL_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = 0; k < N; ++k)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < N; ++i)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < N; ++j)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_FLOYD_WARSHALL_BODY); + } + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp index a6c1e9133..08818c57c 100644 --- a/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp +++ b/src/polybench/POLYBENCH_FLOYD_WARSHALL.hpp @@ -70,6 +70,7 @@ class POLYBENCH_FLOYD_WARSHALL : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_GEMM.cpp b/src/polybench/POLYBENCH_GEMM.cpp index 34d093fb9..ad7799b67 100644 --- a/src/polybench/POLYBENCH_GEMM.cpp +++ b/src/polybench/POLYBENCH_GEMM.cpp @@ -104,5 +104,51 @@ void POLYBENCH_GEMM::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx deallocData(m_C, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_GEMM::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_GEMM_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < ni; ++i )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < nj; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMM_BODY1); + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMM_BODY2); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type k = 0; k < nk; ++k )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMM_BODY3); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMM_BODY4); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GEMM.hpp b/src/polybench/POLYBENCH_GEMM.hpp index ff4817e37..e208babe5 100644 --- a/src/polybench/POLYBENCH_GEMM.hpp +++ b/src/polybench/POLYBENCH_GEMM.hpp @@ -98,6 +98,7 @@ class POLYBENCH_GEMM : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_GEMVER.cpp b/src/polybench/POLYBENCH_GEMVER.cpp index 1b542f9a3..6b3ef7baf 100644 --- a/src/polybench/POLYBENCH_GEMVER.cpp +++ b/src/polybench/POLYBENCH_GEMVER.cpp @@ -132,5 +132,66 @@ void POLYBENCH_GEMVER::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_i deallocData(m_z, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_GEMVER::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_GEMVER_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < n; i++ )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 0; j < n; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY1); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < n; i++ )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY2); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < n; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY3); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY4); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < n; i++ )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY5); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < n; i++ )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY6); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < n; j++)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY7); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GEMVER_BODY8); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace basic } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GEMVER.hpp b/src/polybench/POLYBENCH_GEMVER.hpp index 6bfb141b3..743b35749 100644 --- a/src/polybench/POLYBENCH_GEMVER.hpp +++ b/src/polybench/POLYBENCH_GEMVER.hpp @@ -149,6 +149,7 @@ class POLYBENCH_GEMVER : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_GESUMMV.cpp b/src/polybench/POLYBENCH_GESUMMV.cpp index 7ba2fd8fc..bfb86ebd4 100644 --- a/src/polybench/POLYBENCH_GESUMMV.cpp +++ b/src/polybench/POLYBENCH_GESUMMV.cpp @@ -95,5 +95,48 @@ void POLYBENCH_GESUMMV::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ deallocData(m_B, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_GESUMMV::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_GESUMMV_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < N; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GESUMMV_BODY1); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < N; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GESUMMV_BODY2); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_GESUMMV_BODY3); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_GESUMMV.hpp b/src/polybench/POLYBENCH_GESUMMV.hpp index 816869351..9559e31d9 100644 --- a/src/polybench/POLYBENCH_GESUMMV.hpp +++ b/src/polybench/POLYBENCH_GESUMMV.hpp @@ -92,6 +92,7 @@ class POLYBENCH_GESUMMV : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_HEAT_3D.cpp b/src/polybench/POLYBENCH_HEAT_3D.cpp index cec6a1383..1ef5a00fb 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.cpp +++ b/src/polybench/POLYBENCH_HEAT_3D.cpp @@ -101,5 +101,58 @@ void POLYBENCH_HEAT_3D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_ deallocData(m_Binit, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_HEAT_3D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_HEAT_3D_DATA_SETUP; + + const Index_type ijkend = N-1; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < ijkend; ++i )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 1; j < ijkend; ++j )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = 1; k < ijkend; ++k )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_HEAT_3D_BODY1); + } + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < ijkend; ++i)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 1; j < ijkend; ++j )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type k = 1; k < ijkend; ++k )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_HEAT_3D_BODY2); + } + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_HEAT_3D.hpp b/src/polybench/POLYBENCH_HEAT_3D.hpp index 7a8c6c635..1c14b5f57 100644 --- a/src/polybench/POLYBENCH_HEAT_3D.hpp +++ b/src/polybench/POLYBENCH_HEAT_3D.hpp @@ -110,6 +110,7 @@ class POLYBENCH_HEAT_3D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_JACOBI_1D.cpp b/src/polybench/POLYBENCH_JACOBI_1D.cpp index ebef81138..5530acab6 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.cpp @@ -98,5 +98,50 @@ void POLYBENCH_JACOBI_1D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun deallocData(m_Binit, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_JACOBI_1D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_JACOBI_1D_DATA_SETUP; + + const Index_type iend = N-1; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_JACOBI_1D_BODY1); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < iend; ++i)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_JACOBI_1D_BODY2); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_JACOBI_1D.hpp b/src/polybench/POLYBENCH_JACOBI_1D.hpp index fe113ffcc..6206c016d 100644 --- a/src/polybench/POLYBENCH_JACOBI_1D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_1D.hpp @@ -58,6 +58,7 @@ class POLYBENCH_JACOBI_1D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_JACOBI_2D.cpp b/src/polybench/POLYBENCH_JACOBI_2D.cpp index e7da05444..60e821046 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.cpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.cpp @@ -100,5 +100,54 @@ void POLYBENCH_JACOBI_2D::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun deallocData(m_Binit, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_JACOBI_2D::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_JACOBI_2D_DATA_SETUP; + + const Index_type ijend = N-1; + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < ijend; ++i )) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 1; j < ijend; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_JACOBI_2D_BODY1); + } + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 1; i < ijend; ++i)) { + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type j = 1; j < ijend; ++j)) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_JACOBI_2D_BODY2); + } + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, 0); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_JACOBI_2D.hpp b/src/polybench/POLYBENCH_JACOBI_2D.hpp index d6083cbac..5e5866bea 100644 --- a/src/polybench/POLYBENCH_JACOBI_2D.hpp +++ b/src/polybench/POLYBENCH_JACOBI_2D.hpp @@ -77,6 +77,7 @@ class POLYBENCH_JACOBI_2D : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/polybench/POLYBENCH_MVT.cpp b/src/polybench/POLYBENCH_MVT.cpp index 3e4a55fec..46ce45712 100644 --- a/src/polybench/POLYBENCH_MVT.cpp +++ b/src/polybench/POLYBENCH_MVT.cpp @@ -104,5 +104,56 @@ void POLYBENCH_MVT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx) deallocData(m_A, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void POLYBENCH_MVT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + POLYBENCH_MVT_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < N; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY1); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < N; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY2); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY3); + } + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = 0; i < N; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY4); + RAJAPERF_COUNTERS_SEQ_LOOP(for (Index_type j = 0; j < N; ++j )) { + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY5); + } + RAJAPERF_COUNTERS_LOOP_BODY(POLYBENCH_MVT_BODY6); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace polybench } // end namespace rajaperf diff --git a/src/polybench/POLYBENCH_MVT.hpp b/src/polybench/POLYBENCH_MVT.hpp index 514870844..83a4612f2 100644 --- a/src/polybench/POLYBENCH_MVT.hpp +++ b/src/polybench/POLYBENCH_MVT.hpp @@ -113,6 +113,7 @@ class POLYBENCH_MVT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 9078659f1..b40b397bd 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -86,5 +86,47 @@ void ADD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_c, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void ADD::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + ADD_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(ADD_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp index 761ce64d7..99a6d94a1 100644 --- a/src/stream/ADD.hpp +++ b/src/stream/ADD.hpp @@ -46,6 +46,7 @@ class ADD : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index cfee91918..6364af0d4 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -84,5 +84,47 @@ void COPY::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_c, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void COPY::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + COPY_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(COPY_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp index e24757da9..6bf533720 100644 --- a/src/stream/COPY.hpp +++ b/src/stream/COPY.hpp @@ -45,6 +45,7 @@ class COPY : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp index 4af896a88..287fda7c3 100644 --- a/src/stream/DOT.cpp +++ b/src/stream/DOT.cpp @@ -87,5 +87,57 @@ void DOT::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_b, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void DOT::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + DOT_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_CODE_WRAPPER( + Real_type dot = m_dot_init; + ); + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(DOT_BODY); + } + + RAJAPERF_COUNTERS_PAR_SYNC(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + m_dot += dot; + ); + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp index 1229d861e..b4efdc3d6 100644 --- a/src/stream/DOT.hpp +++ b/src/stream/DOT.hpp @@ -45,6 +45,7 @@ class DOT : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index 2c5e2a594..1de04eb01 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -85,5 +85,47 @@ void MUL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_c, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void MUL::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + MUL_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(MUL_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp index 1e04b2c53..9bad4e557 100644 --- a/src/stream/MUL.hpp +++ b/src/stream/MUL.hpp @@ -46,6 +46,7 @@ class MUL : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx); diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index 395dce001..9814345ec 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -91,5 +91,47 @@ void TRIAD::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) deallocData(m_c, vid); } + +// Only define setCountedAttributes functions past this point +// BEWARE: data types (Index_type, Real_ptr, etc) become wrappers past this point +#include "common/CountingMacros.hpp" + +void TRIAD::setCountedAttributes() +{ + VariantID vid = VariantID::Base_Seq; + size_t tune_idx = 0; + + RAJAPERF_COUNTERS_INITIALIZE(); + + RAJAPERF_COUNTERS_CODE_WRAPPER( + setUp(vid, tune_idx); + ); + + { + RAJAPERF_COUNTERS_CODE_WRAPPER( + const Index_type ibegin = 0; + const Index_type iend = getActualProblemSize(); + + TRIAD_DATA_SETUP + ); + + RAJAPERF_COUNTERS_REP_SCOPE() + { + + RAJAPERF_COUNTERS_PAR_LOOP(for (Index_type i = ibegin; i < iend; ++i )) { + RAJAPERF_COUNTERS_LOOP_BODY(TRIAD_BODY); + } + + } + + } + + RAJAPERF_COUNTERS_CODE_WRAPPER( + tearDown(vid, tune_idx); + ); + + RAJAPERF_COUNTERS_FINALIZE(); +} + } // end namespace stream } // end namespace rajaperf diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp index f901314a4..4ed11ce9a 100644 --- a/src/stream/TRIAD.hpp +++ b/src/stream/TRIAD.hpp @@ -47,6 +47,7 @@ class TRIAD : public KernelBase void setUp(VariantID vid, size_t tune_idx); void updateChecksum(VariantID vid, size_t tune_idx); void tearDown(VariantID vid, size_t tune_idx); + void setCountedAttributes(); void runSeqVariant(VariantID vid, size_t tune_idx); void runOpenMPVariant(VariantID vid, size_t tune_idx);