diff --git a/.gitmodules b/.gitmodules index e6a012fbe..b60b989ef 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,7 @@ [submodule "tpl/RAJA"] path = tpl/RAJA url = https://github.com/LLNL/RAJA.git +[submodule "tpl/RAJAvec"] + path = tpl/RAJAvec + url = https://github.com/LLNL/RAJA.git + branch = feature/kunen1/vector diff --git a/CMakeLists.txt b/CMakeLists.txt index 75d53ca9a..1e907c8b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,14 +13,21 @@ cmake_minimum_required(VERSION 3.9) option(ENABLE_RAJA_SEQUENTIAL "Run sequential variants of RAJA kernels. Disable this, and all other variants, to run _only_ raw C loops." On) +option(ENABLE_RAJA_VECTORIZATION "Run vectorized variants of RAJA kernels. Disable +this, and all other variants, to run _only_ raw C loops." Off) + # # Initialize the BLT build system # +message(STATUS ${ENABLE_RAJA_SEQUENTIAL}) +message(STATUS ${ENABLE_RAJA_VECTORIZATION}) + if (PERFSUITE_ENABLE_WARNINGS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror") endif() +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") set(ENABLE_TESTS Off CACHE BOOL "Enable BLT and RAJA tests") set(ENABLE_EXAMPLES Off CACHE BOOL "Enable RAJA examples") set(ENABLE_EXERCISES Off CACHE BOOL "Enable RAJA exercises") @@ -48,12 +55,6 @@ set(RAJA_DATA_ALIGN 64) # exclude RAJA make targets from top-level build... -add_subdirectory(tpl/RAJA) - -get_property(RAJA_INCLUDE_DIRS DIRECTORY tpl/RAJA PROPERTY INCLUDE_DIRECTORIES) -include_directories(${RAJA_INCLUDE_DIRS}) - - # # Setup variables to pass to Perf suite # @@ -63,14 +64,23 @@ include_directories(${RAJA_INCLUDE_DIRS}) # performance issues in the xl compiler. # if (ENABLE_RAJA_SEQUENTIAL) + add_subdirectory(tpl/RAJAvec) + get_property(RAJA_INCLUDE_DIRS DIRECTORY tpl/RAJAvec PROPERTY INCLUDE_DIRECTORIES) + include_directories(${RAJA_INCLUDE_DIRS}) add_definitions(-DRUN_RAJA_SEQ) endif () +#if(ENABLE_RAJA_VECTORIZATION) +# add_subdirectory(tpl/RAJAvec) +# get_property(RAJA_INCLUDE_DIRS DIRECTORY tpl/RAJAvec PROPERTY INCLUDE_DIRECTORIES) +# include_directories(${RAJA_INCLUDE_DIRS}) +# add_definitions(-DRUN_RAJA_VEC) +#endif () if (ENABLE_OPENMP) add_definitions(-DRUN_OPENMP) endif () set(RAJA_PERFSUITE_VERSION_MAJOR 0) -set(RAJA_PERFSUITE_VERSION_MINOR 8) +set(RAJA_PERFSUITE_VERSION_MINOR 9) set(RAJA_PERFSUITE_VERSION_PATCHLEVEL 0) set(RAJA_PERFSUITE_DEPENDS RAJA) diff --git a/scripts/lc-builds/blueos_xl-2020.09.17.sh b/scripts/lc-builds/blueos_xl-2020.09.17.sh index 1205c5cae..67bb5e441 100755 --- a/scripts/lc-builds/blueos_xl-2020.09.17.sh +++ b/scripts/lc-builds/blueos_xl-2020.09.17.sh @@ -10,16 +10,27 @@ BUILD_SUFFIX=lc_blueos-xl_2020.09.17 RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/blueos/xl_X.cmake -rm -rf build_${BUILD_SUFFIX} 2>/dev/null -mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} +rm -rf build_${BUILD_SUFFIX}_$1 2>/dev/null +mkdir build_${BUILD_SUFFIX}_$1 && cd build_${BUILD_SUFFIX}_$1 module load cmake/3.14.5 +if [ "$1" == "seq" ]; then + argS="On" + argV="Off" + RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake +else + argS="On" + argV="On" + RAJA_HOSTCONFIG=../tpl/RAJAvec/host-configs/lc-builds/toss3/clang_X.cmake +fi + cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/xl/xl-2020.09.17/bin/xlc++_r \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ + -DENABLE_RAJA_VECTORIZATION=$argV \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss3_clang10.0.1.sh b/scripts/lc-builds/toss3_clang10.0.1.sh index 9b069228e..cc0a5bc43 100755 --- a/scripts/lc-builds/toss3_clang10.0.1.sh +++ b/scripts/lc-builds/toss3_clang10.0.1.sh @@ -8,18 +8,30 @@ ############################################################################### BUILD_SUFFIX=lc_toss3-clang-10.0.1 -RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake -rm -rf build_${BUILD_SUFFIX} 2>/dev/null -mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} +rm -rf build_${BUILD_SUFFIX}_$1 2>/dev/null +mkdir build_${BUILD_SUFFIX}_$1 && cd build_${BUILD_SUFFIX}_$1 module load cmake/3.14.5 +if [ "$1" == "seq" ]; then + argS="On" + argV="Off" + RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake +else + argS="On" + argV="On" + RAJA_HOSTCONFIG=../tpl/RAJAvec/host-configs/lc-builds/toss3/clang_X.cmake +fi + + cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-10.0.1/bin/clang++ \ + -DCMAKE_CXX_FLAGS=-ffp-contract=fast \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ + -DENABLE_RAJA_VECTORIZATION=$argV \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss3_clang9.0.0.sh b/scripts/lc-builds/toss3_clang9.0.0.sh index 88678d230..5e0f3e195 100755 --- a/scripts/lc-builds/toss3_clang9.0.0.sh +++ b/scripts/lc-builds/toss3_clang9.0.0.sh @@ -11,16 +11,27 @@ BUILD_SUFFIX=lc_toss3-clang-9.0.0 RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake -rm -rf build_${BUILD_SUFFIX} 2>/dev/null -mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} +rm -rf build_${BUILD_SUFFIX}_$1 2>/dev/null +mkdir build_${BUILD_SUFFIX}_$1 && cd build_${BUILD_SUFFIX}_$1 module load cmake/3.14.5 +if [ "$1" == "seq" ]; then + argS="On" + argV="Off" + RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake +else + argS="On" + argV="On" + RAJA_HOSTCONFIG=../tpl/RAJAvec/host-configs/lc-builds/toss3/clang_X.cmake +fi + cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-9.0.0/bin/clang++ \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ + -DENABLE_RAJA_VECTORIZATION=$argV \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss3_gcc8.1.0.sh b/scripts/lc-builds/toss3_gcc8.1.0.sh index 4d52cc356..63feb0ca1 100755 --- a/scripts/lc-builds/toss3_gcc8.1.0.sh +++ b/scripts/lc-builds/toss3_gcc8.1.0.sh @@ -11,16 +11,27 @@ BUILD_SUFFIX=lc_toss3-gcc-8.1.0 RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/gcc_X.cmake -rm -rf build_${BUILD_SUFFIX} 2>/dev/null -mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} +rm -rf build_${BUILD_SUFFIX}_$1 2>/dev/null +mkdir build_${BUILD_SUFFIX}_$1 && cd build_${BUILD_SUFFIX}_$1 module load cmake/3.14.5 +if [ "$1" == "seq" ]; then + argS="On" + argV="Off" + RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake +else + argS="On" + argV="On" + RAJA_HOSTCONFIG=../tpl/RAJAvec/host-configs/lc-builds/toss3/clang_X.cmake +fi + cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-8.1.0/bin/g++ \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ + -DENABLE_RAJA_VECTORIZATION=$argV \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/scripts/lc-builds/toss3_icpc19.1.0.sh b/scripts/lc-builds/toss3_icpc19.1.0.sh index 00df84d16..030b9253a 100755 --- a/scripts/lc-builds/toss3_icpc19.1.0.sh +++ b/scripts/lc-builds/toss3_icpc19.1.0.sh @@ -11,16 +11,28 @@ BUILD_SUFFIX=lc_toss3-icpc-19.1.0 RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake -rm -rf build_${BUILD_SUFFIX} 2>/dev/null -mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} +rm -rf build_${BUILD_SUFFIX}_$1 2>/dev/null +mkdir build_${BUILD_SUFFIX}_$1 && cd build_${BUILD_SUFFIX}_$1 module load cmake/3.14.5 +if [ "$1" == "seq" ]; then + argS="On" + argV="Off" + RAJA_HOSTCONFIG=../tpl/RAJA/host-configs/lc-builds/toss3/clang_X.cmake +else + argS="On" + argV="On" + RAJA_HOSTCONFIG=../tpl/RAJAvec/host-configs/lc-builds/toss3/clang_X.cmake +fi + cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-19.1.0/bin/icpc \ + -DCMAKE_CXX_FLAGS="-xCORE-AVX2" \ -C ${RAJA_HOSTCONFIG} \ -DENABLE_OPENMP=On \ + -DENABLE_RAJA_VECTORIZATION=$argV \ -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ "$@" \ .. diff --git a/src/basic/DAXPY-Seq.cpp b/src/basic/DAXPY-Seq.cpp index 4c792a28f..0b795dbeb 100644 --- a/src/basic/DAXPY-Seq.cpp +++ b/src/basic/DAXPY-Seq.cpp @@ -76,7 +76,50 @@ void DAXPY::runSeqVariant(VariantID vid) break; } + + case RAJA_Vec : { + +#if(0) + DAXPY_DATA_VEC_SETUP; + + auto daxpy_vec_lam = [=](RAJA::VectorIndex i) { + DAXPY_VEC_BODY; + }; + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + RAJA::forall>( + RAJA::TypedRangeSegment(ibegin, iend), daxpy_vec_lam); + } + stopTimer(); +#endif + +#if(0) + DAXPY_DATA_VEC_SETUP2; + + auto daxpy_vec_lam = [=](RAJA::VectorIndex i) { + DAXPY_VEC_BODY2; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + RAJA::forall>( + RAJA::TypedRangeSegment(ibegin, iend), daxpy_vec_lam); + } + stopTimer(); +#endif + +#if(1) + DAXPY_DATA_VEC_SETUP3; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + DAXPY_VEC_BODY3; + } + stopTimer(); #endif + break; + } +#endif //RUN_RAJA_VEC default : { std::cout << "\n DAXPY : Unknown variant id = " << vid << std::endl; diff --git a/src/basic/DAXPY.cpp b/src/basic/DAXPY.cpp index 3edd8da78..5b5788b08 100644 --- a/src/basic/DAXPY.cpp +++ b/src/basic/DAXPY.cpp @@ -27,6 +27,7 @@ DAXPY::DAXPY(const RunParams& params) setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); + setVariantDefined( RAJA_Vec ); setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); diff --git a/src/basic/DAXPY.hpp b/src/basic/DAXPY.hpp index 1d1a56c96..6b822a0a5 100644 --- a/src/basic/DAXPY.hpp +++ b/src/basic/DAXPY.hpp @@ -20,11 +20,53 @@ #define DAXPY_DATA_SETUP \ Real_ptr x = m_x; \ Real_ptr y = m_y; \ - Real_type a = m_a; + const Real_type a = m_a; #define DAXPY_BODY \ y[i] += a * x[i] ; +#define DAXPY_DATA_VEC_SETUP \ + RAJA_INDEX_VALUE_T(I, Int_type, "I");\ + using vector_t = RAJA::StreamVector; \ + RAJA::TypedView, I> X(x, iend); \ + RAJA::TypedView, I> Y(y, iend); + +#define DAXPY_DATA_VEC_SETUP2 \ + RAJA_INDEX_VALUE_T(I, Int_type, "I"); \ + using vector_t = RAJA::StreamVector; \ + RAJA::TypedView, I> Xview(x, iend); \ + RAJA::TypedView, I> Yview(y, iend); \ + RAJA::forall> (RAJA::TypedRangeSegment(ibegin, iend),\ + [=](RAJA::VectorIndex i) { \ + vector_t X(0), Y(0); \ + for(int j = 0; j < i.size(); ++j) { \ + X.set(j, *(x + (**i) + j)); \ + Y.set(j, *(y + (**i) + j)); \ + } \ + Xview(i) = X; \ + Yview(i) = Y; \ + }); + +#define DAXPY_DATA_VEC_SETUP3 \ + RAJA_INDEX_VALUE_T(I, Int_type, "I");\ + using element_t = RAJA::StreamVector::element_type; \ + element_t X[iend], Y[iend]; \ + for(int i = 0; i < iend; ++i) { \ + X[i] = x[i]; \ + Y[i] = y[i]; \ + } + +#define DAXPY_VEC_BODY \ + Y(i) += a * X(i); + +#define DAXPY_VEC_BODY2 \ + Yview(i) += a*Xview(i); + +#define DAXPY_VEC_BODY3 \ + for(int i = 0;i < iend; ++i){ \ + Y[i] += a * X[i]; \ + y[i] = Y[i]; \ + } #include "common/KernelBase.hpp" diff --git a/src/basic/INIT3-Seq.cpp b/src/basic/INIT3-Seq.cpp index 1483bc9df..0d427f03c 100644 --- a/src/basic/INIT3-Seq.cpp +++ b/src/basic/INIT3-Seq.cpp @@ -76,6 +76,25 @@ void INIT3::runSeqVariant(VariantID vid) break; } + + case RAJA_Vec : { + INIT3_VEC_SETUP; + + auto init3_vec_lam = [=](RAJA::VectorIndex i) { + INIT3_VEC_BODY; + }; + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::TypedRangeSegment(ibegin, iend), init3_vec_lam); + + } + stopTimer(); + + break; + } + #endif // RUN_RAJA_SEQ default : { diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp index 1adbe906c..baedc297f 100644 --- a/src/basic/INIT3.cpp +++ b/src/basic/INIT3.cpp @@ -27,6 +27,7 @@ INIT3::INIT3(const RunParams& params) setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); + setVariantDefined( RAJA_Vec ); setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp index 1a0216344..d9ea2b48f 100644 --- a/src/basic/INIT3.hpp +++ b/src/basic/INIT3.hpp @@ -25,9 +25,20 @@ Real_ptr in1 = m_in1; \ Real_ptr in2 = m_in2; +#define INIT3_VEC_SETUP \ + RAJA_INDEX_VALUE_T(I, Int_type, "I"); \ + using vector_t = RAJA::StreamVector; \ + RAJA::TypedView, I> O1(out1, iend); \ + RAJA::TypedView, I> O2(out2, iend); \ + RAJA::TypedView, I> O3(out3, iend); \ + RAJA::TypedView, I> I1(in1, iend); \ + RAJA::TypedView, I> I2(in2, iend); + #define INIT3_BODY \ out1[i] = out2[i] = out3[i] = - in1[i] - in2[i] ; +#define INIT3_VEC_BODY \ + O1(i) = O2(i) = O3(i) = -1 * I1(i) - I2(i); #include "common/KernelBase.hpp" diff --git a/src/basic/INIT_VIEW1D-Seq.cpp b/src/basic/INIT_VIEW1D-Seq.cpp index 89f186b9b..5289f6a76 100644 --- a/src/basic/INIT_VIEW1D-Seq.cpp +++ b/src/basic/INIT_VIEW1D-Seq.cpp @@ -82,6 +82,19 @@ void INIT_VIEW1D::runSeqVariant(VariantID vid) break; } + + case RAJA_Vec : { + + INIT_VIEW1D_DATA_VEC_SETUP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + INIT_VIEW1D_VEC_BODY; + } + stopTimer(); + + break; + } #endif // RUN_RAJA_SEQ default : { diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp index 8f8fed084..a2e220e7c 100644 --- a/src/basic/INIT_VIEW1D.cpp +++ b/src/basic/INIT_VIEW1D.cpp @@ -27,6 +27,7 @@ INIT_VIEW1D::INIT_VIEW1D(const RunParams& params) setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); + setVariantDefined( RAJA_Vec ); setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp index 77a9eebe6..cb0ecf463 100644 --- a/src/basic/INIT_VIEW1D.hpp +++ b/src/basic/INIT_VIEW1D.hpp @@ -39,6 +39,20 @@ const RAJA::Layout<1> my_layout(iend); \ ViewType view(a, my_layout); +#define INIT_VIEW1D_DATA_VEC_SETUP \ + RAJA_INDEX_VALUE_T(I, Int_type, "I"); \ + using vector_t = RAJA::StreamVector; \ + RAJA::TypedView, I> Aview(a, iend); + +#define INIT_VIEW1D_VEC_BODY \ + RAJA::forall> (RAJA::TypedRangeSegment(ibegin, iend),\ + [=](RAJA::VectorIndex i) { \ + vector_t A(0); \ + for(int j = 0; j < i.size(); ++j) { \ + A.set(j, (**i + j + 1) * v); \ + } \ + Aview(i) = A; \ + }); #include "common/KernelBase.hpp" diff --git a/src/basic/MULADDSUB-Seq.cpp b/src/basic/MULADDSUB-Seq.cpp index dfda9d17b..62a6c9622 100644 --- a/src/basic/MULADDSUB-Seq.cpp +++ b/src/basic/MULADDSUB-Seq.cpp @@ -76,6 +76,27 @@ void MULADDSUB::runSeqVariant(VariantID vid) break; } + + case RAJA_Vec : { + + MULADDSUB_DATA_VEC_SETUP; + + auto mas_lam = [=](RAJA::VectorIndex i) { + MULADDSUB_VEC_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::TypedRangeSegment(ibegin, iend), mas_lam); + + } + stopTimer(); + + break; + } + #endif // RUN_RAJA_SEQ default : { diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp index fd46c3718..7568ad3ea 100644 --- a/src/basic/MULADDSUB.cpp +++ b/src/basic/MULADDSUB.cpp @@ -27,6 +27,7 @@ MULADDSUB::MULADDSUB(const RunParams& params) setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); + setVariantDefined( RAJA_Vec ); setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp index 48a664f63..9a9a7b376 100644 --- a/src/basic/MULADDSUB.hpp +++ b/src/basic/MULADDSUB.hpp @@ -31,6 +31,19 @@ out2[i] = in1[i] + in2[i] ; \ out3[i] = in1[i] - in2[i] ; +#define MULADDSUB_DATA_VEC_SETUP \ + RAJA_INDEX_VALUE_T(I, Int_type, "I"); \ + using vector_t = RAJA::StreamVector; \ + RAJA::TypedView, I> O1(out1, iend); \ + RAJA::TypedView, I> O2(out2, iend); \ + RAJA::TypedView, I> O3(out3, iend); \ + RAJA::TypedView, I> I1(in1, iend); \ + RAJA::TypedView, I> I2(in2, iend); + +#define MULADDSUB_VEC_BODY \ + O1(i) = I1(i) * I2(i); \ + O2(i) = I1(i) + I2(i); \ + O3(i) = I1(i) - I2(i); #include "common/KernelBase.hpp" diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp index f339cbe3a..ae34b5a56 100644 --- a/src/common/KernelBase.cpp +++ b/src/common/KernelBase.cpp @@ -97,6 +97,7 @@ void KernelBase::runKernel(VariantID vid) case Base_Seq : case Lambda_Seq : case RAJA_Seq : + case RAJA_Vec : { #if defined(RUN_RAJA_SEQ) runSeqVariant(vid); diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index a5581e196..6c01e7c1c 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -213,6 +213,7 @@ static const std::string VariantNames [] = std::string("Base_Seq"), std::string("Lambda_Seq"), std::string("RAJA_Seq"), + std::string("RAJA_Vec"), std::string("Base_OpenMP"), std::string("Lambda_OpenMP"), @@ -302,7 +303,8 @@ bool isVariantAvailable(VariantID vid) } #if defined(RUN_RAJA_SEQ) if ( vid == Lambda_Seq || - vid == RAJA_Seq ) { + vid == RAJA_Seq || + vid == RAJA_Vec) { ret_val = true; } #endif diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 30675df3e..d6f964126 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -184,6 +184,7 @@ enum VariantID { Base_Seq = 0, Lambda_Seq, RAJA_Seq, + RAJA_Vec, Base_OpenMP, Lambda_OpenMP, diff --git a/src/lcals/FIRST_SUM-Seq.cpp b/src/lcals/FIRST_SUM-Seq.cpp index bc05f7e5f..e45f00516 100644 --- a/src/lcals/FIRST_SUM-Seq.cpp +++ b/src/lcals/FIRST_SUM-Seq.cpp @@ -76,6 +76,19 @@ void FIRST_SUM::runSeqVariant(VariantID vid) break; } + + case RAJA_Vec : { + + FIRST_SUM_VEC_SETUP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + FIRST_SUM_VEC_BODY; + } + stopTimer(); + + break; + } #endif // RUN_RAJA_SEQ default : { diff --git a/src/lcals/FIRST_SUM.cpp b/src/lcals/FIRST_SUM.cpp index c849efdf7..f98793414 100644 --- a/src/lcals/FIRST_SUM.cpp +++ b/src/lcals/FIRST_SUM.cpp @@ -27,6 +27,7 @@ FIRST_SUM::FIRST_SUM(const RunParams& params) setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); + setVariantDefined( RAJA_Vec ); setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); diff --git a/src/lcals/FIRST_SUM.hpp b/src/lcals/FIRST_SUM.hpp index 76b2c3552..14a9e78ec 100644 --- a/src/lcals/FIRST_SUM.hpp +++ b/src/lcals/FIRST_SUM.hpp @@ -25,6 +25,21 @@ Real_ptr x = m_x; \ Real_ptr y = m_y; +#define FIRST_SUM_VEC_SETUP \ + RAJA_INDEX_VALUE_T(I, Int_type, "I"); \ + using element_t = RAJA::StreamVector::element_type; \ + element_t X[iend], Y[iend]; \ + for(int i = 0; i < iend; ++i) { \ + X[i] = x[i]; \ + Y[i] = y[i]; \ + } + +#define FIRST_SUM_VEC_BODY \ + for(int i = 1; i < iend; ++i) { \ + X[i] = Y[i-1] + Y[i]; \ + x[i] = X[i]; \ + } + #define FIRST_SUM_BODY \ x[i] = y[i-1] + y[i]; diff --git a/src/polybench/POLYBENCH_2MM-Seq.cpp b/src/polybench/POLYBENCH_2MM-Seq.cpp index 4485ef2d8..64fe36800 100644 --- a/src/polybench/POLYBENCH_2MM-Seq.cpp +++ b/src/polybench/POLYBENCH_2MM-Seq.cpp @@ -156,10 +156,10 @@ void POLYBENCH_2MM::runSeqVariant(VariantID vid) RAJA::RangeSegment{0, nk}), RAJA::tuple{0.0}, - poly_2mm_lam1, - poly_2mm_lam2, + poly_2mm_lam1, + poly_2mm_lam2, poly_2mm_lam3 - ); + ); RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment{0, ni}, @@ -177,6 +177,19 @@ void POLYBENCH_2MM::runSeqVariant(VariantID vid) break; } + case RAJA_Vec : { + + POLYBENCH_2MM_DATA_VEC_SETUP; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + POLYBENCH_2MM_VEC_BODY1; + } + stopTimer(); + + break; + + } #endif // RUN_RAJA_SEQ default : { diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp index 3bac5f0ad..a81d4dcc7 100644 --- a/src/polybench/POLYBENCH_2MM.cpp +++ b/src/polybench/POLYBENCH_2MM.cpp @@ -58,6 +58,7 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params) setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); + setVariantDefined( RAJA_Vec ); setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp index 1171fb185..22c77f0eb 100644 --- a/src/polybench/POLYBENCH_2MM.hpp +++ b/src/polybench/POLYBENCH_2MM.hpp @@ -55,7 +55,7 @@ Real_type dot = 0.0; #define POLYBENCH_2MM_BODY2 \ - dot += alpha * A[k + i*nk] * B[j + k*nj]; + dot += alpha * A[k + i*nk] * B[j + k*nj];\ #define POLYBENCH_2MM_BODY3 \ tmp[j + i*nj] = dot; @@ -99,6 +99,97 @@ using VIEW_TYPE = RAJA::View(nj, nl)); \ VIEW_TYPE Dview(D, RAJA::Layout<2>(ni, nl)); +#define POLYBENCH_2MM_DATA_VEC_SETUP \ + RAJA_INDEX_VALUE_T(II, Int_type, "II"); \ + RAJA_INDEX_VALUE_T(IJ, Int_type, "IJ"); \ + RAJA_INDEX_VALUE_T(IK, Int_type, "IK"); \ + RAJA_INDEX_VALUE_T(IL, Int_type, "IL"); \ + using matrix_t = RAJA::RegisterMatrix; \ + std::array perm {{0,1}}; \ + RAJA::TypedView, II, IK> Aview(A, RAJA::make_permuted_layout({{ni, nk}}, perm)); \ + RAJA::TypedView, IK, IJ> Bview(B, RAJA::make_permuted_layout({{nk, nj}}, perm)); \ + RAJA::TypedView, II, IJ> Tmpview(tmp, RAJA::make_permuted_layout({{ni, nj}}, perm)); \ + RAJA::TypedView, IJ, IL> Cview(C, RAJA::make_permuted_layout({{nj, nl}}, perm)); \ + RAJA::TypedView, II, IL> Dview(D, RAJA::make_permuted_layout({{ni, nl}}, perm)); \ + using RowA = RAJA::RowIndex; \ + using ColA = RAJA::ColIndex; \ + using ColB = RAJA::ColIndex; \ + using RowT = RAJA::RowIndex; \ + using ColT = RAJA::ColIndex; \ + using ColC = RAJA::ColIndex; \ + using EXECPOL = \ + RAJA::KernelPolicy< \ + RAJA::statement::For<2, RAJA::matrix_col_exec, \ + RAJA::statement::For<1, RAJA::matrix_col_exec, \ + RAJA::statement::For<0, RAJA::matrix_row_exec, \ + RAJA::statement::Lambda<0> \ + > \ + > \ + > \ + >; \ + for(int i = 0; i < ni*nk; i++) { \ + A[i] = A[i] * alpha; \ + } + +#define POLYBENCH_2MM_VEC_BODY1 \ + std::memset(tmp, 0, ni*nj* sizeof(Real_type));\ + for(int i = 0; i < ni*nl; i++) { \ + D[i] = beta; \ + } \ + auto segments1 = RAJA::make_tuple(RAJA::TypedRangeSegment(0, ni),\ + RAJA::TypedRangeSegment(0, nk),\ + RAJA::TypedRangeSegment(0, nj)); \ + RAJA::kernel( segments1, \ + [=] (RowA i, ColA k, ColB j) { \ + Tmpview(i, j) += Aview(i, k) * Bview(toRowIndex(k), j); \ + } \ + );\ + auto segments2 = RAJA::make_tuple(RAJA::TypedRangeSegment(0, ni),\ + RAJA::TypedRangeSegment(0, nj),\ + RAJA::TypedRangeSegment(0, nl)); \ + RAJA::kernel( segments2, \ + [=] (RowT i, ColT j, ColC l) { \ + Dview(i, l) += Tmpview(i, j) * Cview(toRowIndex(j), l); \ + } \ + ); +// for (II i(0); i < ni; i++ ) { \ +// for (IL l(0); l < nl; l++) { \ +// dot = 0; \ +// for (IJ j(0); j < nj; j++) { \ +// dot += Tmpview(i,j) * Cview(j,l); \ +// std::cout << "Tmp and C view: " << Tmpview(i,j) << " " << Cview(j,l) << " " << dot << std::endl; \ +// } \ +// std::cout << dot << " " << Dview(i,l) << std::endl;\ +// } \ +// } +// for (II i(0); i < ni; i++ ) { \ +// for (IJ j(0); j < nj; j++) { \ +// dot = 0; \ +// for (IK k(0); k < nk; k++) { \ +// dot += Aview(i,k) * Bview(k,j); \ +// std::cout << "A and B view: " << Aview(i,k) << " " << Bview(k,j) << " " << dot << std::endl; \ +// } \ +// std::cout << dot << " " << Tmpview(i,j) << std::endl;\ +// } \ +// } + +//#define POLYBENCH_2MM_VEC_BODY1 \ +// for (Index_type i = 0; i < ni; i++ ) { \ +// for (Index_type j = 0; j < nj; j++) { \ +// dot = 0.; \ +// for (Index_type k = 0; k < nk; k++) { \ +// dot += alpha * a.get(i,k) * b.get(k,j); \ +// std::cout << "dot: " << dot << " " << a.get(i,k) << " " << a.get(k,i) << " " << A[k+i*nk] << " " << b.get(j,k) << " " << b.get(k,j) << " " << B[j+k*nj] << std::endl; \ +// } \ +// c.set(i,j,dot); \ +// } \ +// } + //std::cout << "dot: " << dot << " " << a.get(i,k) << " " << a.get(k,i) << " " << A[i+k*ni] << " " << b.get(j,k) << " " << b.get(k,j) << " " << B[k+j*nk] << std::endl; \ + //auto c = a * b; \ + //for (Index_type i = 0; i < ni; i++ ) { \ + // for (Index_type j = 0; j < nj; j++) { \ + // } \ + //} #include "common/KernelBase.hpp" diff --git a/src/polybench/POLYBENCH_GEMVER-OMP.cpp b/src/polybench/POLYBENCH_GEMVER-OMP.cpp index 843ad1608..60b3ef761 100644 --- a/src/polybench/POLYBENCH_GEMVER-OMP.cpp +++ b/src/polybench/POLYBENCH_GEMVER-OMP.cpp @@ -198,8 +198,8 @@ void POLYBENCH_GEMVER::runOpenMPVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment{0, n}, - RAJA::RangeSegment{0, n}), + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, n), + RAJA::RangeSegment(0, n)), poly_gemver_lam1 ); diff --git a/src/polybench/POLYBENCH_GEMVER-Seq.cpp b/src/polybench/POLYBENCH_GEMVER-Seq.cpp index a1563e84b..46d15ef5f 100644 --- a/src/polybench/POLYBENCH_GEMVER-Seq.cpp +++ b/src/polybench/POLYBENCH_GEMVER-Seq.cpp @@ -189,8 +189,8 @@ void POLYBENCH_GEMVER::runSeqVariant(VariantID vid) startTimer(); for (RepIndex_type irep = 0; irep < run_reps; ++irep) { - RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment{0, n}, - RAJA::RangeSegment{0, n}), + RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, n), + RAJA::RangeSegment(0, n)), poly_gemver_lam1 ); @@ -204,7 +204,7 @@ void POLYBENCH_GEMVER::runSeqVariant(VariantID vid) poly_gemver_lam4 ); - RAJA::forall (RAJA::RangeSegment{0, n}, + RAJA::forall (RAJA::RangeSegment(0, n), poly_gemver_lam5 ); diff --git a/src/stream/ADD-Seq.cpp b/src/stream/ADD-Seq.cpp index 3ef8b81c9..387746f86 100644 --- a/src/stream/ADD-Seq.cpp +++ b/src/stream/ADD-Seq.cpp @@ -76,7 +76,27 @@ void ADD::runSeqVariant(VariantID vid) break; } -#endif // RUN_RAJA_SEQ + + case RAJA_Vec : { + + ADD_DATA_VEC_SETUP; + + auto add_vec_lam = [=](RAJA::VectorIndex i) { + ADD_VEC_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::TypedRangeSegment(ibegin, iend), add_vec_lam); + + } + stopTimer(); + + break; + } +#endif //RUN_RAJA_SEQ default : { std::cout << "\n ADD : Unknown variant id = " << vid << std::endl; diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp index 00cb4ecf0..76287c8d7 100644 --- a/src/stream/ADD.cpp +++ b/src/stream/ADD.cpp @@ -27,6 +27,7 @@ ADD::ADD(const RunParams& params) setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); + setVariantDefined( RAJA_Vec ); setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp index 542c253b0..f54ccc981 100644 --- a/src/stream/ADD.hpp +++ b/src/stream/ADD.hpp @@ -20,11 +20,27 @@ #define ADD_DATA_SETUP \ Real_ptr a = m_a; \ Real_ptr b = m_b; \ - Real_ptr c = m_c; + Real_ptr c = m_c; \ + RAJA_INDEX_VALUE_T(I, int, "I");\ + using vector_t = RAJA::StreamVector;\ + using VecI = RAJA::VectorIndex;\ + RAJA::TypedView, I> A(a, getRunSize()); \ + RAJA::TypedView, I> B(b, getRunSize()); \ + RAJA::TypedView, I> C(c, getRunSize()); + #define ADD_BODY \ c[i] = a[i] + b[i]; +#define ADD_DATA_VEC_SETUP \ + RAJA_INDEX_VALUE_T(I, Int_type, "I"); \ + using vector_t = RAJA::StreamVector; \ + RAJA::TypedView, I> A(a, iend); \ + RAJA::TypedView, I> B(b, iend); \ + RAJA::TypedView, I> C(c, iend); + +#define ADD_VEC_BODY \ + C(i) = A(i) + B(i); #include "common/KernelBase.hpp" diff --git a/src/stream/COPY-Seq.cpp b/src/stream/COPY-Seq.cpp index 68bc51e4d..45e218418 100644 --- a/src/stream/COPY-Seq.cpp +++ b/src/stream/COPY-Seq.cpp @@ -76,6 +76,26 @@ void COPY::runSeqVariant(VariantID vid) break; } + case RAJA_Vec : { + + COPY_DATA_VEC_SETUP; + + auto copy_vec_lam = [=](RAJA::VectorIndex i) { + COPY_VEC_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::TypedRangeSegment(ibegin, iend), copy_vec_lam); + + } + stopTimer(); + + break; + } + #endif // RUN_RAJA_SEQ default : { diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp index 8da987150..761e2e15b 100644 --- a/src/stream/COPY.cpp +++ b/src/stream/COPY.cpp @@ -27,6 +27,7 @@ COPY::COPY(const RunParams& params) setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); + setVariantDefined( RAJA_Vec ); setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp index c0e2c455a..566e962b0 100644 --- a/src/stream/COPY.hpp +++ b/src/stream/COPY.hpp @@ -21,9 +21,17 @@ Real_ptr a = m_a; \ Real_ptr c = m_c; +#define COPY_DATA_VEC_SETUP \ + RAJA_INDEX_VALUE_T(I, Int_type, "I"); \ + using vector_t = RAJA::StreamVector; \ + RAJA::TypedView, I> A(a, iend); \ + RAJA::TypedView, I> C(c, iend); + #define COPY_BODY \ c[i] = a[i] ; +#define COPY_VEC_BODY \ + C(i) = A(i); #include "common/KernelBase.hpp" diff --git a/src/stream/MUL-Seq.cpp b/src/stream/MUL-Seq.cpp index 7b36935e1..53bc6b8fa 100644 --- a/src/stream/MUL-Seq.cpp +++ b/src/stream/MUL-Seq.cpp @@ -76,6 +76,26 @@ void MUL::runSeqVariant(VariantID vid) break; } + + case RAJA_Vec : { + + MUL_DATA_VEC_SETUP; + + auto mul_vec_lam = [=](RAJA::VectorIndex i) { + MUL_VEC_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::TypedRangeSegment(ibegin, iend), mul_vec_lam); + + } + stopTimer(); + + break; + } #endif // RUN_RAJA_SEQ default : { diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp index 0dc16b674..8b17d8962 100644 --- a/src/stream/MUL.cpp +++ b/src/stream/MUL.cpp @@ -27,6 +27,7 @@ MUL::MUL(const RunParams& params) setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); + setVariantDefined( RAJA_Vec ); setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp index 25943fcbe..33e77ed8f 100644 --- a/src/stream/MUL.hpp +++ b/src/stream/MUL.hpp @@ -22,9 +22,17 @@ Real_ptr c = m_c; \ Real_type alpha = m_alpha; +#define MUL_DATA_VEC_SETUP \ + RAJA_INDEX_VALUE_T(I, Int_type, "I"); \ + using vector_t = RAJA::StreamVector; \ + RAJA::TypedView, I> B(b, iend); \ + RAJA::TypedView, I> C(c, iend); + #define MUL_BODY \ b[i] = alpha * c[i] ; +#define MUL_VEC_BODY \ + B(i) = alpha * C(i); #include "common/KernelBase.hpp" diff --git a/src/stream/TRIAD-Seq.cpp b/src/stream/TRIAD-Seq.cpp index 06885d50c..3829d18f1 100644 --- a/src/stream/TRIAD-Seq.cpp +++ b/src/stream/TRIAD-Seq.cpp @@ -76,6 +76,27 @@ void TRIAD::runSeqVariant(VariantID vid) break; } + + case RAJA_Vec : { + + TRIAD_DATA_VEC_SETUP; + + auto triad_vec_lam = [=](RAJA::VectorIndex i) { + TRIAD_VEC_BODY; + }; + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + RAJA::forall>( + RAJA::TypedRangeSegment(ibegin, iend), triad_vec_lam); + + } + stopTimer(); + + break; + } + #endif // RUN_RAJA_SEQ default : { diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp index 3e42feeda..33eb8b3a3 100644 --- a/src/stream/TRIAD.cpp +++ b/src/stream/TRIAD.cpp @@ -27,6 +27,7 @@ TRIAD::TRIAD(const RunParams& params) setVariantDefined( Base_Seq ); setVariantDefined( Lambda_Seq ); setVariantDefined( RAJA_Seq ); + setVariantDefined( RAJA_Vec ); setVariantDefined( Base_OpenMP ); setVariantDefined( Lambda_OpenMP ); diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp index 23ffd168a..3952b71f6 100644 --- a/src/stream/TRIAD.hpp +++ b/src/stream/TRIAD.hpp @@ -23,9 +23,18 @@ Real_ptr c = m_c; \ Real_type alpha = m_alpha; +#define TRIAD_DATA_VEC_SETUP \ + RAJA_INDEX_VALUE_T(I, Int_type, "I"); \ + using vector_t = RAJA::StreamVector; \ + RAJA::TypedView, I> A(a, iend); \ + RAJA::TypedView, I> B(b, iend); \ + RAJA::TypedView, I> C(c, iend); + #define TRIAD_BODY \ a[i] = b[i] + alpha * c[i] ; +#define TRIAD_VEC_BODY \ + A(i) = B(i) + alpha * C(i); #include "common/KernelBase.hpp" diff --git a/tpl/RAJAvec b/tpl/RAJAvec new file mode 160000 index 000000000..e3dc2f988 --- /dev/null +++ b/tpl/RAJAvec @@ -0,0 +1 @@ +Subproject commit e3dc2f988042f785298719b530867e92192a3c0e