diff --git a/include/RAJA/util/reduce.hpp b/include/RAJA/util/reduce.hpp index a5aa923d05..c245e3818f 100644 --- a/include/RAJA/util/reduce.hpp +++ b/include/RAJA/util/reduce.hpp @@ -91,6 +91,14 @@ struct LeftFoldReduce m_accumulated_value = m_op(std::move(m_accumulated_value), std::move(val)); } + /*! + \brief combine a value into the reducer + */ + RAJA_HOST_DEVICE RAJA_INLINE void operator+=(T val) + { + combine(std::move(val)); + } + private: BinaryOp m_op; T m_accumulated_value; @@ -214,6 +222,14 @@ struct BinaryTreeReduce ++m_count; } + /*! + \brief combine a value into the reducer + */ + RAJA_HOST_DEVICE RAJA_INLINE void operator+=(T val) + { + combine(std::move(val)); + } + private: BinaryOp m_op; @@ -241,6 +257,80 @@ struct BinaryTreeReduce } }; +/*! + \brief Reduce class that does a reduction with a left fold. +*/ +template +struct KahanSum +{ + static_assert(std::is_floating_point_v, "T must be a floating point type"); + + RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit KahanSum( + T init = T()) noexcept + : m_accumulated_value(std::move(init)), + m_accumulated_carry(T()) + {} + + KahanSum(KahanSum const&) = delete; + KahanSum& operator=(KahanSum const&) = delete; + KahanSum(KahanSum&&) = delete; + KahanSum& operator=(KahanSum&&) = delete; + + ~KahanSum() = default; + + /*! + \brief reset the combined value of the reducer to the identity + */ + RAJA_HOST_DEVICE RAJA_INLINE void clear() noexcept + { + m_accumulated_value = T(); + m_accumulated_carry = T(); + } + + /*! + \brief return the combined value and clear the reducer + */ + RAJA_HOST_DEVICE RAJA_INLINE T get_and_clear() + { + T accumulated_value = std::move(m_accumulated_value); + + clear(); + + return accumulated_value; + } + + /*! + \brief return the combined value + */ + RAJA_HOST_DEVICE RAJA_INLINE T get() { return m_accumulated_value; } + + /*! + \brief combine a value into the reducer + */ + RAJA_HOST_DEVICE RAJA_INLINE void combine(T val) + { + // volatile used to prevent compiler optimizations that assume + // floating-point operations are associative + T y = val - m_accumulated_carry; + volatile T t = m_accumulated_value + y; + volatile T z = t - m_accumulated_value; + m_accumulated_carry = z - y; + m_accumulated_value = t; + } + + /*! + \brief combine a value into the reducer + */ + RAJA_HOST_DEVICE RAJA_INLINE void operator+=(T val) + { + combine(std::move(val)); + } + +private: + T m_accumulated_value; + T m_accumulated_carry; +}; + template using HighAccuracyReduce = std::conditional_t::value, @@ -291,6 +381,24 @@ binary_tree_reduce(Iter begin, Iter end, T init, BinaryOp op) return reducer.get_and_clear(); } +/*! + \brief Combine into a single value using a kahan sum using O(N) operations + and O(1) memory +*/ +template +RAJA_HOST_DEVICE RAJA_INLINE T kahan_sum(Iter begin, Iter end, T init) +{ + KahanSum reducer(std::move(init)); + + for (; begin != end; ++begin) + { + + reducer.combine(*begin); + } + + return reducer.get_and_clear(); +} + /*! \brief reducer that uses a high accuracy implementation when round-off error is a concern, or a faster algorithm with it is not a concern @@ -358,6 +466,22 @@ RAJA_HOST_DEVICE RAJA_INLINE std::move(op)); } +/*! + \brief Accumulate given range to a single value + using a left fold algorithm in O(N) operations and O(1) extra memory + see https://en.cppreference.com/w/cpp/algorithm/accumulate +*/ +template> +RAJA_HOST_DEVICE RAJA_INLINE concepts:: + enable_if_t, std::is_floating_point> + kahan_sum(Container&& c, T init = T()) +{ + using std::begin; + using std::end; + + return detail::kahan_sum(begin(c), end(c), std::move(init)); +} + /*! \brief Reduce given range to a single value using an algorithm with high accuracy when floating point round off is a diff --git a/test/unit/algorithm/CMakeLists.txt b/test/unit/algorithm/CMakeLists.txt index 2c241b6f6d..ecd6027055 100644 --- a/test/unit/algorithm/CMakeLists.txt +++ b/test/unit/algorithm/CMakeLists.txt @@ -89,7 +89,7 @@ if(RAJA_ENABLE_HIP) endif() -set( UTIL_REDUCES BinaryTree Accumulate ) +set( UTIL_REDUCES BinaryTree Accumulate Kahan ) RAJA_GENERATE_ALGORITHM_UTIL_TESTS( reduce Sequential Default "${UTIL_REDUCES}" ) diff --git a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp index 4e3f9fb795..4c685767c7 100644 --- a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp +++ b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp @@ -37,14 +37,45 @@ // tag classes to differentiate reduce by attributes and apply correct testing -struct left_fold_reduce_tag { }; -struct unordered_reduce_tag { }; +struct left_fold_reduce_tag +{ + static constexpr const char* name() { return "left fold reduce"; } +}; +struct unordered_reduce_tag +{ + static constexpr const char* name() { return "unordered reduce"; } +}; -struct reduce_interface_tag { }; +struct sum_interface_tag +{ + static constexpr const char* name() { return "sum interface"; } +}; +struct reduce_interface_tag +{ + static constexpr const char* name() { return "reduce interface"; } +}; -struct reduce_default_interface_tag { }; -struct reduce_init_interface_tag { }; -struct reduce_init_op_interface_tag { }; +struct any_types_tag { + template < typename > + static constexpr bool matches() { return true; } +}; +struct floating_point_types_tag { + template < typename T > + static constexpr bool matches() { return std::is_floating_point_v; } +}; + +struct reduce_default_interface_tag +{ + static constexpr const char* name() { return "called with ()"; } +}; +struct reduce_init_interface_tag +{ + static constexpr const char* name() { return "called with (init)"; } +}; +struct reduce_init_op_interface_tag +{ + static constexpr const char* name() { return "called with (init, op)"; } +}; // synchronize based on a RAJA execution policy @@ -85,10 +116,7 @@ struct PolicySynchronize> template -struct ReduceData; - -template -struct ReduceData +struct ReduceData { ValType* values = nullptr; ValType* reduced_value = nullptr; @@ -133,100 +161,92 @@ struct ReduceData template -void doReduce(ReduceData & data, +bool doReduce(ReduceData & data, RAJA::Index_type N, T, BinaryOp, - Reducer reducer, reduce_interface_tag, reduce_default_interface_tag) + Reducer reducer, InterfaceTag, reduce_default_interface_tag) { data.copy_data(N); data.resource().wait(); reducer(data.reduced_value, RAJA::make_span(data.values, N)); reducer.synchronize(); + return true; } template -void doReduce(ReduceData & data, - RAJA::Index_type N, - T init, - BinaryOp, - Reducer reducer, reduce_interface_tag, reduce_init_interface_tag) +bool doReduce(ReduceData & data, + RAJA::Index_type N, + T init, + BinaryOp, + Reducer reducer, InterfaceTag, reduce_init_interface_tag) { data.copy_data(N); data.resource().wait(); reducer(data.reduced_value, RAJA::make_span(data.values, N), init); reducer.synchronize(); + return true; } template -void doReduce(ReduceData & data, - RAJA::Index_type N, - T init, - BinaryOp op, - Reducer reducer, reduce_interface_tag, reduce_init_op_interface_tag) +bool doReduce(ReduceData &, + RAJA::Index_type, + T, + BinaryOp, + Reducer, sum_interface_tag, reduce_init_op_interface_tag) { - data.copy_data(N); - data.resource().wait(); - reducer(data.reduced_value, RAJA::make_span(data.values, N), init, op); - reducer.synchronize(); + // can't do this with the sum interface + return false; } - template -::testing::AssertionResult testReduce( - const char* test_name, - const unsigned seed, - ReduceData & data, - RAJA::Index_type N, - T init, - BinaryOp op, - TestReducer test_reducer, left_fold_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci) + typename Reducer> +bool doReduce(ReduceData & data, + RAJA::Index_type N, + T init, + BinaryOp op, + Reducer reducer, reduce_interface_tag, reduce_init_op_interface_tag) { - doReduce(data, N, init, op, test_reducer, si, ci); - - T reduced_check_value = init; - for (RAJA::Index_type i = 0; i < N; i++) { - reduced_check_value = op(std::move(reduced_check_value), data.values[i]); - } - - if (reduced_check_value != *data.reduced_value) { - return ::testing::AssertionFailure() - << test_reducer.name() << " (left fold reduce) " << test_name - << " (with N " << N << " with seed " << seed << ")" - << " incorrect " << *data.reduced_value - << ", expected " << reduced_check_value; - } - - return ::testing::AssertionSuccess(); + data.copy_data(N); + data.resource().wait(); + reducer(data.reduced_value, RAJA::make_span(data.values, N), init, op); + reducer.synchronize(); + return true; } + template + typename OrderTag, + typename DataInterfaceTag, + typename TestInterfaceTag> ::testing::AssertionResult testReduce( const char* test_name, const unsigned seed, - ReduceData & data, + ReduceData & data, RAJA::Index_type N, T init, BinaryOp op, - TestReducer test_reducer, unordered_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci) + TestReducer test_reducer, OrderTag, DataInterfaceTag di, TestInterfaceTag ti) { - doReduce(data, N, init, op, test_reducer, si, ci); + bool did_reduce = doReduce(data, N, init, op, test_reducer, di, ti); + if (!did_reduce) { + return ::testing::AssertionSuccess(); + } T reduced_check_value = init; for (RAJA::Index_type i = 0; i < N; i++) { @@ -235,7 +255,10 @@ ::testing::AssertionResult testReduce( if (reduced_check_value != *data.reduced_value) { return ::testing::AssertionFailure() - << test_reducer.name() << " (unordered reduce) " << test_name + << test_reducer.name() + << " (" << OrderTag::name() << ")" + << " (" << TestInterfaceTag::name() << ")" + << " " << test_name << " (with N " << N << " with seed " << seed << ")" << " incorrect " << *data.reduced_value << ", expected " << reduced_check_value; @@ -252,24 +275,26 @@ void testReducerInterfaces(unsigned seed, RAJA::Index_type MaxN, Reducer reducer { using reduce_category = typename Reducer::reduce_category ; using interface_category = typename Reducer::reduce_interface ; - using no_init_operator = reduce_default_interface_tag; - using init_no_operator = reduce_init_interface_tag; - using init_operator = reduce_init_op_interface_tag; - - std::mt19937 rng(seed); - RAJA::Index_type N = std::uniform_int_distribution((MaxN+1)/2, MaxN)(rng); - std::uniform_int_distribution dist(-N, N); - - ReduceData data(N, res, [&](){ return dist(rng); }); - - ASSERT_TRUE(testReduce("default", seed, data, N, RAJA::operators::plus::identity(), RAJA::operators::plus{}, - reducer, reduce_category{}, interface_category{}, no_init_operator{})); - ASSERT_TRUE(testReduce("init", seed, data, N, ValType(N), RAJA::operators::plus{}, - reducer, reduce_category{}, interface_category{}, init_no_operator{})); - ASSERT_TRUE(testReduce("minimum", seed, data, N, ValType(0), RAJA::operators::minimum{}, - reducer, reduce_category{}, interface_category{}, init_operator{})); - ASSERT_TRUE(testReduce("Maximum", seed, data, N, ValType(0), RAJA::operators::maximum{}, - reducer, reduce_category{}, interface_category{}, init_operator{})); + using types_category = typename Reducer::reduce_types ; + + if constexpr(types_category::template matches()) { + + std::mt19937 rng(seed); + RAJA::Index_type N = std::uniform_int_distribution((MaxN+1)/2, MaxN)(rng); + std::uniform_int_distribution dist(-N, N); + + ReduceData data(N, res, [&](){ return dist(rng); }); + + EXPECT_TRUE(testReduce("default", seed, data, N, RAJA::operators::plus::identity(), RAJA::operators::plus{}, + reducer, reduce_category{}, interface_category{}, reduce_default_interface_tag{})); + EXPECT_TRUE(testReduce("init", seed, data, N, ValType(N), RAJA::operators::plus{}, + reducer, reduce_category{}, interface_category{}, reduce_init_interface_tag{})); + EXPECT_TRUE(testReduce("minimum", seed, data, N, ValType(0), RAJA::operators::minimum{}, + reducer, reduce_category{}, interface_category{}, reduce_init_op_interface_tag{})); + EXPECT_TRUE(testReduce("maximum", seed, data, N, ValType(0), RAJA::operators::maximum{}, + reducer, reduce_category{}, interface_category{}, reduce_init_op_interface_tag{})); + + } } template > struct Accumulate; +template < typename test_policy, typename platform = test_platform > +struct KahanSum; + template < typename test_policy > struct BinaryTreeReduce @@ -40,6 +43,7 @@ struct BinaryTreeReduce { using reduce_category = unordered_reduce_tag; using reduce_interface = reduce_interface_tag; + using reduce_types = any_types_tag; const char* name() { @@ -59,6 +63,7 @@ struct Accumulate { using reduce_category = left_fold_reduce_tag; using reduce_interface = reduce_interface_tag; + using reduce_types = any_types_tag; const char* name() { @@ -72,6 +77,26 @@ struct Accumulate } }; +template < typename test_policy > +struct KahanSum + : ForoneSynchronize +{ + using reduce_category = unordered_reduce_tag; + using reduce_interface = sum_interface_tag; + using reduce_types = floating_point_types_tag; + + const char* name() + { + return "RAJA::kahan_sum"; + } + + template < typename T, typename... Args > + void operator()(T* reduced_value, Args&&... args) + { + *reduced_value = RAJA::kahan_sum(std::forward(args)...); + } +}; + #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) template < typename test_policy > @@ -80,6 +105,7 @@ struct BinaryTreeReduce { using reduce_category = unordered_reduce_tag; using reduce_interface = reduce_interface_tag; + using reduce_types = any_types_tag; std::string m_name; @@ -123,6 +149,7 @@ struct Accumulate { using reduce_category = left_fold_reduce_tag; using reduce_interface = reduce_interface_tag; + using reduce_types = any_types_tag; std::string m_name; @@ -160,6 +187,43 @@ struct Accumulate } }; + +template < typename test_policy > +struct KahanSum + : ForoneSynchronize +{ + using reduce_category = unordered_reduce_tag; + using reduce_interface = sum_interface_tag; + using reduce_types = floating_point_types_tag; + + std::string m_name; + + KahanSum() + : m_name(std::string("RAJA::kahan_sum<") + test_policy_info::name() + std::string(">")) + { } + + const char* name() + { + return m_name.c_str(); + } + + template < typename T, typename Container > + void operator()(T* reduced_value, Container&& c) + { + forone( [=] RAJA_DEVICE() { + *reduced_value = RAJA::kahan_sum(c); + }); + } + + template < typename T, typename Container > + void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal init) + { + forone( [=] RAJA_DEVICE() { + *reduced_value = RAJA::kahan_sum(c, init); + }); + } +}; + #endif @@ -173,6 +237,11 @@ using SequentialAccumulateReduceReducers = Accumulate >; +using SequentialKahanReduceReducers = + camp::list< + KahanSum + >; + #if defined(RAJA_ENABLE_CUDA) using CudaBinaryTreeReduceReducers = @@ -185,6 +254,11 @@ using CudaAccumulateReduceReducers = Accumulate >; +using CudaKahanReduceReducers = + camp::list< + KahanSum + >; + #endif #if defined(RAJA_ENABLE_HIP) @@ -199,6 +273,11 @@ using HipAccumulateReduceReducers = Accumulate >; +using HipKahanReduceReducers = + camp::list< + KahanSum + >; + #endif #endif //__TEST_ALGORITHM_UTIL_REDUCE_HPP__