diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_device_functions.h b/projects/clr/hipamd/include/hip/amd_detail/amd_device_functions.h index 9da7185be84..581160fccb1 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_device_functions.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_device_functions.h @@ -894,4 +894,13 @@ static inline __device__ void* memset(void* ptr, int val, size_t size) { } #endif // !__OPENMP_AMDGCN__ +#define HIP_IMPL_GENERATE_SCAN_FUNC(OP, TYPE_ALIAS, TYPE) \ + extern "C" __device__ __attribute__((const)) TYPE __ockl_wfscan_ ## OP ## _ ## TYPE_ALIAS(TYPE, bool);\ +\ + template \ + __device__ __forceinline__ TYPE scan_ ## OP(TYPE val)\ + {\ + return __ockl_wfscan_ ## OP ## _ ## TYPE_ALIAS(val, Inclusive);\ + } + #endif diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h index af5c73288f9..e5e200c99e9 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h @@ -22,6 +22,11 @@ #endif namespace cooperative_groups { + class thread_group; +namespace impl { + template +__CG_QUALIFIER__ unsigned long long groupMask(const TyGroup&); +} /** \brief The base type of all cooperative group types. * @@ -32,12 +37,13 @@ namespace cooperative_groups { * on Microsoft Windows. */ class thread_group { + template + friend __CG_QUALIFIER__ unsigned long long cooperative_groups::impl::groupMask(const TyGroup&); protected: __hip_uint32_t _type; //! Type of the thread_group. __hip_uint32_t _num_threads; //! Total number of threads in the thread_group. __hip_uint64_t _mask; //! Lanemask for coalesced and tiled partitioned group types, //! LSB represents lane 0, and MSB represents lane 63 - //! Construct a thread group, and set thread group type and other essential //! thread group properties. This generic thread group is directly constructed //! only when the group is supposed to contain only the calling thread @@ -387,6 +393,7 @@ class coalesced_group : public thread_group { friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size); friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred); + template friend __CG_QUALIFIER__ coalesced_group binary_partition(const thread_block_tile& tgrp, bool pred); @@ -1044,6 +1051,36 @@ template class thread_block_tile_internal __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g) : thread_block_tile_type() {} }; + +// becomes to std::true_type if the group is tiled and has a size known at compile time +template +struct isTiledGroup : __hip_internal::false_type { +}; + +template +struct isTiledGroup> + : __hip_internal::integral_constant { +}; + +// returns the size of tile_group provided it is known at compile time +template +struct tiledGroupSize : __hip_internal::integral_constant { + +}; +template +struct tiledGroupSize> + : __hip_internal::integral_constant { +}; + +template +struct isCoalescedGroup : __hip_internal::false_type { +}; + +template <> +struct isCoalescedGroup : __hip_internal::true_type { +}; } // namespace impl /** \brief Group type - thread_block_tile @@ -1287,6 +1324,7 @@ __CG_QUALIFIER__ coalesced_group binary_partition(const thread_block_tile struct plus { __CG_QUALIFIER__ T operator()(T lhs, T rhs) const @@ -1334,6 +1372,153 @@ struct bit_or { return lhs | rhs; } }; + +namespace impl { +// when instantiated with two parameter types, allows to know if there are the same, regardless +// of const/volatile qualifiers +template +using is_param_type_same = __hip_internal::is_same, + typename __hip_internal::remove_cvref>; + +template +struct isArithmeticFunc : __hip_internal::false_type { +}; + +template +struct isArithmeticFunc> : __hip_internal::true_type { +}; + +template +struct isArithmeticFunc> : __hip_internal::true_type { +}; + +template +struct isArithmeticFunc> : __hip_internal::true_type { +}; + +template +struct isBooleanFunc : __hip_internal::false_type { +}; + +template +struct isBooleanFunc> : __hip_internal::true_type { +}; + +template +struct isBooleanFunc> : __hip_internal::true_type { +}; + +template +struct isBooleanFunc> : __hip_internal::true_type { +}; + +// this is the value to return in exclusive_scan, for lane 0 +template +struct CGIdentity { + __CG_QUALIFIER__ T operator()() + { + T result = {}; + return result; + } +}; + +template +struct CGIdentity> { + __CG_QUALIFIER__ T operator()() + { + T result {}; + return ~result; + } +}; + +template +struct CGIdentity> { + __CG_QUALIFIER__ T operator()() + { + // CUDA would return 0 in this case. But in our case we mimic what __ockl_wfscan_* + // would do + return __hip_internal::NumericLimits::maximum(); + } +}; + +template +struct CGIdentity> { + __CG_QUALIFIER__ T operator()() + { + return __hip_internal::NumericLimits::minimum(); + } +}; + +// calculates the necessary warp mask for cooperative groups that support reduce(), or +// inclusive/exlcusive_scan() +template +__CG_QUALIFIER__ unsigned long long groupMask(const TyGroup& group) +{ + unsigned long long mask = ~0ull; + + if constexpr (impl::isCoalescedGroup::value) { + mask = group.coalesced_info.member_mask; + } else { + // we cannot simply just use the __activemask() here, because more than one tile could have active + // threads at a time; we need to mask away the threads that not part of this tile first + mask >>= (64 - group.num_threads()); + mask <<= (((internal::workgroup::thread_rank() % warpSize) / group.num_threads()) * group.num_threads()); + } + + return mask; +} + +// backward permute implementation for cooperative group operations, i.e. +// for up to 32 bytes (__hip_ds_bpermute() can only do 4 bytes at a time, +// this function calls it (or the floating point version) multiple times to +// implement it for bigger sizes +template ::type = 0> +__CG_QUALIFIER__ void bPermute(T&, T, int from) +{ +} + +// trivial case: the type fits within the permute size +template ::type = 0> +__CG_QUALIFIER__ void bPermute(T& permuteResult, T result, int from) +{ + auto backwardPermute = [](int index, T arg) { + if constexpr (__hip_internal::is_floating_point::value && + sizeof(T) <= 4) { + return __hip_ds_bpermutef(index, arg); + } else { + return __hip_ds_bpermute(index, arg); + } + }; + + if constexpr (sizeof(T) == 2) { + union { + int i; + T f; + } tmp; + + tmp.f = result; + tmp.i = __hip_ds_bpermute(from << 2, tmp.i); + permuteResult = tmp.f; + } else if constexpr (sizeof(T) == 4) { + auto bPermuteResult = backwardPermute(from << 2, result); + __builtin_memcpy(&permuteResult, &bPermuteResult, sizeof(result)); + } else { + static_assert(__hip_internal::is_void::value, "Unexpected type"); + } +} + +// Overload when we need multiple ds_permute, because one is not enough +template +__CG_QUALIFIER__ void bPermute(T permuteResult[NumPermutes], T result[NumPermutes], int from) +{ + // ds_bpermute only deals with 32-bit sizes, so for other sizes + // we need to call the permute multiple times + for (int i = 0; i < NumPermutes; i++) { + permuteResult[i] = __hip_ds_bpermute(from << 2, result[i]); + } +} + +} // namespace impl #endif /** diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups_reduce.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups_reduce.h index bdcacd4274f..a0a3b60f406 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups_reduce.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups_reduce.h @@ -18,10 +18,6 @@ namespace cooperative_groups { namespace impl { - template - using is_param_type_same = __hip_internal::is_same, - typename __hip_internal::remove_cvref>; - template struct has_add : __hip_internal::false_type { }; @@ -76,25 +72,6 @@ namespace impl { __hip_internal::void_t(0ull, T {}))> > : __hip_internal::true_type {}; - // we can call reduce() only the block tiles that have a compile-time size - template - struct isTiledGroup : __hip_internal::false_type { - }; - - template - struct isTiledGroup> - : __hip_internal::integral_constant { - }; - - template - struct isCoalescedGroup : __hip_internal::false_type { - }; - - template <> - struct isCoalescedGroup : __hip_internal::true_type { - }; } /** \ingroup CooperativeGAPI @@ -126,15 +103,7 @@ __CG_QUALIFIER__ auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> de unsigned long long mask = ~0ull; - // we cannot simply just use the __activemask() here, because more than one tile could have active - // threads at a time; we need to mask away the threads that not part of this tile first - if constexpr (!__hip_internal::is_same::value) { - mask >>= (64 - group.num_threads()); - mask <<= (((threadIdx.x % warpSize) / group.num_threads()) * group.num_threads()); - } - - // for coalesced_groups, the mask is simply the activemask - mask &= __activemask(); + mask = impl::groupMask(group); if constexpr (__hip_internal::is_same>::value && impl::has_add::value) { diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups_scan.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups_scan.h new file mode 100644 index 00000000000..3a9c2eabdcd --- /dev/null +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups_scan.h @@ -0,0 +1,304 @@ +#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_SCAN_H +#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_SCAN_H + +#if __cplusplus +#if !defined(__HIPCC_RTC__) +#include +#include +#endif + +#if !defined(__HIPCC_RTC__) +#include +#endif + +namespace cooperative_groups { +namespace impl { + // these functions allow to make use of C++ function overloads, instead of having to code + // a big if-constexpr according to operand type + HIP_IMPL_GENERATE_SCAN_FUNC(add, i32, int); + HIP_IMPL_GENERATE_SCAN_FUNC(add, u32, unsigned int); + + HIP_IMPL_GENERATE_SCAN_FUNC(min, i32, int); + HIP_IMPL_GENERATE_SCAN_FUNC(min, u32, unsigned int); + + HIP_IMPL_GENERATE_SCAN_FUNC(max, i32, int); + HIP_IMPL_GENERATE_SCAN_FUNC(max, u32, unsigned int); + + HIP_IMPL_GENERATE_SCAN_FUNC(and, i32, int); + HIP_IMPL_GENERATE_SCAN_FUNC(and, u32, unsigned int); + + HIP_IMPL_GENERATE_SCAN_FUNC(or, i32, int); + HIP_IMPL_GENERATE_SCAN_FUNC(or, u32, unsigned int); + + HIP_IMPL_GENERATE_SCAN_FUNC(xor, i32, int); + HIP_IMPL_GENERATE_SCAN_FUNC(xor, u32, unsigned int); + + // extra types. Unlike cg::reduce() which depends on reduce_*_sync() functions being defined with + // HIP_ENABLE_EXTRA_WARP_SYNC_TYPES to be able to use the ockl intrinsics, for scan we always + // define them here + HIP_IMPL_GENERATE_SCAN_FUNC(add, i64, long long); + HIP_IMPL_GENERATE_SCAN_FUNC(add, u64, unsigned long long); + HIP_IMPL_GENERATE_SCAN_FUNC(add, f32, float); + HIP_IMPL_GENERATE_SCAN_FUNC(add, f64, double); + + HIP_IMPL_GENERATE_SCAN_FUNC(min, i64, long long); + HIP_IMPL_GENERATE_SCAN_FUNC(min, u64, unsigned long long); + HIP_IMPL_GENERATE_SCAN_FUNC(min, f32, float); + HIP_IMPL_GENERATE_SCAN_FUNC(min, f64, double); + + HIP_IMPL_GENERATE_SCAN_FUNC(max, i64, long long); + HIP_IMPL_GENERATE_SCAN_FUNC(max, u64, unsigned long long); + HIP_IMPL_GENERATE_SCAN_FUNC(max, f32, float); + HIP_IMPL_GENERATE_SCAN_FUNC(max, f64, double); + + HIP_IMPL_GENERATE_SCAN_FUNC(and, i64, long long); + HIP_IMPL_GENERATE_SCAN_FUNC(and, u64, unsigned long long); + + HIP_IMPL_GENERATE_SCAN_FUNC(or, i64, long long); + HIP_IMPL_GENERATE_SCAN_FUNC(or, u64, unsigned long long); + + HIP_IMPL_GENERATE_SCAN_FUNC(xor, i64, long long); + HIP_IMPL_GENERATE_SCAN_FUNC(xor, u64, unsigned long long); + + // not all types could be used with wfscan (e.g. user defined types), this predicate + // indicates whether that is the case + template + struct has_arithmetic_scan : __hip_internal::false_type { + }; + + // all the arithmetic operations accept the same types, so we just based it on the overload being + // present for scan_add + template + struct has_arithmetic_scan(T {}))> + > : __hip_internal::true_type { + }; + + template + struct has_boolean_scan : __hip_internal::false_type { + }; + + // all the arithmetic operations accept the same types, so we just based it on the overload being + // present for scan_and + template + struct has_boolean_scan(T {}))> + > : __hip_internal::true_type { + }; + + // given cooperative_groups template parameter, calls the right impl::scan function (that would contain __ockl_wfscan_*) + // and that is overloaded by type + template + __CG_QUALIFIER__ TyVal call_scan(const TyVal& val) + { + using Val = typename __hip_internal::remove_cvref::type; + + if constexpr (__hip_internal::is_same>::value) { + return impl::scan_add(val); + } else if constexpr (__hip_internal::is_same>::value) { + return impl::scan_min(val); + } else if constexpr (__hip_internal::is_same>::value) { + return impl::scan_max(val); + } else if constexpr (__hip_internal::is_same>::value) { + return impl::scan_and(val); + } else if constexpr (__hip_internal::is_same>::value) { + return impl::scan_or(val); + } else if constexpr (__hip_internal::is_same>::value) { + return impl::scan_xor(val); + } + } + + template + __CG_QUALIFIER__ auto scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) + { + using Op = typename __hip_internal::remove_cvref::type; + using Val = typename __hip_internal::remove_cvref::type; + + constexpr bool isPrimitiveType = impl::has_arithmetic_scan::value; + using permuteType = typename __hip_internal::conditional::type; + + // the number of backward permutes will be: size(Val) / 4 rounded up + static constexpr int kNumOfPermutes = (sizeof(Val) <= 4)? + 1 : + (sizeof(Val) + sizeof(unsigned int) - 1) / sizeof(unsigned int); + static_assert(cooperative_groups::impl::is_param_type_same::value, "Operator input and output types differ"); + static_assert(__hip_internal::is_trivially_copyable::value, "val must be trivially copyable"); + static_assert(sizeof(Val) <= 32, "scan only operate on values of size up to 32 bytes"); + + if constexpr (!cooperative_groups::impl::isTiledGroup::value && !cooperative_groups::impl::isCoalescedGroup::value) { + static_assert(__hip_internal::is_void::value, "This group does not exclusively represent a tile"); + } + + unsigned int maskNumBits; + int numIterations; + // next bit to aggregate with + unsigned int maskIdx; + unsigned int laneId = __lane_id(); + int nextBit = laneId; + unsigned long long mask = ~0ull; + + mask = impl::groupMask(group); + maskNumBits = __popcll(mask); + maskIdx = __popcll(((1ull << laneId) - 1) & mask); + + if (laneId) { + mask <<= 64 - laneId; + } + +#ifdef __OPTIMIZE__ // at the time of this writing the ockl wfscan functions do not compile when + // using -O0 + if (impl::isTiledGroup::value) { + // for tiled_groups we know at compile time that whether we can call the ockl intrinsics or + // not; if the block tile is actually the whole warp + if (impl::tiledGroupSize::value == warpSize) { + if constexpr (impl::isArithmeticFunc::value && impl::has_arithmetic_scan::value || + impl::isBooleanFunc::value && impl::has_boolean_scan::value) { + return impl::call_scan(val); + } + } + } else if constexpr (impl::isCoalescedGroup::value) { + // for the coalesced_group case we do need to check at runtime, adding a slight overhead on + // this branch + if (maskNumBits == warpSize) { + if constexpr (impl::isArithmeticFunc::value && impl::has_arithmetic_scan::value || + impl::isBooleanFunc::value && impl::has_boolean_scan::value) { + return impl::call_scan(val); + } + } + } +#endif + + // unsigned int[N] is used in some cases, e.g. when T is wider than 32-bit + using ResultType = typename __hip_internal::conditional< + isPrimitiveType && (sizeof(Val) == 4 || sizeof(Val) == 2), permuteType, + permuteType[kNumOfPermutes]>::type; + static constexpr int alignment = alignof(Val) <= 4? 4 : alignof(Val); + alignas(alignment) ResultType result; + alignas(alignment) ResultType permuteResult; + + if constexpr (isPrimitiveType && (sizeof(Val) == 2 || sizeof(Val) == 4)) { + result = val; + } else { + __builtin_memcpy(result, &val, sizeof(Val)); + } + + // the number of iterations needs to be at least log2(number of bits on) + numIterations = sizeof(int) * 8 - __clz(maskNumBits); + + if constexpr (impl::isTiledGroup::value) { + // the number of bits in the mask is always a power of 2 when + // tiled blocks are used and in that case we need an iteration less + numIterations -= 1; + } else { + // in the coalesced_threads case it depends, we are not sure whether + // it is a power of 2, or not so we check + if (!(maskNumBits & (maskNumBits - 1))) { + numIterations -= 1; + } + } + + int modulo = 1; + + while (numIterations) { + int offset = modulo >> 1; + int increment = modulo - offset; + int nextPos = maskIdx - offset - increment; + bool insideLanes = nextPos >= 0; + + if (insideLanes) { + int next; + + // find the position to aggregate with + for (int i = 0; i < increment; i++) { + next = __builtin_clzll(mask) + 1; + mask <<= next; + nextBit -= next; + } + } + + // clamp index; if out of bounds, the thread read its own value + nextBit = (nextBit < (laneId & ~(warpSize - 1))) ? laneId : nextBit; + bPermute(permuteResult, result, nextBit); + + if (insideLanes) { + if constexpr (!isPrimitiveType) { + Val toReturn; + toReturn = op(*reinterpret_cast(permuteResult), *reinterpret_cast(result)); + __builtin_memcpy(result, &toReturn, sizeof(Val)); + } else if constexpr (sizeof(Val) == 4 || sizeof(Val) == 2) { + result = op(permuteResult, result); + } else if constexpr (sizeof(Val) == 8) { + Val tmp; + unsigned long long rhs = + (static_cast(permuteResult[1]) << 32) | permuteResult[0]; + __builtin_memcpy(&tmp, result, sizeof(Val)); + tmp = op(*reinterpret_cast(&rhs), tmp); + __builtin_memcpy(result, &tmp, sizeof(Val)); + } + } + + modulo <<= 1; + numIterations--; + } + + if constexpr (Inclusive) { + return *reinterpret_cast(&result); + } else { + int nextBit = laneId; + + mask = impl::groupMask(group); + + if (laneId) { + mask <<= 64 - laneId; + nextBit -= mask? __builtin_clzll(mask) + 1 : 0; + } else { + mask = 0ull; + } + + // clamp index; if out of bounds, the thread read its own value + nextBit = (nextBit < (laneId & ~(warpSize - 1))) ? laneId : nextBit; + bPermute(permuteResult, result, nextBit); + + if (mask) { + return *reinterpret_cast(&permuteResult); + } else { + impl::CGIdentity identity; + + return identity(); + } + } + } +} + + template + __CG_QUALIFIER__ auto inclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) + -> decltype(op(val, val)) + { + return impl::scan(group, val, op); + } + + template + __CG_QUALIFIER__ auto exclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) + -> decltype(op(val, val)) + { + return impl::scan(group, val, op); + } + + template + __CG_QUALIFIER__ auto inclusive_scan(const TyGroup& group, TyVal&& val) + { + using Val = typename __hip_internal::remove_cvref::type; + + return impl::scan(group, val, cooperative_groups::plus()); + } + + template + __CG_QUALIFIER__ auto exclusive_scan(const TyGroup& group, TyVal&& val) + { + using Val = typename __hip_internal::remove_cvref::type; + + return impl::scan(group, val, cooperative_groups::plus()); + } +} +#endif // __cplusplus +#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_SCAN_H diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h index d1ba8e16a09..e4ccbc37da7 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h @@ -940,6 +940,20 @@ inline __device__ __half unsafeAtomicAdd(__half* address, __half value) { if (is_lower) return __low2half(out); return __high2half(out); } + +namespace __hip_internal { +template <> +struct NumericLimits<__half> { + static constexpr __half maximum() { + __half_raw raw { .x = 0x7C00U }; + return __half(raw); + } + static constexpr __half minimum() { + __half_raw raw { .x = 0xFC00U }; + return __half(raw); + } +}; +} // namespace __hip_internal #endif // defined(__clang__) && defined(__HIP__) // Math functions @@ -1163,8 +1177,17 @@ template __device__ inline __half __reduce_max_sync(MaskT mask, return __reduce_op_sync(mask, val, op, wfReduce); } +#endif -#endif // __HIP_NO_HALF_OPERATORS__ +#if !defined(__HIP_NO_HALF_OPERATORS__) +namespace cooperative_groups { +namespace impl { +HIP_IMPL_GENERATE_SCAN_FUNC(add, f16, __half); +HIP_IMPL_GENERATE_SCAN_FUNC(min, f16, __half); +HIP_IMPL_GENERATE_SCAN_FUNC(max, f16, __half); +} +} +#endif #endif // defined(__cplusplus) #elif defined(__GNUC__) || defined(_MSC_VER) diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/projects/clr/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h index 535da836851..930c2853c3b 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h @@ -314,6 +314,7 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf 1 : (sizeof(T) + sizeof(unsigned int) - 1) / sizeof(unsigned int); static constexpr auto kMaskNumBits = sizeof(MaskT) * 8; + static constexpr auto alignment = alignof(T) <= 4? 4 : alignof(T); static_assert(__hip_internal::is_integral::value && sizeof(MaskT) == 8, "The mask must be a 64-bit integer. " "Implicitly promoting a smaller integer is almost always an error."); @@ -330,9 +331,13 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf int lastLane = kMaskNumBits - leadingZeroes - 1; int maskNumBits; int numIterations; + // unsigned int[N] is used in some cases, e.g. when T is wider than 32-bit - typename __hip_internal::conditional::type result, permuteResult; + using ResultType = typename __hip_internal::conditional< + isPrimitiveType && (sizeof(T) == 4 || sizeof(T) == 2), + permuteType, permuteType[kNumOfPermutes]>::type; + alignas(alignment) ResultType result; + alignas(alignment) ResultType permuteResult; auto backwardPermute = [](int index, permuteType arg) { if constexpr (__hip_internal::is_floating_point::value && sizeof(T) <= 4) { @@ -372,7 +377,7 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf if constexpr (isPrimitiveType && (sizeof(T) == 2 || sizeof(T) == 4)) { result = val; } else { - __builtin_memcpy(result, &val, sizeof(result)); + __builtin_memcpy(result, &val, sizeof(T)); } // add the values from the lanes using a reduction tree (first the threads with even-numbered diff --git a/projects/clr/hipamd/include/hip/amd_detail/host_defines.h b/projects/clr/hipamd/include/hip/amd_detail/host_defines.h index f0d1b573753..bc81d4ace54 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/host_defines.h +++ b/projects/clr/hipamd/include/hip/amd_detail/host_defines.h @@ -309,6 +309,50 @@ template constexpr index_sequence make_index_sequence_value(index_sequence) { return {}; } + +// An equivalent of std::numeric_limits::max() and lowest(). Note that the +// class name and methods have been changed intentionally to reflect the fact that is not +// a one-to-one replacement of std::numeric_limits and also to avoid a name collision with +// the Win32 max() macro +template +struct NumericLimits; + +template <> +struct NumericLimits { + static constexpr int maximum() { return 0x7FFFFFFF; } + static constexpr int minimum() { return ~0x7FFFFFFF; } +}; + +template <> +struct NumericLimits { + static constexpr unsigned int maximum() { return 0xFFFFFFFFu; } + static constexpr unsigned int minimum() { return 0u; } +}; + +template <> +struct NumericLimits { + static constexpr long long maximum() { return 0x7FFFFFFFFFFFFFFFLL; } + static constexpr long long minimum() { return ~0x7FFFFFFFFFFFFFFFLL; } +}; + +template <> +struct NumericLimits { + static constexpr unsigned long long maximum() { return 0xFFFFFFFFFFFFFFFFull; } + static constexpr unsigned long long minimum() { return 0ull; } +}; + +template <> +struct NumericLimits { + static constexpr float maximum() { return __builtin_bit_cast(float, 0x7f800000); } + static constexpr float minimum() { return -maximum(); } +}; + +template <> +struct NumericLimits { + static constexpr double maximum() { return __builtin_bit_cast(double, 0x7FF0000000000000LL); } + static constexpr double minimum() { return -maximum(); } +}; + } // namespace __hip_internal typedef __hip_internal::uint8_t __hip_uint8_t; typedef __hip_internal::uint16_t __hip_uint16_t; diff --git a/projects/clr/hipamd/src/hiprtc/CMakeLists.txt b/projects/clr/hipamd/src/hiprtc/CMakeLists.txt index ca021702f98..872115e9182 100644 --- a/projects/clr/hipamd/src/hiprtc/CMakeLists.txt +++ b/projects/clr/hipamd/src/hiprtc/CMakeLists.txt @@ -165,6 +165,7 @@ ${PROJECT_SOURCE_DIR}/include/hip/amd_detail/amd_warp_sync_functions.h ${PROJECT_SOURCE_DIR}/include/hip/amd_detail/hip_cooperative_groups_helper.h ${PROJECT_SOURCE_DIR}/include/hip/amd_detail/amd_hip_cooperative_groups.h ${PROJECT_SOURCE_DIR}/include/hip/amd_detail/amd_hip_cooperative_groups_reduce.h +${PROJECT_SOURCE_DIR}/include/hip/amd_detail/amd_hip_cooperative_groups_scan.h ${PROJECT_SOURCE_DIR}/include/hip/amd_detail/amd_hip_unsafe_atomics.h ${PROJECT_SOURCE_DIR}/include/hip/amd_detail/amd_hip_atomic.h ${PROJECT_SOURCE_DIR}/include/hip/amd_detail/math_fwd.h diff --git a/projects/hip-tests/catch/config/configs/unit/cooperativeGrps.yaml b/projects/hip-tests/catch/config/configs/unit/cooperativeGrps.yaml index 8df088707f1..ce7ba045700 100644 --- a/projects/hip-tests/catch/config/configs/unit/cooperativeGrps.yaml +++ b/projects/hip-tests/catch/config/configs/unit/cooperativeGrps.yaml @@ -208,6 +208,7 @@ cooperativeGrps: Unit_Thread_Block_Coalesced_Reduce_arithmetic: *level_2 Unit_Thread_Block_Tile_Reduce_Basic: *level_2 Unit_Thread_Block_Tile_Reduce_Custom_Op: *level_2 + Unit_Thread_Block_Tile_Scan_Custom_Op: *level_2 Unit_Thread_Block_Tile_Reduce_Random_arithmetic: *level_2 Unit_Thread_Block_Tile_Reduce_Random_boolean: *level_2 Unit_Thread_Block_Coalesced_Reduce_arithmetic: @@ -222,6 +223,10 @@ cooperativeGrps: <<: *level_2 # Rock_Window_Failures_on_gfx1151 disabled: [amd_windows, amd_wsl] + Unit_Thread_Block_Tile_Scan_All_Parameter_Sizes: + <<: *level_2 + # Rock_Window_Failures_on_gfx1151 + disabled: [amd_windows, amd_wsl] Unit_Thread_Block_Tile_Reduce_Basic: <<: *level_2 # Rock_Window_Failures_on_gfx1151 @@ -230,6 +235,10 @@ cooperativeGrps: <<: *level_2 # Rock_Window_Failures_on_gfx1151 disabled: [amd_windows, amd_wsl] + Unit_Thread_Block_Tile_Scan_Custom_Op: + <<: *level_2 + # Rock_Window_Failures_on_gfx1151 + disabled: [amd_windows, amd_wsl] Unit_Thread_Block_Tile_Reduce_Standard_Op_Custom_Type: <<: *level_2 # Rock_Window_Failures_on_gfx1151 @@ -238,6 +247,10 @@ cooperativeGrps: <<: *level_2 # Rock_Window_Failures_on_gfx1151 disabled: [amd_windows, amd_wsl] + Unit_Thread_Block_Tile_Scan_Trivially_Copyable_Parameters: + <<: *level_2 + # Rock_Window_Failures_on_gfx1151 + disabled: [amd_windows, amd_wsl] Unit_Thread_Block_Tile_Reduce_All_Parameter_Sizes: <<: *level_2 # Rock_Window_Failures_on_gfx1151 @@ -250,3 +263,39 @@ cooperativeGrps: <<: *level_2 # Rock_Window_Failures_on_gfx1151 disabled: [amd_windows, amd_wsl] + Unit_Thread_Block_Tile_Multi_Dimensional_Reduce: + <<: *level_2 + # Rock_Window_Failures_on_gfx1151 + disabled: [amd_windows, amd_wsl] + Unit_Thread_Block_Tile_Inclusive_Scan_Basic: + <<: *level_2 + # Rock_Window_Failures_on_gfx1151 + disabled: [amd_windows, amd_wsl] + Unit_Thread_Block_Tile_Exclusive_Scan_Basic: + <<: *level_2 + # Rock_Window_Failures_on_gfx1151 + disabled: [amd_windows, amd_wsl] + Unit_Thread_Block_Tile_Scan_Random_arithmetic: + <<: *level_2 + # Rock_Window_Failures_on_gfx1151 + disabled: [amd_windows, amd_wsl] + Unit_Thread_Block_Tile_Scan_Random_boolean: + <<: *level_2 + # Rock_Window_Failures_on_gfx1151 + disabled: [amd_windows, amd_wsl] + Unit_Thread_Block_Tile_2D_3D_Blocks: + <<: *level_2 + # Rock_Window_Failures_on_gfx1151 + disabled: [amd_windows, amd_wsl] + Unit_Thread_Block_Coalesced_Scan_arithmetic: + <<: *level_2 + # Rock_Window_Failures_on_gfx1151 + disabled: [amd_windows, amd_wsl] + Unit_Thread_Block_Coalesced_Scan_boolean: + <<: *level_2 + # Rock_Window_Failures_on_gfx1151 + disabled: [amd_windows, amd_wsl] + Unit_Thread_Block_Scan_partition: + <<: *level_2 + # Rock_Window_Failures_on_gfx1151 + disabled: [amd_windows, amd_wsl] diff --git a/projects/hip-tests/catch/config/configs/unit/rtc.yaml b/projects/hip-tests/catch/config/configs/unit/rtc.yaml index 425dcdb4a06..a22da00d221 100644 --- a/projects/hip-tests/catch/config/configs/unit/rtc.yaml +++ b/projects/hip-tests/catch/config/configs/unit/rtc.yaml @@ -215,3 +215,9 @@ rtc: tags: [compile] Unit_Rtc_CoopReduce: <<: *level_2 + # Rock_Window_Failures_on_gfx1151 + disabled: [amd_windows] + Unit_Rtc_CoopScan: + <<: *level_2 + # Rock_Window_Failures_on_gfx1151 + disabled: [amd_windows] diff --git a/projects/hip-tests/catch/include/warp_common.hh b/projects/hip-tests/catch/include/warp_common.hh index 67ff46d696b..1d8a72c3452 100644 --- a/projects/hip-tests/catch/include/warp_common.hh +++ b/projects/hip-tests/catch/include/warp_common.hh @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -247,6 +248,14 @@ struct MaxOfAbsolute { } }; +template +struct NonCommutativeOp { + T __host__ __device__ operator()(T i, T j) + { + return std::abs(i) * j; + } +}; + // typeid(T).name() does seem to return a very descriptive name for primitive types, // at least on clang, so we roll out an equivalent template @@ -302,6 +311,8 @@ const char* opToString() #endif else if constexpr (std::is_same>::value) return "MaxOfAbsolute"; + else if constexpr (std::is_same>::value) + return "NonCommutativeOp"; else { return "unknown operator"; } @@ -315,7 +326,7 @@ void genRandomMasks(LinearAllocGuard& d_buf, { // masks must be != 0, hence passing 1 as the 'a' distribution parameter int wavefrontSize = getWarpSize(); - std::uniform_int_distribution dist(1, wavefrontSize == 64? ~0ull : (1ul << 32) - 1); + std::uniform_int_distribution dist(1, wavefrontSize == 64? ~0ull : (1ull << 32) - 1); std::uniform_int_distribution distNoHoles(1, getWarpSize() - 2); int numBytes = numItems * sizeof(T); LinearAllocGuard tmp(LinearAllocs::malloc, numBytes); @@ -391,82 +402,220 @@ void genRandomBuffers(LinearAllocGuard& d_buf, HIP_CHECK(hipMemcpy(d_buf.ptr(), buf.ptr(), numBytes, hipMemcpyHostToDevice)); } +enum class AggregationType { Reduce, + InclusiveScan, + ExclusiveScan, + /// @brief an inclusive scan with the default operator, i.e. cg::plus + InclusiveScanDefault, + /// @brief an exclusive scan with the default operator, i.e. cg::plus + ExclusiveScanDefault }; + +inline bool isInclusive(AggregationType aggType) +{ + switch (aggType) { + case AggregationType::Reduce: + return true; + case AggregationType::InclusiveScan: + return true; + case AggregationType::ExclusiveScan: + return false; + case AggregationType::InclusiveScanDefault: + return true; + case AggregationType::ExclusiveScanDefault: + return false; + default: + assert(false && "Unknown aggregation type"); + return "unknown"; + } +} + +inline const char* aggregationTypeToStr(AggregationType aggType) +{ + switch (aggType) { + case AggregationType::Reduce: + return "reduce"; + case AggregationType::InclusiveScan: + return "inclusive scan"; + case AggregationType::ExclusiveScan: + return "exclusive scan"; + case AggregationType::InclusiveScanDefault: + return "inclusive scan plus"; + case AggregationType::ExclusiveScanDefault: + return "exclusive scan plus"; + default: + assert(false && "Unknown aggregation type"); + return "unknown"; + } +} + +constexpr uint64_t nextPowerOf2(uint64_t v) { + v += (v == 0); + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + return ++v; +} + +// Sets id to the expected value exclusive_scan returns for the first active lane +template +void scanIdentity(T& id) +{ + T result = {}; + + if constexpr (std::is_same>::value || std::is_same>::value) { + if constexpr (std::is_same::value) { + result = HIPRT_INF_FP16; + } else if (std::is_floating_point::value) { + result = std::numeric_limits::infinity(); + } else { + result = std::numeric_limits::max(); + } + } else if constexpr (std::is_same>::value || std::is_same>::value) { + if constexpr (std::is_same::value) { + result = -HIPRT_INF_FP16; + } else if (std::is_floating_point::value) { + result = -std::numeric_limits::infinity(); + } else { + result = std::numeric_limits::lowest(); + } + } else if constexpr (std::is_same>::value) { + result = ~result; + } else if constexpr (std::is_same>::value) { + } else { + std::memset(&result, 0, sizeof(T)); + } + + id = result; +} + // given an operation produces the expected result of the warp-wide reduction // @mask indicates the lanes that will participate in the computation +// @return the result associated the lane with the highest index that is active according to the +// mask template -T calculateExpected(const T* input, Op& op, unsigned long long mask) +T calculateExpected(T* output, + const T* input, + Op op, + unsigned long long mask, + AggregationType aggType) { T result; - int wavefrontSize = getWarpSize(); + bool inclusive = aggType != AggregationType::ExclusiveScan && aggType != AggregationType::ExclusiveScanDefault; + int lastLane = 64 - __builtin_clzll(mask) - 1; + T aggregation[64]; + // the results for the previous step of the aggregation + T lastAggregation[64]; + // identity value for this type and operator + T id {}; - if constexpr (std::is_same>::value -#if HT_AMD - || std::is_same>::value -#endif - ) { - T tmp[64] = { 0 }; + std::memset(aggregation, 0, 64 * sizeof(T)); + std::memset(lastAggregation, 0, 64 * sizeof(T)); + + scanIdentity(id); + + for (int i = 0; i < 64; i++) { + output[i] = id; + } - for (int i = 0; i < wavefrontSize; i++) { - if (mask & (1ul << i)) { - tmp[i] = input[i]; + if constexpr (std::is_same>::value || + std::is_same>::value ) { + for (int i = 0; i < lastLane + 1; i++) { + if (mask & (1ull << i)) { + aggregation[i] = input[i]; + lastAggregation[i] = input[i]; } } - for (int modulo = 2; modulo <= wavefrontSize; modulo *= 2) { - for (int i = 0; i < wavefrontSize; i += modulo) { - int j = i + modulo / 2; + for (int modulo = 2; modulo <= nextPowerOf2(lastLane + 1); modulo *= 2) { + for (int i = 0; i < lastLane + 1; i += 1) { + int j = i - modulo / 2; - if (j < wavefrontSize) - tmp[i] += tmp[j]; + if (j >= 0) { + aggregation[i] += lastAggregation[j]; + } } + std::memcpy(lastAggregation, aggregation, sizeof(lastAggregation)); } - result = tmp[0]; -#if HT_AMD + + for (int i = 0; i < lastLane + 1; i += 1) { + if (inclusive) { + output[i] = aggregation[i]; + } else if (i > 0) { + output[i] = aggregation[i - 1]; + } + } + + result = output[lastLane]; } else if constexpr (std::is_same>::value) { MinOp minOp; - return calculateExpected(input, minOp, mask); + return calculateExpected(output, input, minOp, mask, aggType); } else if constexpr (std::is_same>::value) { MaxOp maxOp; - return calculateExpected(input, maxOp, mask); + return calculateExpected(output, input, maxOp, mask, aggType); } else if constexpr (std::is_same>::value) { std::bit_xor xorOp; - return calculateExpected(input, xorOp, mask); + return calculateExpected(output, input, xorOp, mask, aggType); } else if constexpr (std::is_same>::value) { std::bit_or orOp; - return calculateExpected(input, orOp, mask); + return calculateExpected(output, input, orOp, mask, aggType); } else if constexpr (std::is_same>::value) { std::bit_and andOp; - return calculateExpected(input, andOp, mask); -#endif + return calculateExpected(output, input, andOp, mask, aggType); } else { bool initialized = false; - for (int i = 0; i < wavefrontSize; i++) { - if (mask & (1ul << i)) { - if (initialized) - result = op(input[i], result); - else { + result = id; + + for (int i = 0; i < lastLane + 1; i++) { + if (mask & (1ull << i)) { + if (initialized) { + if (inclusive) { + result = op(result, input[i]); + output[i] = result; + } else { + output[i] = result; + result = op(result, input[i]); + } + } else { result = input[i]; + + if (inclusive) { + output[i] = result; + } else { + output[i] = id; + } initialized = true; } } } } + + if (aggType == AggregationType::Reduce) { + for (int i = 0; i < lastLane + 1; i++) { + output[i] = result; + } + } + return result; } template -void printMismatch(const T& result, const T& expected, const T* input, unsigned long long mask) +void printMismatch(const T& result, const T& expected, const T* input, unsigned long long mask, int laneId) { std::ios init(NULL); init.copyfmt(std::cout); - std::cout << "\nMismatch\n"; + std::cout << "\nMismatch at lane: " << laneId << "\n"; std::cout << "Mask: 0x" << std::hex << std::setfill('0') << std::setw(16) << mask << "\n"; + std::cout << "Input:\n"; std::cout.copyfmt(init); for (int i = 0; i < getWarpSize(); i++) { - if ((1ul << i) & mask) { + if ((1ull << i) & mask) { if constexpr (std::is_same::value) { const unsigned char* ptr = reinterpret_cast(&input[i]); @@ -490,10 +639,12 @@ void printMismatch(const T& result, const T& expected, const T* input, unsigned } } -template -void compareFloatingPoint(const T& result, const T& expected, unsigned long long mask, const T* input) +template +void compareFloatingPoint(const T& result, const T& expected, unsigned long long mask, const T* input, int laneId) { using namespace Catch::Matchers; + std::string opName = opToString(); + if constexpr (std::is_same::value) { float resultFloat = __half2float(result); float expectedFloat = __half2float(expected); @@ -501,18 +652,24 @@ void compareFloatingPoint(const T& result, const T& expected, unsigned long long float relativeEpsilon = 0.1 * fmax(resultFloat, expectedFloat); float eps = 0.01f; + if constexpr (std::is_same::value) { + INFO("result: 0x" << std::hex << __half_as_ushort(result)); + } else { + INFO("result: 0x" << std::hex << result); + } + REQUIRE(!__hisnan(result)); - REQUIRE(!__hisinf(result)); if (relativeEpsilon > eps) { if (absDifference > 0.0001) { if (absDifference >= eps * fabs(fmax(resultFloat, expectedFloat))) { - printMismatch(result, expected, input, mask); + printMismatch(result, expected, input, mask, laneId); std::cout << "Relative epsilon: " << relativeEpsilon << "\n"; std::cout << "Difference: " << absDifference << "\n"; } - } + } + INFO("Operator: " << opName << " mask: 0x" << std::hex << mask); REQUIRE_THAT(__half2float(resultFloat), WithinRel(expectedFloat, eps)); } } else { @@ -524,11 +681,12 @@ void compareFloatingPoint(const T& result, const T& expected, unsigned long long if (relativeEpsilon > eps) { if (absDifference > 0.0001) { if (absDifference >= eps * fabs(fmax(result, expected))) { - printMismatch(result, expected, input, mask); + printMismatch(result, expected, input, mask, laneId); std::cout << "Relative epsilon: " << relativeEpsilon << "\n"; std::cout << "Difference: " << absDifference << "\n"; } + INFO("Operator: " << opName << " mask: 0x" << std::hex << mask); REQUIRE_THAT(result, WithinRel(expected, eps)); } } @@ -557,6 +715,7 @@ void runTestReduce(int iteration, Reduce reduce) LinearAllocGuard input, d_input; LinearAllocGuard masks, d_masks; Op op; + std::string opName = opToString>(); int numReduce = 0; genRandomBuffers(d_input, input, dist, gen, kNumReduces * wavefrontSize); @@ -566,15 +725,20 @@ void runTestReduce(int iteration, Reduce reduce) HIP_CHECK(hipMemcpy(output.ptr(), d_output.ptr(), d_output.size_bytes(), hipMemcpyDeviceToHost)); while (numReduce < kNumReduces) { + T expectedByLane[64]; T* waveInput = &input.ptr()[numReduce * wavefrontSize]; - T expected = calculateExpected(waveInput, op, masks.ptr()[numReduce]); + T expected = calculateExpected(expectedByLane, + waveInput, + op, + masks.ptr()[numReduce], + AggregationType::Reduce); int lane = 0; while (lane < wavefrontSize) { auto result = output.ptr()[numReduce * wavefrontSize + lane]; unsigned long long mask = masks.ptr()[numReduce]; - if ((1ul << lane) & mask) { + if ((1ull << lane) & mask) { if constexpr (std::is_integral::value || std::is_same, MinOp>::value || std::is_same, MaxOp>::value) { // for integral types or min/max the result should match exactly @@ -582,12 +746,13 @@ void runTestReduce(int iteration, Reduce reduce) REQUIRE(__half2float(result) == __half2float(expected)); else { if (result != expected) { - printMismatch(result, expected, waveInput, mask); + printMismatch(result, expected, waveInput, mask, lane); + INFO("Operator: " << opName << " mask: 0x" << std::hex << mask); REQUIRE(result == expected); } } } else - compareFloatingPoint(result, expected, mask, waveInput); + compareFloatingPoint>(result, expected, mask, waveInput, lane); } lane++; diff --git a/projects/hip-tests/catch/performance/api/warpSync/warpSync.cc b/projects/hip-tests/catch/performance/api/warpSync/warpSync.cc index 315633fb188..5af8dbc9702 100644 --- a/projects/hip-tests/catch/performance/api/warpSync/warpSync.cc +++ b/projects/hip-tests/catch/performance/api/warpSync/warpSync.cc @@ -10,6 +10,7 @@ #include "warp_common.hh" #include #include +#include #include #include #include @@ -32,6 +33,22 @@ static constexpr int kBlockDim = 1024; +unsigned long long fullMask, halfHighBitsOn, halfBitsOn, high16BitsOn, high8BitsOn, high4BitsOn, + allButOne; + +static void initializeMasks() +{ + int wavefrontSize = getWarpSize(); + int halfWaveSize = wavefrontSize / 2; + + fullMask = (getWarpSize() == 64)? ~0ull : (1ull << 32) - 1; + halfBitsOn = (1ull << (wavefrontSize / 2)) - 1; + halfHighBitsOn = halfBitsOn << halfWaveSize; + high16BitsOn = halfBitsOn << (wavefrontSize - 16); + high8BitsOn = halfBitsOn << (wavefrontSize - 8); + high4BitsOn = halfBitsOn << (wavefrontSize - 4); + allButOne = fullMask & ~1; +} template struct AtomicAddOp { __device__ T operator()(T* lhs, const T& rhs) { return atomicAdd(lhs, rhs); } }; @@ -80,9 +97,10 @@ __global__ void reduceAtomics(T* __restrict__ output, const T* __restrict__ inpu __syncthreads(); - uint lane = __lane_id(); + uint lane = (threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y) % + warpSize; - if (mask & (1ul << lane)) op(&result[numWarp], input[idx]); + if (mask & (1ull << lane)) op(&result[numWarp], input[idx]); __syncthreads(); @@ -94,8 +112,10 @@ __global__ void reduceOpSync(T* __restrict__ output, const T* __restrict__ input unsigned long long mask) { int idx = threadIdx.x + blockIdx.x * kBlockDim; T result; + int laneId = (threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y) % + warpSize; - if (mask & (1ul << __lane_id())) { + if (mask & (1ull << laneId)) { if constexpr (std::is_same, std::plus>::value) result = __reduce_add_sync(mask, input[idx]); else if constexpr (std::is_same, MinOp>::value) @@ -115,7 +135,7 @@ __global__ void reduceOpSync(T* __restrict__ output, const T* __restrict__ input } } -template +template __global__ void reduceCoop(T* __restrict__ output, const T* __restrict__ input) { namespace cg = cooperative_groups; @@ -126,14 +146,16 @@ __global__ void reduceCoop(T* __restrict__ output, const T* __restrict__ input) const int numTiles = warpSize / TileSize; int laneId = threadIdx.x % warpSize; - if constexpr (ExcludeFirst) { - if (laneId == 0) { - return; - } + if constexpr (aggType == AggregationType::Reduce) { + result = cg::reduce(mytile, input[idx], Functor()); + } else if constexpr (aggType == AggregationType::InclusiveScan) { + result = cg::inclusive_scan(mytile, input[idx], Functor()); + } else if constexpr (aggType == AggregationType::ExclusiveScan) { + result = cg::exclusive_scan(mytile, input[idx], Functor()); + } else { + static_assert(__hip_internal::is_void::value, "Unsupported aggType"); } - result = cg::reduce(mytile, input[idx], Functor()); - if (laneId / TileSize == numTiles - 1) { // it's always the higher order tile reduction the one that gets calculated output[idx / warpSize] = result; @@ -190,11 +212,15 @@ template class Op> class ReduceSyncBenchmark }; template class Op> -void checkResults(T* d_lhs, T* d_rhs, size_t numBytes, unsigned long long mask) { +void checkResults(T* d_lhs, T* d_rhs, + size_t numBytes, + unsigned long long mask, + AggregationType aggType) { using namespace Catch::Matchers; LinearAllocGuard h_lhs(LinearAllocs::malloc, numBytes); LinearAllocGuard h_rhs(LinearAllocs::malloc, numBytes); bool memcmpResult; + std::string opName = opToString>(); assert(numBytes % sizeof(T) == 0 && "numBytes needs to be a multiple of sizeof(T)"); HIP_CHECK(hipMemcpy(h_lhs.ptr(), d_lhs, numBytes, hipMemcpyDeviceToHost)); @@ -205,6 +231,11 @@ void checkResults(T* d_lhs, T* d_rhs, size_t numBytes, unsigned long long mask) for (int i = 0; i < numBytes / sizeof(T); i++) { auto& lhsResult = h_lhs.ptr()[i]; auto& rhsResult = h_rhs.ptr()[i]; + INFO(" mask: 0x" << std::hex << mask); + INFO(" index: " << i); + INFO(" operator: " << opName); + INFO(" when checking: " << aggregationTypeToStr(aggType)); + if constexpr (std::is_integral::value || std::is_same, MinOp>::value || std::is_same, MaxOp>::value) @@ -219,9 +250,15 @@ void checkResults(T* d_lhs, T* d_rhs, size_t numBytes, unsigned long long mask) // in this case, instead of using masks, the TileSize would define the mask // at compile time -template class Op> class CoopBenchmark - : public Benchmark> { +template class Op, AggregationType aggType> class CoopBenchmark + : public Benchmark> { public: + + CoopBenchmark() + { + initializeMasks(); + } + void operator()(T* output, const T* input, int numItems) { namespace cg = cooperative_groups; @@ -233,17 +270,17 @@ template class TIMED_SECTION(kTimerTypeEvent) { if constexpr (std::is_same, std::plus>::value) { - reduceCoop, T><<>>(output, input); + reduceCoop, T, aggType><<>>(output, input); } else if constexpr (std::is_same, MinOp>::value) { - reduceCoop, T><<>>(output, input); + reduceCoop, T, aggType><<>>(output, input); } else if constexpr (std::is_same, MaxOp>::value) { - reduceCoop, T><<>>(output, input); + reduceCoop, T, aggType><<>>(output, input); } else if constexpr (std::is_same, AndOp>::value) { - reduceCoop, T><<>>(output, input); + reduceCoop, T, aggType><<>>(output, input); } else if constexpr (std::is_same, OrOp>::value) { - reduceCoop, T><<>>(output, input); + reduceCoop, T, aggType><<>>(output, input); } else if constexpr (std::is_same, XorOp>::value) { - reduceCoop, T><<>>(output, input); + reduceCoop, T, aggType><<>>(output, input); } else { static_assert(std::is_void::value, "Unsupported operator"); } @@ -283,7 +320,47 @@ template <> struct HasAtomicOps { static constexpr bool value = false; }; +template class Op, AggregationType aggType> +void benchmarkCoop(LinearAllocGuard* d_outputCoop, + const LinearAllocGuard& d_input, + const std::map& masks, + int numItems) +{ + int wavefrontSize = getWarpSize(); + + for (const auto& mask : masks) { + printf("%s %llx\n", mask.first.c_str(), mask.second); + unsigned long long warpMask = wavefrontSize == 64? ~0ull : 0xFFFFFFFF; + + if (mask.second == (fullMask & warpMask)) { + if (wavefrontSize == 64) { + CoopBenchmark<64, T, Op, aggType> benchmark; + benchmark.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems); + } else { + CoopBenchmark<32, T, Op, aggType> benchmark; + benchmark.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems); + } + } else if (wavefrontSize == 64 && mask.second == (halfHighBitsOn & warpMask)) { + CoopBenchmark<32, T, Op, aggType> benchmark; + benchmark.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems); + } else if (mask.second == (high16BitsOn & warpMask)) { + CoopBenchmark<16, T, Op, aggType> benchmark; + benchmark.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems); + } else if (mask.second == (high8BitsOn & warpMask)) { + CoopBenchmark<8, T, Op, aggType> benchmark; + benchmark.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems); + } else if (mask.second == (high4BitsOn & warpMask)) { + CoopBenchmark<4, T, Op, aggType> benchmark; + benchmark.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems); + } + } +} template class Op> struct ReduceBenchmark { + ReduceBenchmark() + { + initializeMasks(); + } + void Run() { static constexpr int numMasks = 6; using distribution = typename DistributionType::type; @@ -302,12 +379,6 @@ template class Op> struct ReduceBenchmark { LinearAllocGuard* d_outputCoop = &d_outputsCoop[0]; std::mt19937_64 gen(123); distribution dist; - int halfWaveSize = wavefrontSize / 2; - unsigned long long halfBitsOn = (1ul << (wavefrontSize / 2)) - 1; - unsigned long long fullMask = -1ul, halfHighBitsOn = halfBitsOn << halfWaveSize, - high16BitsOn = halfBitsOn << (wavefrontSize - 16), - high8BitsOn = halfBitsOn << (wavefrontSize - 8), - high4BitsOn = halfBitsOn << (wavefrontSize - 4), allButOne = -1 & ~1; const char* typeStr = typeToString(); const char* opStr = opToString>(); std::map masks; @@ -372,56 +443,39 @@ template class Op> struct ReduceBenchmark { } printf("\n--- reduce cooperative groups %s %s--- \n", opStr, typeStr); - - for (const auto& mask : masks) { - printf("%s %llx\n", mask.first.c_str(), mask.second); - unsigned long warpMask = wavefrontSize == 64? ~0ull : 0xFFFFFFFF; - - if (mask.second == (fullMask & warpMask)) { - - if (wavefrontSize == 64) { - CoopBenchmark<64, false, T, Op> benchmarkCoop; - benchmarkCoop.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems); - } else { - CoopBenchmark<32, false, T, Op> benchmarkCoop; - benchmarkCoop.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems); - } - } else if (wavefrontSize == 64 && mask.second == (halfHighBitsOn & warpMask)) { - CoopBenchmark<32, false, T, Op> benchmarkCoop; - benchmarkCoop.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems); - } else if (mask.second == (high16BitsOn & warpMask)) { - CoopBenchmark<16, false, T, Op> benchmarkCoop; - benchmarkCoop.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems); - } else if (mask.second == (high8BitsOn & warpMask)) { - CoopBenchmark<8, false, T, Op> benchmarkCoop; - benchmarkCoop.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems); - } else if (mask.second == (high4BitsOn & warpMask)) { - CoopBenchmark<4, false, T, Op> benchmarkCoop; - benchmarkCoop.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems); - } else if (mask.second == (allButOne & warpMask)) { - if (wavefrontSize == 64) { - CoopBenchmark<64, true, T, Op> benchmarkCoop; - benchmarkCoop.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems); - } else { - CoopBenchmark<32, true, T, Op> benchmarkCoop; - benchmarkCoop.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems); - } - } - } - + benchmarkCoop(d_outputCoop, d_input, masks, numItems); printf("\n"); if constexpr (HasAtomicOps::value) { printf("Checking results...\n"); for (const auto& mask : masks) { - checkResults(d_outputsAtomic[pos].ptr(), d_outputsReduce[pos].ptr(), outputNumBytes, - mask.second); - checkResults(d_outputsReduce[pos].ptr(), d_outputsCoop[pos].ptr(), outputNumBytes, - mask.second); + checkResults(d_outputsAtomic[pos].ptr(), + d_outputsReduce[pos].ptr(), + outputNumBytes, + mask.second, + AggregationType::Reduce); + + if (mask.second != allButOne) { + // allButOne is not supported in tiled cooperative groups; all the threads of the tile must + // be active + checkResults(d_outputsReduce[pos].ptr(), + d_outputsCoop[pos].ptr(), + outputNumBytes, + mask.second, + AggregationType::Reduce); + } pos++; } } + + printf("\n--- inclusive scan cooperative groups %s %s--- \n", opStr, typeStr); + benchmarkCoop(d_outputCoop, d_input, masks, numItems); + printf("\n"); + + printf("\n--- exclusive scan cooperative groups %s %s--- \n", opStr, typeStr); + benchmarkCoop(d_outputCoop, d_input, masks, numItems); + printf("\n"); } }; diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/cooperative_groups_common.hh b/projects/hip-tests/catch/unit/cooperativeGrps/cooperative_groups_common.hh index eead915c9eb..f7d5ce39f1a 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/cooperative_groups_common.hh +++ b/projects/hip-tests/catch/unit/cooperativeGrps/cooperative_groups_common.hh @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "../math/math_common.hh" namespace { diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/thread_block_tile.cc b/projects/hip-tests/catch/unit/cooperativeGrps/thread_block_tile.cc index c0231d48294..7dfb28033b4 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/thread_block_tile.cc +++ b/projects/hip-tests/catch/unit/cooperativeGrps/thread_block_tile.cc @@ -570,14 +570,24 @@ void testReduceForTileSize() void* devicePtr = d_result.ptr(); void* args[] = { &devicePtr }; - HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(simpleSum), gridDim, blockDim, args, 0, nullptr)); + HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(simpleSum), + gridDim, + blockDim, + args, + 0, + nullptr)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(), h_result.size_bytes(), hipMemcpyDeviceToHost)); REQUIRE(*h_result.host_ptr() == TileSize); - HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(simpleSumSubtiles), gridDim, blockDim, args, 0, nullptr)); + HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(simpleSumSubtiles), + gridDim, + blockDim, + args, + 0, + nullptr)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(), @@ -601,10 +611,15 @@ HIP_TEMPLATE_TEST_CASE(Unit_Thread_Block_Tile_Reduce_Basic, int) } } +// @extraMasks when testing coalesced_threads, this can be use to simulate +// divergence template -void __global__ reduceKernel(T* output, const T* input) +void __global__ reduceKernel(T* output, + const T* input, + unsigned long long* extraMasks, + AggregationType* aggType) { - int tid = threadIdx.x; + int tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; int laneId = tid % warpSize; cg::thread_block mygroup = cg::this_thread_block(); auto mytile = cg::tiled_partition(mygroup); @@ -612,16 +627,37 @@ void __global__ reduceKernel(T* output, const T* input) for (int i = 0; i < kNumReduces; i++) { int idx = warpSize * i + laneId; T& result = output[idx]; + unsigned long long mask = extraMasks[i]; - result = cg::reduce(mytile, input[idx], Functor()); + if ((1ull << laneId) & mask) { + switch (*aggType) { + case AggregationType::Reduce: + result = cg::reduce(mytile, input[idx], Functor()); + break; + case AggregationType::InclusiveScan: + result = cg::inclusive_scan(mytile, input[idx], Functor()); + break; + case AggregationType::ExclusiveScan: + result = cg::exclusive_scan(mytile, input[idx], Functor()); + break; + default: + assert(false && "Unsupported enumeration"); + } + + } else { + result = 0; + } } } // @extraMasks used to simulate divergence when using coalesced_threads template -void __global__ reduceKernelCoalesced(T* output, const T* input, unsigned long long* extraMasks) +void __global__ reduceKernelCoalesced(T* output, + const T* input, + unsigned long long* extraMasks, + AggregationType* aggType) { - int tid = threadIdx.x; + int tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; int laneId = tid % warpSize; for (int i = 0; i < kNumReduces; i++) { @@ -632,7 +668,19 @@ void __global__ reduceKernelCoalesced(T* output, const T* input, unsigned long l if ((1ull << laneId) & mask) { auto coalesced = cg::coalesced_threads(); - result = cg::reduce(coalesced, input[idx], Functor()); + switch (*aggType) { + case AggregationType::Reduce: + result = cg::reduce(coalesced, input[idx], Functor()); + break; + case AggregationType::InclusiveScan: + result = cg::inclusive_scan(coalesced, input[idx], Functor()); + break; + case AggregationType::ExclusiveScan: + result = cg::exclusive_scan(coalesced, input[idx], Functor()); + break; + default: + assert(false && "Unsupported enumeration"); + } } else { result = 0; } @@ -641,7 +689,8 @@ void __global__ reduceKernelCoalesced(T* output, const T* input, unsigned long l // @TileSize the tile size or 0 when testing coalesced groups template -void reduceForTypeAndOp() +void aggregateForTypeAndOp(AggregationType aggType, + dim3 blockDim) { using distribution = typename DistributionType::type; int wavefrontSize = getWarpSize(); @@ -654,18 +703,20 @@ void reduceForTypeAndOp() kNumReduces * sizeof(unsigned long long)); LinearAllocGuard h_extraMasks(LinearAllocs::malloc, kNumReduces * sizeof(unsigned long long)); + LinearAllocGuard d_aggType(LinearAllocs::hipMalloc, sizeof(AggregationType)); std::mt19937_64 gen(Catch::rngSeed()); dim3 gridDim = { 1 }; - dim3 blockDim = { static_cast(wavefrontSize) }; hipError_t status; typename distribution::result_type a = std::is_same::value? std::numeric_limits::lowest() : (std::is_signed::value? -1023 : 0); typename distribution::result_type b = std::is_same::value? std::numeric_limits::max() : 1023; - distribution distInput(a, b); - int numReduce = 0; + distribution distInput {a, b}; + int numAggregation = 0; void* kernelPtr; + T expected[64]; + HIP_CHECK(hipMemcpy(d_aggType.ptr(), &aggType, sizeof(aggType), hipMemcpyHostToDevice)); genRandomBuffers(d_input, h_input, distInput, gen, kNumReduces * wavefrontSize); if (TileSize) { @@ -680,7 +731,7 @@ void reduceForTypeAndOp() kNumReduces); } - std::array devicePtrs = { d_result.ptr(), d_input.ptr(), d_extraMasks.ptr() }; + std::array devicePtrs = { d_result.ptr(), d_input.ptr(), d_extraMasks.ptr(), d_aggType.ptr() }; void* args[devicePtrs.size()]; for (int i = 0; i < devicePtrs.size(); i++) { @@ -705,96 +756,123 @@ void reduceForTypeAndOp() HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(), h_result.size_bytes(), hipMemcpyDeviceToHost)); - while (numReduce < kNumReduces) { + while (numAggregation < kNumReduces) { for (int laneId = 0; laneId < wavefrontSize; laneId++) { + T result = h_result.host_ptr()[numAggregation * wavefrontSize + laneId]; + int lastLane; + const T* input = &h_input.host_ptr()[numAggregation * wavefrontSize]; unsigned long long mask = ~0ull; - T result = h_result.host_ptr()[numReduce * wavefrontSize + laneId], expected = 0; - const T* input = &h_input.host_ptr()[numReduce * wavefrontSize]; + Op op {}; + std::string opName = opToString(); + int resultLane; if constexpr (TileSize > 0) { mask >>= (64 - TileSize); mask <<= ((laneId % wavefrontSize) / TileSize) * TileSize; } - mask &= h_extraMasks.host_ptr()[numReduce]; + mask &= h_extraMasks.host_ptr()[numAggregation]; + lastLane = 64 - __builtin_clzll(mask) - 1; - if ((1ull << laneId) & mask) { - Op op {}; - expected = calculateExpected(input, op, mask); + if (laneId > lastLane) { + continue; } - if constexpr (std::is_integral::value) { - // for integral types the result should match exactly - if (result != expected) { - std::string opName = opToString(); - printMismatch(result, expected, input, mask); - INFO("Operator: " << opName); - REQUIRE(result == expected); - } + calculateExpected(expected, input, op, mask, aggType); + + if (aggType == AggregationType::Reduce) { + resultLane = lastLane; } else { - compareFloatingPoint(result, expected, mask, h_input.host_ptr()); + resultLane = laneId; + } + + if ((1ull << laneId) & mask) { + if constexpr (std::is_integral::value) { + // for integral types the result should match exactly + // for reduce, the result would be in the last lane whose first bit is on in the mask + // for scans, the associated result is different in each lane + if (result != expected[resultLane]) { + INFO("Aggregation type: " << aggregationTypeToStr(aggType)); + printMismatch(result, expected[resultLane], input, mask, laneId); + INFO("Operator: " << opName << " mask: 0x" << std::hex << mask); + REQUIRE(result == expected[resultLane]); + } + } else { + compareFloatingPoint(result, + expected[resultLane], + mask, + h_input.host_ptr(), + laneId); + } } } - numReduce++; + numAggregation++; } } template -void reduceCoopTiles(const std::index_sequence<>) +void aggregateCoopTiles(AggregationType, dim3, const std::index_sequence<>) { } template -void reduceCoopTiles(const std::index_sequence) +void aggregateCoopTiles(AggregationType aggType, dim3 blockDim, const std::index_sequence) { const std::index_sequence remainingTiles; - reduceForTypeAndOp(); - reduceCoopTiles(remainingTiles); + aggregateForTypeAndOp(aggType, blockDim); + aggregateCoopTiles(aggType, blockDim, remainingTiles); } -template -void runReduceRandomForType() +template +void runAggregationRandomForType(AggregationType aggType, dim3 blockDim) { if constexpr (Coalesced) { - reduceForTypeAndOp<0, Op, T>(); - } else if constexpr (WarpSize <= 32) { + aggregateForTypeAndOp<0, Op, T>(aggType, blockDim); + } else if constexpr (MaxTileSize <= 4) { + std::index_sequence<1, 2, 4> tileSizes; + aggregateCoopTiles(aggType, blockDim, tileSizes); + } else if constexpr (MaxTileSize <= 32) { std::index_sequence<1, 2, 4, 8, 16, 32> tileSizes; - reduceCoopTiles(tileSizes); + aggregateCoopTiles(aggType, blockDim, tileSizes); } else { std::index_sequence<1, 2, 4, 8, 16, 32, 64> tileSizes; - reduceCoopTiles(tileSizes); + aggregateCoopTiles(aggType, blockDim, tileSizes); } } template -void runReduceRandomForOps(const std::tuple<>) +void runAggregationRandomForOps(AggregationType aggType, const std::tuple<>) { } template -void runReduceRandomForOps(const std::tuple) +void runAggregationRandomForOps(AggregationType aggType, const std::tuple) { const std::tuple remainingOps; + int wavefrontSize = getWarpSize(); + dim3 blockDim = {static_cast(wavefrontSize)}; - runReduceRandomForType(); - runReduceRandomForOps(remainingOps); + runAggregationRandomForType(aggType, blockDim); + runAggregationRandomForOps(aggType, remainingOps); } // for all the tile sizes and all input types, using random input values, calculates the reduce() // values. Additionally, randomly make some threads not participate for the coalesced_threads case HIP_TEMPLATE_TEST_CASE(Unit_Thread_Block_Tile_Reduce_Random_arithmetic, int, unsigned int, long long, - unsigned long long, float, half, double) + unsigned long long, float, half, double) { std::tuple, cooperative_groups::less, cooperative_groups::greater> types; + int wavefrontSize = getWarpSize(); + dim3 blockDim = {static_cast(wavefrontSize)}; - if (getWarpSize() == 32) { - runReduceRandomForOps(types); + if (wavefrontSize == 32) { + runAggregationRandomForOps(AggregationType::Reduce, types); } else { - runReduceRandomForOps(types); + runAggregationRandomForOps(AggregationType::Reduce, types); } } @@ -806,22 +884,33 @@ HIP_TEMPLATE_TEST_CASE(Unit_Thread_Block_Tile_Reduce_Random_boolean, int, unsign cooperative_groups::bit_xor> types; if (getWarpSize() == 32) { - runReduceRandomForOps(types); + runAggregationRandomForOps(AggregationType::Reduce, types); } else { - runReduceRandomForOps(types); + runAggregationRandomForOps(AggregationType::Reduce, types); } } // passes a custom operator to cooperative_groups::reduce() HIP_TEST_CASE(Unit_Thread_Block_Tile_Reduce_Custom_Op) { - if (getWarpSize() == 32) { - runReduceRandomForType, int, 32>(); + int wavefrontSize = getWarpSize(); + + dim3 blockDim = {static_cast(wavefrontSize)}; + if (wavefrontSize == 32) { + runAggregationRandomForType, int, 32>(AggregationType::Reduce, blockDim); } else { - runReduceRandomForType, int, 64>(); + runAggregationRandomForType, int, 64>(AggregationType::Reduce, blockDim); } } +HIP_TEST_CASE(Unit_Thread_Block_Tile_Scan_Custom_Op) +{ + dim3 blockDim = {static_cast(getWarpSize())}; + // only using 4 threads to avoid long long overflows + runAggregationRandomForType, long long, 4>( + AggregationType::InclusiveScan, blockDim); +} + struct Vector { int x; int y; @@ -838,7 +927,7 @@ struct MaxMagnitude { } }; -void __global__ maxMagnitude(Vector* result) +void __global__ maxMagnitude(Vector* result, AggregationType* aggregationType) { cg::thread_block mygroup = cg::this_thread_block(); auto mytile = cg::tiled_partition<4>(mygroup); @@ -848,27 +937,102 @@ void __global__ maxMagnitude(Vector* result) { 0, 7 }, { 4, 1} }; - *result = cg::reduce(mytile, input[threadIdx.x], op); + switch (*aggregationType) { + case AggregationType::Reduce: + result[threadIdx.x] = cg::reduce(mytile, input[threadIdx.x], op); + break; + case AggregationType::InclusiveScan: + result[threadIdx.x] = cg::inclusive_scan(mytile, input[threadIdx.x], op); + break; + case AggregationType::ExclusiveScan: + result[threadIdx.x] = cg::exclusive_scan(mytile, input[threadIdx.x], op); + break; + default: + assert(false && "Unexpected aggType"); + } } // tests that we can pass trivially copyable structs as values to reduce HIP_TEST_CASE(Unit_Thread_Block_Tile_Reduce_Trivially_Copyable_Parameters) { - LinearAllocGuard h_result(LinearAllocs::malloc, sizeof(Vector)); - LinearAllocGuard d_result(LinearAllocs::hipMalloc, sizeof(Vector)); dim3 gridDim = { 1 }; dim3 blockDim = { 4 }; - void* devicePtr = d_result.ptr(); - void* args[] = { &devicePtr }; + LinearAllocGuard h_result(LinearAllocs::malloc, sizeof(Vector) * blockDim.x); + LinearAllocGuard d_result(LinearAllocs::hipMalloc, sizeof(Vector) * blockDim.x); + LinearAllocGuard d_aggType(LinearAllocs::hipMalloc, sizeof(AggregationType)); + std::array devicePtrs = { d_result.ptr(), d_aggType.ptr() }; + void* args[devicePtrs.size()]; Vector* result; + AggregationType aggType = AggregationType::Reduce; + for (int i = 0; i < devicePtrs.size(); i++) { + args[i] = &devicePtrs[i]; + } + + HIP_CHECK(hipMemcpy(d_aggType.ptr(), &aggType, sizeof(aggType), hipMemcpyHostToDevice)); HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(maxMagnitude), gridDim, blockDim, args, 0, nullptr)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(), h_result.size_bytes(), hipMemcpyDeviceToHost)); - result = &h_result.host_ptr()[0]; - REQUIRE((result->x == 1 && result->y == 9)); + + for (unsigned int idx = 0; idx < blockDim.x; idx++) { + result = &h_result.host_ptr()[idx]; + REQUIRE((result->x == 1 && result->y == 9)); + } +} + +TEST_CASE(Unit_Thread_Block_Tile_Scan_Trivially_Copyable_Parameters) +{ + dim3 gridDim = { 1 }; + dim3 blockDim = { 4 }; + LinearAllocGuard h_result(LinearAllocs::malloc, sizeof(Vector) * blockDim.x); + LinearAllocGuard d_result(LinearAllocs::hipMalloc, sizeof(Vector) * blockDim.x); + LinearAllocGuard d_aggType(LinearAllocs::hipMalloc, sizeof(AggregationType)); + std::array devicePtrs = { d_result.ptr(), d_aggType.ptr() }; + void* args[devicePtrs.size()]; + Vector* result; + AggregationType aggType; + + for (int i = 0; i < devicePtrs.size(); i++) { + args[i] = &devicePtrs[i]; + } + + SECTION("inclusive") { + aggType = AggregationType::InclusiveScan; + HIP_CHECK(hipMemcpy(d_aggType.ptr(), &aggType, sizeof(aggType), hipMemcpyHostToDevice)); + HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(maxMagnitude), gridDim, blockDim, args, 0, nullptr)); + HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(), + h_result.size_bytes(), hipMemcpyDeviceToHost)); + result = &h_result.host_ptr()[0]; + REQUIRE((result->x == 2 && result->y == 3)); + result = &h_result.host_ptr()[1]; + REQUIRE((result->x == 1 && result->y == 9)); + result = &h_result.host_ptr()[2]; + REQUIRE((result->x == 1 && result->y == 9)); + result = &h_result.host_ptr()[3]; + REQUIRE((result->x == 1 && result->y == 9)); + } + + SECTION("exclusive") { + aggType = AggregationType::ExclusiveScan; + HIP_CHECK(hipMemcpy(d_aggType.ptr(), &aggType, sizeof(aggType), hipMemcpyHostToDevice)); + HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(maxMagnitude), gridDim, blockDim, args, 0, nullptr)); + HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(), + h_result.size_bytes(), hipMemcpyDeviceToHost)); + result = &h_result.host_ptr()[0]; + REQUIRE((result->x == 0 && result->y == 0)); + result = &h_result.host_ptr()[1]; + REQUIRE((result->x == 2 && result->y == 3)); + result = &h_result.host_ptr()[2]; + REQUIRE((result->x == 1 && result->y == 9)); + result = &h_result.host_ptr()[3]; + REQUIRE((result->x == 1 && result->y == 9)); + } } template @@ -897,9 +1061,7 @@ struct Max { ArrayContainer result; for (int i = 0; i < NumElems; i++) { - for (int i = 0; i < NumElems; i++) { result[i] = std::max(lhs[i], rhs[i]); - } } return result; @@ -916,13 +1078,50 @@ __global__ void applyFunctor(ArrayContainer* result) if (threadIdx.x < NumElems) { input[threadIdx.x] = threadIdx.x; - __syncwarp(); + } + + mytile.sync(); + + if (threadIdx.x < NumElems) { *result = cg::reduce(mytile, input, op); } } +template +__global__ void applyScanFunctor(ArrayContainer** result, + AggregationType* aggType) +{ + cg::thread_block mygroup = cg::this_thread_block(); + auto mytile = cg::tiled_partition(mygroup); + __shared__ ArrayContainer input; + Functor op; + unsigned int tid = threadIdx.x + + threadIdx.y * blockDim.x + + threadIdx.z * blockDim.x * blockDim.y; + + if (threadIdx.x < NumElems) { + input[threadIdx.x] = threadIdx.x; + } + + mytile.sync(); + + if (threadIdx.x < NumElems) { + switch (*aggType) { + case AggregationType::InclusiveScan: + *(result[tid]) = cg::inclusive_scan(mytile, input, op); + break; + case AggregationType::ExclusiveScan: + *(result[tid]) = cg::exclusive_scan(mytile, input, op); + break; + default: + assert(false && "AggregationType not supported"); + } + } +} + +// tests aggregations of arguments of different sizes (types <= 32 bytes are accepted) template class Functor> -void testReduceSizes() +void testArgsDifferentSizesReduce() { LinearAllocGuard> h_result(LinearAllocs::malloc, sizeof(ArrayContainer)); LinearAllocGuard> d_result(LinearAllocs::hipMalloc, sizeof(ArrayContainer)); @@ -932,7 +1131,8 @@ void testReduceSizes() void* args[] = { &devicePtr }; ArrayContainer* result; - HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(applyFunctor>), gridDim, blockDim, args, 0, nullptr)); + HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(applyFunctor>), + gridDim, blockDim, args, 0, nullptr)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(), @@ -953,7 +1153,101 @@ void testReduceSizes() } if constexpr (NumElems > 1) { - testReduceSizes(); + testArgsDifferentSizesReduce(); + } +} + +// in this case, as opposed to the reduction, were we were only saving one result per reduction, +// we save one result per lane. i.e. we save one ArrayContainer per lane, as each lane will return +// a different ArrayContainer value +template class Functor> +void testArgsDifferentSizesScan(AggregationType aggType) +{ + int wavefrontSize = getWarpSize(); + // one per lane + LinearAllocGuard> h_result[64]; + LinearAllocGuard> d_result[64]; + + // as we cannot pass d_result directly (because it is an array of + // LinearAllocGuards), we convert to an array of raw device pointers + ArrayContainer* h_devicePtrs[64]; + LinearAllocGuard*> d_devicePtrs(LinearAllocs::hipMalloc, sizeof(void*) * 64); + LinearAllocGuard d_aggType(LinearAllocs::hipMalloc, sizeof(AggregationType)); + dim3 gridDim = { 1 }; + dim3 blockDim = { 32 }; + ArrayContainer* result; + + for (int i = 0; i < wavefrontSize; i++) { + h_result[i] = LinearAllocGuard>(LinearAllocs::malloc, + sizeof(ArrayContainer)); + d_result[i] = LinearAllocGuard>(LinearAllocs::hipMalloc, + sizeof(ArrayContainer)); + } + + for (int i = 0; i < wavefrontSize; i++) { + h_devicePtrs[i] = d_result[i].ptr(); + } + + HIP_CHECK(hipMemcpy(d_aggType.ptr(), &aggType, sizeof(aggType), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_devicePtrs.ptr(), + h_devicePtrs, + d_devicePtrs.size_bytes(), + hipMemcpyHostToDevice)); + + std::array devicePtrs = { d_devicePtrs.ptr(), d_aggType.ptr() }; + void* args[devicePtrs.size()]; + + for (int i = 0; i < devicePtrs.size(); i++) { + args[i] = &devicePtrs[i]; + } + HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(applyScanFunctor>), + gridDim, blockDim, args, 0, nullptr)); + HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipGetLastError()); + + for (int i = 0; i < wavefrontSize; i++) { + HIP_CHECK(hipMemcpy(h_result[i].host_ptr(), d_result[i].ptr(), + h_result[i].size_bytes(), hipMemcpyDeviceToHost)); + } + + INFO("T is of size: " << NumElems); + + for (int laneId = 0; laneId < NumElems; laneId++) { + result = &(h_result[laneId].host_ptr()[0]); + + if (aggType == AggregationType::InclusiveScan) { + INFO("Lane: " << laneId); + + if (std::is_same, Sum>::value) { + // the result can be calculated with an arithmetic series formula, modulo 256 + // (we do overflow unsigned char for some indices, but that is defined behaviour) + REQUIRE((*result)[laneId] == (((laneId + 1) * (laneId + laneId) / 2) % 256)); + } else { + REQUIRE((*result)[laneId] == laneId); + } + } else if (aggType == AggregationType::ExclusiveScan) { + if (std::is_same, Sum>::value) { + INFO("Lane: " << laneId); + + // the result can be calculated with an arithmetic series formula, modulo 256 + // (we do overflow unsigned char for some indices, but that is defined behaviour) + if (laneId == 0) { + REQUIRE((*result)[laneId] == 0); + } else { + REQUIRE((*result)[laneId] == (((laneId) * (laneId + laneId) / 2) % 256)); + } + } else { + if (laneId == 0) { + REQUIRE((*result)[laneId] == 0); + } else { + REQUIRE((*result)[laneId] == laneId); + } + } + } + } + + if constexpr (NumElems > 1) { + testArgsDifferentSizesScan(aggType); } } @@ -962,11 +1256,24 @@ void testReduceSizes() HIP_TEST_CASE(Unit_Thread_Block_Tile_Reduce_All_Parameter_Sizes) { SECTION("sum") { - testReduceSizes<32, Sum>(); + testArgsDifferentSizesReduce<32, Sum>(); + } + + SECTION("max") { + testArgsDifferentSizesReduce<32, Max>(); + } +} + +TEST_CASE(Unit_Thread_Block_Tile_Scan_All_Parameter_Sizes) +{ + SECTION("sum") { + testArgsDifferentSizesScan<32, Sum>(AggregationType::InclusiveScan); + testArgsDifferentSizesScan<32, Sum>(AggregationType::ExclusiveScan); } SECTION("max") { - testReduceSizes<32, Max>(); + testArgsDifferentSizesScan<32, Max>(AggregationType::InclusiveScan); + testArgsDifferentSizesScan<32, Max>(AggregationType::ExclusiveScan); } } @@ -1026,24 +1333,463 @@ HIP_TEMPLATE_TEST_CASE(Unit_Thread_Block_Coalesced_Reduce_arithmetic, int, unsig cooperative_groups::greater> ops; if (getWarpSize() == 32) { - runReduceRandomForOps(ops); + runAggregationRandomForOps(AggregationType::Reduce, ops); } else { - runReduceRandomForOps(ops); + runAggregationRandomForOps(AggregationType::Reduce, ops); } } - HIP_TEMPLATE_TEST_CASE(Unit_Thread_Block_Coalesced_Reduce_boolean, int, unsigned int, long long, unsigned long long) { std::tuple, cooperative_groups::bit_or, cooperative_groups::bit_xor> ops; if (getWarpSize() == 32) { - runReduceRandomForOps(ops); + runAggregationRandomForOps(AggregationType::Reduce, ops); } else { - runReduceRandomForOps(ops); + runAggregationRandomForOps(AggregationType::Reduce, ops); + } +} + +template +void __global__ simpleScan(T* result) +{ + T value = threadIdx.x; + cg::thread_block mygroup = cg::this_thread_block(); + auto mytile = cg::tiled_partition(mygroup); + Op op; + + if constexpr (AggType == AggregationType::InclusiveScan) { + result[threadIdx.x] = cg::inclusive_scan(mytile, value, op); + } else if constexpr (AggType == AggregationType::ExclusiveScan) { + result[threadIdx.x] = cg::exclusive_scan(mytile, value, op); + } else if constexpr (AggType == AggregationType::InclusiveScanDefault) { + result[threadIdx.x] = cg::inclusive_scan(mytile, value); + } else if constexpr (AggType == AggregationType::ExclusiveScanDefault) { + result[threadIdx.x] = cg::exclusive_scan(mytile, value); + } else { + assert(false && "Unexpected aggType"); + } +} + +/// @tparam Op either std::plus or std::less +template +void testScanForTileSize() +{ + LinearAllocGuard h_result(LinearAllocs::malloc, sizeof(T) * getWarpSize()); + LinearAllocGuard d_result(LinearAllocs::hipMalloc, h_result.size_bytes()); + dim3 gridDim = { 1 }; + dim3 blockDim = { static_cast(getWarpSize()) }; + T id = 0; + T accum = 0; + int pos = 0; + LinearAllocGuard d_aggType(LinearAllocs::hipMalloc, sizeof(AggregationType)); + std::array devicePtrs = { d_result.ptr(), d_aggType.ptr() }; + void* args[devicePtrs.size()]; + AggregationType aggType = AggType; + + if (!isInclusive(aggType)) { + scanIdentity(id); } -} + + accum = id; + + for (int i = 0; i < devicePtrs.size(); i++) { + args[i] = &devicePtrs[i]; + } + + HIP_CHECK(hipMemcpy(d_aggType.ptr(), &aggType, sizeof(AggType), hipMemcpyHostToDevice)); + HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(simpleScan), + gridDim, + blockDim, + args, + 0, + nullptr)); + HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(), + h_result.size_bytes(), hipMemcpyDeviceToHost)); + + while (pos < getWarpSize()) { + for (int i = 0; i < TileSize; i++) { + UNSCOPED_INFO("Index: " << pos + i << " tile size: " << TileSize); + + if (isInclusive(aggType)) { + accum += pos + i; + INFO("Inclusive scan"); + REQUIRE(h_result.host_ptr()[pos + i] == accum); + } else { + using ComparisonType = typename std::conditional::value, float, T>::type; + ComparisonType result, expected; + + INFO("Exclusive scan"); + + if constexpr (std::is_same::value) { + // Catch2 cannot print fp16 if there is an error + result = __half2float(h_result.host_ptr()[pos + i]); + expected = __half2float(__half2float(accum)); + } else { + result = h_result.host_ptr()[pos + i]; + expected = accum; + } + + INFO("Lane: " << (pos + i)); + REQUIRE(result == expected); + + if (i == 0) { + accum = id; + } + + if constexpr (std::is_same>::value) { + accum = std::min(__half2float(accum), __half2float(pos + i)); + } else { + accum += pos + i; + } + } + } + + accum = id; + pos += TileSize; + } +} + +TEST_CASE(Unit_Thread_Block_Tile_Inclusive_Scan_Basic) +{ + using Op = cooperative_groups::plus; + static constexpr AggregationType AggType = AggregationType::InclusiveScan; + + testScanForTileSize<1, AggType, Op, int>(); + testScanForTileSize<2, AggType, Op, int>(); + testScanForTileSize<4, AggType, Op, int>(); + testScanForTileSize<8, AggType, Op, int>(); + testScanForTileSize<16, AggType, Op, int>(); + testScanForTileSize<32, AggType, Op, int>(); + testScanForTileSize<32, AggregationType::InclusiveScanDefault, Op, int>(); + + if (getWarpSize() == 64) { + testScanForTileSize<64, AggType, Op, int>(); + } +} + +TEMPLATE_TEST_CASE(Unit_Thread_Block_Tile_Exclusive_Scan_Basic, int, half) +{ + static constexpr AggregationType AggType = AggregationType::ExclusiveScan; + + SECTION("plus") { + using Op = cooperative_groups::plus; + testScanForTileSize<1, AggType, Op, TestType>(); + testScanForTileSize<2, AggType, Op, TestType>(); + testScanForTileSize<4, AggType, Op, TestType>(); + testScanForTileSize<8, AggType, Op, TestType>(); + testScanForTileSize<16, AggType, Op, TestType>(); + testScanForTileSize<32, AggType, Op, TestType>(); + testScanForTileSize<32, AggregationType::ExclusiveScanDefault, Op, int>(); + + if (getWarpSize() == 64) { + testScanForTileSize<64, AggType, Op, TestType>(); + } + } + + SECTION("less") { + using Op = cooperative_groups::less; + testScanForTileSize<1, AggType, Op, TestType>(); + testScanForTileSize<2, AggType, Op, TestType>(); + testScanForTileSize<4, AggType, Op, TestType>(); + testScanForTileSize<8, AggType, Op, TestType>(); + testScanForTileSize<16, AggType, Op, TestType>(); + testScanForTileSize<32, AggType, Op, TestType>(); + + if (getWarpSize() == 64) { + testScanForTileSize<64, AggType, Op, TestType>(); + } + } +} + +// for all the tile sizes and all input types, using random input values, calculates the scan +// values. Additionally, randomly make some threads not participate for the coalesced_threads case +TEMPLATE_TEST_CASE(Unit_Thread_Block_Tile_Scan_Random_arithmetic, int, unsigned int, long long, + unsigned long long, float, half, double) +{ + std::tuple, + cooperative_groups::less, + cooperative_groups::greater> types; + + SECTION("inclusive") { + if (getWarpSize() == 32) { + runAggregationRandomForOps(AggregationType::InclusiveScan, types); + } else { + runAggregationRandomForOps(AggregationType::InclusiveScan, types); + } + } + + SECTION("exclusive") { + if (getWarpSize() == 32) { + runAggregationRandomForOps(AggregationType::ExclusiveScan, types); + } else { + runAggregationRandomForOps(AggregationType::ExclusiveScan, types); + } + } +} + +TEMPLATE_TEST_CASE(Unit_Thread_Block_Tile_Scan_Random_boolean, int, unsigned int, long long, + unsigned long long) +{ + std::tuple, + cooperative_groups::bit_or, + cooperative_groups::bit_xor> types; + + SECTION("inclusive") { + if (getWarpSize() == 32) { + runAggregationRandomForOps(AggregationType::InclusiveScan, types); + } else { + runAggregationRandomForOps(AggregationType::InclusiveScan, types); + } + } + + SECTION("exclusive") { + if (getWarpSize() == 32) { + runAggregationRandomForOps(AggregationType::ExclusiveScan, types); + } else { + runAggregationRandomForOps(AggregationType::ExclusiveScan, types); + } + } +} + +// make sures that tiled blocks that use the y or z dimension work correctly +TEST_CASE(Unit_Thread_Block_Tile_2D_3D_Blocks) +{ + int wavefrontSize = getWarpSize(); + + SECTION("2D") { + dim3 blockDim {1, 2, static_cast(wavefrontSize / 2)}; + + if (wavefrontSize == 32) { + runAggregationRandomForType, int, 32>(AggregationType::Reduce, blockDim); + runAggregationRandomForType, int, 32>(AggregationType::InclusiveScan, blockDim); + runAggregationRandomForType, int, 32>(AggregationType::ExclusiveScan, blockDim); + } else { + runAggregationRandomForType, int, 64>(AggregationType::Reduce, blockDim); + runAggregationRandomForType, int, 64>(AggregationType::InclusiveScan, blockDim); + runAggregationRandomForType, int, 64>(AggregationType::ExclusiveScan, blockDim); + } + } + + SECTION("3D") { + dim3 blockDim {static_cast(wavefrontSize) / 2, 2, 1}; + + if (wavefrontSize == 32) { + runAggregationRandomForType, int, 32>(AggregationType::Reduce, blockDim); + runAggregationRandomForType, int, 32>(AggregationType::InclusiveScan, blockDim); + runAggregationRandomForType, int, 32>(AggregationType::ExclusiveScan, blockDim); + } else { + runAggregationRandomForType, int, 64>(AggregationType::Reduce, blockDim); + runAggregationRandomForType, int, 64>(AggregationType::InclusiveScan, blockDim); + runAggregationRandomForType, int, 64>(AggregationType::ExclusiveScan, blockDim); + } + } +} + +TEMPLATE_TEST_CASE(Unit_Thread_Block_Coalesced_Scan_arithmetic, int, unsigned int, long long, + unsigned long long, float, half, double) +{ + std::tuple, + cooperative_groups::less, + cooperative_groups::greater> ops; + + SECTION("inclusive") { + if (getWarpSize() == 32) { + runAggregationRandomForOps(AggregationType::InclusiveScan, ops); + } else { + runAggregationRandomForOps(AggregationType::InclusiveScan, ops); + } + } + + SECTION("exclusive") { + if (getWarpSize() == 32) { + runAggregationRandomForOps(AggregationType::ExclusiveScan, ops); + } else { + runAggregationRandomForOps(AggregationType::ExclusiveScan, ops); + } + } +} + +TEMPLATE_TEST_CASE(Unit_Thread_Block_Coalesced_Scan_boolean, int, unsigned int, long long, + unsigned long long) +{ + std::tuple, + cooperative_groups::bit_or, + cooperative_groups::bit_xor> ops; + + SECTION("inclusive") { + if (getWarpSize() == 32) { + runAggregationRandomForOps(AggregationType::InclusiveScan, ops); + } else { + runAggregationRandomForOps(AggregationType::InclusiveScan, ops); + } + } + + SECTION("exclusive") { + if (getWarpSize() == 32) { + runAggregationRandomForOps(AggregationType::ExclusiveScan, ops); + } else { + runAggregationRandomForOps(AggregationType::ExclusiveScan, ops); + } + } +} + +void __global__ binaryPartitionCoalesced(int* out, int* ranks) +{ + if (threadIdx.x >= warpSize / 2) { + // this group will contain the upper part of the threads + auto coalesced = cg::coalesced_threads(); + + // this group is subsequently split in two: on even and odd indexes + auto partitioned = cg::binary_partition(coalesced, threadIdx.x % 2); + + ranks[threadIdx.x] = partitioned.thread_rank(); + out[threadIdx.x] = cg::inclusive_scan(partitioned, threadIdx.x); + } else { + ranks[threadIdx.x] = -1; + out[threadIdx.x] = -1; + } +} + +template +void __global__ binaryPartitionTiled(int* out, int* ranks) +{ + cg::thread_block mygroup = cg::this_thread_block(); + auto tile = cg::tiled_partition(mygroup); + + if (tile.meta_group_rank() == 1) { + auto partitioned = cg::binary_partition(tile, threadIdx.x % 2); + ranks[threadIdx.x] = partitioned.thread_rank(); + out[threadIdx.x] = cg::inclusive_scan(partitioned, threadIdx.x); + } else { + ranks[threadIdx.x] = -1; + out[threadIdx.x] = -1; + } +} + +TEST_CASE(Unit_Thread_Block_Scan_partition) +{ + int wavefrontSize = getWarpSize(); + LinearAllocGuard h_result(LinearAllocs::malloc, sizeof(int) * wavefrontSize); + LinearAllocGuard d_result(LinearAllocs::hipMalloc, h_result.size_bytes()); + LinearAllocGuard h_ranks(LinearAllocs::malloc, sizeof(int) * wavefrontSize); + LinearAllocGuard d_ranks(LinearAllocs::hipMalloc, h_ranks.size_bytes()); + dim3 gridDim = { 1 }; + dim3 blockDim = { static_cast(getWarpSize()) }; + void* resultsPtr = d_result.ptr(); + void* ranksPtr = d_ranks.ptr(); + void* args[] = { &resultsPtr, &ranksPtr }; + auto checkResults = [&]() { + int accumEven = 0; + int accumOdd = 0; + + for (int laneId = 0; laneId < getWarpSize(); laneId++) { + if (laneId >= wavefrontSize / 2) { + if (laneId % 2 == 0) { + accumEven += laneId; + } else { + accumOdd += laneId; + } + + INFO("laneId: " << laneId); + REQUIRE(h_ranks.host_ptr()[laneId] == (laneId - wavefrontSize / 2) / 2); + REQUIRE(h_result.host_ptr()[laneId] == (laneId % 2 ? accumOdd : accumEven)); + } else { + INFO("laneId: " << laneId); + REQUIRE(h_ranks.host_ptr()[laneId] == -1); + REQUIRE(h_result.host_ptr()[laneId] == -1); + } + } + }; + + // the result of both sections must be the same; in one we use coalesced_threads to sub-divide + // into higher-order and lower-order threads, in another we just use cg::tiled_partition to do the + // equivalent + SECTION("coalesced") { + HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(binaryPartitionCoalesced), + gridDim, + blockDim, + args, + 0, + nullptr)); + HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipMemcpy(h_ranks.host_ptr(), d_ranks.ptr(), + h_ranks.size_bytes(), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(), + h_result.size_bytes(), hipMemcpyDeviceToHost)); + checkResults(); + } + + SECTION("tiled") { + void* kernelPtr = reinterpret_cast(wavefrontSize == 32? + binaryPartitionTiled<32> : binaryPartitionTiled<64>); + HIP_CHECK(hipLaunchCooperativeKernel(kernelPtr, + gridDim, + blockDim, + args, + 0, + nullptr)); + HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipMemcpy(h_ranks.host_ptr(), d_ranks.ptr(), + h_ranks.size_bytes(), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(), + h_result.size_bytes(), hipMemcpyDeviceToHost)); + checkResults(); + } +} + +__global__ void multiDimReduce(int* output) +{ + cg::thread_block mygroup = cg::this_thread_block(); + auto mytile = cg::tiled_partition<16>(mygroup); + int tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; + + output[tid] = cooperative_groups::reduce(mytile, tid, cooperative_groups::plus()); +} + +// test tiles reduces on multidimensional blocks. Basically splits the 128 threads in the block +// into 16 threads tiles, each one contributing the tid to their reduces +HIP_TEST_CASE(Unit_Thread_Block_Tile_Multi_Dimensional_Reduce) +{ + LinearAllocGuard h_result(LinearAllocs::malloc, sizeof(int) * 128); + LinearAllocGuard d_result(LinearAllocs::hipMalloc, h_result.size_bytes()); + dim3 gridDim = { 1 }; + // use a block size bigger than the warp size + dim3 blockDim = { 16, 4, 2 }; + void* devicePtr = d_result.ptr(); + void* args[1] = { &devicePtr }; + hipError_t status; + + status = hipLaunchCooperativeKernel(multiDimReduce, + gridDim, + blockDim, + args, + 0, + nullptr); + HIP_CHECK(status); + HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(), + h_result.size_bytes(), hipMemcpyDeviceToHost)); + + for (int row = 0; row < 8; row++) { + int rowResult = 0; + + for (int index = 0; index < 16; index++) { + rowResult += row * 16 + index; + } + + for (int index = 0; index < 16; index++) { + INFO("row: " << row << " index: " << index); + REQUIRE(h_result.host_ptr()[row * 16 + index] == rowResult); + } + } +} /** * End doxygen group DeviceLanguageTest. diff --git a/projects/hip-tests/catch/unit/rtc/rtc_coop.cc b/projects/hip-tests/catch/unit/rtc/rtc_coop.cc index 364ed259714..5c55a057676 100644 --- a/projects/hip-tests/catch/unit/rtc/rtc_coop.cc +++ b/projects/hip-tests/catch/unit/rtc/rtc_coop.cc @@ -34,7 +34,7 @@ void compileProgram(hiprtcProgram& prog, const std::tuple&) { std::string expression; std::tuple remainingTypes; - expression = std::string("reduceCoopKernel<") + + expression = std::string("aggregationCoopKernel<") + functorToString() + ", " + typeToString() + @@ -64,7 +64,7 @@ void compileProgram(hiprtcProgram& prog, const std::tuple<>&) { } template class Op> -void runReduce(hiprtcProgram& prog) { +void runAggregation(hiprtcProgram& prog, AggregationType aggType) { using distribution = typename DistributionType::type; static constexpr std::array tileSizes = {1, 2, 4, 8, 16, 32, 64}; @@ -76,6 +76,7 @@ void runReduce(hiprtcProgram& prog) { LinearAllocGuard input(LinearAllocs::malloc, d_input.size_bytes()); LinearAllocGuard d_output(LinearAllocs::hipMalloc, wavefrontSize * sizeof(T) * tileSizes.size()); LinearAllocGuard output(LinearAllocs::malloc, d_output.size_bytes()); + LinearAllocGuard d_aggType(LinearAllocs::hipMalloc, sizeof(AggregationType)); std::mt19937_64 gen(Catch::rngSeed()); // for float16, we generate any random unsigned short, but cap the exponent later on // to keep it in the range (-8.0..8.0) (just to avoid overflows) @@ -84,15 +85,16 @@ void runReduce(hiprtcProgram& prog) { T b = std::is_same::value? std::numeric_limits::max() : 1023; distribution dist(a, b); + HIP_CHECK(hipMemcpy(d_aggType.ptr(), &aggType, sizeof(aggType), hipMemcpyHostToDevice)); genRandomBuffers(d_input, input, dist, gen, wavefrontSize); - std::vector args = { d_output.ptr(), d_input.ptr() }; + std::vector args = { d_output.ptr(), d_input.ptr(), d_aggType.ptr() }; std::size_t sizeBytes = args.size() * sizeof(void*); void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, args.data(), HIP_LAUNCH_PARAM_BUFFER_SIZE, &sizeBytes, HIP_LAUNCH_PARAM_END}; std::vector code; size_t codeSize; std::string expression = - std::string("reduceCoopKernel<") + + std::string("aggregationCoopKernel<") + functorToString() + ", " + typeToString() + @@ -116,21 +118,13 @@ void runReduce(hiprtcProgram& prog) { INFO("Type: " << typeToString()); for (auto tileSize : tileSizes) { - UNSCOPED_INFO("Tile size: " << tileSize); - for (unsigned int laneId = 0; laneId < wavefrontSize; laneId++) { - unsigned long long mask = ~0ull >> (64 - tileSize); - - mask <<= (((laneId % wavefrontSize) / tileSize) * tileSize); - if (tileSize <= wavefrontSize) { std::string inputStr; if constexpr (!std::is_same::value) { inputStr = std::string(" input: ") + std::to_string(input.host_ptr()[laneId]); } - - UNSCOPED_INFO("laneId: " << laneId << " mask: " << mask << inputStr); } } @@ -139,11 +133,24 @@ void runReduce(hiprtcProgram& prog) { mask <<= ((laneId % wavefrontSize) / tileSize) * tileSize; - if (tileSize <= wavefrontSize) { - T expected; - Op op; - expected = calculateExpected(input.host_ptr(), op, mask); - REQUIRE(output.host_ptr()[numTile * wavefrontSize + laneId] == expected); + if ((1ull << laneId) & mask) { + if (tileSize <= wavefrontSize) { + T expectedByLane[64]; + Op op; + T result = output.host_ptr()[numTile * wavefrontSize + laneId]; + + calculateExpected(expectedByLane, input.host_ptr(), op, mask, aggType); + + if constexpr (std::is_integral::value) { + REQUIRE(result == expectedByLane[laneId]); + } else { + compareFloatingPoint>(result, + expectedByLane[laneId], + mask, + input.host_ptr(), + laneId); + } + } } } @@ -152,61 +159,75 @@ void runReduce(hiprtcProgram& prog) { } template