diff --git a/bolt/common/base/SimdUtil-inl.h b/bolt/common/base/SimdUtil-inl.h index b723b8c0..d5757be1 100644 --- a/bolt/common/base/SimdUtil-inl.h +++ b/bolt/common/base/SimdUtil-inl.h @@ -28,6 +28,7 @@ * -------------------------------------------------------------------------- */ +#include #include #if XSIMD_WITH_NEON @@ -38,6 +39,14 @@ XSIMD_DECLARE_SIMD_REGISTER( detail::neon_vector_type); } // namespace xsimd::types #endif + +#if XSIMD_WITH_SVE +#include +namespace xsimd::types { +XSIMD_DECLARE_SIMD_REGISTER(bool, sve, detail::sve_vector_type); +} +#endif + namespace bytedance::bolt::simd { namespace detail { @@ -106,6 +115,10 @@ struct BitMask { return (vaddv_u8(vget_high_u8(vmask)) << 8) | vaddv_u8(vget_low_u8(vmask)); } #endif + + static int toBitMask(xsimd::batch_bool mask, const xsimd::generic&) { + return genericToBitMask(mask); + } }; template @@ -146,6 +159,16 @@ struct BitMask { } #endif +#if XSIMD_WITH_SVE + static int toBitMask(xsimd::batch_bool mask, const xsimd::sve&) { + svuint16_t onc = svdup_u16(1); + svuint16_t inv = svindex_u16(0, 1); + svuint16_t pow = svlsl_m(svptrue_b16(), onc, inv); + uint32_t nullsres = svaddv(mask, pow); + return nullsres; + } +#endif + static int toBitMask(xsimd::batch_bool mask, const xsimd::generic&) { return genericToBitMask(mask); } @@ -185,6 +208,16 @@ struct BitMask { } #endif +#if XSIMD_WITH_SVE + static int toBitMask(xsimd::batch_bool mask, const xsimd::sve&) { + svuint32_t onc = svdup_u32(1); + svuint32_t inv = svindex_u32(0, 1); + svuint32_t pow = svlsl_m(svptrue_b32(), onc, inv); + uint8_t nullsres = svaddv(mask, pow); + return nullsres; + } +#endif + static int toBitMask(xsimd::batch_bool mask, const xsimd::generic&) { return genericToBitMask(mask); } @@ -224,6 +257,16 @@ struct BitMask { } #endif +#if XSIMD_WITH_SVE + static int toBitMask(xsimd::batch_bool mask, const xsimd::sve&) { + svuint64_t onc = svdup_u64(1); + svuint64_t inv = svindex_u64(0, 1); + svuint64_t pow = svlsl_m(svptrue_b64(), onc, inv); + uint32_t nullsres = svaddv(mask, pow); + return nullsres; + } +#endif + static int toBitMask(xsimd::batch_bool mask, const xsimd::generic&) { return genericToBitMask(mask); } @@ -352,9 +395,21 @@ template <> inline xsimd::batch_bool leadingMask( int i, const xsimd::default_arch&) { + /* + With GCC builds, compiler throws an error "invalid cast" on reintepreting to + the same data type, in SVE 256's case, svbool_t + __attribute__((arm_sve_vector_bits(256))). + So this is a workaround for now. Can be updated once the bug in GCC is + resolved in future GCC versions. + */ + +#if XSIMD_WITH_SVE && defined(__GNUC__) && !defined(__clang__) + return xsimd::batch_bool(leadingMask32[i].data); +#else return reinterpret_cast< xsimd::batch_bool::register_type>( leadingMask32[i].data); +#endif } template <> @@ -368,9 +423,21 @@ template <> inline xsimd::batch_bool leadingMask( int i, const xsimd::default_arch&) { + /* + With GCC builds, compiler throws an error "invalid cast" on reintepreting to + the same data type, in SVE 256's case, svbool_t + __attribute__((arm_sve_vector_bits(256))). + So this is a workaround for now. Can be updated once the bug in GCC is + resolved in future GCC versions. + */ + +#if XSIMD_WITH_SVE && defined(__GNUC__) && !defined(__clang__) + return xsimd::batch_bool(leadingMask64[i].data); +#else return reinterpret_cast< xsimd::batch_bool::register_type>( leadingMask64[i].data); +#endif } } // namespace detail @@ -402,8 +469,8 @@ struct CopyWord, A> { // Copies one element of T and advances 'to', 'from', and 'bytes' by // sizeof(T). Returns false if 'bytes' went to 0. template -inline bool copyNextWord(void*& to, const void*& from, int32_t& bytes) { - if (bytes >= sizeof(T)) { +inline bool copyNextWord(void*& to, const void*& from, int64_t& bytes) { + if (bytes >= static_cast(sizeof(T))) { CopyWord::apply(to, from); bytes -= sizeof(T); if (!bytes) { @@ -418,13 +485,13 @@ inline bool copyNextWord(void*& to, const void*& from, int32_t& bytes) { } // namespace detail template -inline void memcpy(void* to, const void* from, int32_t bytes, const A& arch) { +inline void memcpy(void* to, const void* from, int64_t bytes, const A& arch) { while (bytes >= batchByteSize(arch)) { if (!detail::copyNextWord, A>(to, from, bytes)) { return; } } - while (bytes >= sizeof(int64_t)) { + while (bytes >= static_cast(sizeof(int64_t))) { if (!detail::copyNextWord(to, from, bytes)) { return; } @@ -479,7 +546,7 @@ void memset(void* to, char data, int32_t bytes, const A& arch) { } } int64_t data64 = *reinterpret_cast(&v); - while (bytes >= sizeof(int64_t)) { + while (bytes >= static_cast(sizeof(int64_t))) { if (!detail::setNextWord(to, data64, bytes, arch)) { return; } @@ -553,6 +620,17 @@ struct Gather { return apply(base, loadIndices(indices, arch), arch); } +#if XSIMD_WITH_SVE + template + static xsimd::batch + apply(const T* base, const int32_t* indices, const xsimd::sve& arch) { + svint32_t hashes_vec = svld1_s32(svptrue_b32(), indices); + return reinterpret_cast::register_type>( + svld1_gather_s32index_s32( + svptrue_b32(), reinterpret_cast(base), hashes_vec)); + } +#endif + template static xsimd::batch apply(const T* base, const int32_t* indices, const xsimd::generic&) { @@ -569,6 +647,17 @@ struct Gather { } #endif +#if XSIMD_WITH_SVE + template + static xsimd::batch + apply(const T* base, VIndexType vindex, const xsimd::sve&) { + alignas(A::alignment()) int32_t indices[vindex.size]; + vindex.store_aligned(indices); + svint32_t hashes_vec = svld1_s32(svptrue_b32(), indices); + return genericGather(base, indices); + } +#endif + template static xsimd::batch apply(const T* base, VIndexType vindex, const xsimd::generic&) { @@ -587,6 +676,21 @@ struct Gather { return maskApply(src, mask, base, loadIndices(indices, arch), arch); } +#if XSIMD_WITH_SVE + template + static xsimd::batch maskApply( + xsimd::batch src, + xsimd::batch_bool mask, + const T* base, + const int32_t* indices, + const xsimd::sve& arch) { + svint32_t result = svld1_gather_s32index_s32( + mask, reinterpret_cast(base), svld1_s32(mask, indices)); + return reinterpret_cast::register_type>( + svsel_s32(mask, result, src)); + } +#endif + template static xsimd::batch maskApply( xsimd::batch src, @@ -650,6 +754,23 @@ struct Gather { return Batch64::load_unaligned(indices); } +#if (XSIMD_WITH_SVE && SVE_BITS == 128) + + static Batch64 loadIndices( + const int32_t* indices, + const xsimd::sve&) { + return Batch64::load_unaligned(indices); + } +#endif + +#if (XSIMD_WITH_SVE && SVE_BITS == 256) + static Batch128 loadIndices( + const int32_t* indices, + const xsimd::sve&) { + return Batch128::load_unaligned(indices); + } +#endif + #if XSIMD_WITH_AVX2 template static xsimd::batch @@ -664,6 +785,46 @@ struct Gather { return genericGather(base, indices); } +#if XSIMD_WITH_SVE + template + static xsimd::batch + apply(const T* base, const int32_t* indices, const xsimd::sve& arch) { + svint32_t hashes_vec = svld1_s32(svptrue_b32(), indices); + svint64_t idx64_lo = svunpklo_s64(hashes_vec); + return reinterpret_cast::register_type>( + svld1_gather_s64index_s64( + svptrue_b64(), reinterpret_cast(base), idx64_lo)); + } +#endif + +#if (XSIMD_WITH_SVE && SVE_BITS == 256) + template + static xsimd::batch + apply(const T* base, Batch128 vindex, const xsimd::sve&) { + alignas(A::alignment()) int32_t indices[vindex.size]; + vindex.store_unaligned(indices); + svint32_t hashes_vec = svld1_s32(svptrue_b32(), indices); + svint64_t idx64_lo = svunpklo_s64(hashes_vec); + return reinterpret_cast::register_type>( + svld1_gather_s64index_s64( + svptrue_b64(), reinterpret_cast(base), idx64_lo)); + } +#endif + +#if (XSIMD_WITH_SVE && SVE_BITS == 128) + template + static xsimd::batch + apply(const T* base, Batch64 vindex, const xsimd::sve&) { + constexpr int N = xsimd::batch::size; + alignas(A::alignment()) T dst[N]; + auto bytes = reinterpret_cast(base); + for (int i = 0; i < N; ++i) { + dst[i] = *reinterpret_cast(bytes + vindex.data[i] * kScale); + } + return xsimd::load_aligned(dst); + } +#endif + #if XSIMD_WITH_AVX2 template static xsimd::batch apply( @@ -686,6 +847,41 @@ struct Gather { return genericMaskGather(src, mask, base, indices); } +#if (XSIMD_WITH_SVE && SVE_BITS == 128) + template + static xsimd::batch maskApply( + xsimd::batch src, + xsimd::batch_bool mask, + const T* base, + Batch64 vindex, + const xsimd::sve& arch) { + constexpr int N = Batch64::size; + alignas(A::alignment()) int32_t indices[N]; + vindex.store_unaligned(indices); + return maskApply(src, mask, base, indices, arch); + } +#endif + +#if (XSIMD_WITH_SVE && SVE_BITS == 256) + template + static xsimd::batch maskApply( + xsimd::batch src, + xsimd::batch_bool mask, + const T* base, + Batch128 vindex, + const xsimd::sve& arch) { + constexpr int N = Batch128::size; + alignas(A::alignment()) int32_t indices[N]; + vindex.store_unaligned(indices); + svint32_t hashes_vec = svld1_s32(svptrue_b32(), indices); + svint64_t idx64_lo = svunpklo_s64(hashes_vec); + svint64_t result = svld1_gather_s64index_s64( + mask, reinterpret_cast(base), idx64_lo); + return reinterpret_cast::register_type>( + svsel_s64(mask, result, src)); + } +#endif + #if XSIMD_WITH_AVX2 template static xsimd::batch maskApply( @@ -743,6 +939,17 @@ struct Gather { } #endif +#if XSIMD_WITH_SVE + template + static xsimd::batch + apply(const T* base, const int64_t* indices, const xsimd::sve& arch) { + svint64_t hashes_vec = svld1_s64(svptrue_b64(), indices); + return reinterpret_cast::register_type>( + svld1_gather_s64index_s64( + svptrue_b64(), reinterpret_cast(base), hashes_vec)); + } +#endif + template static xsimd::batch apply(const T* base, const int64_t* indices, const xsimd::generic&) { @@ -759,6 +966,21 @@ struct Gather { return maskApply(src, mask, base, loadIndices(indices, arch), arch); } +#if XSIMD_WITH_SVE + template + static xsimd::batch maskApply( + xsimd::batch src, + xsimd::batch_bool mask, + const T* base, + const int64_t* indices, + const xsimd::sve& arch) { + svint64_t result = svld1_gather_s64index_s64( + mask, reinterpret_cast(base), svld1_s64(mask, indices)); + return reinterpret_cast::register_type>( + svsel_s64(mask, result, src)); + } +#endif + #if XSIMD_WITH_AVX2 template static xsimd::batch maskApply( @@ -777,6 +999,23 @@ struct Gather { } #endif +#if XSIMD_WITH_SVE + template + static xsimd::batch maskApply( + xsimd::batch src, + xsimd::batch_bool mask, + const T* base, + VIndexType vindex, + const xsimd::sve&) { + alignas(A::alignment()) int64_t indices[vindex.size]; + vindex.store_aligned(indices); + svint64_t result = svld1_gather_s64index_s64( + mask, reinterpret_cast(base), svld1_s64(mask, indices)); + return reinterpret_cast::register_type>( + svsel_s64(mask, result, src)); + } +#endif + template static xsimd::batch maskApply( xsimd::batch src, @@ -818,6 +1057,36 @@ xsimd::batch pack32( } #endif +#if XSIMD_WITH_SVE +template +xsimd::batch pack32( + xsimd::batch x, + xsimd::batch y, + const xsimd::sve&) { + return svuzp1_s16(svreinterpret_s16_s32(x), svreinterpret_s16_s32(y)); +} +#endif + +template +xsimd::batch pack32( + xsimd::batch x, + xsimd::batch y, + const xsimd::generic&) { + constexpr std::size_t size = xsimd::batch::size; + alignas(A) int32_t xArr[size]; + alignas(A) int32_t yArr[size]; + alignas(A) int16_t resultArr[2 * size]; + + x.store_unaligned(xArr); + y.store_unaligned(yArr); + + for (std::size_t i = 0; i < size; ++i) { + resultArr[i] = static_cast(xArr[i]); + resultArr[i + size] = static_cast(yArr[i]); + } + return xsimd::batch::load_unaligned(resultArr); +} + #if XSIMD_WITH_AVX2 template xsimd::batch pack32( @@ -857,6 +1126,16 @@ template Batch64 genericPermute(Batch64 data, Batch64 idx) { static_assert(data.size >= idx.size); Batch64 ans; + for (size_t i = 0; i < idx.size; ++i) { + ans.data[i] = data.data[idx.data[i]]; + } + return ans; +} + +template +Batch128 genericPermute(Batch128 data, Batch128 idx) { + static_assert(data.size >= idx.size); + Batch128 ans; for (int i = 0; i < idx.size; ++i) { ans.data[i] = data.data[idx.data[i]]; } @@ -892,6 +1171,21 @@ struct Permute { } #endif +#if XSIMD_WITH_SVE + static xsimd::batch apply( + xsimd::batch data, + xsimd::batch idx, + const xsimd::sve&) { + if constexpr (std::is_same_v) { + svfloat32_t result = svtbl_f32(data.data, svreinterpret_u32(idx.data)); + return reinterpret_cast(result); + } else { + svint32_t result = svtbl_s32(data.data, svreinterpret_u32(idx.data)); + return reinterpret_cast(result); + } + } +#endif + #if XSIMD_WITH_AVX static HalfBatch apply(HalfBatch data, HalfBatch idx, const xsimd::avx&) { @@ -927,7 +1221,8 @@ xsimd::batch gather( } else { second = xsimd::batch::broadcast(0); } - return detail::pack32(first, second, arch); + auto packed = detail::pack32(first, second, arch); + return packed; } namespace detail { @@ -1050,6 +1345,25 @@ struct GetHalf { } #endif + template + static xsimd::batch apply( + xsimd::batch data, + const xsimd::generic&) { + constexpr std::size_t input_size = xsimd::batch::size; + constexpr std::size_t half_size = input_size / 2; + + alignas(A::alignment()) std::array input_buffer; + data.store_aligned(input_buffer.data()); + + alignas(A::alignment()) std::array output_buffer; + for (std::size_t i = 0; i < half_size; ++i) { + output_buffer[i] = static_cast( + kSecond ? input_buffer[i + half_size] : input_buffer[i]); + } + + return xsimd::load_aligned(output_buffer.data()); + } + #if XSIMD_WITH_NEON template static xsimd::batch apply( @@ -1101,6 +1415,23 @@ struct GetHalf { return vmovl_u32(vreinterpret_u32_s32(half)); } #endif + + template + static xsimd::batch apply( + xsimd::batch data, + const xsimd::generic&) { + constexpr std::size_t input_size = xsimd::batch::size; + constexpr std::size_t half_size = input_size / 2; + alignas(A::alignment()) std::array input_buffer; + data.store_aligned(input_buffer.data()); + alignas(A::alignment()) std::array output_buffer; + for (std::size_t i = 0; i < half_size; ++i) { + output_buffer[i] = static_cast( + kSecond ? static_cast(input_buffer[i + half_size]) + : static_cast(input_buffer[i])); + } + return xsimd::load_aligned(output_buffer.data()); + } }; } // namespace detail @@ -1144,6 +1475,21 @@ struct Filter { return ans; } #endif + +#if XSIMD_WITH_SVE + static xsimd::batch + apply(xsimd::batch data, int mask, const xsimd::sve& arch) { + int lane_count = svcntb() / sizeof(T); + T compressed[lane_count]; + int idx = 0; + for (int i = 0; i < lane_count; i++) { + if (mask & (1 << i)) { + compressed[idx++] = data.get(i); + } + } + return xsimd::load_unaligned(compressed); + } +#endif }; template @@ -1177,6 +1523,20 @@ struct Filter { reinterpret_cast<__m256i>(data.data), vindex)); } #endif + +#if XSIMD_WITH_SVE + static xsimd::batch + apply(xsimd::batch data, int mask, const xsimd::sve&) { + auto vindex = xsimd::batch::load_aligned(byteSetBits[mask]); + if constexpr (std::is_same_v) { + svfloat64_t result = svtbl_f64(data.data, svreinterpret_u64(vindex.data)); + return reinterpret_cast(result); + } else { + svint64_t result = svtbl_s64(data.data, svreinterpret_u64(vindex.data)); + return reinterpret_cast(result); + } + } +#endif }; template @@ -1194,6 +1554,15 @@ struct Crc32 { } #endif +#if XSIMD_WITH_SVE + static uint32_t apply(uint32_t checksum, uint64_t value, const xsimd::sve&) { + __asm__("crc32cx %w[c], %w[c], %x[v]" + : [c] "+r"(checksum) + : [v] "r"(value)); + return checksum; + } +#endif + #if XSIMD_WITH_NEON static uint32_t apply(uint32_t checksum, uint64_t value, const xsimd::neon&) { __asm__("crc32cx %w[c], %w[c], %x[v]" @@ -1219,6 +1588,20 @@ xsimd::batch iota(const A&) { namespace detail { +#if (XSIMD_WITH_SVE && SVE_BITS == 256) +template +struct HalfBatchImpl>> { + using Type = Batch128; +}; +#endif + +#if (XSIMD_WITH_SVE && SVE_BITS == 128) +template +struct HalfBatchImpl>> { + using Type = Batch64; +}; +#endif + template struct HalfBatchImpl>> { using Type = xsimd::batch; @@ -1256,10 +1639,18 @@ struct ReinterpretBatch { } }; -#if XSIMD_WITH_NEON || XSIMD_WITH_NEON64 +#if XSIMD_WITH_NEON || XSIMD_WITH_NEON64 || XSIMD_WITH_SVE template struct ReinterpretBatch { +#if XSIMD_WITH_SVE + static xsimd::batch apply( + xsimd::batch data, + const xsimd::sve&) { + return svreinterpret_u8_s8(data.data); + } +#endif + #if XSIMD_WITH_NEON static xsimd::batch apply( xsimd::batch data, @@ -1279,6 +1670,14 @@ struct ReinterpretBatch { template struct ReinterpretBatch { +#if XSIMD_WITH_SVE + static xsimd::batch apply( + xsimd::batch data, + const xsimd::sve&) { + return svreinterpret_s8_u8(data.data); + } +#endif + #if XSIMD_WITH_NEON static xsimd::batch apply( xsimd::batch data, @@ -1298,6 +1697,14 @@ struct ReinterpretBatch { template struct ReinterpretBatch { +#if XSIMD_WITH_SVE + static xsimd::batch apply( + xsimd::batch data, + const xsimd::sve&) { + return svreinterpret_u16_s16(data.data); + } +#endif + #if XSIMD_WITH_NEON static xsimd::batch apply( xsimd::batch data, @@ -1317,6 +1724,14 @@ struct ReinterpretBatch { template struct ReinterpretBatch { +#if XSIMD_WITH_SVE + static xsimd::batch apply( + xsimd::batch data, + const xsimd::sve&) { + return svreinterpret_s16_u16(data.data); + } +#endif + #if XSIMD_WITH_NEON static xsimd::batch apply( xsimd::batch data, @@ -1336,6 +1751,14 @@ struct ReinterpretBatch { template struct ReinterpretBatch { +#if XSIMD_WITH_SVE + static xsimd::batch apply( + xsimd::batch data, + const xsimd::sve&) { + return svreinterpret_u32_s32(data.data); + } +#endif + #if XSIMD_WITH_NEON static xsimd::batch apply( xsimd::batch data, @@ -1355,6 +1778,14 @@ struct ReinterpretBatch { template struct ReinterpretBatch { +#if XSIMD_WITH_SVE + static xsimd::batch apply( + xsimd::batch data, + const xsimd::sve&) { + return svreinterpret_s32_u32(data.data); + } +#endif + #if XSIMD_WITH_NEON static xsimd::batch apply( xsimd::batch data, @@ -1374,6 +1805,14 @@ struct ReinterpretBatch { template struct ReinterpretBatch { +#if XSIMD_WITH_SVE + static xsimd::batch apply( + xsimd::batch data, + const xsimd::sve&) { + return svreinterpret_u64_u32(data.data); + } +#endif + #if XSIMD_WITH_NEON static xsimd::batch apply( xsimd::batch data, @@ -1393,6 +1832,14 @@ struct ReinterpretBatch { template struct ReinterpretBatch { +#if XSIMD_WITH_SVE + static xsimd::batch apply( + xsimd::batch data, + const xsimd::sve&) { + return svreinterpret_u64_s64(data.data); + } +#endif + #if XSIMD_WITH_NEON static xsimd::batch apply( xsimd::batch data, @@ -1412,6 +1859,14 @@ struct ReinterpretBatch { template struct ReinterpretBatch { +#if XSIMD_WITH_SVE + static xsimd::batch apply( + xsimd::batch data, + const xsimd::sve&) { + return svreinterpret_u32_s64(data.data); + } +#endif + #if XSIMD_WITH_NEON static xsimd::batch apply( xsimd::batch data, @@ -1431,6 +1886,14 @@ struct ReinterpretBatch { template struct ReinterpretBatch { +#if XSIMD_WITH_SVE + static xsimd::batch apply( + xsimd::batch data, + const xsimd::sve&) { + return svreinterpret_s64_u64(data.data); + } +#endif + #if XSIMD_WITH_NEON static xsimd::batch apply( xsimd::batch data, @@ -1450,6 +1913,14 @@ struct ReinterpretBatch { template struct ReinterpretBatch { +#if XSIMD_WITH_SVE + static xsimd::batch apply( + xsimd::batch data, + const xsimd::sve&) { + return svreinterpret_u32_u64(data.data); + } +#endif + #if XSIMD_WITH_NEON static xsimd::batch apply( xsimd::batch data, diff --git a/bolt/common/base/SimdUtil.h b/bolt/common/base/SimdUtil.h index 83ac5c58..3a747e08 100644 --- a/bolt/common/base/SimdUtil.h +++ b/bolt/common/base/SimdUtil.h @@ -144,6 +144,50 @@ struct Batch64 { } }; +template +struct Batch128 { + static constexpr size_t size = [] { + static_assert(16 % sizeof(T) == 0); + return 16 / sizeof(T); + }(); + + T data[size]; + + static Batch128 from(std::initializer_list values) { + BOLT_DCHECK_EQ(values.size(), size); + Batch128 ans; + for (int i = 0; i < size; ++i) { + ans.data[i] = *(values.begin() + i); + } + return ans; + } + + void store_unaligned(T* out) const { + std::copy(std::begin(data), std::end(data), out); + } + + static Batch128 load_aligned(const T* mem) { + return load_unaligned(mem); + } + + static Batch128 load_unaligned(const T* mem) { + Batch128 ans; + std::copy(mem, mem + size, ans.data); + return ans; + } + + friend Batch128 operator+(Batch128 x, T y) { + for (int i = 0; i < size; ++i) { + x.data[i] += y; + } + return x; + } + + friend Batch128 operator-(Batch128 x, T y) { + return x + (-y); + } +}; + namespace detail { template struct Gather; @@ -191,6 +235,17 @@ gather(const T* base, Batch64 vindex, const A& arch = {}) { return Impl::template apply(base, vindex.data, arch); } +template < + typename T, + typename IndexType, + int kScale = sizeof(T), + typename A = xsimd::default_arch> +xsimd::batch +gather(const T* base, Batch128 vindex, const A& arch = {}) { + using Impl = detail::Gather; + return Impl::template apply(base, vindex.data, arch); +} + // Same as 'gather' above except the indices are read from memory. template < typename T, @@ -236,6 +291,21 @@ xsimd::batch maskGather( return Impl::template maskApply(src, mask, base, vindex.data, arch); } +template < + typename T, + typename IndexType, + int kScale = sizeof(T), + typename A = xsimd::default_arch> +xsimd::batch maskGather( + xsimd::batch src, + xsimd::batch_bool mask, + const T* base, + Batch128 vindex, + const A& arch = {}) { + using Impl = detail::Gather; + return Impl::template maskApply(src, mask, base, vindex.data, arch); +} + // Same as 'maskGather' above but read indices from memory. template < typename T, @@ -465,7 +535,7 @@ inline T* addBytes(T* pointer, int32_t bytes) { // 'memcpy' implementation that copies at maximum width and unrolls // when 'bytes' is constant. template -inline void memcpy(void* to, const void* from, int32_t bytes, const A& = {}); +inline void memcpy(void* to, const void* from, int64_t bytes, const A& = {}); // memset implementation that writes at maximum width and unrolls for // constant values of 'bytes'. diff --git a/bolt/dwio/common/ColumnVisitors.h b/bolt/dwio/common/ColumnVisitors.h index 41199c0a..ac449e05 100644 --- a/bolt/dwio/common/ColumnVisitors.h +++ b/bolt/dwio/common/ColumnVisitors.h @@ -702,7 +702,15 @@ inline xsimd::batch cvtU32toI64( xsimd::batch values) { return _mm256_cvtepu32_epi64(values); } -#elif XSIMD_WITH_SSE2 || XSIMD_WITH_NEON +#elif (XSIMD_WITH_SVE && SVE_BITS == 256) +inline xsimd::batch cvtU32toI64(simd::Batch128 values) { + int64_t element_1 = static_cast(values.data[0]); + int64_t element_2 = static_cast(values.data[1]); + int64_t element_3 = static_cast(values.data[2]); + int64_t element_4 = static_cast(values.data[3]); + return xsimd::batch(element_1, element_2, element_3, element_4); +} +#elif XSIMD_WITH_SSE2 || XSIMD_WITH_NEON || (XSIMD_WITH_SVE && SVE_BITS == 128) inline xsimd::batch cvtU32toI64(simd::Batch64 values) { int64_t lo = static_cast(values.data[0]); int64_t hi = static_cast(values.data[1]); @@ -878,10 +886,19 @@ class DictionaryColumnVisitor dictMask, reinterpret_cast(filterCache() - 3), indices); +#ifdef SVE_BITS + auto unknowns = simd::toBitMask( + simd::reinterpretBatch((cache & (kUnknown << 24)) << 1) != + xsimd::batch(0)); + auto passed = simd::toBitMask( + (cache & xsimd::batch(kSuccess << 24)) != + xsimd::batch(0)); +#else auto unknowns = simd::toBitMask(xsimd::batch_bool( simd::reinterpretBatch((cache & (kUnknown << 24)) << 1))); auto passed = simd::toBitMask( xsimd::batch_bool(simd::reinterpretBatch(cache))); +#endif if (UNLIKELY(unknowns)) { uint16_t bits = unknowns; // Ranges only over inputs that are in dictionary, the not in dictionary @@ -1241,10 +1258,19 @@ class StringDictionaryColumnVisitor } else { cache = simd::gather(base, indices); } +#ifdef SVE_BITS + auto unknowns = simd::toBitMask( + simd::reinterpretBatch((cache & (kUnknown << 24)) << 1) != + xsimd::batch(0)); + auto passed = simd::toBitMask( + (cache & xsimd::batch(kSuccess << 24)) != + xsimd::batch(0)); +#else auto unknowns = simd::toBitMask(xsimd::batch_bool( simd::reinterpretBatch((cache & (kUnknown << 24)) << 1))); auto passed = simd::toBitMask( xsimd::batch_bool(simd::reinterpretBatch(cache))); +#endif if (UNLIKELY(unknowns)) { uint16_t bits = unknowns; while (bits) { diff --git a/conanfile.py b/conanfile.py index a33d52ad..468f637b 100644 --- a/conanfile.py +++ b/conanfile.py @@ -398,14 +398,38 @@ def generate(self): tc.cache_variables["CMAKE_CXX_FLAGS"] = flags tc.cache_variables["CMAKE_C_FLAGS"] = flags + #Check SVE + import subprocess + sve_supported = False + sve2_supported = False + result = subprocess.run(['lscpu'], capture_output=True, text=True, timeout=3) + if result.returncode == 0: + cpu_info = result.stdout.lower() + if 'sve' in cpu_info: + sve_supported = True + if 'sve2' in cpu_info : + sve2_supported = True + if str(self.settings.arch) in ["armv8", "arm"]: - # Support CRC & NEON on ARMv8 - flags = f"{self.BOLT_GLOABL_FLAGS} -march=armv8.3-a" + if sve2_supported: + # Support CRC & NEON & SVE2 on ARMv8 + flags = f"{self.BOLT_GLOABL_FLAGS} -march=armv8.3-a+sve2-bitperm -msve-vector-bits=256 -DSVE_BITS=256" + elif sve_supported: + # Support CRC & NEON & SVE on ARMv8 + flags = f"{self.BOLT_GLOABL_FLAGS} -march=armv8.3-a+sve -msve-vector-bits=256 -DSVE_BITS=256" + else: + # Support CRC & NEON on ARMv8 + flags = f"{self.BOLT_GLOABL_FLAGS} -march=armv8.3-a" tc.cache_variables["CMAKE_CXX_FLAGS"] = flags tc.cache_variables["CMAKE_C_FLAGS"] = flags elif str(self.settings.arch) in ["armv9"]: # gcc 12+ https://www.phoronix.com/news/GCC-12-ARMv9-march-armv9-a - flags = f"{self.BOLT_GLOABL_FLAGS} -march=armv9-a" + if sve2_supported: + flags = f"{self.BOLT_GLOABL_FLAGS} -march=armv9-a+sve2-bitperm -msve-vector-bits=256 -DSVE_BITS=256" + elif sve_supported: + flags = f"{self.BOLT_GLOABL_FLAGS} -march=armv9-a+sve -msve-vector-bits=256 -DSVE_BITS=256" + else: + flags = f"{self.BOLT_GLOABL_FLAGS} -march=armv9-a" tc.variables["CMAKE_C_FLAGS"] = flags tc.variables["CMAKE_CXX_FLAGS"] = flags if (