Skip to content

Add new Algorithms using explicit batch type #496

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -151,6 +151,46 @@ void mean(const vector_type& a, const vector_type& b, vector_type& res)
}
```

Algorithms like `xsimd::reduce` and `xsimd::transform` are available also in the batch explicit modality:

```cpp
template <class C, class T = typename std::decay<decltype(*C().begin())>::type>
T nansum(const C& v)
{
return xsimd::reduce_batch(v.begin(), v.end(), 0.0,
[](auto x, auto y) {
return (std::isnan(x) ? 0.0 : x) + (std::isnan(y) ? 0.0 : y);
},
[](auto x, auto y) {
static decltype(x) zero(0.0);
auto xnan = xsimd::isnan(x);
auto ynan = xsimd::isnan(y);
auto xs = xsimd::select(xnan, zero, x);
auto ys = xsimd::select(ynan, zero, y);
return xs + ys;
});
}
```

To switch from `std::count_if` to `xsimd::count_if`:

```cpp
// v is an aligned vector of int type
auto count_expected = std::count_if(v.begin(), v.end(),
[](auto x) {
return x >= 50 && x <= 70 ? 1 : 0;
});
auto count = xsimd::count_if(v.begin(), v.end(),
[](auto x) {
return x >= 50 && x <= 70 ? 1 : 0;
},
[](auto b) {
static decltype(b) zero(0);
static decltype(b) one(1);
return xsimd::hadd(xsimd::select(b >= 50 && b <= 70, one, zero));
});
assert(count_expected == count);
```

## Building and Running the Tests

323 changes: 231 additions & 92 deletions include/xsimd/stl/algorithms.hpp
Original file line number Diff line number Diff line change
@@ -11,125 +11,175 @@
#ifndef XSIMD_ALGORITHMS_HPP
#define XSIMD_ALGORITHMS_HPP

#include <type_traits>
#include "../memory/xsimd_load_store.hpp"

namespace xsimd
{
template <class I1, class I2, class O1, class UF>
void transform(I1 first, I2 last, O1 out_first, UF&& f)
namespace detail
{
using value_type = typename std::decay<decltype(*first)>::type;
using traits = simd_traits<value_type>;
using batch_type = typename traits::type;
template <typename... >
using void_t_v = void;

std::size_t size = static_cast<std::size_t>(std::distance(first, last));
std::size_t simd_size = traits::size;
template <class T, class = void>
struct has_increment : std::false_type {};

const auto* ptr_begin = &(*first);
auto* ptr_out = &(*out_first);
template <class T>
struct has_increment<T, void_t_v<decltype(++std::declval<T>())>> : std::true_type {};

std::size_t align_begin = xsimd::get_alignment_offset(ptr_begin, size, simd_size);
std::size_t out_align = xsimd::get_alignment_offset(ptr_out, size, simd_size);
std::size_t align_end = align_begin + ((size - align_begin) & ~(simd_size - 1));
template <class... Args>
struct have_increment : all_true<has_increment<Args>::value...> {};

template <class... Args>
struct not_have_increment : all_true<!has_increment<Args>::value...> {};

template <class I1, class I2, class O1, class UF, class UFB>
using enable_unary_algorithm_t = typename std::enable_if<have_increment<I1, I2, O1>::value && not_have_increment<UF, UFB>::value, int>::type;

if (align_begin == out_align)
template <class I1, class I2, class I3, class O1, class UF>
using enable_binary_algorithm_t = typename std::enable_if<have_increment<I1, I2, I3,O1>::value && not_have_increment<UF>::value, int>::type;

template <class I1, class I2, class O1, class UF, class UFB>
void transform(I1 first, I2 last, O1 out_first, UF&& f, UFB&& fb)
{
for (std::size_t i = 0; i < align_begin; ++i)
{
out_first[i] = f(first[i]);
}
using value_type = typename std::decay<decltype(*first)>::type;
using traits = simd_traits<value_type>;
using batch_type = typename traits::type;

batch_type batch;
for (std::size_t i = align_begin; i < align_end; i += simd_size)
std::size_t size = static_cast<std::size_t>(std::distance(first, last));
std::size_t simd_size = traits::size;

const auto* ptr_begin = &(*first);
auto* ptr_out = &(*out_first);

std::size_t align_begin = xsimd::get_alignment_offset(ptr_begin, size, simd_size);
std::size_t out_align = xsimd::get_alignment_offset(ptr_out, size, simd_size);
std::size_t align_end = align_begin + ((size - align_begin) & ~(simd_size - 1));

if (align_begin == out_align)
{
xsimd::load_aligned(&first[i], batch);
xsimd::store_aligned(&out_first[i], f(batch));
for (std::size_t i = 0; i < align_begin; ++i)
{
out_first[i] = f(first[i]);
}

batch_type batch;
for (std::size_t i = align_begin; i < align_end; i += simd_size)
{
xsimd::load_aligned(&first[i], batch);
xsimd::store_aligned(&out_first[i], fb(batch));
}

for (std::size_t i = align_end; i < size; ++i)
{
out_first[i] = f(first[i]);
}
}

for (std::size_t i = align_end; i < size; ++i)
else
{
out_first[i] = f(first[i]);
for (std::size_t i = 0; i < align_begin; ++i)
{
out_first[i] = f(first[i]);
}

batch_type batch;
for (std::size_t i = align_begin; i < align_end; i += simd_size)
{
xsimd::load_aligned(&first[i], batch);
xsimd::store_unaligned(&out_first[i], fb(batch));
}

for (std::size_t i = align_end; i < size; ++i)
{
out_first[i] = f(first[i]);
}
}
}
else

template <class I1, class I2, class I3, class O1, class UF, class UFB>
void transform(I1 first_1, I2 last_1, I3 first_2, O1 out_first, UF&& f, UFB&& fb)
{
for (std::size_t i = 0; i < align_begin; ++i)
using value_type = typename std::decay<decltype(*first_1)>::type;
using traits = simd_traits<value_type>;
using batch_type = typename traits::type;

std::size_t size = static_cast<std::size_t>(std::distance(first_1, last_1));
std::size_t simd_size = traits::size;

const auto* ptr_begin_1 = &(*first_1);
const auto* ptr_begin_2 = &(*first_2);
auto* ptr_out = &(*out_first);

std::size_t align_begin_1 = xsimd::get_alignment_offset(ptr_begin_1, size, simd_size);
std::size_t align_begin_2 = xsimd::get_alignment_offset(ptr_begin_2, size, simd_size);
std::size_t out_align = xsimd::get_alignment_offset(ptr_out, size, simd_size);
std::size_t align_end = align_begin_1 + ((size - align_begin_1) & ~(simd_size - 1));

#define XSIMD_LOOP_MACRO(A1, A2, A3) \
for (std::size_t i = 0; i < align_begin_1; ++i) \
{ \
out_first[i] = f(first_1[i], first_2[i]); \
} \
\
batch_type batch_1, batch_2; \
for (std::size_t i = align_begin_1; i < align_end; i += simd_size) \
{ \
xsimd::A1(&first_1[i], batch_1); \
xsimd::A2(&first_2[i], batch_2); \
xsimd::A3(&out_first[i], fb(batch_1, batch_2)); \
} \
\
for (std::size_t i = align_end; i < size; ++i) \
{ \
out_first[i] = f(first_1[i], first_2[i]); \
} \

if (align_begin_1 == out_align && align_begin_1 == align_begin_2)
{
out_first[i] = f(first[i]);
XSIMD_LOOP_MACRO(load_aligned, load_aligned, store_aligned);
}

batch_type batch;
for (std::size_t i = align_begin; i < align_end; i += simd_size)
else if (align_begin_1 == out_align && align_begin_1 != align_begin_2)
{
xsimd::load_aligned(&first[i], batch);
xsimd::store_unaligned(&out_first[i], f(batch));
XSIMD_LOOP_MACRO(load_aligned, load_unaligned, store_aligned);
}

for (std::size_t i = align_end; i < size; ++i)
else if (align_begin_1 != out_align && align_begin_1 == align_begin_2)
{
out_first[i] = f(first[i]);
XSIMD_LOOP_MACRO(load_aligned, load_aligned, store_unaligned);
}
else if (align_begin_1 != out_align && align_begin_1 != align_begin_2)
{
XSIMD_LOOP_MACRO(load_aligned, load_unaligned, store_unaligned);
}

#undef XSIMD_LOOP_MACRO
}
}

template <class I1, class I2, class I3, class O1, class UF>
void transform(I1 first_1, I2 last_1, I3 first_2, O1 out_first, UF&& f)
template <class I1, class I2, class O1, class UF, class UFB,
detail::enable_unary_algorithm_t<I1, I2, O1, UF, UFB> = 0>
void transform(I1 first, I2 last, O1 out_first, UF&& f, UFB&& fb)
{
using value_type = typename std::decay<decltype(*first_1)>::type;
using traits = simd_traits<value_type>;
using batch_type = typename traits::type;
detail::transform(first, last, out_first, f, fb);
}

std::size_t size = static_cast<std::size_t>(std::distance(first_1, last_1));
std::size_t simd_size = traits::size;

const auto* ptr_begin_1 = &(*first_1);
const auto* ptr_begin_2 = &(*first_2);
auto* ptr_out = &(*out_first);

std::size_t align_begin_1 = xsimd::get_alignment_offset(ptr_begin_1, size, simd_size);
std::size_t align_begin_2 = xsimd::get_alignment_offset(ptr_begin_2, size, simd_size);
std::size_t out_align = xsimd::get_alignment_offset(ptr_out, size, simd_size);
std::size_t align_end = align_begin_1 + ((size - align_begin_1) & ~(simd_size - 1));

#define XSIMD_LOOP_MACRO(A1, A2, A3) \
for (std::size_t i = 0; i < align_begin_1; ++i) \
{ \
out_first[i] = f(first_1[i], first_2[i]); \
} \
\
batch_type batch_1, batch_2; \
for (std::size_t i = align_begin_1; i < align_end; i += simd_size) \
{ \
xsimd::A1(&first_1[i], batch_1); \
xsimd::A2(&first_2[i], batch_2); \
xsimd::A3(&out_first[i], f(batch_1, batch_2)); \
} \
\
for (std::size_t i = align_end; i < size; ++i) \
{ \
out_first[i] = f(first_1[i], first_2[i]); \
} \

if (align_begin_1 == out_align && align_begin_1 == align_begin_2)
{
XSIMD_LOOP_MACRO(load_aligned, load_aligned, store_aligned);
}
else if (align_begin_1 == out_align && align_begin_1 != align_begin_2)
{
XSIMD_LOOP_MACRO(load_aligned, load_unaligned, store_aligned);
}
else if (align_begin_1 != out_align && align_begin_1 == align_begin_2)
{
XSIMD_LOOP_MACRO(load_aligned, load_aligned, store_unaligned);
}
else if (align_begin_1 != out_align && align_begin_1 != align_begin_2)
{
XSIMD_LOOP_MACRO(load_aligned, load_unaligned, store_unaligned);
}
template <class I1, class I2, class O1, class UF>
void transform(I1 first, I2 last, O1 out_first, UF&& f)
{
detail::transform(first, last, out_first, f, f);
}

#undef XSIMD_LOOP_MACRO
template <class I1, class I2, class I3, class O1, class UF, class UFB>
void transform(I1 first_1, I2 last_1, I3 first_2, O1 out_first, UF&& f, UFB&& fb)
{
detail::transform(first_1, last_1, first_2, out_first, f, fb);
}

template <class I1, class I2, class I3, class O1, class UF,
detail::enable_binary_algorithm_t<I1, I2, I3, O1, UF> = 0>
void transform(I1 first_1, I2 last_1, I3 first_2, O1 out_first, UF&& f)
{
detail::transform(first_1, last_1, first_2, out_first, f, f);
}

// TODO: Remove this once we drop C++11 support
namespace detail
@@ -141,9 +191,8 @@ namespace xsimd
};
}


template <class Iterator1, class Iterator2, class Init, class BinaryFunction = detail::plus>
Init reduce(Iterator1 first, Iterator2 last, Init init, BinaryFunction&& binfun = detail::plus{})
template <class Iterator1, class Iterator2, class Init, class BinaryFunction, class BinaryFunctionBatch>
Init reduce(Iterator1 first, Iterator2 last, Init init, BinaryFunction&& binfun, BinaryFunctionBatch&& binfun_batch)
{
using value_type = typename std::decay<decltype(*first)>::type;
using traits = simd_traits<value_type>;
@@ -180,7 +229,7 @@ namespace xsimd
for (auto const end = ptr_begin + align_end; ptr < end; ptr += simd_size)
{
xsimd::load_aligned(ptr, batch);
batch_init = binfun(batch_init, batch);
batch_init = binfun_batch(batch_init, batch);
}

// reduce across batch
@@ -197,6 +246,96 @@ namespace xsimd
return init;
}

template <class Iterator1, class Iterator2, class Init, class BinaryFunction = detail::plus>
Init reduce(Iterator1 first, Iterator2 last, Init init, BinaryFunction&& binfun = detail::plus{})
{
return reduce(first, last, init, binfun, binfun);
}

namespace detail
{
template <class T>
struct count_batch
{
count_batch(T value)
: value(value)
{}

count_batch(const count_batch<T>&) = default;
count_batch(count_batch<T>&&) = default;

template <class B>
std::size_t operator()(const B& b)
{
static auto zero = B(T(0));
static auto one = B(T(1));
return static_cast<std::size_t>(xsimd::hadd(xsimd::select(b == value, one, zero)));
}

private:
T value;
};
}

template <class Iterator1, class Iterator2, class UnaryPredicate, class UnaryPredicateBatch>
std::size_t count_if(Iterator1 first, Iterator2 last, UnaryPredicate&& predicate, UnaryPredicateBatch&& predicate_batch)
{
using value_type = typename std::decay<decltype(*first)>::type;
using traits = simd_traits<value_type>;
using batch_type = typename traits::type;

std::size_t size = static_cast<std::size_t>(std::distance(first, last));
constexpr std::size_t simd_size = traits::size;

std::size_t counter(0);
if(size < simd_size)
{
while(first != last)
{
counter += predicate(*first++);
}
return counter;
}

const auto* const ptr_begin = &(*first);

std::size_t align_begin = xsimd::get_alignment_offset(ptr_begin, size, simd_size);
std::size_t align_end = align_begin + ((size - align_begin) & ~(simd_size - 1));

// reduce initial unaligned part
for (std::size_t i = 0; i < align_begin; ++i)
{
counter += predicate(first[i]);
}

// reduce aligned part
batch_type batch;
auto ptr = ptr_begin + align_begin;
for (auto const end = ptr_begin + align_end; ptr < end; ptr += simd_size)
{
xsimd::load_aligned(ptr, batch);
counter += predicate_batch(batch);
}

// reduce final unaligned part
for (std::size_t i = align_end; i < size; ++i)
{
counter += predicate(first[i]);
}

return counter;
}

template <class Iterator1, class Iterator2, class T>
std::size_t count(Iterator1 first, Iterator2 last, const T& value)
{
return count_if(first,
last,
[&value](const T& x) { return value == x; },
detail::count_batch<T>{value}
);
}

}

#endif
159 changes: 158 additions & 1 deletion test/test_algorithms.cpp
Original file line number Diff line number Diff line change
@@ -10,6 +10,10 @@

#include <numeric>
#include "test_utils.hpp"
#include <xsimd/xsimd.hpp>
#include <xsimd/types/xsimd_fallback.hpp>
#include <xsimd/types/xsimd_traits.hpp>
#include <xsimd/types/xsimd_base.hpp>

struct binary_functor
{
@@ -37,6 +41,64 @@ template <class T>
using test_allocator_type = std::allocator<T>;
#endif

template <class C>
struct types {
using value_type = typename std::decay<decltype(*C().begin())>::type;
using traits = xsimd::simd_traits<value_type>;
using batch_type = typename traits::type;
};

#if XSIMD_X86_INSTR_SET > XSIMD_VERSION_NUMBER_NOT_AVAILABLE || XSIMD_ARM_INSTR_SET > XSIMD_VERSION_NUMBER_NOT_AVAILABLE
TEST(algorithms, unary_transform_batch)
{
using vector_type = std::vector<int, test_allocator_type<int>>;
using batch_type = types<vector_type>::batch_type;
auto flip_flop = vector_type(42, 0);
std::iota(flip_flop.begin(), flip_flop.end(), 1);
auto square_pair = [](int x) {
return !(x % 2) ? x : x*x;
};
auto flip_flop_axpected = flip_flop;
std::transform(flip_flop_axpected.begin(), flip_flop_axpected.end(), flip_flop_axpected.begin(), square_pair);

xsimd::transform(flip_flop.begin(), flip_flop.end(), flip_flop.begin(),
// NOTE: since c++14 a simple `[](auto x)` reduce code complexity
[](int x) {
return !(x % 2) ? x : x*x;
},
// NOTE: since c++14 a simple `[](auto b)` reduce code complexity
[](batch_type b) {
return xsimd::select(!(b % 2), b, b*b);
});
EXPECT_TRUE(std::equal(flip_flop_axpected.begin(), flip_flop_axpected.end(), flip_flop.begin()) && flip_flop_axpected.size() == flip_flop.size());
}

TEST(algorithms, binary_transform_batch)
{
using vector_type = std::vector<int, test_allocator_type<int>>;
using batch_type = types<vector_type>::batch_type;
auto flip_flop_a = vector_type(42, 0);
auto flip_flop_b = vector_type(42, 0);
std::iota(flip_flop_a.begin(), flip_flop_a.end(), 1);
std::iota(flip_flop_b.begin(), flip_flop_b.end(), 3);
auto square_pair = [](int x, int y) {
return !((x + y) % 2) ? x + y : x*x + y*y;
};
auto flip_flop_axpected = flip_flop_a;
std::transform(flip_flop_a.begin(), flip_flop_a.end(), flip_flop_b.begin(), flip_flop_axpected.begin(), square_pair);

auto flip_flop_result = vector_type(flip_flop_axpected.size());
xsimd::transform(flip_flop_a.begin(), flip_flop_a.end(), flip_flop_b.begin(), flip_flop_result.begin(),
[](int x, int y) {
return !((x +y) % 2) ? x + y : x*x + y*y;
},
[](batch_type bx, batch_type by) {
return xsimd::select(!((bx + by) % 2), bx + by, bx*bx + by+by);
});
EXPECT_TRUE(std::equal(flip_flop_axpected.begin(), flip_flop_axpected.end(), flip_flop_result.begin()) && flip_flop_axpected.size() == flip_flop_result.size());
}
#endif

TEST(algorithms, binary_transform)
{
std::vector<double> expected(93);
@@ -83,7 +145,6 @@ TEST(algorithms, binary_transform)
std::fill(ca.begin(), ca.end(), -1); // erase
}


TEST(algorithms, unary_transform)
{
std::vector<double> expected(93);
@@ -216,6 +277,102 @@ TEST_F(xsimd_reduce, using_custom_binary_function)
}
}

TEST(algorithms, reduce_batch)
{
const float nan = std::numeric_limits<float>::quiet_NaN();
using vector_type = std::vector<float, test_allocator_type<float>>;
using batch_type = types<vector_type>::batch_type;
auto vector_with_nan = vector_type(1000, 0);
std::iota(vector_with_nan.begin(), vector_with_nan.end(), 3.5);
auto i = 0;
auto add_nan = [&i, &nan](const float x) {
return i % 2 ? nan : x;
};
std::transform(vector_with_nan.begin(), vector_with_nan.end(), vector_with_nan.begin(), add_nan);

auto nansum_expected = std::accumulate(vector_with_nan.begin(), vector_with_nan.end(), 0.0,
[](float x, float y) {
return (std::isnan(x) ? 0.0 : x) + (std::isnan(y) ? 0.0 : y);
});

auto nansum = xsimd::reduce(vector_with_nan.begin(), vector_with_nan.end(), 0.0,
[](float x, float y) {
return (std::isnan(x) ? 0.0 : x) + (std::isnan(y) ? 0.0 : y);
},
[](batch_type x, batch_type y) {
static batch_type zero(0.0);
auto xnan = xsimd::isnan(x);
auto ynan = xsimd::isnan(y);
auto xs = xsimd::select(xnan, zero, x);
auto ys = xsimd::select(ynan, zero, y);
return xs + ys;
});

EXPECT_NEAR(nansum_expected, nansum, 1e-6);

auto count_nan_expected = std::count_if(vector_with_nan.begin(), vector_with_nan.end(),
[](float x){
return static_cast<std::size_t>(std::isnan(x));
});

auto count_nan = xsimd::count_if(vector_with_nan.begin(), vector_with_nan.end(),
[](float x){
return static_cast<std::size_t>(std::isnan(x));
},
[](batch_type b) {
static decltype(b) zero(0.0);
static decltype(b) one(1.0);
auto bnan = xsimd::isnan(b);
return static_cast<std::size_t>(xsimd::hadd(xsimd::select(bnan, one, zero)));
});

EXPECT_EQ(count_nan_expected, count_nan);

auto count_not_nan_expected = vector_with_nan.size() - count_nan_expected;
auto count_not_nan = vector_with_nan.size() - count_nan;

auto nanmean_expected = count_not_nan_expected ? nansum_expected / (float)count_not_nan_expected : 0;
auto nanmean = count_not_nan ? nansum / (float)count_not_nan : 0;

EXPECT_NEAR(nanmean_expected, nanmean, 1e-6);
}

TEST(algorithms, count)
{
using vector_type = std::vector<float, test_allocator_type<float>>;
auto v = vector_type(100, 0);
std::iota(v.begin(), v.end(), 3.14);
v[12] = 123.321;
v[42] = 123.321;
v[93] = 123.321;

EXPECT_EQ(3, xsimd::count(v.begin(), v.end(), 123.321));
}

TEST(algorithms, count_if)
{
using vector_type = std::vector<int, test_allocator_type<int>>;
using batch_type = types<vector_type>::batch_type;
auto v = vector_type(100, 0);
std::iota(v.begin(), v.end(), 1);

auto count_expected = std::count_if(v.begin(), v.end(),
[](int x) {
return x >= 50 && x <= 70 ? 1 : 0;
});

auto count = xsimd::count_if(v.begin(), v.end(),
[](int x) {
return x >= 50 && x <= 70 ? 1 : 0;
},
[](batch_type b) {
static batch_type zero(0);
static batch_type one(1);
return xsimd::hadd(xsimd::select(b >= 50 && b <= 70, one, zero));
});
EXPECT_EQ(count_expected, count);
}

#if XSIMD_X86_INSTR_SET > XSIMD_VERSION_NUMBER_NOT_AVAILABLE || XSIMD_ARM_INSTR_SET > XSIMD_VERSION_NUMBER_NOT_AVAILABLE
TEST(algorithms, iterator)
{