Skip to content

Commit 66fd323

Browse files
committed
1. Adding stream API for non temporal data transfers
2. Adding xsimd::fence as a wrapper around std atomic for cache coherence 3. Adding tests
1 parent d6150b8 commit 66fd323

File tree

10 files changed

+351
-0
lines changed

10 files changed

+351
-0
lines changed

include/xsimd/arch/common/xsimd_common_memory.hpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,12 @@ namespace xsimd
298298
return load_unaligned(mem, b, A {});
299299
}
300300

301+
template <class A, class T>
302+
XSIMD_INLINE batch_bool<T, A> load_stream(bool const* mem, batch_bool<T, A> b, requires_arch<common>) noexcept
303+
{
304+
return load_aligned(mem, b, A {});
305+
}
306+
301307
// load_aligned
302308
namespace detail
303309
{
@@ -348,6 +354,12 @@ namespace xsimd
348354
return detail::load_unaligned<A>(mem, cvt, common {}, detail::conversion_type<A, T_in, T_out> {});
349355
}
350356

357+
template <class A, class T_in, class T_out>
358+
XSIMD_INLINE batch<T_out, A> load_stream(T_in const* mem, convert<T_out> cvt, requires_arch<common>) noexcept
359+
{
360+
return load_aligned<A>(mem, cvt, A {});
361+
}
362+
351363
// rotate_right
352364
template <size_t N, class A, class T>
353365
XSIMD_INLINE batch<T, A> rotate_right(batch<T, A> const& self, requires_arch<common>) noexcept
@@ -589,6 +601,12 @@ namespace xsimd
589601
mem[i] = bool(buffer[i]);
590602
}
591603

604+
template <class A, class T>
605+
XSIMD_INLINE void store_stream(batch_bool<T, A> const& self, bool* mem, requires_arch<common>) noexcept
606+
{
607+
store(self, mem, A {});
608+
}
609+
592610
// store_aligned
593611
template <class A, class T_in, class T_out>
594612
XSIMD_INLINE void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<common>) noexcept
@@ -607,6 +625,12 @@ namespace xsimd
607625
return store_aligned<A>(mem, self, common {});
608626
}
609627

628+
template <class A, class T_in, class T_out>
629+
XSIMD_INLINE void store_stream(T_out* mem, batch<T_in, A> const& self, requires_arch<common>) noexcept
630+
{
631+
store_aligned<A>(mem, self, A {});
632+
}
633+
610634
// swizzle
611635
template <class A, class T, class ITy, ITy... Vs>
612636
XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<ITy, A, Vs...> mask, requires_arch<common>) noexcept
@@ -688,6 +712,12 @@ namespace xsimd
688712
return detail::load_complex(hi, lo, A {});
689713
}
690714

715+
template <class A, class T_out, class T_in>
716+
XSIMD_INLINE batch<std::complex<T_out>, A> load_complex_stream(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<common>) noexcept
717+
{
718+
return load_complex_aligned<A>(mem, kernel::convert<std::complex<T_out>> {}, A {});
719+
}
720+
691721
// store_complex_aligned
692722
template <class A, class T_out, class T_in>
693723
XSIMD_INLINE void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<common>) noexcept
@@ -712,6 +742,12 @@ namespace xsimd
712742
hi.store_unaligned(buffer + real_batch::size);
713743
}
714744

745+
template <class A, class T_out, class T_in>
746+
XSIMD_INLINE void store_complex_stream(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<common>) noexcept
747+
{
748+
store_complex_aligned<A>(dst, src, A {});
749+
}
750+
715751
// transpose
716752
template <class A, class T>
717753
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<common>) noexcept

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1404,6 +1404,23 @@ namespace xsimd
14041404
return _mm256_storeu_pd(mem, self);
14051405
}
14061406

1407+
// store_stream
1408+
template <class A>
1409+
XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
1410+
{
1411+
_mm256_stream_ps(mem, self);
1412+
}
1413+
template <class A>
1414+
XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
1415+
{
1416+
_mm256_stream_pd(mem, self);
1417+
}
1418+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1419+
XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
1420+
{
1421+
_mm256_stream_si256((__m256i*)mem, self);
1422+
}
1423+
14071424
// sub
14081425
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
14091426
XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,23 @@ namespace xsimd
116116
}
117117
}
118118

119+
// load_stream
120+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
121+
XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx2>) noexcept
122+
{
123+
return _mm256_stream_load_si256((__m256i const*)mem);
124+
}
125+
template <class A>
126+
XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<avx2>) noexcept
127+
{
128+
return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i const*)mem));
129+
}
130+
template <class A>
131+
XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<avx2>) noexcept
132+
{
133+
return _mm256_castsi256_pd(_mm256_stream_load_si256((__m256i const*)mem));
134+
}
135+
119136
// bitwise_and
120137
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
121138
XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1391,6 +1391,23 @@ namespace xsimd
13911391
return _mm512_loadu_pd(mem);
13921392
}
13931393

1394+
// load_stream
1395+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1396+
XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
1397+
{
1398+
return _mm512_stream_load_si512((__m512i*)mem);
1399+
}
1400+
template <class A>
1401+
XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
1402+
{
1403+
return _mm512_castsi512_ps(_mm512_stream_load_si512((__m512i*)mem));
1404+
}
1405+
template <class A>
1406+
XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
1407+
{
1408+
return _mm512_castsi512_pd(_mm512_stream_load_si512((__m512i*)mem));
1409+
}
1410+
13941411
// lt
13951412
template <class A>
13961413
XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
@@ -2171,6 +2188,23 @@ namespace xsimd
21712188
return _mm512_storeu_pd(mem, self);
21722189
}
21732190

2191+
// store_stream
2192+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
2193+
XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
2194+
{
2195+
_mm512_stream_si512((__m512i*)mem, self);
2196+
}
2197+
template <class A>
2198+
XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
2199+
{
2200+
_mm512_stream_ps(mem, self);
2201+
}
2202+
template <class A>
2203+
XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
2204+
{
2205+
_mm512_stream_pd(mem, self);
2206+
}
2207+
21742208
// sub
21752209
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
21762210
XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1741,6 +1741,23 @@ namespace xsimd
17411741
return _mm_storeu_pd(mem, self);
17421742
}
17431743

1744+
// store_stream
1745+
template <class A>
1746+
XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
1747+
{
1748+
_mm_stream_ps(mem, self);
1749+
}
1750+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1751+
XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
1752+
{
1753+
_mm_stream_si128((__m128i*)mem, self);
1754+
}
1755+
template <class A>
1756+
XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
1757+
{
1758+
_mm_stream_pd(mem, self);
1759+
}
1760+
17441761
// sub
17451762
template <class A>
17461763
XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept

include/xsimd/arch/xsimd_sse4_1.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,23 @@ namespace xsimd
166166
}
167167
}
168168

169+
// load_stream
170+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
171+
XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<sse4_1>) noexcept
172+
{
173+
return _mm_stream_load_si128((__m128i*)mem);
174+
}
175+
template <class A>
176+
XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<sse4_1>) noexcept
177+
{
178+
return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)mem));
179+
}
180+
template <class A>
181+
XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<sse4_1>) noexcept
182+
{
183+
return _mm_castsi128_pd(_mm_stream_load_si128((__m128i*)mem));
184+
}
185+
169186
// min
170187
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
171188
XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept

include/xsimd/memory/xsimd_alignment.hpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,17 @@ namespace xsimd
3333
{
3434
};
3535

36+
/**
37+
* @struct stream_mode
38+
* @brief tag for load and store of aligned non-temporal memory.
39+
*
40+
* Streaming accesses expect aligned pointers. When no architecture-specific
41+
* implementation is available, they fall back to aligned semantics.
42+
*/
43+
struct stream_mode
44+
{
45+
};
46+
3647
/***********************
3748
* Allocator alignment *
3849
***********************/

include/xsimd/types/xsimd_api.hpp

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#ifndef XSIMD_API_HPP
1313
#define XSIMD_API_HPP
1414

15+
#include <atomic>
1516
#include <complex>
1617
#include <cstddef>
1718
#include <limits>
@@ -1334,6 +1335,30 @@ namespace xsimd
13341335
return kernel::load_complex_aligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
13351336
}
13361337

1338+
template <class To, class A = default_arch, class From>
1339+
XSIMD_INLINE simd_return_type<From, To, A> load_as(From const* ptr, stream_mode) noexcept
1340+
{
1341+
using batch_value_type = typename simd_return_type<From, To, A>::value_type;
1342+
detail::static_check_supported_config<From, A>();
1343+
detail::static_check_supported_config<To, A>();
1344+
return kernel::load_stream<A>(ptr, kernel::convert<batch_value_type> {}, A {});
1345+
}
1346+
1347+
template <class To, class A = default_arch>
1348+
XSIMD_INLINE simd_return_type<bool, To, A> load_as(bool const* ptr, stream_mode) noexcept
1349+
{
1350+
detail::static_check_supported_config<To, A>();
1351+
return simd_return_type<bool, To, A>::load_stream(ptr);
1352+
}
1353+
1354+
template <class To, class A = default_arch, class From>
1355+
XSIMD_INLINE simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, stream_mode) noexcept
1356+
{
1357+
detail::static_check_supported_config<To, A>();
1358+
using batch_value_type = typename simd_return_type<std::complex<From>, To, A>::value_type;
1359+
return kernel::load_complex_stream<A>(ptr, kernel::convert<batch_value_type> {}, A {});
1360+
}
1361+
13371362
#ifdef XSIMD_ENABLE_XTL_COMPLEX
13381363
template <class To, class A = default_arch, class From, bool i3ec>
13391364
XSIMD_INLINE simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, aligned_mode) noexcept
@@ -1342,6 +1367,14 @@ namespace xsimd
13421367
detail::static_check_supported_config<From, A>();
13431368
return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), aligned_mode());
13441369
}
1370+
1371+
template <class To, class A = default_arch, class From, bool i3ec>
1372+
XSIMD_INLINE simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, stream_mode) noexcept
1373+
{
1374+
detail::static_check_supported_config<To, A>();
1375+
detail::static_check_supported_config<From, A>();
1376+
return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), stream_mode());
1377+
}
13451378
#endif
13461379

13471380
/**
@@ -1416,6 +1449,13 @@ namespace xsimd
14161449
return load_as<From, A>(ptr, unaligned_mode {});
14171450
}
14181451

1452+
template <class A = default_arch, class From>
1453+
XSIMD_INLINE batch<From, A> load(From const* ptr, stream_mode) noexcept
1454+
{
1455+
detail::static_check_supported_config<From, A>();
1456+
return load_as<From, A>(ptr, stream_mode {});
1457+
}
1458+
14191459
/**
14201460
* @ingroup batch_data_transfer
14211461
*
@@ -2339,12 +2379,40 @@ namespace xsimd
23392379
kernel::store_complex_aligned<A>(dst, src, A {});
23402380
}
23412381

2382+
template <class To, class A = default_arch, class From>
2383+
XSIMD_INLINE void store_as(To* dst, batch<From, A> const& src, stream_mode) noexcept
2384+
{
2385+
detail::static_check_supported_config<From, A>();
2386+
kernel::store_stream<A>(dst, src, A {});
2387+
}
2388+
2389+
template <class A = default_arch, class From>
2390+
XSIMD_INLINE void store_as(bool* dst, batch_bool<From, A> const& src, stream_mode) noexcept
2391+
{
2392+
detail::static_check_supported_config<From, A>();
2393+
kernel::store_stream<A>(src, dst, A {});
2394+
}
2395+
2396+
template <class To, class A = default_arch, class From>
2397+
XSIMD_INLINE void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, stream_mode) noexcept
2398+
{
2399+
detail::static_check_supported_config<std::complex<From>, A>();
2400+
kernel::store_complex_stream<A>(dst, src, A {});
2401+
}
2402+
23422403
#ifdef XSIMD_ENABLE_XTL_COMPLEX
23432404
template <class To, class A = default_arch, class From, bool i3ec>
23442405
XSIMD_INLINE void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
23452406
{
23462407
store_as(reinterpret_cast<std::complex<To>*>(dst), src, aligned_mode());
23472408
}
2409+
2410+
template <class To, class A = default_arch, class From, bool i3ec>
2411+
XSIMD_INLINE void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, stream_mode) noexcept
2412+
{
2413+
detail::static_check_supported_config<std::complex<From>, A>();
2414+
store_as(reinterpret_cast<std::complex<To>*>(dst), src, stream_mode());
2415+
}
23482416
#endif
23492417

23502418
/**
@@ -2413,6 +2481,22 @@ namespace xsimd
24132481
store_as<T, A>(mem, val, unaligned_mode {});
24142482
}
24152483

2484+
template <class A, class T>
2485+
XSIMD_INLINE void store(T* mem, batch<T, A> const& val, stream_mode) noexcept
2486+
{
2487+
store_as<T, A>(mem, val, stream_mode {});
2488+
}
2489+
2490+
/**
2491+
* @ingroup batch_data_transfer
2492+
*
2493+
* Issues a sequentially consistent memory fence.
2494+
*/
2495+
XSIMD_INLINE void fence() noexcept
2496+
{
2497+
std::atomic_thread_fence(std::memory_order_seq_cst);
2498+
}
2499+
24162500
/**
24172501
* @ingroup batch_data_transfer
24182502
*

0 commit comments

Comments
 (0)