@@ -1640,6 +1640,31 @@ namespace xsimd
1640
1640
return bitwise_cast<int32_t >(swizzle (bitwise_cast<uint32_t >(self), mask, sse2 {}));
1641
1641
}
1642
1642
1643
+ template <class A , uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1644
+ XSIMD_INLINE batch<uint16_t , A> swizzle (batch<uint16_t , A> const & self, batch_constant<uint16_t , A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<sse2>) noexcept
1645
+ {
1646
+ // permute within each lane
1647
+ constexpr auto mask_lo = detail::mod_shuffle (V0, V1, V2, V3);
1648
+ constexpr auto mask_hi = detail::mod_shuffle (V4, V5, V6, V7);
1649
+ __m128i lo = _mm_shufflelo_epi16 (self, mask_lo);
1650
+ __m128i hi = _mm_shufflehi_epi16 (self, mask_hi);
1651
+
1652
+ __m128i lo_lo = _mm_castpd_si128 (_mm_shuffle_pd (_mm_castsi128_pd (lo), _mm_castsi128_pd (lo), _MM_SHUFFLE2 (0 , 0 )));
1653
+ __m128i hi_hi = _mm_castpd_si128 (_mm_shuffle_pd (_mm_castsi128_pd (hi), _mm_castsi128_pd (hi), _MM_SHUFFLE2 (1 , 1 )));
1654
+
1655
+ // mask to choose the right lane
1656
+ batch_bool_constant<uint16_t , A, (V0 < 4 ), (V1 < 4 ), (V2 < 4 ), (V3 < 4 ), (V4 < 4 ), (V5 < 4 ), (V6 < 4 ), (V7 < 4 )> blend_mask;
1657
+
1658
+ // blend the two permutes
1659
+ return select (blend_mask, batch<uint16_t , A>(lo_lo), batch<uint16_t , A>(hi_hi));
1660
+ }
1661
+
1662
+ template <class A , uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1663
+ XSIMD_INLINE batch<int16_t , A> swizzle (batch<int16_t , A> const & self, batch_constant<uint16_t , A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<sse2>) noexcept
1664
+ {
1665
+ return bitwise_cast<int16_t >(swizzle (bitwise_cast<uint16_t >(self), mask, sse2 {}));
1666
+ }
1667
+
1643
1668
// transpose
1644
1669
template <class A >
1645
1670
XSIMD_INLINE void transpose (batch<float , A>* matrix_begin, batch<float , A>* matrix_end, requires_arch<sse2>) noexcept
0 commit comments