|
23 | 23 | #include <emmintrin.h>
|
24 | 24 | #define ZEND_HAVE_VECTOR_128
|
25 | 25 |
|
26 |
| -typedef __m128i zend_vec_8x16_t; |
27 |
| -typedef __m128i zend_vec_16x8_t; |
28 |
| -typedef __m128i zend_vec_32x4_t; |
29 |
| -typedef __m128i zend_vec_64x2_t; |
30 |
| - |
31 |
| -#define zend_vec_setzero_8x16() _mm_setzero_si128() |
32 |
| -#define zend_vec_set_8x16(x) _mm_set1_epi8(x) |
33 |
| -#define zend_vec_set_8x16_from_16x8(x0, x1, x2, x3, x4, x5, x6, x7) _mm_set_epi16(x0, x1, x2, x3, x4, x5, x6, x7) |
34 |
| -#define zend_vec_set_8x16_from_32x4(x0, x1, x2, x3) _mm_set_epi32(x0, x1, x2, x3) |
35 |
| -#define zend_vec_set_8x16_from_64x2(x0, x1) _mm_set_epi64(x0, x1) |
36 |
| -#define zend_vec_load_8x16(x) _mm_load_si128((const __m128i *) (x)) |
37 |
| -#define zend_vec_loadu_8x16(x) _mm_loadu_si128((const __m128i *) (x)) |
38 |
| -#define zend_vec_store_8x16(to, x) _mm_store_si128((__m128i *) (to), x) |
39 |
| -#define zend_vec_storeu_8x16(to, x) _mm_storeu_si128((__m128i *) (to), x) |
40 |
| - |
41 |
| -#define zend_vec_or_8x16(a, b) _mm_or_si128(a, b) |
42 |
| -#define zend_vec_xor_8x16(a, b) _mm_xor_si128(a, b) |
43 |
| -#define zend_vec_and_8x16(a, b) _mm_and_si128(a, b) |
44 |
| -#define zend_vec_rshift_128_from_8x16(x, bytes) _mm_srli_si128(x, bytes) |
45 |
| -#define zend_vec_lshift_128_from_8x16(x, bytes) _mm_slli_si128(x, bytes) |
46 |
| - |
47 |
| -#define zend_vec_add_8x16(a, b) _mm_add_epi8(a, b) |
48 |
| - |
49 |
| -#define zend_vec_cmpeq_8x16(a, b) _mm_cmpeq_epi8(a, b) |
50 |
| -#define zend_vec_cmplt_8x16(a, b) _mm_cmplt_epi8(a, b) |
51 |
| -#define zend_vec_cmpgt_8x16(a, b) _mm_cmpgt_epi8(a, b) |
52 |
| - |
53 |
| -#define zend_vec_movemask_8x16(x) _mm_movemask_epi8(x) |
54 |
| - |
55 | 26 |
|
56 | 27 | #elif defined(__aarch64__) || defined(_M_ARM64)
|
57 | 28 | #include <arm_neon.h>
|
58 | 29 | #define ZEND_HAVE_VECTOR_128
|
59 | 30 |
|
60 |
| -typedef int8x16_t zend_vec_8x16_t; |
61 |
| -typedef int16x8_t zend_vec_16x8_t; |
62 |
| -typedef int32x4_t zend_vec_32x4_t; |
63 |
| -typedef int64x2_t zend_vec_64x2_t; |
| 31 | +typedef int8x16_t __m128i; |
64 | 32 |
|
65 |
| -#define zend_vec_setzero_8x16() vdupq_n_s8(0) |
66 |
| -#define zend_vec_set_8x16(x) vdupq_n_s8(x) |
67 |
| -#define zend_vec_set_8x16_from_16x8(x0, x1, x2, x3, x4, x5, x6, x7) \ |
| 33 | +#define _mm_setzero_si128() vdupq_n_s8(0) |
| 34 | +#define _mm_set1_epi8(x) vdupq_n_s8(x) |
| 35 | +#define _mm_set_epi16(x0, x1, x2, x3, x4, x5, x6, x7) \ |
68 | 36 | vreinterpretq_s8_s16((int16x8_t) { \
|
69 | 37 | (int16_t) (x7), (int16_t) (x6), (int16_t) (x5), (int16_t) (x4), \
|
70 | 38 | (int16_t) (x3), (int16_t) (x2), (int16_t) (x1), (int16_t) (x0) })
|
71 |
| -#define zend_vec_set_8x16_from_32x4(x0, x1, x2, x3) \ |
| 39 | +#define _mm_set_epi32(x0, x1, x2, x3) \ |
72 | 40 | vreinterpretq_s8_s32((int32x4_t) { (int32_t) (x3), (int32_t) (x2), (int32_t) (x1), (int32_t) (x0) })
|
73 |
| -#define zend_vec_set_8x16_from_64x2(x0, x1) vreinterpretq_s8_s64((int64x2_t) { (int64_t) (x1), (int64_t) (x0) }) |
74 |
| -#define zend_vec_load_8x16(x) vld1q_s8((const int8_t *) (x)) |
75 |
| -#define zend_vec_loadu_8x16(x) zend_vec_load_8x16(x) |
76 |
| -#define zend_vec_store_8x16(to, x) vst1q_s8((int8_t *) (to), x) |
77 |
| -#define zend_vec_storeu_8x16(to, x) zend_vec_store_8x16(to, x) |
| 41 | +#define _mm_set_epi64(x0, x1) vreinterpretq_s8_s64((int64x2_t) { (int64_t) (x1), (int64_t) (x0) }) |
| 42 | +#define _mm_load_si128(x) vld1q_s8((const int8_t *) (x)) |
| 43 | +#define _mm_loadu_si128(x) _mm_load_si128(x) |
| 44 | +#define _mm_store_si128(to, x) vst1q_s8((int8_t *) (to), x) |
| 45 | +#define _mm_storeu_si128(to, x) _mm_store_si128(to, x) |
78 | 46 |
|
79 |
| -#define zend_vec_or_8x16(a, b) vorrq_s8(a, b) |
80 |
| -#define zend_vec_xor_8x16(a, b) veorq_s8(a, b) |
81 |
| -#define zend_vec_and_8x16(a, b) vandq_s8(a, b) |
82 |
| -#define zend_vec_rshift_128_from_8x16(x, bytes) vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), bytes)) |
83 |
| -#define zend_vec_lshift_128_from_8x16(x, bytes) vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 16 - bytes)) |
| 47 | +#define _mm_or_si128(a, b) vorrq_s8(a, b) |
| 48 | +#define _mm_xor_si128(a, b) veorq_s8(a, b) |
| 49 | +#define _mm_and_si128(a, b) vandq_s8(a, b) |
| 50 | +#define _mm_srli_si128(x, bytes) vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), bytes)) |
| 51 | +#define _mm_slli_si128(x, bytes) vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 16 - bytes)) |
84 | 52 |
|
85 |
| -#define zend_vec_add_8x16(a, b) vaddq_s8(a, b) |
| 53 | +#define _mm_add_epi8(a, b) vaddq_s8(a, b) |
86 | 54 |
|
87 |
| -#define zend_vec_cmpeq_8x16(a, b) (vreinterpretq_s8_u8(vceqq_s8(a, b))) |
88 |
| -#define zend_vec_cmplt_8x16(a, b) (vreinterpretq_s8_u8(vcltq_s8(a, b))) |
89 |
| -#define zend_vec_cmpgt_8x16(a, b) (vreinterpretq_s8_u8(vcgtq_s8(a, b))) |
| 55 | +#define _mm_cmpeq_epi8(a, b) (vreinterpretq_s8_u8(vceqq_s8(a, b))) |
| 56 | +#define _mm_cmplt_epi8(a, b) (vreinterpretq_s8_u8(vcltq_s8(a, b))) |
| 57 | +#define _mm_cmpgt_epi8(a, b) (vreinterpretq_s8_u8(vcgtq_s8(a, b))) |
90 | 58 |
|
91 |
| -static zend_always_inline int zend_vec_movemask_8x16(int8x16_t x) |
| 59 | +static zend_always_inline int _mm_movemask_epi8(int8x16_t x) |
92 | 60 | {
|
93 | 61 | /**
|
94 | 62 | * based on code from
|
|
0 commit comments