Skip to content

Commit b471a90

Browse files
committed
Use zend_simd.h as a wrapper for neon
1 parent b8972bc commit b471a90

File tree

3 files changed

+60
-92
lines changed

3 files changed

+60
-92
lines changed

Zend/zend_simd.h

+20-52
Original file line numberDiff line numberDiff line change
@@ -23,72 +23,40 @@
2323
#include <emmintrin.h>
2424
#define ZEND_HAVE_VECTOR_128
2525

26-
typedef __m128i zend_vec_8x16_t;
27-
typedef __m128i zend_vec_16x8_t;
28-
typedef __m128i zend_vec_32x4_t;
29-
typedef __m128i zend_vec_64x2_t;
30-
31-
#define zend_vec_setzero_8x16() _mm_setzero_si128()
32-
#define zend_vec_set_8x16(x) _mm_set1_epi8(x)
33-
#define zend_vec_set_8x16_from_16x8(x0, x1, x2, x3, x4, x5, x6, x7) _mm_set_epi16(x0, x1, x2, x3, x4, x5, x6, x7)
34-
#define zend_vec_set_8x16_from_32x4(x0, x1, x2, x3) _mm_set_epi32(x0, x1, x2, x3)
35-
#define zend_vec_set_8x16_from_64x2(x0, x1) _mm_set_epi64(x0, x1)
36-
#define zend_vec_load_8x16(x) _mm_load_si128((const __m128i *) (x))
37-
#define zend_vec_loadu_8x16(x) _mm_loadu_si128((const __m128i *) (x))
38-
#define zend_vec_store_8x16(to, x) _mm_store_si128((__m128i *) (to), x)
39-
#define zend_vec_storeu_8x16(to, x) _mm_storeu_si128((__m128i *) (to), x)
40-
41-
#define zend_vec_or_8x16(a, b) _mm_or_si128(a, b)
42-
#define zend_vec_xor_8x16(a, b) _mm_xor_si128(a, b)
43-
#define zend_vec_and_8x16(a, b) _mm_and_si128(a, b)
44-
#define zend_vec_rshift_128_from_8x16(x, bytes) _mm_srli_si128(x, bytes)
45-
#define zend_vec_lshift_128_from_8x16(x, bytes) _mm_slli_si128(x, bytes)
46-
47-
#define zend_vec_add_8x16(a, b) _mm_add_epi8(a, b)
48-
49-
#define zend_vec_cmpeq_8x16(a, b) _mm_cmpeq_epi8(a, b)
50-
#define zend_vec_cmplt_8x16(a, b) _mm_cmplt_epi8(a, b)
51-
#define zend_vec_cmpgt_8x16(a, b) _mm_cmpgt_epi8(a, b)
52-
53-
#define zend_vec_movemask_8x16(x) _mm_movemask_epi8(x)
54-
5526

5627
#elif defined(__aarch64__) || defined(_M_ARM64)
5728
#include <arm_neon.h>
5829
#define ZEND_HAVE_VECTOR_128
5930

60-
typedef int8x16_t zend_vec_8x16_t;
61-
typedef int16x8_t zend_vec_16x8_t;
62-
typedef int32x4_t zend_vec_32x4_t;
63-
typedef int64x2_t zend_vec_64x2_t;
31+
typedef int8x16_t __m128i;
6432

65-
#define zend_vec_setzero_8x16() vdupq_n_s8(0)
66-
#define zend_vec_set_8x16(x) vdupq_n_s8(x)
67-
#define zend_vec_set_8x16_from_16x8(x0, x1, x2, x3, x4, x5, x6, x7) \
33+
#define _mm_setzero_si128() vdupq_n_s8(0)
34+
#define _mm_set1_epi8(x) vdupq_n_s8(x)
35+
#define _mm_set_epi16(x0, x1, x2, x3, x4, x5, x6, x7) \
6836
vreinterpretq_s8_s16((int16x8_t) { \
6937
(int16_t) (x7), (int16_t) (x6), (int16_t) (x5), (int16_t) (x4), \
7038
(int16_t) (x3), (int16_t) (x2), (int16_t) (x1), (int16_t) (x0) })
71-
#define zend_vec_set_8x16_from_32x4(x0, x1, x2, x3) \
39+
#define _mm_set_epi32(x0, x1, x2, x3) \
7240
vreinterpretq_s8_s32((int32x4_t) { (int32_t) (x3), (int32_t) (x2), (int32_t) (x1), (int32_t) (x0) })
73-
#define zend_vec_set_8x16_from_64x2(x0, x1) vreinterpretq_s8_s64((int64x2_t) { (int64_t) (x1), (int64_t) (x0) })
74-
#define zend_vec_load_8x16(x) vld1q_s8((const int8_t *) (x))
75-
#define zend_vec_loadu_8x16(x) zend_vec_load_8x16(x)
76-
#define zend_vec_store_8x16(to, x) vst1q_s8((int8_t *) (to), x)
77-
#define zend_vec_storeu_8x16(to, x) zend_vec_store_8x16(to, x)
41+
#define _mm_set_epi64(x0, x1) vreinterpretq_s8_s64((int64x2_t) { (int64_t) (x1), (int64_t) (x0) })
42+
#define _mm_load_si128(x) vld1q_s8((const int8_t *) (x))
43+
#define _mm_loadu_si128(x) _mm_load_si128(x)
44+
#define _mm_store_si128(to, x) vst1q_s8((int8_t *) (to), x)
45+
#define _mm_storeu_si128(to, x) _mm_store_si128(to, x)
7846

79-
#define zend_vec_or_8x16(a, b) vorrq_s8(a, b)
80-
#define zend_vec_xor_8x16(a, b) veorq_s8(a, b)
81-
#define zend_vec_and_8x16(a, b) vandq_s8(a, b)
82-
#define zend_vec_rshift_128_from_8x16(x, bytes) vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), bytes))
83-
#define zend_vec_lshift_128_from_8x16(x, bytes) vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 16 - bytes))
47+
#define _mm_or_si128(a, b) vorrq_s8(a, b)
48+
#define _mm_xor_si128(a, b) veorq_s8(a, b)
49+
#define _mm_and_si128(a, b) vandq_s8(a, b)
50+
#define _mm_srli_si128(x, bytes) vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), bytes))
51+
#define _mm_slli_si128(x, bytes) vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 16 - bytes))
8452

85-
#define zend_vec_add_8x16(a, b) vaddq_s8(a, b)
53+
#define _mm_add_epi8(a, b) vaddq_s8(a, b)
8654

87-
#define zend_vec_cmpeq_8x16(a, b) (vreinterpretq_s8_u8(vceqq_s8(a, b)))
88-
#define zend_vec_cmplt_8x16(a, b) (vreinterpretq_s8_u8(vcltq_s8(a, b)))
89-
#define zend_vec_cmpgt_8x16(a, b) (vreinterpretq_s8_u8(vcgtq_s8(a, b)))
55+
#define _mm_cmpeq_epi8(a, b) (vreinterpretq_s8_u8(vceqq_s8(a, b)))
56+
#define _mm_cmplt_epi8(a, b) (vreinterpretq_s8_u8(vcltq_s8(a, b)))
57+
#define _mm_cmpgt_epi8(a, b) (vreinterpretq_s8_u8(vcgtq_s8(a, b)))
9058

91-
static zend_always_inline int zend_vec_movemask_8x16(int8x16_t x)
59+
static zend_always_inline int _mm_movemask_epi8(int8x16_t x)
9260
{
9361
/**
9462
* based on code from

ext/opcache/ZendAccelerator.c

+5-5
Original file line numberDiff line numberDiff line change
@@ -176,13 +176,13 @@ static void bzero_aligned(void *mem, size_t size)
176176
#elif defined(ZEND_HAVE_VECTOR_128)
177177
char *p = (char*)mem;
178178
char *end = p + size;
179-
zend_vec_8x16_t xmm0 = zend_vec_setzero_8x16();
179+
__m128i xmm0 = _mm_setzero_si128();
180180

181181
while (p < end) {
182-
zend_vec_store_8x16(p, xmm0);
183-
zend_vec_store_8x16((p+16), xmm0);
184-
zend_vec_store_8x16((p+32), xmm0);
185-
zend_vec_store_8x16((p+48), xmm0);
182+
_mm_store_si128((__m128i*)p, xmm0);
183+
_mm_store_si128((__m128i*)(p+16), xmm0);
184+
_mm_store_si128((__m128i*)(p+32), xmm0);
185+
_mm_store_si128((__m128i*)(p+48), xmm0);
186186
p += 64;
187187
}
188188
#else

ext/standard/url.c

+35-35
Original file line numberDiff line numberDiff line change
@@ -456,51 +456,51 @@ static zend_always_inline zend_string *php_url_encode_impl(const char *s, size_t
456456

457457
#ifdef ZEND_HAVE_VECTOR_128
458458
while (from + 16 < end) {
459-
zend_vec_8x16_t mask;
459+
__m128i mask;
460460
uint32_t bits;
461-
const zend_vec_8x16_t _A = zend_vec_set_8x16('A' - 1);
462-
const zend_vec_8x16_t Z_ = zend_vec_set_8x16('Z' + 1);
463-
const zend_vec_8x16_t _a = zend_vec_set_8x16('a' - 1);
464-
const zend_vec_8x16_t z_ = zend_vec_set_8x16('z' + 1);
465-
const zend_vec_8x16_t _zero = zend_vec_set_8x16('0' - 1);
466-
const zend_vec_8x16_t nine_ = zend_vec_set_8x16('9' + 1);
467-
const zend_vec_8x16_t dot = zend_vec_set_8x16('.');
468-
const zend_vec_8x16_t minus = zend_vec_set_8x16('-');
469-
const zend_vec_8x16_t under = zend_vec_set_8x16('_');
470-
471-
zend_vec_8x16_t in = zend_vec_loadu_8x16(from);
472-
473-
zend_vec_8x16_t gt = zend_vec_cmpgt_8x16(in, _A);
474-
zend_vec_8x16_t lt = zend_vec_cmplt_8x16(in, Z_);
475-
mask = zend_vec_and_8x16(lt, gt); /* upper */
476-
gt = zend_vec_cmpgt_8x16(in, _a);
477-
lt = zend_vec_cmplt_8x16(in, z_);
478-
mask = zend_vec_or_8x16(mask, zend_vec_and_8x16(lt, gt)); /* lower */
479-
gt = zend_vec_cmpgt_8x16(in, _zero);
480-
lt = zend_vec_cmplt_8x16(in, nine_);
481-
mask = zend_vec_or_8x16(mask, zend_vec_and_8x16(lt, gt)); /* number */
482-
mask = zend_vec_or_8x16(mask, zend_vec_cmpeq_8x16(in, dot));
483-
mask = zend_vec_or_8x16(mask, zend_vec_cmpeq_8x16(in, minus));
484-
mask = zend_vec_or_8x16(mask, zend_vec_cmpeq_8x16(in, under));
461+
const __m128i _A = _mm_set1_epi8('A' - 1);
462+
const __m128i Z_ = _mm_set1_epi8('Z' + 1);
463+
const __m128i _a = _mm_set1_epi8('a' - 1);
464+
const __m128i z_ = _mm_set1_epi8('z' + 1);
465+
const __m128i _zero = _mm_set1_epi8('0' - 1);
466+
const __m128i nine_ = _mm_set1_epi8('9' + 1);
467+
const __m128i dot = _mm_set1_epi8('.');
468+
const __m128i minus = _mm_set1_epi8('-');
469+
const __m128i under = _mm_set1_epi8('_');
470+
471+
__m128i in = _mm_loadu_si128((__m128i *)from);
472+
473+
__m128i gt = _mm_cmpgt_epi8(in, _A);
474+
__m128i lt = _mm_cmplt_epi8(in, Z_);
475+
mask = _mm_and_si128(lt, gt); /* upper */
476+
gt = _mm_cmpgt_epi8(in, _a);
477+
lt = _mm_cmplt_epi8(in, z_);
478+
mask = _mm_or_si128(mask, _mm_and_si128(lt, gt)); /* lower */
479+
gt = _mm_cmpgt_epi8(in, _zero);
480+
lt = _mm_cmplt_epi8(in, nine_);
481+
mask = _mm_or_si128(mask, _mm_and_si128(lt, gt)); /* number */
482+
mask = _mm_or_si128(mask, _mm_cmpeq_epi8(in, dot));
483+
mask = _mm_or_si128(mask, _mm_cmpeq_epi8(in, minus));
484+
mask = _mm_or_si128(mask, _mm_cmpeq_epi8(in, under));
485485

486486
if (!raw) {
487-
const zend_vec_8x16_t blank = zend_vec_set_8x16(' ');
488-
zend_vec_8x16_t eq = zend_vec_cmpeq_8x16(in, blank);
489-
if (zend_vec_movemask_8x16(eq)) {
490-
in = zend_vec_add_8x16(in, zend_vec_and_8x16(eq, zend_vec_set_8x16('+' - ' ')));
491-
mask = zend_vec_or_8x16(mask, eq);
487+
const __m128i blank = _mm_set1_epi8(' ');
488+
__m128i eq = _mm_cmpeq_epi8(in, blank);
489+
if (_mm_movemask_epi8(eq)) {
490+
in = _mm_add_epi8(in, _mm_and_si128(eq, _mm_set1_epi8('+' - ' ')));
491+
mask = _mm_or_si128(mask, eq);
492492
}
493493
}
494494
if (raw) {
495-
const zend_vec_8x16_t wavy = zend_vec_set_8x16('~');
496-
mask = zend_vec_or_8x16(mask, zend_vec_cmpeq_8x16(in, wavy));
495+
const __m128i wavy = _mm_set1_epi8('~');
496+
mask = _mm_or_si128(mask, _mm_cmpeq_epi8(in, wavy));
497497
}
498-
if (((bits = zend_vec_movemask_8x16(mask)) & 0xffff) == 0xffff) {
499-
zend_vec_storeu_8x16(to, in);
498+
if (((bits = _mm_movemask_epi8(mask)) & 0xffff) == 0xffff) {
499+
_mm_storeu_si128((__m128i*)to, in);
500500
to += 16;
501501
} else {
502502
unsigned char xmm[16];
503-
zend_vec_storeu_8x16(xmm, in);
503+
_mm_storeu_si128((__m128i*)xmm, in);
504504
for (size_t i = 0; i < sizeof(xmm); i++) {
505505
if ((bits & (0x1 << i))) {
506506
*to++ = xmm[i];

0 commit comments

Comments
 (0)