Skip to content

Simd json encode #120

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 46 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
a3dd262
wip
nielsdos Feb 3, 2025
144b0e1
shift opt
nielsdos Feb 3, 2025
7d485a9
Get rid of acc
nielsdos Feb 3, 2025
58f30ff
SSE2 guard
nielsdos Feb 3, 2025
6a01058
use ascii
nielsdos Feb 3, 2025
55a0b0e
dynamic mask
nielsdos Feb 3, 2025
2a2008e
comment
nielsdos Feb 3, 2025
7c966a6
wip
nielsdos Feb 3, 2025
65f3b7e
wip
nielsdos Feb 3, 2025
124396a
potential solution
nielsdos Feb 3, 2025
1826161
remove some debug
nielsdos Feb 3, 2025
5a2c034
correct ifdefs, without resolver support
nielsdos Feb 3, 2025
326b982
Attempt to use standard bitset stuff
nielsdos Feb 3, 2025
db54e3f
preliminary resolver support (needs more work)
nielsdos Feb 3, 2025
8bcd6bb
fix native build
nielsdos Feb 3, 2025
d7f2562
let ci run without max_shift trick to compare perf
nielsdos Feb 4, 2025
2b11554
Revert "let ci run without max_shift trick to compare perf"
nielsdos Feb 4, 2025
ef72f33
Reduce overhead of worst case to 1.5x
nielsdos Feb 4, 2025
e3baa23
wip1
nielsdos Feb 4, 2025
3c8b68e
cheaper pos compute
nielsdos Feb 4, 2025
4d16463
no magic nrs
nielsdos Feb 4, 2025
27a89e0
simple heuristic
nielsdos Feb 4, 2025
b071dba
various small improvements
nielsdos Feb 5, 2025
2ae769e
save ci resources
nielsdos Feb 5, 2025
10bd63a
test with always inline
nielsdos Feb 5, 2025
5df25a4
tweak
nielsdos Feb 5, 2025
d5c5b9f
code layout trick (vtune dsb improvement)
nielsdos Feb 5, 2025
ceb8443
skip extra check
nielsdos Feb 5, 2025
81efe6b
tweak
nielsdos Feb 5, 2025
1d7109d
abstract away
nielsdos Feb 6, 2025
45e91f5
mark branch
nielsdos Feb 6, 2025
dfd6de0
split off
nielsdos Feb 6, 2025
ff4ef5b
cs
nielsdos Feb 6, 2025
57efb3a
fix mask on sse2 builds
nielsdos Feb 6, 2025
df0117e
test
nielsdos Feb 6, 2025
246b413
tweaks
nielsdos Feb 6, 2025
847497f
tweak
nielsdos Feb 6, 2025
901a957
flag
nielsdos Feb 6, 2025
8947f09
tighter code layout
nielsdos Feb 6, 2025
4c41ad3
Remove check
nielsdos Feb 6, 2025
40cd08f
Tweak
nielsdos Feb 6, 2025
dfb690d
Code layout and comment tweak
nielsdos Feb 6, 2025
bd6e462
test with indirect function ptr
nielsdos Feb 6, 2025
8d5a381
Revert "test with indirect function ptr"
nielsdos Feb 6, 2025
d4297de
code layout
nielsdos Feb 6, 2025
bc48fb8
wip
nielsdos Feb 7, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
@@ -137,7 +137,7 @@ jobs:
if: ${{ !matrix.asan }}
uses: ./.github/actions/verify-generated-files
LINUX_X32:
if: github.repository == 'php/php-src' || github.event_name == 'pull_request'
if: false
name: LINUX_X32_DEBUG_ZTS
runs-on: ubuntu-latest
timeout-minutes: 50
@@ -193,7 +193,7 @@ jobs:
-d zend_extension=opcache.so
-d opcache.enable_cli=1
MACOS_DEBUG_NTS:
if: github.repository == 'php/php-src' || github.event_name == 'pull_request'
if: false
strategy:
fail-fast: false
matrix:
@@ -234,7 +234,7 @@ jobs:
- name: Verify generated files are up to date
uses: ./.github/actions/verify-generated-files
WINDOWS:
if: github.repository == 'php/php-src' || github.event_name == 'pull_request'
if: false
name: WINDOWS_X64_ZTS
runs-on: windows-2022
timeout-minutes: 50
404 changes: 315 additions & 89 deletions ext/json/json_encoder.c
Original file line number Diff line number Diff line change
@@ -29,6 +29,18 @@
#include "zend_enum.h"
#include "zend_property_hooks.h"
#include "zend_lazy_objects.h"
#include "zend_bitset.h"

#if defined(ZEND_INTRIN_SSE4_2_NATIVE) || defined(ZEND_INTRIN_SSE4_2_FUNC_PROTO)
# include <nmmintrin.h>
#endif
#ifdef ZEND_INTRIN_SSE4_2_FUNC_PROTO
# include "zend_cpuinfo.h"
#endif

#ifdef __SSE2__
# define JSON_USE_SIMD
#endif

static const char digits[] = "0123456789abcdef";

@@ -366,6 +378,242 @@ static zend_result php_json_encode_array(smart_str *buf, zval *val, int options,
}
/* }}} */

/* Specialization of smart_str_appendl() to avoid performance loss due to code bloat */
static zend_always_inline void php_json_append(smart_str *dest, const char *src, size_t len)
{
/* dest has a minimum size of the input length,
* this avoids generating initial allocation code */
ZEND_ASSERT(dest->s);

smart_str_appendl(dest, src, len);
}

static zend_always_inline bool php_json_printable_ascii_escape(smart_str *buf, unsigned char us, int options)
{
ZEND_ASSERT(buf->s);

switch (us) {
case '"':
if (options & PHP_JSON_HEX_QUOT) {
php_json_append(buf, "\\u0022", 6);
} else {
php_json_append(buf, "\\\"", 2);
}
break;

case '\\':
php_json_append(buf, "\\\\", 2);
break;

case '/':
if (options & PHP_JSON_UNESCAPED_SLASHES) {
smart_str_appendc(buf, '/');
} else {
php_json_append(buf, "\\/", 2);
}
break;

case '<':
if (options & PHP_JSON_HEX_TAG) {
php_json_append(buf, "\\u003C", 6);
} else {
smart_str_appendc(buf, '<');
}
break;

case '>':
if (options & PHP_JSON_HEX_TAG) {
php_json_append(buf, "\\u003E", 6);
} else {
smart_str_appendc(buf, '>');
}
break;

case '&':
if (options & PHP_JSON_HEX_AMP) {
php_json_append(buf, "\\u0026", 6);
} else {
smart_str_appendc(buf, '&');
}
break;

case '\'':
if (options & PHP_JSON_HEX_APOS) {
php_json_append(buf, "\\u0027", 6);
} else {
smart_str_appendc(buf, '\'');
}
break;

default:
return false;
}

return true;
}

#ifdef JSON_USE_SIMD
static zend_always_inline int php_json_sse2_compute_escape_intersection(const __m128i mask, const __m128i input)
{
(void) mask;

const __m128i result_34 = _mm_cmpeq_epi8(input, _mm_set1_epi8('"'));
const __m128i result_38 = _mm_cmpeq_epi8(input, _mm_set1_epi8('&'));
const __m128i result_39 = _mm_cmpeq_epi8(input, _mm_set1_epi8('\''));
const __m128i result_47 = _mm_cmpeq_epi8(input, _mm_set1_epi8('/'));
const __m128i result_60 = _mm_cmpeq_epi8(input, _mm_set1_epi8('<'));
const __m128i result_62 = _mm_cmpeq_epi8(input, _mm_set1_epi8('>'));
const __m128i result_92 = _mm_cmpeq_epi8(input, _mm_set1_epi8('\\'));

const __m128i result_34_38 = _mm_or_si128(result_34, result_38);
const __m128i result_39_47 = _mm_or_si128(result_39, result_47);
const __m128i result_60_62 = _mm_or_si128(result_60, result_62);

const __m128i result_34_38_39_47 = _mm_or_si128(result_34_38, result_39_47);
const __m128i result_60_62_92 = _mm_or_si128(result_60_62, result_92);

const __m128i result_individual_bytes = _mm_or_si128(result_34_38_39_47, result_60_62_92);
return _mm_movemask_epi8(result_individual_bytes);
}
#endif

#if defined(ZEND_INTRIN_SSE4_2_NATIVE) || defined(ZEND_INTRIN_SSE4_2_FUNC_PROTO)
static const char php_json_escape_noslashes_lut[2][8][16] = {
/* !PHP_JSON_UNESCAPED_SLASHES */
{
[0] = {'"', '\\', '/', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
[PHP_JSON_HEX_AMP] = {'"', '\\', '&', '/', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
[PHP_JSON_HEX_APOS] = {'"', '\\', '\'', '/', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
[PHP_JSON_HEX_AMP|PHP_JSON_HEX_APOS] = {'"', '\\', '&', '\'', '/', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
[PHP_JSON_HEX_TAG] = {'"', '\\', '<', '>', '/', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
[PHP_JSON_HEX_AMP|PHP_JSON_HEX_TAG] = {'"', '\\', '&', '<', '>', '/', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
[PHP_JSON_HEX_APOS|PHP_JSON_HEX_TAG] = {'"', '\\', '\'', '<', '>', '/', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
[PHP_JSON_HEX_AMP|PHP_JSON_HEX_APOS|PHP_JSON_HEX_TAG] = {'"', '\\', '&', '\'', '<', '>', '/', 0, 0, 0, 0, 0, 0, 0, 0, 0}
},

/* PHP_JSON_UNESCAPED_SLASHES */
{
[0] = {'"', '\\', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
[PHP_JSON_HEX_AMP] = {'"', '\\', '&', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
[PHP_JSON_HEX_APOS] = {'"', '\\', '\'', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
[PHP_JSON_HEX_AMP|PHP_JSON_HEX_APOS] = {'"', '\\', '&', '\'', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
[PHP_JSON_HEX_TAG] = {'"', '\\', '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
[PHP_JSON_HEX_AMP|PHP_JSON_HEX_TAG] = {'"', '\\', '&', '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
[PHP_JSON_HEX_APOS|PHP_JSON_HEX_TAG] = {'"', '\\', '\'', '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
[PHP_JSON_HEX_AMP|PHP_JSON_HEX_APOS|PHP_JSON_HEX_TAG] = {'"', '\\', '&', '\'', '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
}
};

static zend_always_inline __m128i php_json_create_sse_escape_mask(int options)
{
const int slashes = (options & PHP_JSON_UNESCAPED_SLASHES) ? 1 : 0;
const int masked = options & (PHP_JSON_HEX_AMP|PHP_JSON_HEX_APOS|PHP_JSON_HEX_TAG);
return *(const __m128i *) &php_json_escape_noslashes_lut[slashes][masked];
}

ZEND_INTRIN_SSE4_2_FUNC_DECL(int php_json_sse42_compute_escape_intersection_real(const __m128i mask, const __m128i input));
zend_always_inline int php_json_sse42_compute_escape_intersection_real(const __m128i mask, const __m128i input)
{
const __m128i result_individual_bytes = _mm_cmpistrm(mask, input, _SIDD_SBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
return _mm_cvtsi128_si32(result_individual_bytes);
}
#endif

#if defined(JSON_USE_SIMD) && defined(ZEND_INTRIN_SSE4_2_FUNC_PROTO)
static int php_json_sse42_compute_escape_intersection(const __m128i mask, const __m128i input) __attribute__((ifunc("php_json_resolve_escape_intersection")));

typedef int (*php_json_compute_escape_intersection_t)(const __m128i mask, const __m128i input);

ZEND_NO_SANITIZE_ADDRESS
ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */
static php_json_compute_escape_intersection_t php_json_resolve_escape_intersection(void) {
if (zend_cpu_supports_sse42()) {
return php_json_sse42_compute_escape_intersection_real;
}
return php_json_sse2_compute_escape_intersection;
}
#endif

typedef enum php_json_simd_result {
PHP_JSON_STOP,
PHP_JSON_SLOW,
PHP_JSON_NON_ASCII,
} php_json_simd_result;

#ifdef JSON_USE_SIMD
static zend_always_inline php_json_simd_result php_json_process_simd_block(
smart_str *buf,
const __m128i sse_escape_mask,
const char **restrict s,
size_t *restrict pos,
size_t *restrict len,
int options
)
{
while (*len >= sizeof(__m128i)) {
const __m128i input = _mm_loadu_si128((const __m128i *) (*s + *pos));
/* signed compare, so checks for unsigned bytes >= 0x80 as well */
const __m128i input_range = _mm_cmplt_epi8(input, _mm_set1_epi8(32));

int max_shift = sizeof(__m128i);

int input_range_mask = _mm_movemask_epi8(input_range);
if (input_range_mask != 0) {
if (UNEXPECTED(input_range_mask & 1)) {
/* not worth it */
return PHP_JSON_NON_ASCII;
}
max_shift = zend_ulong_ntz(input_range_mask);
}

#ifdef ZEND_INTRIN_SSE4_2_NATIVE
int mask = php_json_sse42_compute_escape_intersection_real(sse_escape_mask, input);
#elif defined(ZEND_INTRIN_SSE4_2_FUNC_PROTO)
int mask = php_json_sse42_compute_escape_intersection(sse_escape_mask, input);
#else
int mask = php_json_sse2_compute_escape_intersection(_mm_setzero_si128(), input);
#endif
if (mask != 0) {
if (UNEXPECTED(max_shift < sizeof(__m128i))) {
int shift = zend_ulong_ntz(mask); /* first offending character */
*pos += MIN(max_shift, shift);
*len -= MIN(max_shift, shift);
return PHP_JSON_SLOW;
}

php_json_append(buf, *s, *pos);
*s += *pos;
const char *s_backup = *s;

/* It's more important to keep this loop tight than to optimize this with
* a trailing zero count. */
for (; mask; mask >>= 1, *s += 1) {
if (UNEXPECTED(mask & 1)) {
bool handled = php_json_printable_ascii_escape(buf, (*s)[0], options);
ZEND_ASSERT(handled);
} else {
ZEND_ASSERT(buf->s);
smart_str_appendc(buf, (*s)[0]);
}
}

*pos = sizeof(__m128i) - (*s - s_backup);
} else {
if (max_shift < sizeof(__m128i)) {
*pos += max_shift;
*len -= max_shift;
return PHP_JSON_SLOW;
}
*pos += sizeof(__m128i);
}

*len -= sizeof(__m128i);
}

return UNEXPECTED(!*len) ? PHP_JSON_STOP : PHP_JSON_SLOW;
}
#endif

zend_result php_json_escape_string(
smart_str *buf, const char *s, size_t len,
int options, php_json_encoder *encoder) /* {{{ */
@@ -395,54 +643,71 @@ zend_result php_json_escape_string(
}

}
checkpoint = buf->s ? ZSTR_LEN(buf->s) : 0;

/* pre-allocate for string length plus 2 quotes */
smart_str_alloc(buf, len+2, 0);
checkpoint = ZSTR_LEN(buf->s);
smart_str_appendc(buf, '"');

pos = 0;

#ifdef JSON_USE_SIMD
# if defined(ZEND_INTRIN_SSE4_2_NATIVE) || defined(ZEND_INTRIN_SSE4_2_FUNC_PROTO)
const __m128i sse_escape_mask = php_json_create_sse_escape_mask(options);
# else
const __m128i sse_escape_mask = _mm_setzero_si128();
# endif
#endif

do {
static const uint32_t charmap[8] = {
0xffffffff, 0x500080c4, 0x10000000, 0x00000000,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff};

php_json_simd_result result = PHP_JSON_SLOW;
#ifdef JSON_USE_SIMD
// TODO: html.c change (incl UNEXPECTED) & mss dit manueel terug inlinen?
result = php_json_process_simd_block(buf, sse_escape_mask, &s, &pos, &len, options);
if (UNEXPECTED(result == PHP_JSON_STOP)) {
break;
}
#endif

us = (unsigned char)s[pos];
if (EXPECTED(!ZEND_BIT_TEST(charmap, us))) {
if (EXPECTED(result != PHP_JSON_NON_ASCII && !ZEND_BIT_TEST(charmap, us))) {
pos++;
len--;
if (len == 0) {
smart_str_appendl(buf, s, pos);
break;
}
} else {
if (pos) {
smart_str_appendl(buf, s, pos);
s += pos;
pos = 0;
}
us = (unsigned char)s[0];
if (UNEXPECTED(us >= 0x80)) {
zend_result status;
us = php_next_utf8_char((unsigned char *)s, len, &pos, &status);
size_t pos_old = pos;
const char *cur = s + pos;
pos = 0;
us = php_next_utf8_char_ex((unsigned char *)cur, us, len, &pos);
len -= pos;
pos += pos_old;

/* check whether UTF8 character is correct */
if (UNEXPECTED(status != SUCCESS)) {
if (UNEXPECTED(!us)) {
if (pos_old && (options & (PHP_JSON_INVALID_UTF8_IGNORE|PHP_JSON_INVALID_UTF8_SUBSTITUTE))) {
php_json_append(buf, s, pos_old);
}
s += pos;
pos = 0;

if (options & PHP_JSON_INVALID_UTF8_IGNORE) {
/* ignore invalid UTF8 character */
} else if (options & PHP_JSON_INVALID_UTF8_SUBSTITUTE) {
/* Use Unicode character 'REPLACEMENT CHARACTER' (U+FFFD) */
if (options & PHP_JSON_UNESCAPED_UNICODE) {
smart_str_appendl(buf, "\xef\xbf\xbd", 3);
php_json_append(buf, "\xef\xbf\xbd", 3);
} else {
smart_str_appendl(buf, "\\ufffd", 6);
php_json_append(buf, "\\ufffd", 6);
}
} else {
ZSTR_LEN(buf->s) = checkpoint;
encoder->error_code = PHP_JSON_ERROR_UTF8;
if (options & PHP_JSON_PARTIAL_OUTPUT_ON_ERROR) {
smart_str_appendl(buf, "null", 4);
php_json_append(buf, "null", 4);
}
return FAILURE;
}
@@ -453,126 +718,87 @@ zend_result php_json_escape_string(
} else if ((options & PHP_JSON_UNESCAPED_UNICODE)
&& ((options & PHP_JSON_UNESCAPED_LINE_TERMINATORS)
|| us < 0x2028 || us > 0x2029)) {
smart_str_appendl(buf, s, pos);
/* No need to emit any bytes, just move the cursor. */
} else {
php_json_append(buf, s, pos_old);
s += pos;
pos = 0;

ZEND_ASSERT(buf->s);

/* From http://en.wikipedia.org/wiki/UTF16 */
dst = smart_str_extend(buf, 6 + ((us >= 0x10000) ? 6 : 0));
if (us >= 0x10000) {
unsigned int next_us;

us -= 0x10000;
next_us = (unsigned short)((us & 0x3ff) | 0xdc00);
us = (unsigned short)((us >> 10) | 0xd800);
dst = smart_str_extend(buf, 6);
dst[0] = '\\';
dst[1] = 'u';
dst[2] = digits[(us >> 12) & 0xf];
dst[3] = digits[(us >> 8) & 0xf];
dst[4] = digits[(us >> 4) & 0xf];
dst[5] = digits[us & 0xf];
us = next_us;
dst += 6;
}
dst = smart_str_extend(buf, 6);
dst[0] = '\\';
dst[1] = 'u';
dst[2] = digits[(us >> 12) & 0xf];
dst[3] = digits[(us >> 8) & 0xf];
dst[4] = digits[(us >> 4) & 0xf];
dst[5] = digits[us & 0xf];
}
s += pos;
len -= pos;
pos = 0;
} else {
if (pos) {
php_json_append(buf, s, pos);
s += pos;
pos = 0;
}
s++;
switch (us) {
case '"':
if (options & PHP_JSON_HEX_QUOT) {
smart_str_appendl(buf, "\\u0022", 6);
} else {
smart_str_appendl(buf, "\\\"", 2);
}
break;

case '\\':
smart_str_appendl(buf, "\\\\", 2);
break;

case '/':
if (options & PHP_JSON_UNESCAPED_SLASHES) {
smart_str_appendc(buf, '/');
} else {
smart_str_appendl(buf, "\\/", 2);
}
break;

case '\b':
smart_str_appendl(buf, "\\b", 2);
php_json_append(buf, "\\b", 2);
break;

case '\f':
smart_str_appendl(buf, "\\f", 2);
php_json_append(buf, "\\f", 2);
break;

case '\n':
smart_str_appendl(buf, "\\n", 2);
php_json_append(buf, "\\n", 2);
break;

case '\r':
smart_str_appendl(buf, "\\r", 2);
php_json_append(buf, "\\r", 2);
break;

case '\t':
smart_str_appendl(buf, "\\t", 2);
break;

case '<':
if (options & PHP_JSON_HEX_TAG) {
smart_str_appendl(buf, "\\u003C", 6);
} else {
smart_str_appendc(buf, '<');
}
break;

case '>':
if (options & PHP_JSON_HEX_TAG) {
smart_str_appendl(buf, "\\u003E", 6);
} else {
smart_str_appendc(buf, '>');
}
break;

case '&':
if (options & PHP_JSON_HEX_AMP) {
smart_str_appendl(buf, "\\u0026", 6);
} else {
smart_str_appendc(buf, '&');
}
break;

case '\'':
if (options & PHP_JSON_HEX_APOS) {
smart_str_appendl(buf, "\\u0027", 6);
} else {
smart_str_appendc(buf, '\'');
}
php_json_append(buf, "\\t", 2);
break;

default:
ZEND_ASSERT(us < ' ');
dst = smart_str_extend(buf, 6);
dst[0] = '\\';
dst[1] = 'u';
dst[2] = '0';
dst[3] = '0';
dst[4] = digits[(us >> 4) & 0xf];
dst[5] = digits[us & 0xf];
if (!php_json_printable_ascii_escape(buf, us, options)) {
ZEND_ASSERT(us < ' ');
dst = smart_str_extend(buf, 6);
dst[0] = '\\';
dst[1] = 'u';
dst[2] = '0';
dst[3] = '0';
dst[4] = digits[(us >> 4) & 0xf];
dst[5] = digits[us & 0xf];
}
break;
}
len--;
}
}
} while (len);

php_json_append(buf, s, pos);

ZEND_ASSERT(buf->s);
smart_str_appendc(buf, '"');

return SUCCESS;
152 changes: 90 additions & 62 deletions ext/standard/html.c
Original file line number Diff line number Diff line change
@@ -53,12 +53,16 @@
(all) = (all) && !CHARSET_PARTIAL_SUPPORT((charset)) && ((doctype) != ENT_HTML_DOC_XML1); \
} while (0)

#define MB_FAILURE(pos, advance) do { \
#define MB_FAILURE_NO_STATUS(pos, advance) do { \
*cursor = pos + (advance); \
*status = FAILURE; \
return 0; \
} while (0)

#define MB_FAILURE(pos, advance) do { \
*status = FAILURE; \
MB_FAILURE_NO_STATUS(pos, advance); \
} while (0)

#define CHECK_LEN(pos, chars_need) ((str_len - (pos)) >= (chars_need))

/* valid as single byte character or leading byte */
@@ -85,6 +89,85 @@ static char *get_default_charset(void) {
}
/* }}} */

PHPAPI unsigned int php_next_utf8_char_ex(
const unsigned char *str,
unsigned char c,
size_t str_len,
size_t *cursor)
{
size_t pos = *cursor;
unsigned int this_char = 0;

/* We'll follow strategy 2. from section 3.6.1 of UTR #36:
* "In a reported illegal byte sequence, do not include any
* non-initial byte that encodes a valid character or is a leading
* byte for a valid sequence." */

ZEND_ASSERT(c >= 0x80);

if (UNEXPECTED(c < 0xc2)) {
MB_FAILURE_NO_STATUS(pos, 1);
} else if (c < 0xe0) {
if (UNEXPECTED(!CHECK_LEN(pos, 2)))
MB_FAILURE_NO_STATUS(pos, 1);

if (UNEXPECTED(!utf8_trail(str[pos + 1]))) {
MB_FAILURE_NO_STATUS(pos, utf8_lead(str[pos + 1]) ? 1 : 2);
}
this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
if (UNEXPECTED(this_char < 0x80)) { /* non-shortest form */
MB_FAILURE_NO_STATUS(pos, 2);
}
pos += 2;
} else if (c < 0xf0) {
size_t avail = str_len - pos;

if (UNEXPECTED(avail < 3 ||
!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2]))) {
if (avail < 2 || utf8_lead(str[pos + 1]))
MB_FAILURE_NO_STATUS(pos, 1);
else if (avail < 3 || utf8_lead(str[pos + 2]))
MB_FAILURE_NO_STATUS(pos, 2);
else
MB_FAILURE_NO_STATUS(pos, 3);
}

this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
if (UNEXPECTED(this_char < 0x800)) { /* non-shortest form */
MB_FAILURE_NO_STATUS(pos, 3);
} else if (UNEXPECTED(this_char >= 0xd800 && this_char <= 0xdfff)) { /* surrogate */
MB_FAILURE_NO_STATUS(pos, 3);
}
pos += 3;
} else if (c < 0xf5) {
size_t avail = str_len - pos;

if (UNEXPECTED(avail < 4 ||
!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2]) ||
!utf8_trail(str[pos + 3]))) {
if (avail < 2 || utf8_lead(str[pos + 1]))
MB_FAILURE_NO_STATUS(pos, 1);
else if (avail < 3 || utf8_lead(str[pos + 2]))
MB_FAILURE_NO_STATUS(pos, 2);
else if (avail < 4 || utf8_lead(str[pos + 3]))
MB_FAILURE_NO_STATUS(pos, 3);
else
MB_FAILURE_NO_STATUS(pos, 4);
}

this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
if (UNEXPECTED(this_char < 0x10000 || this_char > 0x10FFFF)) { /* non-shortest form or outside range */
MB_FAILURE_NO_STATUS(pos, 4);
}
pos += 4;
} else {
MB_FAILURE_NO_STATUS(pos, 1);
}

*cursor = pos;
return this_char;
}

/* {{{ get_next_char */
static inline unsigned int get_next_char(
enum entity_charset charset,
@@ -105,72 +188,17 @@ static inline unsigned int get_next_char(
switch (charset) {
case cs_utf_8:
{
/* We'll follow strategy 2. from section 3.6.1 of UTR #36:
* "In a reported illegal byte sequence, do not include any
* non-initial byte that encodes a valid character or is a leading
* byte for a valid sequence." */
unsigned char c;
c = str[pos];
if (c < 0x80) {
this_char = c;
pos++;
} else if (c < 0xc2) {
MB_FAILURE(pos, 1);
} else if (c < 0xe0) {
if (!CHECK_LEN(pos, 2))
MB_FAILURE(pos, 1);

if (!utf8_trail(str[pos + 1])) {
MB_FAILURE(pos, utf8_lead(str[pos + 1]) ? 1 : 2);
}
this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
if (this_char < 0x80) { /* non-shortest form */
MB_FAILURE(pos, 2);
}
pos += 2;
} else if (c < 0xf0) {
size_t avail = str_len - pos;

if (avail < 3 ||
!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2])) {
if (avail < 2 || utf8_lead(str[pos + 1]))
MB_FAILURE(pos, 1);
else if (avail < 3 || utf8_lead(str[pos + 2]))
MB_FAILURE(pos, 2);
else
MB_FAILURE(pos, 3);
}

this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
if (this_char < 0x800) { /* non-shortest form */
MB_FAILURE(pos, 3);
} else if (this_char >= 0xd800 && this_char <= 0xdfff) { /* surrogate */
MB_FAILURE(pos, 3);
}
pos += 3;
} else if (c < 0xf5) {
size_t avail = str_len - pos;

if (avail < 4 ||
!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2]) ||
!utf8_trail(str[pos + 3])) {
if (avail < 2 || utf8_lead(str[pos + 1]))
MB_FAILURE(pos, 1);
else if (avail < 3 || utf8_lead(str[pos + 2]))
MB_FAILURE(pos, 2);
else if (avail < 4 || utf8_lead(str[pos + 3]))
MB_FAILURE(pos, 3);
else
MB_FAILURE(pos, 4);
}

this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
if (this_char < 0x10000 || this_char > 0x10FFFF) { /* non-shortest form or outside range */
MB_FAILURE(pos, 4);
}
pos += 4;
} else {
MB_FAILURE(pos, 1);
this_char = php_next_utf8_char_ex(str, c, str_len, cursor);
if (UNEXPECTED(this_char == 0)) {
*status = FAILURE;
}
return this_char;
}
}
break;
1 change: 1 addition & 0 deletions ext/standard/html.h
Original file line number Diff line number Diff line change
@@ -48,5 +48,6 @@ PHPAPI zend_string *php_escape_html_entities(const unsigned char *old, size_t ol
PHPAPI zend_string *php_escape_html_entities_ex(const unsigned char *old, size_t oldlen, int all, int flags, const char *hint_charset, bool double_encode, bool quiet);
PHPAPI zend_string *php_unescape_html_entities(zend_string *str, int all, int flags, const char *hint_charset);
PHPAPI unsigned int php_next_utf8_char(const unsigned char *str, size_t str_len, size_t *cursor, zend_result *status);
PHPAPI unsigned int php_next_utf8_char_ex(const unsigned char *str, unsigned char c, size_t str_len, size_t *cursor);

#endif /* HTML_H */