Skip to content

Commit 3182b2e

Browse files
committed
Refactor string format parsing
1 parent 50a41d4 commit 3182b2e

File tree

9 files changed

+200
-165
lines changed

9 files changed

+200
-165
lines changed

ext/standard/scanf.c

Lines changed: 148 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@
7575
#define SCAN_NOSKIP 0x1 /* Don't skip blanks. */
7676
#define SCAN_SUPPRESS 0x2 /* Suppress assignment. */
7777
#define SCAN_UNSIGNED 0x4 /* Read an unsigned value. */
78-
#define SCAN_WIDTH 0x8 /* A width value was supplied. */
7978

8079
#define SCAN_SIGNOK 0x10 /* A +/- character is allowed. */
8180
#define SCAN_NODIGITS 0x20 /* No digits have been scanned. */
@@ -299,21 +298,18 @@ static void ReleaseCharSet(CharSet *cset)
299298
*
300299
*----------------------------------------------------------------------
301300
*/
302-
static int ValidateFormat(const zend_string *zstr_format, uint32_t format_arg_num, uint32_t numVars, uint32_t *totalSubs)
301+
static bool isFormatStringValid(const zend_string *zstr_format, uint32_t format_arg_num, uint32_t numVars, uint32_t *totalSubs)
303302
{
304303
#define STATIC_LIST_SIZE 16
305-
int flags;
306304
bool gotXpg = false;
307305
bool gotSequential = false;
308-
const char *ch = NULL;
309306
uint32_t staticAssign[STATIC_LIST_SIZE];
310307
uint32_t *nassign = staticAssign;
311308
uint32_t objIndex = 0;
312309
uint32_t xpgSize = 0;
313310
uint32_t nspace = STATIC_LIST_SIZE;
314-
zend_ulong value;
311+
bool bindToVariables = numVars;
315312

316-
bool assignToVariables = numVars;
317313
/*
318314
* Initialize an array that records the number of times a variable
319315
* is assigned to by the format string. We use this to detect if
@@ -325,100 +321,129 @@ static int ValidateFormat(const zend_string *zstr_format, uint32_t format_arg_nu
325321
}
326322
memset(nassign, 0, sizeof(uint32_t)*numVars);
327323

328-
const char *format = ZSTR_VAL(zstr_format);
329-
while (*format != '\0') {
330-
ch = format++;
331-
flags = 0;
332-
333-
if (*ch != '%') {
324+
const char *end_ptr = ZSTR_VAL(zstr_format) + ZSTR_LEN(zstr_format);
325+
for (const char *ptr = ZSTR_VAL(zstr_format); ptr < end_ptr; ptr++) {
326+
bool isCurrentSpecifierBound = true;
327+
/* Look for specifier start */
328+
if (*ptr != '%') {
334329
continue;
335330
}
336-
ch = format++;
337-
if (*ch == '%') {
338-
continue;
331+
ptr++;
332+
333+
if (UNEXPECTED(ptr == end_ptr)) {
334+
zend_argument_value_error(format_arg_num, "unterminated format specifier");
335+
goto error;
339336
}
340-
if (*ch == '*') {
341-
flags |= SCAN_SUPPRESS;
342-
ch = format++;
343-
goto xpgCheckDone;
337+
338+
/* Literal % so continue */
339+
if (*ptr == '%') {
340+
continue;
341+
} else if (*ptr == '*') {
342+
/* Consumed specifier but not assigned to variable */
343+
isCurrentSpecifierBound = false;
344+
ptr++;
345+
if (UNEXPECTED(ptr == end_ptr)) {
346+
zend_argument_value_error(format_arg_num, "unterminated format specifier");
347+
goto error;
348+
}
344349
}
345350

346-
if ( isdigit( (int)*ch ) ) {
347-
/*
348-
* Check for an XPG3-style %n$ specification. Note: there
349-
* must not be a mixture of XPG3 specs and non-XPG3 specs
351+
if (isdigit(*ptr)) {
352+
/* We might either have an XPG3-style %n$ specification,
353+
* or we are parsing the _maximum field width_ of the specifier.
354+
*
355+
* Note: there must not be a mixture of XPG3 specs and non-XPG3 specs
350356
* in the same format string.
351357
*/
352-
char *end = NULL;
353-
value = ZEND_STRTOUL(format-1, &end, 10);
354-
if (*end != '$') {
355-
goto notXpg;
356-
}
357-
format = end+1;
358-
ch = format++;
359-
gotXpg = true;
360-
if (gotSequential) {
361-
goto mixedXPG;
358+
359+
char *width_end = NULL;
360+
zend_ulong width = ZEND_STRTOUL(ptr, &width_end, 10);
361+
if (UNEXPECTED(width_end == end_ptr)) {
362+
zend_argument_value_error(format_arg_num, "unterminated format specifier");
363+
goto error;
362364
}
363-
if ((value < 1) || (assignToVariables && (value > numVars))) {
364-
goto badIndex;
365-
} else if (!assignToVariables) {
366-
/*
367-
* In the case where no vars are specified, the user can
368-
* specify %9999$ legally, so we have to consider special
369-
* rules for growing the assign array. 'value' is
370-
* guaranteed to be > 0.
371-
*/
365+
ptr = width_end;
372366

373-
/* set a lower artificial limit on this
374-
* in the interest of security and resource friendliness
375-
* 255 arguments should be more than enough. - cc
376-
*/
377-
if (value > SCAN_MAX_ARGS) {
378-
goto badIndex;
367+
if (*ptr != '$' && isCurrentSpecifierBound) {
368+
gotSequential = true;
369+
} else {
370+
/* We indeed have an XPG3 style spec */
371+
ptr++;
372+
if (UNEXPECTED(ptr == end_ptr)) {
373+
zend_argument_value_error(format_arg_num, "unterminated format specifier");
374+
goto error;
375+
}
376+
if (UNEXPECTED(gotSequential)) {
377+
zend_argument_value_error(format_arg_num, "cannot mix \"%%\" and \"%%n$\" conversion specifiers");
378+
goto error;
379379
}
380380

381-
xpgSize = (xpgSize > value) ? xpgSize : value;
382-
}
383-
objIndex = value - 1;
384-
goto xpgCheckDone;
385-
}
381+
zend_ulong argument_index = width;
382+
gotXpg = true;
386383

387-
notXpg:
388-
gotSequential = true;
389-
if (gotXpg) {
390-
mixedXPG:
391-
zend_value_error("%s", "cannot mix \"%\" and \"%n$\" conversion specifiers");
392-
goto error;
393-
}
384+
if (UNEXPECTED(argument_index < 1 || (bindToVariables && argument_index > numVars))) {
385+
zend_argument_value_error(format_arg_num, "argument index %%%" ZEND_ULONG_FMT_SPEC "$ is out of range", argument_index);
386+
goto error;
387+
} else if (!bindToVariables) {
388+
/*
389+
* In the case where no vars are specified, the user can
390+
* specify %9999$ legally, so we have to consider special
391+
* rules for growing the assign array. 'value' is
392+
* guaranteed to be > 0.
393+
*/
394394

395-
xpgCheckDone:
396-
/*
397-
* Parse any width specifier.
398-
*/
399-
if (isdigit(UCHAR(*ch))) {
400-
char *end = NULL;
401-
value = ZEND_STRTOUL(format-1, &end, 10);
402-
flags |= SCAN_WIDTH;
403-
format = end;
404-
ch = format++;
395+
/* set a lower artificial limit on this
396+
* in the interest of security and resource friendliness
397+
* 255 arguments should be more than enough. - cc
398+
*/
399+
if (argument_index > SCAN_MAX_ARGS) {
400+
// TODO Specify limit?
401+
zend_argument_value_error(format_arg_num, "argument index %%%" ZEND_ULONG_FMT_SPEC "$ is out of range", argument_index);
402+
goto error;
403+
}
404+
405+
xpgSize = MAX(argument_index, xpgSize);
406+
}
407+
objIndex = argument_index - 1;
408+
409+
/* Check if we have a width argument with the XPG3 specifier */
410+
if (!isdigit(*ptr)) {
411+
goto length_modifier;
412+
}
413+
/* Grab the width to be able to continue */
414+
width = ZEND_STRTOUL(ptr, &width_end, 10);
415+
if (UNEXPECTED(width_end == end_ptr)) {
416+
zend_argument_value_error(format_arg_num, "unterminated format specifier");
417+
goto error;
418+
}
419+
ptr = width_end;
420+
}
421+
} else if (isCurrentSpecifierBound) {
422+
if (UNEXPECTED(gotXpg)) {
423+
zend_argument_value_error(format_arg_num, "cannot mix \"%%\" and \"%%n$\" conversion specifiers");
424+
goto error;
425+
}
426+
gotSequential = true;
405427
}
406428

407-
/*
408-
* Ignore size specifier.
409-
*/
410-
if ((*ch == 'l') || (*ch == 'L') || (*ch == 'h')) {
411-
ch = format++;
429+
length_modifier:
430+
/* Ignore length modifier */
431+
if (*ptr == 'l' || *ptr == 'L' || *ptr == 'h') {
432+
ptr++;
433+
if (UNEXPECTED(ptr == end_ptr)) {
434+
zend_argument_value_error(format_arg_num, "unterminated format specifier");
435+
goto error;
436+
}
412437
}
413438

414-
if (!(flags & SCAN_SUPPRESS) && assignToVariables && (objIndex >= numVars)) {
415-
goto badIndex;
439+
if (isCurrentSpecifierBound && bindToVariables && (objIndex >= numVars)) {
440+
zend_argument_value_error(format_arg_num, "Different numbers of variable names and field specifiers");
441+
goto error;
416442
}
417443

418-
/*
419-
* Handle the various field types.
420-
*/
421-
switch (*ch) {
444+
/* Handle specifiers */
445+
ZEND_ASSERT(ptr != end_ptr);
446+
switch (*ptr) {
422447
case 'n':
423448
case 'd':
424449
case 'D':
@@ -440,48 +465,44 @@ static int ValidateFormat(const zend_string *zstr_format, uint32_t format_arg_nu
440465
/* ANSI. since Zend auto allocates space for vars, this is no */
441466
/* problem - cc */
442467
/*
443-
if (flags & SCAN_WIDTH) {
444-
php_error_docref(NULL, E_WARNING, "Field width may not be specified in %c conversion");
468+
if (hasFieldWidth) {
469+
php_error_docref(NULL, E_WARNING, "Field width may not be specified in %%c conversion");
445470
goto error;
446471
}
447472
*/
448473
break;
449474

475+
/* Range specifier */
450476
case '[':
451-
if (*format == '\0') {
452-
goto badSet;
453-
}
454-
ch = format++;
455-
if (*ch == '^') {
456-
if (*format == '\0') {
457-
goto badSet;
477+
ptr++;
478+
/* Not match flag */
479+
if (*ptr == '^') {
480+
ptr++;
481+
if (UNEXPECTED(ptr == end_ptr)) {
482+
zend_argument_value_error(format_arg_num, "unterminated [ format specifier");
483+
goto error;
458484
}
459-
ch = format++;
460485
}
461-
if (*ch == ']') {
462-
if (*format == '\0') {
463-
goto badSet;
464-
}
465-
ch = format++;
486+
/* If ] is the first character of the range it means it should be *included*
487+
* in the range and not mark the end of it (as it would be empty otherwise) */
488+
if (*ptr == ']') {
489+
ptr++;
466490
}
467-
while (*ch != ']') {
468-
if (*format == '\0') {
469-
goto badSet;
491+
while (*ptr != ']') {
492+
if (UNEXPECTED(ptr == end_ptr)) {
493+
zend_argument_value_error(format_arg_num, "unterminated [ format specifier");
494+
goto error;
470495
}
471-
ch = format++;
496+
ptr++;
472497
}
473498
break;
474-
badSet:
475-
zend_value_error("Unmatched [ in format string");
476-
goto error;
477499

478-
default: {
479-
zend_value_error("Bad scan conversion character \"%c\"", *ch);
500+
default:
501+
zend_argument_value_error(format_arg_num, "unknown format specifier \"%c\"", *ptr);
480502
goto error;
481-
}
482503
}
483504

484-
if (!(flags & SCAN_SUPPRESS)) {
505+
if (isCurrentSpecifierBound) {
485506
if (objIndex >= nspace) {
486507
/*
487508
* Expand the nassign buffer. If we are using XPG specifiers,
@@ -509,12 +530,12 @@ static int ValidateFormat(const zend_string *zstr_format, uint32_t format_arg_nu
509530
nassign[objIndex]++;
510531
objIndex++;
511532
}
512-
} /* while (*format != '\0') */
533+
}
513534

514535
/*
515536
* Verify that all of the variable were assigned exactly once.
516537
*/
517-
if (!assignToVariables) {
538+
if (!bindToVariables) {
518539
if (xpgSize) {
519540
numVars = xpgSize;
520541
} else {
@@ -526,35 +547,38 @@ static int ValidateFormat(const zend_string *zstr_format, uint32_t format_arg_nu
526547

527548
for (uint32_t i = 0; i < numVars; i++) {
528549
if (nassign[i] > 1) {
529-
zend_value_error("%s", "Variable is assigned by multiple \"%n$\" conversion specifiers");
550+
if (bindToVariables) {
551+
/* +1 as arguments are 1-indexed not 0-indexed */
552+
zend_argument_value_error(i + 1 + format_arg_num, "is assigned by multiple \"%%n$\" conversion specifiers");
553+
} else {
554+
zend_argument_value_error(format_arg_num, "argument %" PRIu32 " is assigned by multiple \"%%n$\" conversion specifiers", i+1);
555+
}
530556
goto error;
531557
} else if (!xpgSize && (nassign[i] == 0)) {
532558
/*
533559
* If the space is empty, and xpgSize is 0 (means XPG wasn't
534560
* used, and/or numVars != 0), then too many vars were given
535561
*/
536-
zend_value_error("Variable is not assigned by any conversion specifiers");
562+
if (bindToVariables) {
563+
/* +1 as arguments are 1-indexed not 0-indexed */
564+
zend_argument_value_error(i + 1 + format_arg_num, "is not assigned by any conversion specifiers");
565+
} else {
566+
zend_argument_value_error(format_arg_num, "argument %" PRIu32 " is not assigned by any conversion specifiers", i+1);
567+
}
537568
goto error;
538569
}
539570
}
540571

541572
if (nassign != staticAssign) {
542-
efree((char *)nassign);
543-
}
544-
return SCAN_SUCCESS;
545-
546-
badIndex:
547-
if (gotXpg) {
548-
zend_value_error("%s", "\"%n$\" argument index out of range");
549-
} else {
550-
zend_value_error("Different numbers of variable names and field specifiers");
573+
efree(nassign);
551574
}
575+
return true;
552576

553577
error:
554578
if (nassign != staticAssign) {
555-
efree((char *)nassign);
579+
efree(nassign);
556580
}
557-
return SCAN_ERROR_INVALID_FORMAT;
581+
return false;
558582
#undef STATIC_LIST_SIZE
559583
}
560584
/* }}} */
@@ -602,8 +626,8 @@ PHPAPI int php_sscanf_internal(const char *string, const zend_string *zstr_forma
602626
* Check for errors in the format string.
603627
*/
604628
uint32_t totalVars = 0;
605-
if (ValidateFormat(zstr_format, format_arg_num, numVars, &totalVars) != SCAN_SUCCESS) {
606-
scan_set_error_return( assignToVariables, return_value );
629+
/* Throws when format is invalid */
630+
if (!isFormatStringValid(zstr_format, format_arg_num, numVars, &totalVars)) {
607631
return SCAN_ERROR_INVALID_FORMAT;
608632
}
609633

0 commit comments

Comments
 (0)