Skip to content

Commit 4c81bf7

Browse files
alexandre-dauboisnicolas-grekas
authored andcommitted
[JsonPath] Handle special whitespaces in filters
1 parent be082d5 commit 4c81bf7

File tree

6 files changed

+553
-425
lines changed

6 files changed

+553
-425
lines changed

JsonCrawler.php

Lines changed: 136 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,11 @@ private function evaluateBracket(string $expr, mixed $value): array
133133
return [];
134134
}
135135

136-
if ('*' === $expr) {
136+
if (str_contains($expr, ',') && (str_starts_with($trimmed = trim($expr), ',') || str_ends_with($trimmed, ','))) {
137+
throw new JsonCrawlerException($expr, 'Expression cannot have leading or trailing commas');
138+
}
139+
140+
if ('*' === $expr = JsonPathUtils::normalizeWhitespace($expr)) {
137141
return array_values($value);
138142
}
139143

@@ -168,8 +172,7 @@ private function evaluateBracket(string $expr, mixed $value): array
168172
return $result;
169173
}
170174

171-
// start, end and step
172-
if (preg_match('/^(-?\d*):(-?\d*)(?::(-?\d+))?$/', $expr, $matches)) {
175+
if (preg_match('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/', $expr, $matches)) {
173176
if (!array_is_list($value)) {
174177
return [];
175178
}
@@ -217,14 +220,12 @@ private function evaluateBracket(string $expr, mixed $value): array
217220

218221
// filter expressions
219222
if (preg_match('/^\?(.*)$/', $expr, $matches)) {
220-
$filterExpr = $matches[1];
221-
222-
if (preg_match('/^(\w+)\s*\([^()]*\)\s*([<>=!]+.*)?$/', $filterExpr)) {
223+
if (preg_match('/^(\w+)\s*\([^()]*\)\s*([<>=!]+.*)?$/', $filterExpr = trim($matches[1]))) {
223224
$filterExpr = "($filterExpr)";
224225
}
225226

226227
if (!str_starts_with($filterExpr, '(')) {
227-
throw new JsonCrawlerException($expr, 'Invalid filter expression');
228+
$filterExpr = "($filterExpr)";
228229
}
229230

230231
// remove outer filter parentheses
@@ -235,30 +236,30 @@ private function evaluateBracket(string $expr, mixed $value): array
235236

236237
// comma-separated values, e.g. `['key1', 'key2', 123]` or `[0, 1, 'key']`
237238
if (str_contains($expr, ',')) {
238-
$parts = $this->parseCommaSeparatedValues($expr);
239+
$parts = JsonPathUtils::parseCommaSeparatedValues($expr);
239240

240241
$result = [];
241-
$keysIndices = array_keys($value);
242-
$isList = array_is_list($value);
243242

244243
foreach ($parts as $part) {
245244
$part = trim($part);
246245

247-
if (preg_match('/^([\'"])(.*)\1$/', $part, $matches)) {
246+
if ('*' === $part) {
247+
$result = array_merge($result, array_values($value));
248+
} elseif (preg_match('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/', $part, $matches)) {
249+
// slice notation
250+
$sliceResult = $this->evaluateBracket($part, $value);
251+
$result = array_merge($result, $sliceResult);
252+
} elseif (preg_match('/^([\'"])(.*)\1$/', $part, $matches)) {
248253
$key = JsonPathUtils::unescapeString($matches[2], $matches[1]);
249254

250-
if ($isList) {
255+
if (array_is_list($value)) {
256+
// for arrays, find ALL objects that contain this key
251257
foreach ($value as $item) {
252258
if (\is_array($item) && \array_key_exists($key, $item)) {
253259
$result[] = $item;
254-
break;
255260
}
256261
}
257-
258-
continue; // no results here
259-
}
260-
261-
if (\array_key_exists($key, $value)) {
262+
} elseif (\array_key_exists($key, $value)) { // for objects, get the value for this key
262263
$result[] = $value[$key];
263264
}
264265
} elseif (preg_match('/^-?\d+$/', $part)) {
@@ -268,14 +269,14 @@ private function evaluateBracket(string $expr, mixed $value): array
268269
$index = \count($value) + $index;
269270
}
270271

271-
if ($isList && \array_key_exists($index, $value)) {
272+
if (array_is_list($value) && \array_key_exists($index, $value)) {
272273
$result[] = $value[$index];
273-
continue;
274-
}
275-
276-
// numeric index on a hashmap
277-
if (isset($keysIndices[$index]) && isset($value[$keysIndices[$index]])) {
278-
$result[] = $value[$keysIndices[$index]];
274+
} else {
275+
// numeric index on a hashmap
276+
$keysIndices = array_keys($value);
277+
if (isset($keysIndices[$index]) && isset($value[$keysIndices[$index]])) {
278+
$result[] = $value[$keysIndices[$index]];
279+
}
279280
}
280281
}
281282
}
@@ -310,7 +311,29 @@ private function evaluateFilter(string $expr, mixed $value): array
310311

311312
private function evaluateFilterExpression(string $expr, mixed $context): bool
312313
{
313-
$expr = trim($expr);
314+
$expr = JsonPathUtils::normalizeWhitespace($expr);
315+
316+
// remove outer parentheses if they wrap the entire expression
317+
if (str_starts_with($expr, '(') && str_ends_with($expr, ')')) {
318+
$depth = 0;
319+
$isWrapped = true;
320+
$i = -1;
321+
while (null !== $char = $expr[++$i] ?? null) {
322+
if ('(' === $char) {
323+
++$depth;
324+
} elseif (')' === $char && 0 === --$depth && isset($expr[$i + 1])) {
325+
$isWrapped = false;
326+
break;
327+
}
328+
}
329+
if ($isWrapped) {
330+
$expr = trim(substr($expr, 1, -1));
331+
}
332+
}
333+
334+
if (str_starts_with($expr, '!')) {
335+
return !$this->evaluateFilterExpression(trim(substr($expr, 1)), $context);
336+
}
314337

315338
if (str_contains($expr, '&&')) {
316339
$parts = array_map('trim', explode('&&', $expr));
@@ -353,8 +376,8 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
353376
}
354377

355378
// function calls
356-
if (preg_match('/^(\w+)\((.*)\)$/', $expr, $matches)) {
357-
$functionName = $matches[1];
379+
if (preg_match('/^(\w++)\s*+\((.*)\)$/', $expr, $matches)) {
380+
$functionName = trim($matches[1]);
358381
if (!isset(self::RFC9535_FUNCTIONS[$functionName])) {
359382
throw new JsonCrawlerException($expr, \sprintf('invalid function "%s"', $functionName));
360383
}
@@ -369,8 +392,15 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
369392

370393
private function evaluateScalar(string $expr, mixed $context): mixed
371394
{
372-
if (is_numeric($expr)) {
373-
return str_contains($expr, '.') ? (float) $expr : (int) $expr;
395+
$expr = JsonPathUtils::normalizeWhitespace($expr);
396+
397+
if (JsonPathUtils::isJsonNumber($expr)) {
398+
return str_contains($expr, '.') || str_contains(strtolower($expr), 'e') ? (float) $expr : (int) $expr;
399+
}
400+
401+
// only validate tokens that look like standalone numbers
402+
if (preg_match('/^[\d+\-.eE]+$/', $expr) && preg_match('/\d/', $expr)) {
403+
throw new JsonCrawlerException($expr, \sprintf('Invalid number format "%s"', $expr));
374404
}
375405

376406
if ('@' === $expr) {
@@ -404,9 +434,8 @@ private function evaluateScalar(string $expr, mixed $context): mixed
404434
}
405435

406436
// function calls
407-
if (preg_match('/^(\w+)\((.*)\)$/', $expr, $matches)) {
408-
$functionName = $matches[1];
409-
if (!isset(self::RFC9535_FUNCTIONS[$functionName])) {
437+
if (preg_match('/^(\w++)\((.*)\)$/', $expr, $matches)) {
438+
if (!isset(self::RFC9535_FUNCTIONS[$functionName = trim($matches[1])])) {
410439
throw new JsonCrawlerException($expr, \sprintf('invalid function "%s"', $functionName));
411440
}
412441

@@ -416,31 +445,60 @@ private function evaluateScalar(string $expr, mixed $context): mixed
416445
return null;
417446
}
418447

419-
private function evaluateFunction(string $name, string $args, array $context): mixed
448+
private function evaluateFunction(string $name, string $args, mixed $context): mixed
420449
{
421-
$args = array_map(
422-
fn ($arg) => $this->evaluateScalar(trim($arg), $context),
423-
explode(',', $args)
424-
);
450+
$argList = [];
451+
$nodelistSizes = [];
452+
if ($args = trim($args)) {
453+
$args = JsonPathUtils::parseCommaSeparatedValues($args);
454+
foreach ($args as $arg) {
455+
$arg = trim($arg);
456+
if (str_starts_with($arg, '$')) { // special handling for absolute paths
457+
$results = $this->evaluate(new JsonPath($arg));
458+
$argList[] = $results[0] ?? null;
459+
$nodelistSizes[] = \count($results);
460+
} elseif (!str_starts_with($arg, '@')) { // special handling for @ to track nodelist size
461+
$argList[] = $this->evaluateScalar($arg, $context);
462+
$nodelistSizes[] = 1;
463+
} elseif ('@' === $arg) {
464+
$argList[] = $context;
465+
$nodelistSizes[] = 1;
466+
} elseif (!\is_array($context)) {
467+
$argList[] = null;
468+
$nodelistSizes[] = 0;
469+
} elseif (str_starts_with($pathPart = substr($arg, 1), '[')) {
470+
// handle bracket expressions like @['a','d']
471+
$results = $this->evaluateBracket(substr($pathPart, 1, -1), $context);
472+
$argList[] = $results;
473+
$nodelistSizes[] = \count($results);
474+
} else {
475+
// handle dot notation like @.a
476+
$results = $this->evaluateTokensOnDecodedData(JsonPathTokenizer::tokenize(new JsonPath('$'.$pathPart)), $context);
477+
$argList[] = $results[0] ?? null;
478+
$nodelistSizes[] = \count($results);
479+
}
480+
}
481+
}
425482

426-
$value = $args[0] ?? null;
483+
$value = $argList[0] ?? null;
484+
$nodelistSize = $nodelistSizes[0] ?? 0;
427485

428486
return match ($name) {
429487
'length' => match (true) {
430488
\is_string($value) => mb_strlen($value),
431489
\is_array($value) => \count($value),
432490
default => 0,
433491
},
434-
'count' => \is_array($value) ? \count($value) : 0,
492+
'count' => $nodelistSize,
435493
'match' => match (true) {
436-
\is_string($value) && \is_string($args[1] ?? null) => (bool) @preg_match(\sprintf('/^%s$/', $args[1]), $value),
494+
\is_string($value) && \is_string($argList[1] ?? null) => (bool) @preg_match(\sprintf('/^%s$/u', $this->transformJsonPathRegex($argList[1])), $value),
437495
default => false,
438496
},
439497
'search' => match (true) {
440-
\is_string($value) && \is_string($args[1] ?? null) => (bool) @preg_match("/$args[1]/", $value),
498+
\is_string($value) && \is_string($argList[1] ?? null) => (bool) @preg_match("/{$this->transformJsonPathRegex($argList[1])}/u", $value),
441499
default => false,
442500
},
443-
'value' => $value,
501+
'value' => 1 < $nodelistSize ? null : (1 === $nodelistSize ? (\is_array($value) ? ($value[0] ?? null) : $value) : $value),
444502
default => null,
445503
};
446504
}
@@ -474,43 +532,51 @@ private function compare(mixed $left, mixed $right, string $operator): bool
474532
};
475533
}
476534

477-
private function parseCommaSeparatedValues(string $expr): array
535+
/**
536+
* Transforms JSONPath regex patterns to comply with RFC 9535.
537+
*
538+
* The main issue is that '.' should not match \r or \n but should
539+
* match Unicode line separators U+2028 and U+2029.
540+
*/
541+
private function transformJsonPathRegex(string $pattern): string
478542
{
479-
$parts = [];
480-
$current = '';
481-
$inQuotes = false;
482-
$quoteChar = null;
483-
484-
for ($i = 0; $i < \strlen($expr); ++$i) {
485-
$char = $expr[$i];
486-
487-
if ('\\' === $char && $i + 1 < \strlen($expr)) {
488-
$current .= $char.$expr[++$i];
543+
$result = '';
544+
$inCharClass = false;
545+
$escaped = false;
546+
$i = -1;
547+
548+
while (null !== $char = $pattern[++$i] ?? null) {
549+
if ($escaped) {
550+
$result .= $char;
551+
$escaped = false;
489552
continue;
490553
}
491554

492-
if ('"' === $char || "'" === $char) {
493-
if (!$inQuotes) {
494-
$inQuotes = true;
495-
$quoteChar = $char;
496-
} elseif ($char === $quoteChar) {
497-
$inQuotes = false;
498-
$quoteChar = null;
499-
}
500-
} elseif (!$inQuotes && ',' === $char) {
501-
$parts[] = trim($current);
502-
$current = '';
555+
if ('\\' === $char) {
556+
$result .= $char;
557+
$escaped = true;
558+
continue;
559+
}
503560

561+
if ('[' === $char && !$inCharClass) {
562+
$inCharClass = true;
563+
$result .= $char;
504564
continue;
505565
}
506566

507-
$current .= $char;
508-
}
567+
if (']' === $char && $inCharClass) {
568+
$inCharClass = false;
569+
$result .= $char;
570+
continue;
571+
}
509572

510-
if ('' !== $current) {
511-
$parts[] = trim($current);
573+
if ('.' === $char && !$inCharClass) {
574+
$result .= '(?:[^\r\n]|\x{2028}|\x{2029})';
575+
} else {
576+
$result .= $char;
577+
}
512578
}
513579

514-
return $parts;
580+
return $result;
515581
}
516582
}

0 commit comments

Comments
 (0)