@@ -133,7 +133,11 @@ private function evaluateBracket(string $expr, mixed $value): array
133
133
return [];
134
134
}
135
135
136
- if ('* ' === $ expr ) {
136
+ if (str_contains ($ expr , ', ' ) && (str_starts_with ($ trimmed = trim ($ expr ), ', ' ) || str_ends_with ($ trimmed , ', ' ))) {
137
+ throw new JsonCrawlerException ($ expr , 'Expression cannot have leading or trailing commas ' );
138
+ }
139
+
140
+ if ('* ' === $ expr = JsonPathUtils::normalizeWhitespace ($ expr )) {
137
141
return array_values ($ value );
138
142
}
139
143
@@ -168,8 +172,7 @@ private function evaluateBracket(string $expr, mixed $value): array
168
172
return $ result ;
169
173
}
170
174
171
- // start, end and step
172
- if (preg_match ('/^(-?\d*):(-?\d*)(?::(-?\d+))?$/ ' , $ expr , $ matches )) {
175
+ if (preg_match ('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/ ' , $ expr , $ matches )) {
173
176
if (!array_is_list ($ value )) {
174
177
return [];
175
178
}
@@ -217,14 +220,12 @@ private function evaluateBracket(string $expr, mixed $value): array
217
220
218
221
// filter expressions
219
222
if (preg_match ('/^\?(.*)$/ ' , $ expr , $ matches )) {
220
- $ filterExpr = $ matches [1 ];
221
-
222
- if (preg_match ('/^(\w+)\s*\([^()]*\)\s*([<>=!]+.*)?$/ ' , $ filterExpr )) {
223
+ if (preg_match ('/^(\w+)\s*\([^()]*\)\s*([<>=!]+.*)?$/ ' , $ filterExpr = trim ($ matches [1 ]))) {
223
224
$ filterExpr = "( $ filterExpr) " ;
224
225
}
225
226
226
227
if (!str_starts_with ($ filterExpr , '( ' )) {
227
- throw new JsonCrawlerException ( $ expr , ' Invalid filter expression ' ) ;
228
+ $ filterExpr = " ( $ filterExpr ) " ;
228
229
}
229
230
230
231
// remove outer filter parentheses
@@ -235,30 +236,30 @@ private function evaluateBracket(string $expr, mixed $value): array
235
236
236
237
// comma-separated values, e.g. `['key1', 'key2', 123]` or `[0, 1, 'key']`
237
238
if (str_contains ($ expr , ', ' )) {
238
- $ parts = $ this -> parseCommaSeparatedValues ($ expr );
239
+ $ parts = JsonPathUtils:: parseCommaSeparatedValues ($ expr );
239
240
240
241
$ result = [];
241
- $ keysIndices = array_keys ($ value );
242
- $ isList = array_is_list ($ value );
243
242
244
243
foreach ($ parts as $ part ) {
245
244
$ part = trim ($ part );
246
245
247
- if (preg_match ('/^([ \'"])(.*)\1$/ ' , $ part , $ matches )) {
246
+ if ('* ' === $ part ) {
247
+ $ result = array_merge ($ result , array_values ($ value ));
248
+ } elseif (preg_match ('/^(-?\d*+)\s*+:\s*+(-?\d*+)(?:\s*+:\s*+(-?\d++))?$/ ' , $ part , $ matches )) {
249
+ // slice notation
250
+ $ sliceResult = $ this ->evaluateBracket ($ part , $ value );
251
+ $ result = array_merge ($ result , $ sliceResult );
252
+ } elseif (preg_match ('/^([ \'"])(.*)\1$/ ' , $ part , $ matches )) {
248
253
$ key = JsonPathUtils::unescapeString ($ matches [2 ], $ matches [1 ]);
249
254
250
- if ($ isList ) {
255
+ if (array_is_list ($ value )) {
256
+ // for arrays, find ALL objects that contain this key
251
257
foreach ($ value as $ item ) {
252
258
if (\is_array ($ item ) && \array_key_exists ($ key , $ item )) {
253
259
$ result [] = $ item ;
254
- break ;
255
260
}
256
261
}
257
-
258
- continue ; // no results here
259
- }
260
-
261
- if (\array_key_exists ($ key , $ value )) {
262
+ } elseif (\array_key_exists ($ key , $ value )) { // for objects, get the value for this key
262
263
$ result [] = $ value [$ key ];
263
264
}
264
265
} elseif (preg_match ('/^-?\d+$/ ' , $ part )) {
@@ -268,14 +269,14 @@ private function evaluateBracket(string $expr, mixed $value): array
268
269
$ index = \count ($ value ) + $ index ;
269
270
}
270
271
271
- if ($ isList && \array_key_exists ($ index , $ value )) {
272
+ if (array_is_list ( $ value ) && \array_key_exists ($ index , $ value )) {
272
273
$ result [] = $ value [$ index ];
273
- continue ;
274
- }
275
-
276
- // numeric index on a hashmap
277
- if ( isset ( $ keysIndices [ $ index ]) && isset ( $ value [$ keysIndices [$ index ]])) {
278
- $ result [] = $ value [ $ keysIndices [ $ index ]];
274
+ } else {
275
+ // numeric index on a hashmap
276
+ $ keysIndices = array_keys ( $ value );
277
+ if ( isset ( $ keysIndices [ $ index]) && isset ( $ value [ $ keysIndices [ $ index ]])) {
278
+ $ result [] = $ value [$ keysIndices [$ index ]];
279
+ }
279
280
}
280
281
}
281
282
}
@@ -310,7 +311,29 @@ private function evaluateFilter(string $expr, mixed $value): array
310
311
311
312
private function evaluateFilterExpression (string $ expr , mixed $ context ): bool
312
313
{
313
- $ expr = trim ($ expr );
314
+ $ expr = JsonPathUtils::normalizeWhitespace ($ expr );
315
+
316
+ // remove outer parentheses if they wrap the entire expression
317
+ if (str_starts_with ($ expr , '( ' ) && str_ends_with ($ expr , ') ' )) {
318
+ $ depth = 0 ;
319
+ $ isWrapped = true ;
320
+ $ i = -1 ;
321
+ while (null !== $ char = $ expr [++$ i ] ?? null ) {
322
+ if ('( ' === $ char ) {
323
+ ++$ depth ;
324
+ } elseif (') ' === $ char && 0 === --$ depth && isset ($ expr [$ i + 1 ])) {
325
+ $ isWrapped = false ;
326
+ break ;
327
+ }
328
+ }
329
+ if ($ isWrapped ) {
330
+ $ expr = trim (substr ($ expr , 1 , -1 ));
331
+ }
332
+ }
333
+
334
+ if (str_starts_with ($ expr , '! ' )) {
335
+ return !$ this ->evaluateFilterExpression (trim (substr ($ expr , 1 )), $ context );
336
+ }
314
337
315
338
if (str_contains ($ expr , '&& ' )) {
316
339
$ parts = array_map ('trim ' , explode ('&& ' , $ expr ));
@@ -353,8 +376,8 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
353
376
}
354
377
355
378
// function calls
356
- if (preg_match ('/^(\w+) \((.*)\)$/ ' , $ expr , $ matches )) {
357
- $ functionName = $ matches [1 ];
379
+ if (preg_match ('/^(\w++)\s*+ \((.*)\)$/ ' , $ expr , $ matches )) {
380
+ $ functionName = trim ( $ matches [1 ]) ;
358
381
if (!isset (self ::RFC9535_FUNCTIONS [$ functionName ])) {
359
382
throw new JsonCrawlerException ($ expr , \sprintf ('invalid function "%s" ' , $ functionName ));
360
383
}
@@ -369,8 +392,15 @@ private function evaluateFilterExpression(string $expr, mixed $context): bool
369
392
370
393
private function evaluateScalar (string $ expr , mixed $ context ): mixed
371
394
{
372
- if (is_numeric ($ expr )) {
373
- return str_contains ($ expr , '. ' ) ? (float ) $ expr : (int ) $ expr ;
395
+ $ expr = JsonPathUtils::normalizeWhitespace ($ expr );
396
+
397
+ if (JsonPathUtils::isJsonNumber ($ expr )) {
398
+ return str_contains ($ expr , '. ' ) || str_contains (strtolower ($ expr ), 'e ' ) ? (float ) $ expr : (int ) $ expr ;
399
+ }
400
+
401
+ // only validate tokens that look like standalone numbers
402
+ if (preg_match ('/^[\d+\-.eE]+$/ ' , $ expr ) && preg_match ('/\d/ ' , $ expr )) {
403
+ throw new JsonCrawlerException ($ expr , \sprintf ('Invalid number format "%s" ' , $ expr ));
374
404
}
375
405
376
406
if ('@ ' === $ expr ) {
@@ -404,9 +434,8 @@ private function evaluateScalar(string $expr, mixed $context): mixed
404
434
}
405
435
406
436
// function calls
407
- if (preg_match ('/^(\w+)\((.*)\)$/ ' , $ expr , $ matches )) {
408
- $ functionName = $ matches [1 ];
409
- if (!isset (self ::RFC9535_FUNCTIONS [$ functionName ])) {
437
+ if (preg_match ('/^(\w++)\((.*)\)$/ ' , $ expr , $ matches )) {
438
+ if (!isset (self ::RFC9535_FUNCTIONS [$ functionName = trim ($ matches [1 ])])) {
410
439
throw new JsonCrawlerException ($ expr , \sprintf ('invalid function "%s" ' , $ functionName ));
411
440
}
412
441
@@ -416,31 +445,60 @@ private function evaluateScalar(string $expr, mixed $context): mixed
416
445
return null ;
417
446
}
418
447
419
- private function evaluateFunction (string $ name , string $ args , array $ context ): mixed
448
+ private function evaluateFunction (string $ name , string $ args , mixed $ context ): mixed
420
449
{
421
- $ args = array_map (
422
- fn ($ arg ) => $ this ->evaluateScalar (trim ($ arg ), $ context ),
423
- explode (', ' , $ args )
424
- );
450
+ $ argList = [];
451
+ $ nodelistSizes = [];
452
+ if ($ args = trim ($ args )) {
453
+ $ args = JsonPathUtils::parseCommaSeparatedValues ($ args );
454
+ foreach ($ args as $ arg ) {
455
+ $ arg = trim ($ arg );
456
+ if (str_starts_with ($ arg , '$ ' )) { // special handling for absolute paths
457
+ $ results = $ this ->evaluate (new JsonPath ($ arg ));
458
+ $ argList [] = $ results [0 ] ?? null ;
459
+ $ nodelistSizes [] = \count ($ results );
460
+ } elseif (!str_starts_with ($ arg , '@ ' )) { // special handling for @ to track nodelist size
461
+ $ argList [] = $ this ->evaluateScalar ($ arg , $ context );
462
+ $ nodelistSizes [] = 1 ;
463
+ } elseif ('@ ' === $ arg ) {
464
+ $ argList [] = $ context ;
465
+ $ nodelistSizes [] = 1 ;
466
+ } elseif (!\is_array ($ context )) {
467
+ $ argList [] = null ;
468
+ $ nodelistSizes [] = 0 ;
469
+ } elseif (str_starts_with ($ pathPart = substr ($ arg , 1 ), '[ ' )) {
470
+ // handle bracket expressions like @['a','d']
471
+ $ results = $ this ->evaluateBracket (substr ($ pathPart , 1 , -1 ), $ context );
472
+ $ argList [] = $ results ;
473
+ $ nodelistSizes [] = \count ($ results );
474
+ } else {
475
+ // handle dot notation like @.a
476
+ $ results = $ this ->evaluateTokensOnDecodedData (JsonPathTokenizer::tokenize (new JsonPath ('$ ' .$ pathPart )), $ context );
477
+ $ argList [] = $ results [0 ] ?? null ;
478
+ $ nodelistSizes [] = \count ($ results );
479
+ }
480
+ }
481
+ }
425
482
426
- $ value = $ args [0 ] ?? null ;
483
+ $ value = $ argList [0 ] ?? null ;
484
+ $ nodelistSize = $ nodelistSizes [0 ] ?? 0 ;
427
485
428
486
return match ($ name ) {
429
487
'length ' => match (true ) {
430
488
\is_string ($ value ) => mb_strlen ($ value ),
431
489
\is_array ($ value ) => \count ($ value ),
432
490
default => 0 ,
433
491
},
434
- 'count ' => \is_array ( $ value ) ? \count ( $ value ) : 0 ,
492
+ 'count ' => $ nodelistSize ,
435
493
'match ' => match (true ) {
436
- \is_string ($ value ) && \is_string ($ args [1 ] ?? null ) => (bool ) @preg_match (\sprintf ('/^%s$/ ' , $ args [1 ]), $ value ),
494
+ \is_string ($ value ) && \is_string ($ argList [1 ] ?? null ) => (bool ) @preg_match (\sprintf ('/^%s$/u ' , $ this -> transformJsonPathRegex ( $ argList [1 ]) ), $ value ),
437
495
default => false ,
438
496
},
439
497
'search ' => match (true ) {
440
- \is_string ($ value ) && \is_string ($ args [1 ] ?? null ) => (bool ) @preg_match ("/ $ args [1 ]/ " , $ value ),
498
+ \is_string ($ value ) && \is_string ($ argList [1 ] ?? null ) => (bool ) @preg_match ("/ { $ this -> transformJsonPathRegex ( $ argList [1 ])} /u " , $ value ),
441
499
default => false ,
442
500
},
443
- 'value ' => $ value ,
501
+ 'value ' => 1 < $ nodelistSize ? null : ( 1 === $ nodelistSize ? ( \is_array ( $ value) ? ( $ value [ 0 ] ?? null ) : $ value ) : $ value ) ,
444
502
default => null ,
445
503
};
446
504
}
@@ -474,43 +532,51 @@ private function compare(mixed $left, mixed $right, string $operator): bool
474
532
};
475
533
}
476
534
477
- private function parseCommaSeparatedValues (string $ expr ): array
535
+ /**
536
+ * Transforms JSONPath regex patterns to comply with RFC 9535.
537
+ *
538
+ * The main issue is that '.' should not match \r or \n but should
539
+ * match Unicode line separators U+2028 and U+2029.
540
+ */
541
+ private function transformJsonPathRegex (string $ pattern ): string
478
542
{
479
- $ parts = [];
480
- $ current = '' ;
481
- $ inQuotes = false ;
482
- $ quoteChar = null ;
483
-
484
- for ($ i = 0 ; $ i < \strlen ($ expr ); ++$ i ) {
485
- $ char = $ expr [$ i ];
486
-
487
- if ('\\' === $ char && $ i + 1 < \strlen ($ expr )) {
488
- $ current .= $ char .$ expr [++$ i ];
543
+ $ result = '' ;
544
+ $ inCharClass = false ;
545
+ $ escaped = false ;
546
+ $ i = -1 ;
547
+
548
+ while (null !== $ char = $ pattern [++$ i ] ?? null ) {
549
+ if ($ escaped ) {
550
+ $ result .= $ char ;
551
+ $ escaped = false ;
489
552
continue ;
490
553
}
491
554
492
- if ('" ' === $ char || "' " === $ char ) {
493
- if (!$ inQuotes ) {
494
- $ inQuotes = true ;
495
- $ quoteChar = $ char ;
496
- } elseif ($ char === $ quoteChar ) {
497
- $ inQuotes = false ;
498
- $ quoteChar = null ;
499
- }
500
- } elseif (!$ inQuotes && ', ' === $ char ) {
501
- $ parts [] = trim ($ current );
502
- $ current = '' ;
555
+ if ('\\' === $ char ) {
556
+ $ result .= $ char ;
557
+ $ escaped = true ;
558
+ continue ;
559
+ }
503
560
561
+ if ('[ ' === $ char && !$ inCharClass ) {
562
+ $ inCharClass = true ;
563
+ $ result .= $ char ;
504
564
continue ;
505
565
}
506
566
507
- $ current .= $ char ;
508
- }
567
+ if ('] ' === $ char && $ inCharClass ) {
568
+ $ inCharClass = false ;
569
+ $ result .= $ char ;
570
+ continue ;
571
+ }
509
572
510
- if ('' !== $ current ) {
511
- $ parts [] = trim ($ current );
573
+ if ('. ' === $ char && !$ inCharClass ) {
574
+ $ result .= '(?:[^\r\n]|\x{2028}|\x{2029}) ' ;
575
+ } else {
576
+ $ result .= $ char ;
577
+ }
512
578
}
513
579
514
- return $ parts ;
580
+ return $ result ;
515
581
}
516
582
}
0 commit comments