@@ -1519,6 +1519,125 @@ TEST_F(PredicateTest, TestLikeLongPatternHeapAlloc) {
15191519 ASSERT_FALSE (predicate->Test (arrow_schema, CreateStringRow ({non_matching_field})).value ());
15201520}
15211521
1522+ TEST_F (PredicateTest, TestLikeInvalidEscapeSequence) {
1523+ auto arrow_schema = arrow::schema (arrow::FieldVector ({arrow::field (" f0" , arrow::utf8 ())}));
1524+
1525+ // Trailing backslash is invalid (Java throws "Invalid escape sequence")
1526+ ASSERT_OK_AND_ASSIGN (auto predicate_base,
1527+ PredicateBuilder::Like (
1528+ /* field_index=*/ 0 , /* field_name=*/ " f0" , FieldType::STRING,
1529+ Literal (FieldType::STRING, " abc\\ " , 4 )));
1530+ auto predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1531+ ASSERT_NOK_WITH_MSG (predicate->Test (arrow_schema, CreateStringRow ({" abc" })),
1532+ " Invalid escape sequence" );
1533+
1534+ // Backslash followed by non-special char is invalid (only \_, \%, \\ are legal)
1535+ ASSERT_OK_AND_ASSIGN (predicate_base,
1536+ PredicateBuilder::Like (
1537+ /* field_index=*/ 0 , /* field_name=*/ " f0" , FieldType::STRING,
1538+ Literal (FieldType::STRING, " a\\ bc" , 4 )));
1539+ predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1540+ ASSERT_NOK_WITH_MSG (predicate->Test (arrow_schema, CreateStringRow ({" abc" })),
1541+ " Invalid escape sequence" );
1542+
1543+ // \n is not a valid escape
1544+ ASSERT_OK_AND_ASSIGN (predicate_base,
1545+ PredicateBuilder::Like (
1546+ /* field_index=*/ 0 , /* field_name=*/ " f0" , FieldType::STRING,
1547+ Literal (FieldType::STRING, " a\\ nf" , 4 )));
1548+ predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1549+ ASSERT_NOK_WITH_MSG (predicate->Test (arrow_schema, CreateStringRow ({" anf" })),
1550+ " Invalid escape sequence" );
1551+ }
1552+
1553+ TEST_F (PredicateTest, TestLikeEscapeBackslash) {
1554+ auto arrow_schema = arrow::schema (arrow::FieldVector ({arrow::field (" f0" , arrow::utf8 ())}));
1555+
1556+ // \\\\ in C++ string literal = "\\" in the pattern = escaped backslash
1557+ ASSERT_OK_AND_ASSIGN (auto predicate_base,
1558+ PredicateBuilder::Like (
1559+ /* field_index=*/ 0 , /* field_name=*/ " f0" , FieldType::STRING,
1560+ Literal (FieldType::STRING, " a\\\\ b" , 4 )));
1561+ auto predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1562+ // Field "a\b" should match pattern "a\\b" (escaped backslash)
1563+ ASSERT_TRUE (predicate->Test (arrow_schema, CreateStringRow ({" a\\ b" })).value ());
1564+ ASSERT_FALSE (predicate->Test (arrow_schema, CreateStringRow ({" axb" })).value ());
1565+
1566+ // Escaped percent: "a\%b" matches literal "a%b"
1567+ ASSERT_OK_AND_ASSIGN (predicate_base,
1568+ PredicateBuilder::Like (
1569+ /* field_index=*/ 0 , /* field_name=*/ " f0" , FieldType::STRING,
1570+ Literal (FieldType::STRING, " a\\ %b" , 4 )));
1571+ predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1572+ ASSERT_TRUE (predicate->Test (arrow_schema, CreateStringRow ({" a%b" })).value ());
1573+ ASSERT_FALSE (predicate->Test (arrow_schema, CreateStringRow ({" axb" })).value ());
1574+ ASSERT_FALSE (predicate->Test (arrow_schema, CreateStringRow ({" axxb" })).value ());
1575+ }
1576+
1577+ TEST_F (PredicateTest, TestLikeUtf8MultibyteUnderscore) {
1578+ auto arrow_schema = arrow::schema (arrow::FieldVector ({arrow::field (" f0" , arrow::utf8 ())}));
1579+
1580+ // Single '_' should match one Unicode character, not one byte.
1581+ ASSERT_OK_AND_ASSIGN (auto predicate_base,
1582+ PredicateBuilder::Like (
1583+ /* field_index=*/ 0 , /* field_name=*/ " f0" , FieldType::STRING,
1584+ Literal (FieldType::STRING, " _" , 1 )));
1585+ auto predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1586+ ASSERT_TRUE (predicate->Test (arrow_schema, CreateStringRow ({" 中" })).value ());
1587+ ASSERT_FALSE (predicate->Test (arrow_schema, CreateStringRow ({" 中文" })).value ());
1588+
1589+ // "a_c" where _ matches one Chinese character
1590+ ASSERT_OK_AND_ASSIGN (predicate_base,
1591+ PredicateBuilder::Like (
1592+ /* field_index=*/ 0 , /* field_name=*/ " f0" , FieldType::STRING,
1593+ Literal (FieldType::STRING, " a_c" , 3 )));
1594+ predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1595+ ASSERT_TRUE (predicate->Test (arrow_schema, CreateStringRow ({" a中c" })).value ());
1596+ ASSERT_FALSE (predicate->Test (arrow_schema, CreateStringRow ({" a中文c" })).value ());
1597+
1598+ // "___" should match exactly 3 Unicode characters
1599+ ASSERT_OK_AND_ASSIGN (predicate_base,
1600+ PredicateBuilder::Like (
1601+ /* field_index=*/ 0 , /* field_name=*/ " f0" , FieldType::STRING,
1602+ Literal (FieldType::STRING, " ___" , 3 )));
1603+ predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1604+ ASSERT_TRUE (predicate->Test (arrow_schema, CreateStringRow ({" 中文字" })).value ());
1605+ ASSERT_FALSE (predicate->Test (arrow_schema, CreateStringRow ({" 中文" })).value ());
1606+
1607+ // '%' should still work with multi-byte characters
1608+ std::string pattern_contains = std::string (" %" ) + " 中" + " %" ;
1609+ ASSERT_OK_AND_ASSIGN (
1610+ predicate_base,
1611+ PredicateBuilder::Like (
1612+ /* field_index=*/ 0 , /* field_name=*/ " f0" , FieldType::STRING,
1613+ Literal (FieldType::STRING, pattern_contains.data (), pattern_contains.size ())));
1614+ predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1615+ ASSERT_TRUE (predicate->Test (arrow_schema, CreateStringRow ({" hello中world" })).value ());
1616+ ASSERT_FALSE (predicate->Test (arrow_schema, CreateStringRow ({" helloworld" })).value ());
1617+ }
1618+
1619+ TEST_F (PredicateTest, TestLikeJavaRegexLineTerminatorSemantics) {
1620+ auto arrow_schema = arrow::schema (arrow::FieldVector ({arrow::field (" f0" , arrow::utf8 ())}));
1621+
1622+ // Java regex '.' does not match line terminators, so '_' should not match them either.
1623+ ASSERT_OK_AND_ASSIGN (auto predicate_base,
1624+ PredicateBuilder::Like (
1625+ /* field_index=*/ 0 , /* field_name=*/ " f0" , FieldType::STRING,
1626+ Literal (FieldType::STRING, " _" , 1 )));
1627+ auto predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1628+ ASSERT_FALSE (predicate->Test (arrow_schema, CreateStringRow ({" \n " })).value ());
1629+ ASSERT_FALSE (predicate->Test (arrow_schema, CreateStringRow ({" \r " })).value ());
1630+
1631+ // Java LIKE '%' uses (?s:.*), so it should still match line terminators.
1632+ ASSERT_OK_AND_ASSIGN (predicate_base,
1633+ PredicateBuilder::Like (
1634+ /* field_index=*/ 0 , /* field_name=*/ " f0" , FieldType::STRING,
1635+ Literal (FieldType::STRING, " %" , 1 )));
1636+ predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1637+ ASSERT_TRUE (predicate->Test (arrow_schema, CreateStringRow ({" \n " })).value ());
1638+ ASSERT_TRUE (predicate->Test (arrow_schema, CreateStringRow ({" \r " })).value ());
1639+ }
1640+
15221641TEST_F (PredicateTest, TestCompound) {
15231642 ASSERT_OK_AND_ASSIGN (
15241643 const auto startswith_predicate,
0 commit comments