Skip to content

Commit ed933a4

Browse files
committed
Merge branch 'main' into zhf-small-fix
2 parents e20b030 + 4b2c0c0 commit ed933a4

8 files changed

Lines changed: 394 additions & 42 deletions

File tree

LICENSE

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,54 @@ License: https://www.apache.org/licenses/LICENSE-2.0
506506

507507
--------------------------------------------------------------------------------
508508

509+
This product includes code derived from PyTorch TH simd.h.
510+
511+
* SIMD detection code in third_party/roaring_bitmap/roaring.cpp
512+
513+
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
514+
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
515+
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
516+
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
517+
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
518+
Copyright (c) 2011-2013 NYU (Clement Farabet)
519+
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
520+
Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute
521+
(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
522+
Samy Bengio, Johnny Mariethoz)
523+
524+
All rights reserved.
525+
526+
License: BSD-3-Clause
527+
528+
Redistribution and use in source and binary forms, with or without
529+
modification, are permitted provided that the following conditions are met:
530+
531+
1. Redistributions of source code must retain the above copyright
532+
notice, this list of conditions and the following disclaimer.
533+
534+
2. Redistributions in binary form must reproduce the above copyright
535+
notice, this list of conditions and the following disclaimer in the
536+
documentation and/or other materials provided with the distribution.
537+
538+
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
539+
America and IDIAP Research Institute nor the names of its contributors may be
540+
used to endorse or promote products derived from this software without
541+
specific prior written permission.
542+
543+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
544+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
545+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
546+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
547+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
548+
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
549+
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
550+
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
551+
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
552+
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
553+
POSSIBILITY OF SUCH DAMAGE.
554+
555+
--------------------------------------------------------------------------------
556+
509557
This product includes code from cppjieba.
510558

511559
* cppjieba utility in src/paimon/global_index/lucene/ directory

src/paimon/common/predicate/like.cpp

Lines changed: 99 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -16,79 +16,144 @@
1616

1717
#include "paimon/common/predicate/like.h"
1818

19+
#include <string>
20+
#include <vector>
21+
22+
#include "fmt/format.h"
1923
namespace paimon {
2024

25+
namespace {
26+
27+
/// Returns the byte length of a UTF-8 leading byte's code point.
28+
/// Returns 1 for ASCII, 2-4 for multi-byte sequences, 1 for invalid bytes.
29+
inline size_t Utf8CodePointLength(unsigned char leading_byte) {
30+
if (leading_byte < 0x80) {
31+
return 1;
32+
}
33+
if ((leading_byte & 0xE0) == 0xC0) {
34+
return 2;
35+
}
36+
if ((leading_byte & 0xF0) == 0xE0) {
37+
return 3;
38+
}
39+
if ((leading_byte & 0xF8) == 0xF0) {
40+
return 4;
41+
}
42+
return 1; // invalid continuation byte, treat as single byte
43+
}
44+
45+
inline bool IsJavaRegexLineTerminator(const std::string& code_point) {
46+
return code_point == "\n" || code_point == "\r" || code_point == "\xC2\x85" ||
47+
code_point == "\xE2\x80\xA8" || code_point == "\xE2\x80\xA9";
48+
}
49+
50+
} // namespace
51+
2152
Result<bool> Like::TestString(const std::string& field, const std::string& pattern) const {
2253
if (pattern.empty()) {
2354
return field.empty();
2455
}
25-
std::vector<char> pat;
56+
57+
// Phase 1: Parse pattern with escape handling (Java-compatible).
58+
// Only \_, \%, \\ are valid escape sequences.
59+
std::vector<std::string> pat_chars; // each element is a literal string segment or wildcard
2660
std::vector<bool> is_wild;
27-
for (size_t i = 0; i < pattern.size(); ++i) {
28-
if (pattern[i] == '\\' && i + 1 < pattern.size()) {
29-
pat.push_back(pattern[i + 1]);
61+
62+
for (size_t i = 0; i < pattern.size();) {
63+
if (pattern[i] == '\\') {
64+
if (i + 1 >= pattern.size()) {
65+
return Status::Invalid(fmt::format("Invalid escape sequence '{}', index={}",
66+
pattern, std::to_string(i)));
67+
}
68+
char next_char = pattern[i + 1];
69+
if (next_char != '_' && next_char != '%' && next_char != '\\') {
70+
return Status::Invalid(fmt::format("Invalid escape sequence '{}', index={}",
71+
pattern, std::to_string(i)));
72+
}
73+
pat_chars.emplace_back(1, next_char);
3074
is_wild.push_back(false);
75+
i += 2;
76+
} else if (pattern[i] == '_' || pattern[i] == '%') {
77+
pat_chars.emplace_back(1, pattern[i]);
78+
is_wild.push_back(true);
3179
++i;
3280
} else {
33-
char c = pattern[i];
34-
pat.push_back(c);
35-
is_wild.push_back(c == '_' || c == '%');
81+
// Read one UTF-8 code point from pattern as a literal element.
82+
size_t cp_len = Utf8CodePointLength(static_cast<unsigned char>(pattern[i]));
83+
if (i + cp_len > pattern.size()) {
84+
cp_len = 1;
85+
}
86+
pat_chars.push_back(pattern.substr(i, cp_len));
87+
is_wild.push_back(false);
88+
i += cp_len;
3689
}
3790
}
38-
std::vector<char> simp_pat;
91+
92+
// Phase 2: Merge consecutive '%' wildcards.
93+
std::vector<std::string> simp_pat;
3994
std::vector<bool> simp_wild;
40-
for (size_t i = 0; i < pat.size(); ++i) {
41-
if (is_wild[i] && pat[i] == '%' && !simp_pat.empty() && simp_wild.back() &&
42-
simp_pat.back() == '%') {
95+
for (size_t i = 0; i < pat_chars.size(); ++i) {
96+
if (is_wild[i] && pat_chars[i] == "%" && !simp_pat.empty() && simp_wild.back() &&
97+
simp_pat.back() == "%") {
4398
continue;
4499
}
45-
simp_pat.push_back(pat[i]);
100+
simp_pat.push_back(pat_chars[i]);
46101
simp_wild.push_back(is_wild[i]);
47102
}
48-
const size_t m = field.size();
103+
104+
// Phase 3: Decompose field into UTF-8 code points for character-level matching.
105+
std::vector<std::string> field_chars;
106+
for (size_t i = 0; i < field.size();) {
107+
size_t cp_len = Utf8CodePointLength(static_cast<unsigned char>(field[i]));
108+
if (i + cp_len > field.size()) {
109+
cp_len = 1; // truncated sequence, treat byte as single char
110+
}
111+
field_chars.push_back(field.substr(i, cp_len));
112+
i += cp_len;
113+
}
114+
115+
const size_t m = field_chars.size();
49116
const size_t n = simp_pat.size();
50-
if (field.empty()) {
51-
return n == 1 && simp_wild[0] && simp_pat[0] == '%';
117+
118+
if (m == 0) {
119+
return n == 1 && simp_wild[0] && simp_pat[0] == "%";
52120
}
121+
122+
// Quick reject: count minimum required characters (non-wildcard pattern elements).
53123
size_t min_len = 0;
54124
for (size_t i = 0; i < n; ++i) {
55125
if (!simp_wild[i]) {
56126
min_len++;
127+
} else if (simp_pat[i] == "_") {
128+
min_len++;
57129
}
58130
}
59131
if (min_len > m) {
60132
return false;
61133
}
62-
constexpr size_t STACK_LIMIT = 128;
63-
std::unique_ptr<bool[]> dp_storage;
64-
bool* dp;
65-
if (n <= STACK_LIMIT) {
66-
dp = static_cast<bool*>(alloca((n + 1) * sizeof(bool)));
67-
} else {
68-
dp_storage = std::make_unique<bool[]>(n + 1);
69-
dp = dp_storage.get();
70-
}
71-
std::fill_n(dp, n + 1, false);
134+
135+
// Phase 4: DP matching at character (code point) level.
136+
std::vector<bool> dp(n + 1, false);
72137
dp[0] = true;
73-
for (size_t j = 1; j <= n && simp_wild[j - 1] && simp_pat[j - 1] == '%'; ++j) {
138+
for (size_t j = 1; j <= n && simp_wild[j - 1] && simp_pat[j - 1] == "%"; ++j) {
74139
dp[j] = true;
75140
}
76-
const char* f = field.data();
141+
77142
for (size_t i = 0; i < m; ++i) {
78-
const char sc = f[i];
143+
const std::string& field_char = field_chars[i];
79144
bool prev = dp[0];
80145
dp[0] = false;
81146
bool has_match = false;
82147
for (size_t j = 1; j <= n; ++j) {
83148
const bool temp = dp[j];
84-
const char pc = simp_pat[j - 1];
149+
const std::string& pc = simp_pat[j - 1];
85150
const bool wild = simp_wild[j - 1];
86-
if (wild && pc == '%') {
151+
if (wild && pc == "%") {
87152
dp[j] = dp[j - 1] || dp[j];
88-
} else if (wild && pc == '_') {
89-
dp[j] = prev;
153+
} else if (wild && pc == "_") {
154+
dp[j] = prev && !IsJavaRegexLineTerminator(field_char);
90155
} else {
91-
dp[j] = (pc == sc) ? prev : false;
156+
dp[j] = (pc == field_char) ? prev : false;
92157
}
93158
has_match |= dp[j];
94159
prev = temp;
@@ -97,6 +162,6 @@ Result<bool> Like::TestString(const std::string& field, const std::string& patte
97162
return false;
98163
}
99164
}
100-
return dp[n];
165+
return static_cast<bool>(dp[n]);
101166
}
102167
} // namespace paimon

src/paimon/common/predicate/predicate_test.cpp

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1519,6 +1519,125 @@ TEST_F(PredicateTest, TestLikeLongPatternHeapAlloc) {
15191519
ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({non_matching_field})).value());
15201520
}
15211521

1522+
TEST_F(PredicateTest, TestLikeInvalidEscapeSequence) {
1523+
auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", arrow::utf8())}));
1524+
1525+
// Trailing backslash is invalid (Java throws "Invalid escape sequence")
1526+
ASSERT_OK_AND_ASSIGN(auto predicate_base,
1527+
PredicateBuilder::Like(
1528+
/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
1529+
Literal(FieldType::STRING, "abc\\", 4)));
1530+
auto predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1531+
ASSERT_NOK_WITH_MSG(predicate->Test(arrow_schema, CreateStringRow({"abc"})),
1532+
"Invalid escape sequence");
1533+
1534+
// Backslash followed by non-special char is invalid (only \_, \%, \\ are legal)
1535+
ASSERT_OK_AND_ASSIGN(predicate_base,
1536+
PredicateBuilder::Like(
1537+
/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
1538+
Literal(FieldType::STRING, "a\\bc", 4)));
1539+
predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1540+
ASSERT_NOK_WITH_MSG(predicate->Test(arrow_schema, CreateStringRow({"abc"})),
1541+
"Invalid escape sequence");
1542+
1543+
// \n is not a valid escape
1544+
ASSERT_OK_AND_ASSIGN(predicate_base,
1545+
PredicateBuilder::Like(
1546+
/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
1547+
Literal(FieldType::STRING, "a\\nf", 4)));
1548+
predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1549+
ASSERT_NOK_WITH_MSG(predicate->Test(arrow_schema, CreateStringRow({"anf"})),
1550+
"Invalid escape sequence");
1551+
}
1552+
1553+
TEST_F(PredicateTest, TestLikeEscapeBackslash) {
1554+
auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", arrow::utf8())}));
1555+
1556+
// \\\\ in C++ string literal = "\\" in the pattern = escaped backslash
1557+
ASSERT_OK_AND_ASSIGN(auto predicate_base,
1558+
PredicateBuilder::Like(
1559+
/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
1560+
Literal(FieldType::STRING, "a\\\\b", 4)));
1561+
auto predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1562+
// Field "a\b" should match pattern "a\\b" (escaped backslash)
1563+
ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"a\\b"})).value());
1564+
ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"axb"})).value());
1565+
1566+
// Escaped percent: "a\%b" matches literal "a%b"
1567+
ASSERT_OK_AND_ASSIGN(predicate_base,
1568+
PredicateBuilder::Like(
1569+
/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
1570+
Literal(FieldType::STRING, "a\\%b", 4)));
1571+
predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1572+
ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"a%b"})).value());
1573+
ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"axb"})).value());
1574+
ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"axxb"})).value());
1575+
}
1576+
1577+
TEST_F(PredicateTest, TestLikeUtf8MultibyteUnderscore) {
1578+
auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", arrow::utf8())}));
1579+
1580+
// Single '_' should match one Unicode character, not one byte.
1581+
ASSERT_OK_AND_ASSIGN(auto predicate_base,
1582+
PredicateBuilder::Like(
1583+
/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
1584+
Literal(FieldType::STRING, "_", 1)));
1585+
auto predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1586+
ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({""})).value());
1587+
ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"中文"})).value());
1588+
1589+
// "a_c" where _ matches one Chinese character
1590+
ASSERT_OK_AND_ASSIGN(predicate_base,
1591+
PredicateBuilder::Like(
1592+
/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
1593+
Literal(FieldType::STRING, "a_c", 3)));
1594+
predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1595+
ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"a中c"})).value());
1596+
ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"a中文c"})).value());
1597+
1598+
// "___" should match exactly 3 Unicode characters
1599+
ASSERT_OK_AND_ASSIGN(predicate_base,
1600+
PredicateBuilder::Like(
1601+
/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
1602+
Literal(FieldType::STRING, "___", 3)));
1603+
predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1604+
ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"中文字"})).value());
1605+
ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"中文"})).value());
1606+
1607+
// '%' should still work with multi-byte characters
1608+
std::string pattern_contains = std::string("%") + "" + "%";
1609+
ASSERT_OK_AND_ASSIGN(
1610+
predicate_base,
1611+
PredicateBuilder::Like(
1612+
/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
1613+
Literal(FieldType::STRING, pattern_contains.data(), pattern_contains.size())));
1614+
predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1615+
ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"hello中world"})).value());
1616+
ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"helloworld"})).value());
1617+
}
1618+
1619+
TEST_F(PredicateTest, TestLikeJavaRegexLineTerminatorSemantics) {
1620+
auto arrow_schema = arrow::schema(arrow::FieldVector({arrow::field("f0", arrow::utf8())}));
1621+
1622+
// Java regex '.' does not match line terminators, so '_' should not match them either.
1623+
ASSERT_OK_AND_ASSIGN(auto predicate_base,
1624+
PredicateBuilder::Like(
1625+
/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
1626+
Literal(FieldType::STRING, "_", 1)));
1627+
auto predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1628+
ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"\n"})).value());
1629+
ASSERT_FALSE(predicate->Test(arrow_schema, CreateStringRow({"\r"})).value());
1630+
1631+
// Java LIKE '%' uses (?s:.*), so it should still match line terminators.
1632+
ASSERT_OK_AND_ASSIGN(predicate_base,
1633+
PredicateBuilder::Like(
1634+
/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING,
1635+
Literal(FieldType::STRING, "%", 1)));
1636+
predicate = std::dynamic_pointer_cast<PredicateFilter>(predicate_base);
1637+
ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"\n"})).value());
1638+
ASSERT_TRUE(predicate->Test(arrow_schema, CreateStringRow({"\r"})).value());
1639+
}
1640+
15221641
TEST_F(PredicateTest, TestCompound) {
15231642
ASSERT_OK_AND_ASSIGN(
15241643
const auto startswith_predicate,

src/paimon/common/utils/string_utils.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,10 @@ Result<int32_t> StringUtils::StringToDate(const std::string& str) {
140140
if (ss.fail()) {
141141
return Status::Invalid(fmt::format("failed to convert string '{}' to date", str));
142142
}
143+
int32_t orig_mon = timeinfo.tm_mon;
144+
int32_t orig_mday = timeinfo.tm_mday;
143145
std::time_t time = timegm(&timeinfo);
144-
if (time == -1) {
146+
if (time == -1 || timeinfo.tm_mon != orig_mon || timeinfo.tm_mday != orig_mday) {
145147
return Status::Invalid(fmt::format("failed to convert string '{}' to date", str));
146148
}
147149
static const int64_t SECONDS_PER_DAY = 86400l; // = 24 * 60 * 60
@@ -207,8 +209,10 @@ Result<int64_t> StringUtils::StringToTimestampMillis(const std::string& str) {
207209
str));
208210
}
209211

212+
int32_t orig_mon = timeinfo.tm_mon;
213+
int32_t orig_mday = timeinfo.tm_mday;
210214
std::time_t time = mktime(&timeinfo);
211-
if (time == -1) {
215+
if (time == -1 || timeinfo.tm_mon != orig_mon || timeinfo.tm_mday != orig_mday) {
212216
return Status::Invalid(fmt::format("failed to convert string '{}' to timestamp", str));
213217
}
214218
return static_cast<int64_t>(time) * 1000 + millis_part;

0 commit comments

Comments
 (0)