Skip to content

Commit 1125338

Browse files
committed
Introduce a regex size limit before compilation
Also, fix the regex size limit after compilation to a constant that we control, to prevent surprises if a future version of the Regex crate changes its default.
1 parent 2ddff70 commit 1125338

File tree

9 files changed

+77
-13
lines changed

9 files changed

+77
-13
lines changed

doc/user/content/sql/functions/_index.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,8 @@ Operator | Computes
112112

113113
The regular expression syntax supported by Materialize is documented by the
114114
[Rust `regex` crate](https://docs.rs/regex/*/#syntax).
115+
The maximum length of a regular expression is 1 MiB in its raw form, and 10 MiB
116+
after compiling it.
115117

116118
{{< warning >}}
117119
Materialize regular expressions are similar to, but not identical to, PostgreSQL

doc/user/data/sql_funcs.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -350,11 +350,11 @@
350350

351351
- signature: 'lpad(s: str, len: int) -> str'
352352
description: "Prepend `s` with spaces up to length `len`,
353-
or right truncate if `len` is less than the length of `s`."
353+
or right truncate if `len` is less than the length of `s`. The maximum length of the result string is 100 MiB."
354354

355355
- signature: 'lpad(s: str, len: int, p: str) -> str'
356356
description: "Prepend `s` with characters pulled from `p` up to length `len`,
357-
or right truncate if `len` is less than the length of `s`."
357+
or right truncate if `len` is less than the length of `s`. The maximum length of the result string is 100 MiB."
358358

359359
- signature: 'ltrim(s: str) -> str'
360360
description: Trim all spaces from the left side of `s`.
@@ -422,7 +422,7 @@
422422
If `flags` is set to `i`, matches case-insensitively.
423423
424424
- signature: 'repeat(s: str, n: int) -> str'
425-
description: Replicate the string `n` times.
425+
description: Replicate the string `n` times. The maximum length of the result string is 100 MiB.
426426

427427
- signature: 'replace(s: str, f: str, r: str) -> str'
428428
description: "`s` with all instances of `f` replaced with `r`."

src/expr/src/relation/func.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ use mz_repr::adt::array::ArrayDimension;
2727
use mz_repr::adt::date::Date;
2828
use mz_repr::adt::interval::Interval;
2929
use mz_repr::adt::numeric::{self, Numeric, NumericMaxScale};
30-
use mz_repr::adt::regex::Regex as ReprRegex;
30+
use mz_repr::adt::regex::{Regex as ReprRegex, RegexCompilationError};
3131
use mz_repr::adt::timestamp::{CheckedTimestamp, TimestampLike};
3232
use mz_repr::{
3333
ColumnName, Datum, Diff, Row, RowArena, RowPacker, SharedRow, SqlColumnType, SqlRelationType,
@@ -3122,7 +3122,7 @@ impl FromStr for AnalyzedRegexOpts {
31223122
pub struct AnalyzedRegex(ReprRegex, Vec<CaptureGroupDesc>, AnalyzedRegexOpts);
31233123

31243124
impl AnalyzedRegex {
3125-
pub fn new(s: &str, opts: AnalyzedRegexOpts) -> Result<Self, regex::Error> {
3125+
pub fn new(s: &str, opts: AnalyzedRegexOpts) -> Result<Self, RegexCompilationError> {
31263126
let r = ReprRegex::new(s, opts.case_insensitive)?;
31273127
// TODO(benesch): remove potentially dangerous usage of `as`.
31283128
#[allow(clippy::as_conversions)]

src/expr/src/scalar.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ use mz_repr::adt::array::InvalidArrayError;
2828
use mz_repr::adt::date::DateError;
2929
use mz_repr::adt::datetime::DateTimeUnits;
3030
use mz_repr::adt::range::InvalidRangeError;
31-
use mz_repr::adt::regex::Regex;
31+
use mz_repr::adt::regex::{Regex, RegexCompilationError};
3232
use mz_repr::adt::timestamp::TimestampError;
3333
use mz_repr::strconv::{ParseError, ParseHexError};
3434
use mz_repr::{Datum, Row, RowArena, SqlColumnType, SqlScalarType};
@@ -2892,8 +2892,8 @@ impl From<InvalidArrayError> for EvalError {
28922892
}
28932893
}
28942894

2895-
impl From<regex::Error> for EvalError {
2896-
fn from(e: regex::Error) -> EvalError {
2895+
impl From<RegexCompilationError> for EvalError {
2896+
fn from(e: RegexCompilationError) -> EvalError {
28972897
EvalError::InvalidRegex(e.to_string().into())
28982898
}
28992899
}

src/expr/src/scalar/func.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ pub use variadic::VariadicFunc;
7676
/// The maximum size of the result strings of certain string functions, such as `repeat` and `lpad`.
7777
/// Chosen to be the smallest number to keep our tests passing without changing. 100MiB is probably
7878
/// higher than what we want, but it's better than no limit.
79+
///
80+
/// Note: This number appears in our user-facing documentation in the function reference for every
81+
/// function where it applies.
7982
const MAX_STRING_FUNC_RESULT_BYTES: usize = 1024 * 1024 * 100;
8083

8184
pub fn jsonb_stringify<'a>(a: Datum<'a>, temp_storage: &'a RowArena) -> Datum<'a> {

src/expr/src/scalar/like_pattern.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use std::str::FromStr;
1313
use derivative::Derivative;
1414
use mz_lowertest::MzReflect;
1515
use mz_ore::fmt::FormatBuffer;
16-
use mz_repr::adt::regex::Regex;
16+
use mz_repr::adt::regex::{Regex, RegexCompilationError};
1717
use serde::{Deserialize, Serialize};
1818

1919
use crate::scalar::EvalError;
@@ -346,7 +346,10 @@ fn build_regex(subpatterns: &[Subpattern], case_insensitive: bool) -> Result<Reg
346346
r.push('$');
347347
match Regex::new(&r, case_insensitive) {
348348
Ok(regex) => Ok(regex),
349-
Err(regex::Error::CompiledTooBig(_)) => Err(EvalError::LikePatternTooLong),
349+
Err(RegexCompilationError::PatternTooLarge { .. }) => Err(EvalError::LikePatternTooLong),
350+
Err(RegexCompilationError::RegexError(regex::Error::CompiledTooBig(_))) => {
351+
Err(EvalError::LikePatternTooLong)
352+
}
350353
Err(e) => Err(EvalError::Internal(
351354
format!("build_regex produced invalid regex: {}", e).into(),
352355
)),

src/regexp/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ mod tests {
6060
_ => anyhow::bail!("unexpected regex flags"),
6161
}
6262
}
63-
Ok(Regex::new(needle, case_insensitive)?)
63+
Regex::new(needle, case_insensitive).map_err(|e| anyhow::anyhow!("{}", e))
6464
}
6565

6666
// Assert equivalency to postgres and generate TestCases.

src/repr/src/adt/regex.rs

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,23 @@ use serde::de::Error as DeError;
2121
use serde::ser::SerializeStruct;
2222
use serde::{Deserialize, Deserializer, Serialize, Serializer, de};
2323

24+
/// The maximum size of a regex after compilation.
25+
/// This is the same as the `Regex` crate's default at the time of writing.
26+
///
27+
/// Note: This number is mentioned in our user-facing docs at the "String operators" in the function
28+
/// reference.
29+
const MAX_REGEX_SIZE_AFTER_COMPILATION: usize = 10 * 1024 * 1024;
30+
31+
/// We also need a separate limit for the size of regexes before compilation. Even though the
32+
/// `Regex` crate promises that using its `size_limit` option (which we set to the other limit,
33+
/// `MAX_REGEX_SIZE_AFTER_COMPILATION`) would prevent excessive resource usage, this doesn't seem to
34+
/// be the case. Since we compile regexes in envd, we need strict limits to prevent envd OOMs.
35+
/// See <https://github.com/MaterializeInc/database-issues/issues/9907> for an example.
36+
///
37+
/// Note: This number is mentioned in our user-facing docs at the "String operators" in the function
38+
/// reference.
39+
const MAX_REGEX_SIZE_BEFORE_COMPILATION: usize = 1 * 1024 * 1024;
40+
2441
/// A hashable, comparable, and serializable regular expression type.
2542
///
2643
/// The [`regex::Regex`] type, the de facto standard regex type in Rust, does
@@ -58,7 +75,7 @@ impl Regex {
5875
/// A simple constructor for the default setting of `dot_matches_new_line: true`.
5976
/// See <https://www.postgresql.org/docs/current/functions-matching.html#POSIX-MATCHING-RULES>
6077
/// "newline-sensitive matching"
61-
pub fn new(pattern: &str, case_insensitive: bool) -> Result<Regex, Error> {
78+
pub fn new(pattern: &str, case_insensitive: bool) -> Result<Regex, RegexCompilationError> {
6279
Self::new_dot_matches_new_line(pattern, case_insensitive, true)
6380
}
6481

@@ -67,10 +84,16 @@ impl Regex {
6784
pattern: &str,
6885
case_insensitive: bool,
6986
dot_matches_new_line: bool,
70-
) -> Result<Regex, Error> {
87+
) -> Result<Regex, RegexCompilationError> {
88+
if pattern.len() > MAX_REGEX_SIZE_BEFORE_COMPILATION {
89+
return Err(RegexCompilationError::PatternTooLarge {
90+
pattern_size: pattern.len(),
91+
});
92+
}
7193
let mut regex_builder = RegexBuilder::new(pattern);
7294
regex_builder.case_insensitive(case_insensitive);
7395
regex_builder.dot_matches_new_line(dot_matches_new_line);
96+
regex_builder.size_limit(MAX_REGEX_SIZE_AFTER_COMPILATION);
7497
Ok(Regex {
7598
case_insensitive,
7699
dot_matches_new_line,
@@ -86,6 +109,36 @@ impl Regex {
86109
}
87110
}
88111

112+
/// Error type for regex compilation failures.
113+
#[derive(Debug, Clone)]
114+
pub enum RegexCompilationError {
115+
/// Wrapper for regex crate's Error type.
116+
RegexError(Error),
117+
/// Regex pattern size exceeds MAX_REGEX_SIZE_BEFORE_COMPILATION.
118+
PatternTooLarge { pattern_size: usize },
119+
}
120+
121+
impl fmt::Display for RegexCompilationError {
122+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
123+
match self {
124+
RegexCompilationError::RegexError(e) => write!(f, "{}", e),
125+
RegexCompilationError::PatternTooLarge {
126+
pattern_size: patter_size,
127+
} => write!(
128+
f,
129+
"regex pattern too large ({} bytes, max {} bytes)",
130+
patter_size, MAX_REGEX_SIZE_BEFORE_COMPILATION
131+
),
132+
}
133+
}
134+
}
135+
136+
impl From<Error> for RegexCompilationError {
137+
fn from(e: Error) -> Self {
138+
RegexCompilationError::RegexError(e)
139+
}
140+
}
141+
89142
impl PartialEq<Regex> for Regex {
90143
fn eq(&self, other: &Regex) -> bool {
91144
self.pattern() == other.pattern()

test/sqllogictest/regex.slt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -818,5 +818,8 @@ SELECT regexp_matches(bar, '(b[^b]+)(b[^b]+)(b[^b]+)', 'ig') FROM foo2;
818818
db error: ERROR: function regexp_matches(smallint, unknown, unknown) does not exist
819819
HINT: No function matches the given name and argument types. You might need to add explicit type casts.
820820

821+
statement error db error: ERROR: invalid regular expression: regex pattern too large \(62400000 bytes, max 1048576 bytes\)
822+
SELECT 'aaaaaaaaaaa' ~ repeat('vxx.0.0-rc.4 (5b079d80c)', '2600000');
823+
821824
statement ok
822825
DROP CLUSTER multiprocess;

0 commit comments

Comments
 (0)