Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/user/content/sql/functions/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ Operator | Computes

The regular expression syntax supported by Materialize is documented by the
[Rust `regex` crate](https://docs.rs/regex/*/#syntax).
The maximum length of a regular expression is 1 MiB in its raw form, and 10 MiB
after compiling it.

{{< warning >}}
Materialize regular expressions are similar to, but not identical to, PostgreSQL
Expand Down
6 changes: 3 additions & 3 deletions doc/user/data/sql_funcs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -350,11 +350,11 @@

- signature: 'lpad(s: str, len: int) -> str'
description: "Prepend `s` with spaces up to length `len`,
or right truncate if `len` is less than the length of `s`."
or right truncate if `len` is less than the length of `s`. The maximum length of the result string is 100 MiB."

- signature: 'lpad(s: str, len: int, p: str) -> str'
description: "Prepend `s` with characters pulled from `p` up to length `len`,
or right truncate if `len` is less than the length of `s`."
or right truncate if `len` is less than the length of `s`. The maximum length of the result string is 100 MiB."

- signature: 'ltrim(s: str) -> str'
description: Trim all spaces from the left side of `s`.
Expand Down Expand Up @@ -422,7 +422,7 @@
If `flags` is set to `i`, matches case-insensitively.
- signature: 'repeat(s: str, n: int) -> str'
description: Replicate the string `n` times.
description: Replicate the string `n` times. The maximum length of the result string is 100 MiB.

- signature: 'replace(s: str, f: str, r: str) -> str'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The replace function could also be inflationary... do we want limits on all such string functions? (Seems like:concat, concat_ws, decode, and possibly encode [a string under the limit in UTF-8 might not be in UTF-32]). If we're trying to enforce this invariant globally, also string_agg and anything coming from JSON.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

description: "`s` with all instances of `f` replaced with `r`."
Expand Down
4 changes: 2 additions & 2 deletions src/expr/src/relation/func.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ use mz_repr::adt::array::ArrayDimension;
use mz_repr::adt::date::Date;
use mz_repr::adt::interval::Interval;
use mz_repr::adt::numeric::{self, Numeric, NumericMaxScale};
use mz_repr::adt::regex::Regex as ReprRegex;
use mz_repr::adt::regex::{Regex as ReprRegex, RegexCompilationError};
use mz_repr::adt::timestamp::{CheckedTimestamp, TimestampLike};
use mz_repr::{
ColumnName, Datum, Diff, Row, RowArena, RowPacker, SharedRow, SqlColumnType, SqlRelationType,
Expand Down Expand Up @@ -3122,7 +3122,7 @@ impl FromStr for AnalyzedRegexOpts {
pub struct AnalyzedRegex(ReprRegex, Vec<CaptureGroupDesc>, AnalyzedRegexOpts);

impl AnalyzedRegex {
pub fn new(s: &str, opts: AnalyzedRegexOpts) -> Result<Self, regex::Error> {
pub fn new(s: &str, opts: AnalyzedRegexOpts) -> Result<Self, RegexCompilationError> {
let r = ReprRegex::new(s, opts.case_insensitive)?;
// TODO(benesch): remove potentially dangerous usage of `as`.
#[allow(clippy::as_conversions)]
Expand Down
6 changes: 3 additions & 3 deletions src/expr/src/scalar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ use mz_repr::adt::array::InvalidArrayError;
use mz_repr::adt::date::DateError;
use mz_repr::adt::datetime::DateTimeUnits;
use mz_repr::adt::range::InvalidRangeError;
use mz_repr::adt::regex::Regex;
use mz_repr::adt::regex::{Regex, RegexCompilationError};
use mz_repr::adt::timestamp::TimestampError;
use mz_repr::strconv::{ParseError, ParseHexError};
use mz_repr::{Datum, Row, RowArena, SqlColumnType, SqlScalarType};
Expand Down Expand Up @@ -2892,8 +2892,8 @@ impl From<InvalidArrayError> for EvalError {
}
}

impl From<regex::Error> for EvalError {
fn from(e: regex::Error) -> EvalError {
impl From<RegexCompilationError> for EvalError {
fn from(e: RegexCompilationError) -> EvalError {
EvalError::InvalidRegex(e.to_string().into())
}
}
Expand Down
13 changes: 8 additions & 5 deletions src/expr/src/scalar/func.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,13 @@ pub use unary::{EagerUnaryFunc, LazyUnaryFunc, UnaryFunc};
pub use unmaterializable::UnmaterializableFunc;
pub use variadic::VariadicFunc;

/// The maximum size of a newly allocated string. Chosen to be the smallest number to keep our tests
/// passing without changing. 100MiB is probably higher than what we want, but it's better than no
/// limit.
const MAX_STRING_BYTES: usize = 1024 * 1024 * 100;
/// The maximum size of the result strings of certain string functions, such as `repeat` and `lpad`.
/// Chosen to be the smallest number to keep our tests passing without changing. 100MiB is probably
/// higher than what we want, but it's better than no limit.
///
/// Note: This number appears in our user-facing documentation in the function reference for every
/// function where it applies.
const MAX_STRING_FUNC_RESULT_BYTES: usize = 1024 * 1024 * 100;

pub fn jsonb_stringify<'a>(a: Datum<'a>, temp_storage: &'a RowArena) -> Datum<'a> {
match a {
Expand Down Expand Up @@ -4915,7 +4918,7 @@ fn repeat_string<'a>(
) -> Result<Datum<'a>, EvalError> {
let len = usize::try_from(count.unwrap_int32()).unwrap_or(0);
let string = string.unwrap_str();
if (len * string.len()) > MAX_STRING_BYTES {
if (len * string.len()) > MAX_STRING_FUNC_RESULT_BYTES {
return Err(EvalError::LengthTooLarge);
}
Ok(Datum::String(temp_storage.push_string(string.repeat(len))))
Expand Down
4 changes: 2 additions & 2 deletions src/expr/src/scalar/func/variadic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ use sha1::Sha1;
use sha2::{Sha224, Sha256, Sha384, Sha512};

use crate::func::{
MAX_STRING_BYTES, array_create_scalar, build_regex, date_bin, parse_timezone,
MAX_STRING_FUNC_RESULT_BYTES, array_create_scalar, build_regex, date_bin, parse_timezone,
regexp_match_static, regexp_replace_parse_flags, regexp_replace_static,
regexp_split_to_array_re, stringify_datum, timezone_time,
};
Expand Down Expand Up @@ -719,7 +719,7 @@ fn pad_leading<'a>(
));
}
};
if len > MAX_STRING_BYTES {
if len > MAX_STRING_FUNC_RESULT_BYTES {
return Err(EvalError::LengthTooLarge);
}

Expand Down
7 changes: 5 additions & 2 deletions src/expr/src/scalar/like_pattern.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use std::str::FromStr;
use derivative::Derivative;
use mz_lowertest::MzReflect;
use mz_ore::fmt::FormatBuffer;
use mz_repr::adt::regex::Regex;
use mz_repr::adt::regex::{Regex, RegexCompilationError};
use serde::{Deserialize, Serialize};

use crate::scalar::EvalError;
Expand Down Expand Up @@ -346,7 +346,10 @@ fn build_regex(subpatterns: &[Subpattern], case_insensitive: bool) -> Result<Reg
r.push('$');
match Regex::new(&r, case_insensitive) {
Ok(regex) => Ok(regex),
Err(regex::Error::CompiledTooBig(_)) => Err(EvalError::LikePatternTooLong),
Err(RegexCompilationError::PatternTooLarge { .. }) => Err(EvalError::LikePatternTooLong),
Err(RegexCompilationError::RegexError(regex::Error::CompiledTooBig(_))) => {
Err(EvalError::LikePatternTooLong)
}
Err(e) => Err(EvalError::Internal(
format!("build_regex produced invalid regex: {}", e).into(),
)),
Expand Down
2 changes: 1 addition & 1 deletion src/regexp/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ mod tests {
_ => anyhow::bail!("unexpected regex flags"),
}
}
Ok(Regex::new(needle, case_insensitive)?)
Regex::new(needle, case_insensitive).map_err(|e| anyhow::anyhow!("{}", e))
}

// Assert equivalency to postgres and generate TestCases.
Expand Down
57 changes: 55 additions & 2 deletions src/repr/src/adt/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,23 @@ use serde::de::Error as DeError;
use serde::ser::SerializeStruct;
use serde::{Deserialize, Deserializer, Serialize, Serializer, de};

/// The maximum size of a regex after compilation.
/// This is the same as the `Regex` crate's default at the time of writing.
///
/// Note: This number is mentioned in our user-facing docs at the "String operators" in the function
/// reference.
const MAX_REGEX_SIZE_AFTER_COMPILATION: usize = 10 * 1024 * 1024;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NB that this is in fact the default NFA size limit already (but good to document and enforce!) https://docs.rs/regex/latest/src/regex/builders.rs.html#52

If our goal is to use less memory/prevent OOMs, we might want a smaller limit.

Copy link
Contributor Author

@ggevay ggevay Nov 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NB that this is in fact the default NFA size limit already (but good to document and enforce!)

Yes, I just made the compiled regex size limit explicit to

  • prevent surprises if a future version of the Regex crate changes its default;
  • make the limit more easily discoverable when reading our source code.

If our goal is to use less memory/prevent OOMs, we might want a smaller limit.

Well, a 10 MiB compiled regex (if it really doesn't go above that) should be a walk in the park. Our envd mem limit is GBs. Btw. I also checked that making this limit even super small does not prevent the original issue unfortunately. I'm planning to send a bug report to the crate when I have time, because their docs kinda make it sound like they have good DOS prevention, so maybe this is something that they'd want to fix. (As we also discussed on zoom.)


/// We also need a separate limit for the size of regexes before compilation. Even though the
/// `Regex` crate promises that using its `size_limit` option (which we set to the other limit,
/// `MAX_REGEX_SIZE_AFTER_COMPILATION`) would prevent excessive resource usage, this doesn't seem to
/// be the case. Since we compile regexes in envd, we need strict limits to prevent envd OOMs.
/// See <https://github.com/MaterializeInc/database-issues/issues/9907> for an example.
///
/// Note: This number is mentioned in our user-facing docs at the "String operators" in the function
/// reference.
const MAX_REGEX_SIZE_BEFORE_COMPILATION: usize = 1 * 1024 * 1024;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we document this constant in our docs?

Is it a potential regression that we don't accept a query that works fine at the moment?

Copy link
Contributor Author

@ggevay ggevay Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I'll add it in the docs. Edit: Done.

Unfortunately this could indeed introduce a regression. I'm hoping that the 1 million limit is big enough that nobody has a regex this big at the moment. If somebody happens to have such a big regex in their catalog, then the upgrade-check would fail, in which case we'd make a new RC with a bigger limit or reverting this change. I'd say this risk is acceptable in cloud. In self-managed, running into this would be a bit more troublesome, because there would have to be some back-and-forth with the user, but considering the very low chance, maybe it's acceptable?

(Guarding this with a feature flag would unfortunately be very hard at this point in the code.)

This is what Postgres' docs says about big regexes:

No particular limit is imposed on the length of REs in this implementation. However, programs intended to be highly portable should not employ REs longer than 256 bytes, as a POSIX-compliant implementation can refuse to accept such REs.

In practice, Postgres seems to fail at lengths of tens of thousands on regexes that look like the ones causing us trouble in https://github.com/MaterializeInc/database-issues/issues/9907:

postgres=# SELECT 'aaaaaaaaaaa' ~ repeat('vxx.0.0-rc.4 (5b079d80c)', '10000');
ERROR:  invalid regular expression: regular expression is too complex
postgres=# SELECT 'aaaaaaaaaaa' ~ repeat('vxx.0.0-rc.4 (5b079d80c)', '4000');
ERROR:  invalid regular expression: regular expression is too complex
postgres=# SELECT 'aaaaaaaaaaa' ~ repeat('vxx.0.0-rc.4 (5b079d80c)', '2000');
ERROR:  invalid regular expression: regular expression is too complex
postgres=# SELECT 'aaaaaaaaaaa' ~ repeat('vxx.0.0-rc.4 (5b079d80c)', '1000');
 ?column? 
----------
 f
(1 row)

I also did some quick googling, and couldn't find any case of someone talking about a regex longer than tens of thousands. Claude also says

When developers discuss "large" regexes in production environments, they're typically talking about patterns measured in hundreds or low thousands of characters, not millions.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can simply check this for existing views using mzexplore---should be quick to export data and run some jq queries.

Copy link
Contributor Author

@ggevay ggevay Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately we can do this only for cloud, and I'm not too worried about cloud. In self-managed it would be somewhat more of a hassle if we need a patch release.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, right. Feature flag it (or disable the limit in unsafe mode)? Too much trouble to ask the self-managed users?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feature flag it

Unfortunately we can't feature flag it, because we don't have access to feature flag values in scalar function evaluations. Flags would have to be wired in from super far. (And the extra context being passed around scalar evaluation might even have a non-trivial performance impact.)

disable the limit in unsafe mode

Unsafe mode enables too much unsafe stuff, so we'd like to never tell a customer to turn it on. It's more a testing thing than a user-facing escape hatch.

Too much trouble to ask the self-managed users?

Well, I'd say it's acceptable, considering that the risk of a self-managed user already having such a big regex is quite low.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, you've convinced me!


/// A hashable, comparable, and serializable regular expression type.
///
/// The [`regex::Regex`] type, the de facto standard regex type in Rust, does
Expand Down Expand Up @@ -58,7 +75,7 @@ impl Regex {
/// A simple constructor for the default setting of `dot_matches_new_line: true`.
/// See <https://www.postgresql.org/docs/current/functions-matching.html#POSIX-MATCHING-RULES>
/// "newline-sensitive matching"
pub fn new(pattern: &str, case_insensitive: bool) -> Result<Regex, Error> {
pub fn new(pattern: &str, case_insensitive: bool) -> Result<Regex, RegexCompilationError> {
Self::new_dot_matches_new_line(pattern, case_insensitive, true)
}

Expand All @@ -67,10 +84,16 @@ impl Regex {
pattern: &str,
case_insensitive: bool,
dot_matches_new_line: bool,
) -> Result<Regex, Error> {
) -> Result<Regex, RegexCompilationError> {
if pattern.len() > MAX_REGEX_SIZE_BEFORE_COMPILATION {
return Err(RegexCompilationError::PatternTooLarge {
pattern_size: pattern.len(),
});
}
let mut regex_builder = RegexBuilder::new(pattern);
regex_builder.case_insensitive(case_insensitive);
regex_builder.dot_matches_new_line(dot_matches_new_line);
regex_builder.size_limit(MAX_REGEX_SIZE_AFTER_COMPILATION);
Ok(Regex {
case_insensitive,
dot_matches_new_line,
Expand All @@ -86,6 +109,36 @@ impl Regex {
}
}

/// Error type for regex compilation failures.
#[derive(Debug, Clone)]
pub enum RegexCompilationError {
/// Wrapper for regex crate's Error type.
RegexError(Error),
/// Regex pattern size exceeds MAX_REGEX_SIZE_BEFORE_COMPILATION.
PatternTooLarge { pattern_size: usize },
}

impl fmt::Display for RegexCompilationError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
RegexCompilationError::RegexError(e) => write!(f, "{}", e),
RegexCompilationError::PatternTooLarge {
pattern_size: patter_size,
} => write!(
f,
"regex pattern too large ({} bytes, max {} bytes)",
patter_size, MAX_REGEX_SIZE_BEFORE_COMPILATION
),
}
}
}

impl From<Error> for RegexCompilationError {
fn from(e: Error) -> Self {
RegexCompilationError::RegexError(e)
}
}

impl PartialEq<Regex> for Regex {
fn eq(&self, other: &Regex) -> bool {
self.pattern() == other.pattern()
Expand Down
3 changes: 3 additions & 0 deletions test/sqllogictest/regex.slt
Original file line number Diff line number Diff line change
Expand Up @@ -818,5 +818,8 @@ SELECT regexp_matches(bar, '(b[^b]+)(b[^b]+)(b[^b]+)', 'ig') FROM foo2;
db error: ERROR: function regexp_matches(smallint, unknown, unknown) does not exist
HINT: No function matches the given name and argument types. You might need to add explicit type casts.

statement error db error: ERROR: invalid regular expression: regex pattern too large \(62400000 bytes, max 1048576 bytes\)
SELECT 'aaaaaaaaaaa' ~ repeat('vxx.0.0-rc.4 (5b079d80c)', '2600000');

statement ok
DROP CLUSTER multiprocess;