MaterializeInc · ggevay · Nov 20, 2025 · Nov 17, 2025 · Nov 17, 2025 · mgree
@@ -112,6 +112,8 @@ Operator | Computes
 
 The regular expression syntax supported by Materialize is documented by the
 [Rust `regex` crate](https://docs.rs/regex/*/#syntax).
+The maximum length of a regular expression is 1 MiB in its raw form, and 10 MiB
+after compiling it.
 
 {{< warning >}}
 Materialize regular expressions are similar to, but not identical to, PostgreSQL

@@ -350,11 +350,11 @@
 
   - signature: 'lpad(s: str, len: int) -> str'
     description: "Prepend `s` with spaces up to length `len`,
-      or right truncate if `len` is less than the length of `s`."
+      or right truncate if `len` is less than the length of `s`. The maximum length of the result string is 100 MiB."
 
   - signature: 'lpad(s: str, len: int, p: str) -> str'
     description: "Prepend `s` with characters pulled from `p` up to length `len`,
-      or right truncate if `len` is less than the length of `s`."
+      or right truncate if `len` is less than the length of `s`. The maximum length of the result string is 100 MiB."
 
   - signature: 'ltrim(s: str) -> str'
     description: Trim all spaces from the left side of `s`.
@@ -422,7 +422,7 @@
       If `flags` is set to `i`, matches case-insensitively.
 
   - signature: 'repeat(s: str, n: int) -> str'
-    description: Replicate the string `n` times.
+    description: Replicate the string `n` times. The maximum length of the result string is 100 MiB.
 
   - signature: 'replace(s: str, f: str, r: str) -> str'
     description: "`s` with all instances of `f` replaced with `r`."

@@ -27,7 +27,7 @@ use mz_repr::adt::array::ArrayDimension;
 use mz_repr::adt::date::Date;
 use mz_repr::adt::interval::Interval;
 use mz_repr::adt::numeric::{self, Numeric, NumericMaxScale};
-use mz_repr::adt::regex::Regex as ReprRegex;
+use mz_repr::adt::regex::{Regex as ReprRegex, RegexCompilationError};
 use mz_repr::adt::timestamp::{CheckedTimestamp, TimestampLike};
 use mz_repr::{
     ColumnName, Datum, Diff, Row, RowArena, RowPacker, SharedRow, SqlColumnType, SqlRelationType,
@@ -3122,7 +3122,7 @@ impl FromStr for AnalyzedRegexOpts {
 pub struct AnalyzedRegex(ReprRegex, Vec<CaptureGroupDesc>, AnalyzedRegexOpts);
 
 impl AnalyzedRegex {
-    pub fn new(s: &str, opts: AnalyzedRegexOpts) -> Result<Self, regex::Error> {
+    pub fn new(s: &str, opts: AnalyzedRegexOpts) -> Result<Self, RegexCompilationError> {
         let r = ReprRegex::new(s, opts.case_insensitive)?;
         // TODO(benesch): remove potentially dangerous usage of `as`.
         #[allow(clippy::as_conversions)]

@@ -28,7 +28,7 @@ use mz_repr::adt::array::InvalidArrayError;
 use mz_repr::adt::date::DateError;
 use mz_repr::adt::datetime::DateTimeUnits;
 use mz_repr::adt::range::InvalidRangeError;
-use mz_repr::adt::regex::Regex;
+use mz_repr::adt::regex::{Regex, RegexCompilationError};
 use mz_repr::adt::timestamp::TimestampError;
 use mz_repr::strconv::{ParseError, ParseHexError};
 use mz_repr::{Datum, Row, RowArena, SqlColumnType, SqlScalarType};
@@ -2892,8 +2892,8 @@ impl From<InvalidArrayError> for EvalError {
     }
 }
 
-impl From<regex::Error> for EvalError {
-    fn from(e: regex::Error) -> EvalError {
+impl From<RegexCompilationError> for EvalError {
+    fn from(e: RegexCompilationError) -> EvalError {
         EvalError::InvalidRegex(e.to_string().into())
     }
 }

@@ -73,10 +73,13 @@ pub use unary::{EagerUnaryFunc, LazyUnaryFunc, UnaryFunc};
 pub use unmaterializable::UnmaterializableFunc;
 pub use variadic::VariadicFunc;
 
-/// The maximum size of a newly allocated string. Chosen to be the smallest number to keep our tests
-/// passing without changing. 100MiB is probably higher than what we want, but it's better than no
-/// limit.
-const MAX_STRING_BYTES: usize = 1024 * 1024 * 100;
+/// The maximum size of the result strings of certain string functions, such as `repeat` and `lpad`.
+/// Chosen to be the smallest number to keep our tests passing without changing. 100MiB is probably
+/// higher than what we want, but it's better than no limit.
+///
+/// Note: This number appears in our user-facing documentation in the function reference for every
+/// function where it applies.
+const MAX_STRING_FUNC_RESULT_BYTES: usize = 1024 * 1024 * 100;
 
 pub fn jsonb_stringify<'a>(a: Datum<'a>, temp_storage: &'a RowArena) -> Datum<'a> {
     match a {
@@ -4915,7 +4918,7 @@ fn repeat_string<'a>(
 ) -> Result<Datum<'a>, EvalError> {
     let len = usize::try_from(count.unwrap_int32()).unwrap_or(0);
     let string = string.unwrap_str();
-    if (len * string.len()) > MAX_STRING_BYTES {
+    if (len * string.len()) > MAX_STRING_FUNC_RESULT_BYTES {
         return Err(EvalError::LengthTooLarge);
     }
     Ok(Datum::String(temp_storage.push_string(string.repeat(len))))

@@ -35,7 +35,7 @@ use sha1::Sha1;
 use sha2::{Sha224, Sha256, Sha384, Sha512};
 
 use crate::func::{
-    MAX_STRING_BYTES, array_create_scalar, build_regex, date_bin, parse_timezone,
+    MAX_STRING_FUNC_RESULT_BYTES, array_create_scalar, build_regex, date_bin, parse_timezone,
     regexp_match_static, regexp_replace_parse_flags, regexp_replace_static,
     regexp_split_to_array_re, stringify_datum, timezone_time,
 };
@@ -719,7 +719,7 @@ fn pad_leading<'a>(
             ));
         }
     };
-    if len > MAX_STRING_BYTES {
+    if len > MAX_STRING_FUNC_RESULT_BYTES {
         return Err(EvalError::LengthTooLarge);
     }
 

@@ -13,7 +13,7 @@ use std::str::FromStr;
 use derivative::Derivative;
 use mz_lowertest::MzReflect;
 use mz_ore::fmt::FormatBuffer;
-use mz_repr::adt::regex::Regex;
+use mz_repr::adt::regex::{Regex, RegexCompilationError};
 use serde::{Deserialize, Serialize};
 
 use crate::scalar::EvalError;
@@ -346,7 +346,10 @@ fn build_regex(subpatterns: &[Subpattern], case_insensitive: bool) -> Result<Reg
     r.push('$');
     match Regex::new(&r, case_insensitive) {
         Ok(regex) => Ok(regex),
-        Err(regex::Error::CompiledTooBig(_)) => Err(EvalError::LikePatternTooLong),
+        Err(RegexCompilationError::PatternTooLarge { .. }) => Err(EvalError::LikePatternTooLong),
+        Err(RegexCompilationError::RegexError(regex::Error::CompiledTooBig(_))) => {
+            Err(EvalError::LikePatternTooLong)
+        }
         Err(e) => Err(EvalError::Internal(
             format!("build_regex produced invalid regex: {}", e).into(),
         )),

diff --git a/src/regexp/src/lib.rs b/src/regexp/src/lib.rs
@@ -60,7 +60,7 @@ mod tests {
                 _ => anyhow::bail!("unexpected regex flags"),
             }
         }
-        Ok(Regex::new(needle, case_insensitive)?)
+        Regex::new(needle, case_insensitive).map_err(|e| anyhow::anyhow!("{}", e))
     }
 
     // Assert equivalency to postgres and generate TestCases.

@@ -21,6 +21,23 @@ use serde::de::Error as DeError;
 use serde::ser::SerializeStruct;
 use serde::{Deserialize, Deserializer, Serialize, Serializer, de};
 
+/// The maximum size of a regex after compilation.
+/// This is the same as the `Regex` crate's default at the time of writing.
+///
+/// Note: This number is mentioned in our user-facing docs at the "String operators" in the function
+/// reference.
+const MAX_REGEX_SIZE_AFTER_COMPILATION: usize = 10 * 1024 * 1024;
+
+/// We also need a separate limit for the size of regexes before compilation. Even though the
+/// `Regex` crate promises that using its `size_limit` option (which we set to the other limit,
+/// `MAX_REGEX_SIZE_AFTER_COMPILATION`) would prevent excessive resource usage, this doesn't seem to
+/// be the case. Since we compile regexes in envd, we need strict limits to prevent envd OOMs.
+/// See <https://github.com/MaterializeInc/database-issues/issues/9907> for an example.
+///
+/// Note: This number is mentioned in our user-facing docs at the "String operators" in the function
+/// reference.
+const MAX_REGEX_SIZE_BEFORE_COMPILATION: usize = 1 * 1024 * 1024;
+
 /// A hashable, comparable, and serializable regular expression type.
 ///
 /// The  [`regex::Regex`] type, the de facto standard regex type in Rust, does
@@ -58,7 +75,7 @@ impl Regex {
     /// A simple constructor for the default setting of `dot_matches_new_line: true`.
     /// See <https://www.postgresql.org/docs/current/functions-matching.html#POSIX-MATCHING-RULES>
     /// "newline-sensitive matching"
-    pub fn new(pattern: &str, case_insensitive: bool) -> Result<Regex, Error> {
+    pub fn new(pattern: &str, case_insensitive: bool) -> Result<Regex, RegexCompilationError> {
         Self::new_dot_matches_new_line(pattern, case_insensitive, true)
     }
 
@@ -67,10 +84,16 @@ impl Regex {
         pattern: &str,
         case_insensitive: bool,
         dot_matches_new_line: bool,
-    ) -> Result<Regex, Error> {
+    ) -> Result<Regex, RegexCompilationError> {
+        if pattern.len() > MAX_REGEX_SIZE_BEFORE_COMPILATION {
+            return Err(RegexCompilationError::PatternTooLarge {
+                pattern_size: pattern.len(),
+            });
+        }
         let mut regex_builder = RegexBuilder::new(pattern);
         regex_builder.case_insensitive(case_insensitive);
         regex_builder.dot_matches_new_line(dot_matches_new_line);
+        regex_builder.size_limit(MAX_REGEX_SIZE_AFTER_COMPILATION);
         Ok(Regex {
             case_insensitive,
             dot_matches_new_line,
@@ -86,6 +109,36 @@ impl Regex {
     }
 }
 
+/// Error type for regex compilation failures.
+#[derive(Debug, Clone)]
+pub enum RegexCompilationError {
+    /// Wrapper for regex crate's Error type.
+    RegexError(Error),
+    /// Regex pattern size exceeds MAX_REGEX_SIZE_BEFORE_COMPILATION.
+    PatternTooLarge { pattern_size: usize },
+}
+
+impl fmt::Display for RegexCompilationError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            RegexCompilationError::RegexError(e) => write!(f, "{}", e),
+            RegexCompilationError::PatternTooLarge {
+                pattern_size: patter_size,
+            } => write!(
+                f,
+                "regex pattern too large ({} bytes, max {} bytes)",
+                patter_size, MAX_REGEX_SIZE_BEFORE_COMPILATION
+            ),
+        }
+    }
+}
+
+impl From<Error> for RegexCompilationError {
+    fn from(e: Error) -> Self {
+        RegexCompilationError::RegexError(e)
+    }
+}
+
 impl PartialEq<Regex> for Regex {
     fn eq(&self, other: &Regex) -> bool {
         self.pattern() == other.pattern()

diff --git a/test/sqllogictest/regex.slt b/test/sqllogictest/regex.slt
@@ -818,5 +818,8 @@ SELECT regexp_matches(bar, '(b[^b]+)(b[^b]+)(b[^b]+)', 'ig') FROM foo2;
 db error: ERROR: function regexp_matches(smallint, unknown, unknown) does not exist
 HINT: No function matches the given name and argument types.  You might need to add explicit type casts.
 
+statement error db error: ERROR: invalid regular expression: regex pattern too large \(62400000 bytes, max 1048576 bytes\)
+SELECT 'aaaaaaaaaaa' ~ repeat('vxx.0.0-rc.4 (5b079d80c)', '2600000');
+
 statement ok
 DROP CLUSTER multiprocess;