diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index b5ab46f0ec713..9a4f2219b4d1f 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -189,13 +189,15 @@ fn regexp_replace_func(args: &[ColumnarValue]) -> Result { } } -/// replace POSIX capture groups (like \1) with Rust Regex group (like ${1}) +/// replace POSIX capture groups (like \1 or \\1) with Rust Regex group (like ${1}) /// used by regexp_replace +/// Handles both single backslash (\1) and double backslash (\\1) which can occur +/// when SQL strings with escaped backslashes are passed through fn regex_replace_posix_groups(replacement: &str) -> String { static CAPTURE_GROUPS_RE_LOCK: LazyLock = - LazyLock::new(|| Regex::new(r"(\\)(\d*)").unwrap()); + LazyLock::new(|| Regex::new(r"\\{1,2}(\d+)").unwrap()); CAPTURE_GROUPS_RE_LOCK - .replace_all(replacement, "$${$2}") + .replace_all(replacement, "$${$1}") .into_owned() } @@ -659,6 +661,33 @@ mod tests { use super::*; + #[test] + fn test_regex_replace_posix_groups() { + // Test that \1, \2, etc. are replaced with ${1}, ${2}, etc. + assert_eq!(regex_replace_posix_groups(r"\1"), "${1}"); + assert_eq!(regex_replace_posix_groups(r"\12"), "${12}"); + assert_eq!(regex_replace_posix_groups(r"X\1Y"), "X${1}Y"); + assert_eq!(regex_replace_posix_groups(r"\1\2"), "${1}${2}"); + + // Test double backslash (from SQL escaped strings like '\\1') + assert_eq!(regex_replace_posix_groups(r"\\1"), "${1}"); + assert_eq!(regex_replace_posix_groups(r"X\\1Y"), "X${1}Y"); + assert_eq!(regex_replace_posix_groups(r"\\1\\2"), "${1}${2}"); + + // Test 3 or 4 backslashes before digits to document expected behavior + assert_eq!(regex_replace_posix_groups(r"\\\1"), r"\${1}"); + assert_eq!(regex_replace_posix_groups(r"\\\\1"), r"\\${1}"); + assert_eq!(regex_replace_posix_groups(r"\\\1\\\\2"), r"\${1}\\${2}"); + + // Test that a lone backslash is NOT replaced (requires at least one digit) + assert_eq!(regex_replace_posix_groups(r"\"), r"\"); + assert_eq!(regex_replace_posix_groups(r"foo\bar"), r"foo\bar"); + + // Test that backslash followed by non-digit is preserved + assert_eq!(regex_replace_posix_groups(r"\n"), r"\n"); + assert_eq!(regex_replace_posix_groups(r"\t"), r"\t"); + } + macro_rules! static_pattern_regexp_replace { ($name:ident, $T:ty, $O:ty) => { #[test]