diff --git a/Cargo.toml b/Cargo.toml index 60be5b9d4..0f8b99f74 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex" -version = "1.11.1" #:version +version = "1.12.0" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" readme = "README.md" @@ -176,14 +176,14 @@ default-features = false # For the actual regex engines. [dependencies.regex-automata] path = "regex-automata" -version = "0.4.8" +version = "0.5.0" default-features = false features = ["alloc", "syntax", "meta", "nfa-pikevm"] # For parsing regular expressions. [dependencies.regex-syntax] path = "regex-syntax" -version = "0.8.5" +version = "0.9.0" default-features = false [dev-dependencies] diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 19d9dc229..2c4069899 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-automata" -version = "0.4.9" #:version +version = "0.5.0" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" @@ -86,7 +86,7 @@ internal-instrument-pikevm = ["logging", "std"] aho-corasick = { version = "1.0.0", optional = true, default-features = false } log = { version = "0.4.14", optional = true } memchr = { version = "2.6.0", optional = true, default-features = false } -regex-syntax = { path = "../regex-syntax", version = "0.8.5", optional = true, default-features = false } +regex-syntax = { path = "../regex-syntax", version = "0.9.0", optional = true, default-features = false } [dev-dependencies] anyhow = "1.0.69" diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index fdae99fa6..43973a92a 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -5083,6 +5083,12 @@ impl BuildError { BuildError { kind: BuildErrorKind::Unsupported(msg) } } + pub(crate) fn unsupported_lookaround() -> BuildError { + let msg = "cannot build DFAs for regexes with look-around \ + sub-expressions; use a different regex engine"; + BuildError { kind: BuildErrorKind::Unsupported(msg) } + } + pub(crate) fn too_many_states() -> BuildError { BuildError { kind: BuildErrorKind::TooManyStates } } diff --git a/regex-automata/src/dfa/determinize.rs b/regex-automata/src/dfa/determinize.rs index 19f99f5d6..3b048081e 100644 --- a/regex-automata/src/dfa/determinize.rs +++ b/regex-automata/src/dfa/determinize.rs @@ -219,6 +219,10 @@ impl<'a> Runner<'a> { return Err(BuildError::unsupported_dfa_word_boundary_unicode()); } + if self.nfa.lookaround_count() > 0 { + return Err(BuildError::unsupported_lookaround()); + } + // A sequence of "representative" bytes drawn from each equivalence // class. These representative bytes are fed to the NFA to compute // state transitions. This allows us to avoid re-computing state diff --git a/regex-automata/src/dfa/onepass.rs b/regex-automata/src/dfa/onepass.rs index 01e45309c..b75feac45 100644 --- a/regex-automata/src/dfa/onepass.rs +++ b/regex-automata/src/dfa/onepass.rs @@ -602,6 +602,9 @@ impl<'a> InternalBuilder<'a> { )); } assert_eq!(DEAD, self.add_empty_state()?); + if self.nfa.lookaround_count() > 0 { + return Err(BuildError::unsupported_lookaround()); + } // This is where the explicit slots start. We care about this because // we only need to track explicit slots. The implicit slots---two for @@ -638,6 +641,10 @@ impl<'a> InternalBuilder<'a> { self.stack_push(nfa_id, Epsilons::empty())?; while let Some((id, epsilons)) = self.stack.pop() { match *self.nfa.state(id) { + thompson::State::WriteLookAround { .. } + | thompson::State::CheckLookAround { .. } => { + return Err(BuildError::unsupported_lookaround()); + } thompson::State::ByteRange { ref trans } => { self.compile_transition(dfa_id, trans, epsilons)?; } @@ -2996,6 +3003,7 @@ enum BuildErrorKind { UnsupportedLook { look: Look }, ExceededSizeLimit { limit: usize }, NotOnePass { msg: &'static str }, + UnsupportedLookAround, } impl BuildError { @@ -3026,6 +3034,10 @@ impl BuildError { fn not_one_pass(msg: &'static str) -> BuildError { BuildError { kind: BuildErrorKind::NotOnePass { msg } } } + + fn unsupported_lookaround() -> BuildError { + BuildError { kind: BuildErrorKind::UnsupportedLookAround } + } } #[cfg(feature = "std")] @@ -3074,6 +3086,9 @@ impl core::fmt::Display for BuildError { pattern is not one-pass: {}", msg, ), + UnsupportedLookAround => { + write!(f, "one-pass DFA does not support look-arounds") + } } } } diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index bd9179b19..5c1978f8d 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -4056,6 +4056,9 @@ impl Builder { &self, nfa: thompson::NFA, ) -> Result { + if nfa.lookaround_count() > 0 { + return Err(BuildError::unsupported_lookaround()); + } let quitset = self.config.quit_set_from_nfa(&nfa)?; let classes = self.config.byte_classes_from_nfa(&nfa, &quitset); // Check that we can fit at least a few states into our cache, diff --git a/regex-automata/src/hybrid/error.rs b/regex-automata/src/hybrid/error.rs index d134e7ec9..062b9ac62 100644 --- a/regex-automata/src/hybrid/error.rs +++ b/regex-automata/src/hybrid/error.rs @@ -61,6 +61,12 @@ impl BuildError { different regex engine"; BuildError { kind: BuildErrorKind::Unsupported(msg) } } + + pub(crate) fn unsupported_lookaround() -> BuildError { + let msg = "cannot build DFAs for regexes with look-around \ + sub-expressions; use a different regex engine"; + BuildError { kind: BuildErrorKind::Unsupported(msg) } + } } #[cfg(feature = "std")] diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index 8cfdecbec..6bc4bdc71 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -611,7 +611,8 @@ impl Regex { &'r self, input: I, ) -> FindMatches<'r, 'h> { - let cache = self.pool.get(); + let mut cache = self.pool.get(); + cache.keep_lookaround_state(true); let it = iter::Searcher::new(input.into()); FindMatches { re: self, cache, it } } @@ -652,7 +653,8 @@ impl Regex { &'r self, input: I, ) -> CapturesMatches<'r, 'h> { - let cache = self.pool.get(); + let mut cache = self.pool.get(); + cache.keep_lookaround_state(true); let caps = self.create_captures(); let it = iter::Searcher::new(input.into()); CapturesMatches { re: self, cache, caps, it } @@ -2076,7 +2078,11 @@ impl<'r, 'h> Iterator for FindMatches<'r, 'h> { #[inline] fn next(&mut self) -> Option { let FindMatches { re, ref mut cache, ref mut it } = *self; - it.advance(|input| Ok(re.search_with(cache, input))) + let result = it.advance(|input| Ok(re.search_with(cache, input))); + if result.is_none() { + cache.keep_lookaround_state(false); + } + result } #[inline] @@ -2149,6 +2155,7 @@ impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> { if caps.is_match() { Some(caps.clone()) } else { + cache.keep_lookaround_state(false); None } } @@ -2385,6 +2392,19 @@ impl Cache { re.imp.strat.reset_cache(self) } + /// Set this cache to keep the state of look-behind assertions upon a + /// match being found. + /// + /// This must only be called with a value of `true` when a new search is + /// started at the end of a previously found match, otherwise the result + /// of any search after this call will most likely be wrong. + /// + /// Calling this function with a value of `false` will clear any previously + /// stored look-behind state. + pub fn keep_lookaround_state(&mut self, keep: bool) { + self.pikevm.keep_lookaround_state(keep); + } + /// Returns the heap memory usage, in bytes, of this cache. /// /// This does **not** include the stack size used up by this cache. To diff --git a/regex-automata/src/meta/reverse_inner.rs b/regex-automata/src/meta/reverse_inner.rs index 3d78779f6..14e260a1e 100644 --- a/regex-automata/src/meta/reverse_inner.rs +++ b/regex-automata/src/meta/reverse_inner.rs @@ -170,6 +170,7 @@ fn top_concat(mut hir: &Hir) -> Option> { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) + | HirKind::LookAround(_) | HirKind::Repetition(_) | HirKind::Alternation(_) => return None, HirKind::Capture(hir::Capture { ref sub, .. }) => sub, @@ -206,6 +207,9 @@ fn flatten(hir: &Hir) -> Hir { HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()), HirKind::Class(ref x) => Hir::class(x.clone()), HirKind::Look(ref x) => Hir::look(x.clone()), + HirKind::LookAround(ref x) => { + Hir::lookaround(x.with(flatten(x.sub()))) + } HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))), // This is the interesting case. We just drop the group information // entirely and use the child HIR itself. diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 04f2ba3c3..19823b555 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -258,6 +258,11 @@ impl Pre<()> { if !info.props()[0].look_set().is_empty() { return None; } + // For a similar reason, we require that it has zero look-around + // expressions. + if info.props()[0].contains_lookaround_expr() { + return None; + } // Finally, currently, our prefilters are all oriented around // leftmost-first match semantics, so don't try to use them if the // caller asked for anything else. @@ -490,49 +495,52 @@ impl Core { // we know we aren't going to use the lazy DFA. So we do a config check // up front, which is in practice the only way we won't try to use the // DFA. - let (nfarev, hybrid, dfa) = - if !info.config().get_hybrid() && !info.config().get_dfa() { - (None, wrappers::Hybrid::none(), wrappers::DFA::none()) + let (nfarev, hybrid, dfa) = if !info.config().get_hybrid() + && !info.config().get_dfa() + // With look-arounds, the lazy DFA and dense DFA would fail to build + || nfa.lookaround_count() > 0 + { + (None, wrappers::Hybrid::none(), wrappers::DFA::none()) + } else { + // FIXME: Technically, we don't quite yet KNOW that we need + // a reverse NFA. It's possible for the DFAs below to both + // fail to build just based on the forward NFA. In which case, + // building the reverse NFA was totally wasted work. But... + // fixing this requires breaking DFA construction apart into + // two pieces: one for the forward part and another for the + // reverse part. Quite annoying. Making it worse, when building + // both DFAs fails, it's quite likely that the NFA is large and + // that it will take quite some time to build the reverse NFA + // too. So... it's really probably worth it to do this! + let nfarev = thompson::Compiler::new() + // Currently, reverse NFAs don't support capturing groups, + // so we MUST disable them. But even if we didn't have to, + // we would, because nothing in this crate does anything + // useful with capturing groups in reverse. And of course, + // the lazy DFA ignores capturing groups in all cases. + .configure( + thompson_config + .clone() + .which_captures(WhichCaptures::None) + .reverse(true), + ) + .build_many_from_hir(hirs) + .map_err(BuildError::nfa)?; + let dfa = if !info.config().get_dfa() { + wrappers::DFA::none() } else { - // FIXME: Technically, we don't quite yet KNOW that we need - // a reverse NFA. It's possible for the DFAs below to both - // fail to build just based on the forward NFA. In which case, - // building the reverse NFA was totally wasted work. But... - // fixing this requires breaking DFA construction apart into - // two pieces: one for the forward part and another for the - // reverse part. Quite annoying. Making it worse, when building - // both DFAs fails, it's quite likely that the NFA is large and - // that it will take quite some time to build the reverse NFA - // too. So... it's really probably worth it to do this! - let nfarev = thompson::Compiler::new() - // Currently, reverse NFAs don't support capturing groups, - // so we MUST disable them. But even if we didn't have to, - // we would, because nothing in this crate does anything - // useful with capturing groups in reverse. And of course, - // the lazy DFA ignores capturing groups in all cases. - .configure( - thompson_config - .clone() - .which_captures(WhichCaptures::None) - .reverse(true), - ) - .build_many_from_hir(hirs) - .map_err(BuildError::nfa)?; - let dfa = if !info.config().get_dfa() { - wrappers::DFA::none() - } else { - wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev) - }; - let hybrid = if !info.config().get_hybrid() { - wrappers::Hybrid::none() - } else if dfa.is_some() { - debug!("skipping lazy DFA because we have a full DFA"); - wrappers::Hybrid::none() - } else { - wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev) - }; - (Some(nfarev), hybrid, dfa) + wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev) }; + let hybrid = if !info.config().get_hybrid() { + wrappers::Hybrid::none() + } else if dfa.is_some() { + debug!("skipping lazy DFA because we have a full DFA"); + wrappers::Hybrid::none() + } else { + wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev) + }; + (Some(nfarev), hybrid, dfa) + }; Ok(Core { info, pre, diff --git a/regex-automata/src/meta/wrappers.rs b/regex-automata/src/meta/wrappers.rs index 6cb19ba0d..83f5c12ab 100644 --- a/regex-automata/src/meta/wrappers.rs +++ b/regex-automata/src/meta/wrappers.rs @@ -133,6 +133,12 @@ impl PikeVMCache { PikeVMCache(Some(builder.get().0.create_cache())) } + pub(crate) fn keep_lookaround_state(&mut self, keep: bool) { + if let Some(cache) = self.0.as_mut() { + cache.keep_lookaround_state(keep); + } + } + pub(crate) fn reset(&mut self, builder: &PikeVM) { self.0.as_mut().unwrap().reset(&builder.get().0); } @@ -204,6 +210,8 @@ impl BoundedBacktrackerEngine { { if !info.config().get_backtrack() || info.config().get_match_kind() != MatchKind::LeftmostFirst + // TODO: remove once look-around support is added. + || nfa.lookaround_count() > 0 { return Ok(None); } diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs index df99e456d..eb36d1829 100644 --- a/regex-automata/src/nfa/thompson/backtrack.rs +++ b/regex-automata/src/nfa/thompson/backtrack.rs @@ -301,6 +301,9 @@ impl Builder { nfa: NFA, ) -> Result { nfa.look_set_any().available().map_err(BuildError::word)?; + if nfa.lookaround_count() > 0 { + return Err(BuildError::unsupported_lookarounds()); + } Ok(BoundedBacktracker { config: self.config.clone(), nfa }) } @@ -1453,7 +1456,7 @@ impl BoundedBacktracker { /// Execute a "step" in the backtracing algorithm. /// /// A "step" is somewhat of a misnomer, because this routine keeps going - /// until it either runs out of things to try or fins a match. In the + /// until it either runs out of things to try or finds a match. In the /// former case, it may have pushed some things on to the backtracking /// stack, in which case, those will be tried next as part of the /// 'backtrack' routine above. @@ -1519,6 +1522,12 @@ impl BoundedBacktracker { } sid = next; } + State::WriteLookAround { .. } + | State::CheckLookAround { .. } => { + unimplemented!( + "backtracking engine does not support look-arounds" + ); + } State::Union { ref alternates } => { sid = match alternates.get(0) { None => return None, diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index 6b69e8784..4f2f9af79 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -91,6 +91,17 @@ enum State { /// The next state that this state should transition to. next: StateID, }, + /// An empty state that behaves analogously to a `Match` state but for + /// the look-around sub-expression with the given look-around index. + WriteLookAround { lookaround_index: SmallIndex }, + /// A conditional epsilon transition that will only be taken if the + /// look-around sub-expression with the given index evaluates to `positive` + /// at the current position in the haystack. + CheckLookAround { + lookaround_index: SmallIndex, + positive: bool, + next: StateID, + }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via earlier transitions /// are preferred over later transitions. @@ -154,7 +165,9 @@ impl State { | State::CaptureStart { .. } | State::CaptureEnd { .. } | State::Fail - | State::Match { .. } => 0, + | State::Match { .. } + | State::CheckLookAround { .. } + | State::WriteLookAround { .. } => 0, State::Sparse { ref transitions } => { transitions.len() * mem::size_of::() } @@ -327,6 +340,8 @@ pub struct Builder { /// contains a single regex, then `start_pattern[0]` and `start_anchored` /// are always equivalent. start_pattern: Vec, + /// The starting states for each individual look-behind sub-expression. + start_look_behind: Vec, /// A map from pattern ID to capture group index to name. (If no name /// exists, then a None entry is present. Thus, all capturing groups are /// present in this mapping.) @@ -372,6 +387,7 @@ impl Builder { self.pattern_id = None; self.states.clear(); self.start_pattern.clear(); + self.start_look_behind.clear(); self.captures.clear(); self.memory_states = 0; } @@ -436,6 +452,7 @@ impl Builder { remap.resize(self.states.len(), StateID::ZERO); nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern); + nfa.set_look_behind_starts(self.start_look_behind.as_slice()); nfa.set_captures(&self.captures).map_err(BuildError::captures)?; // The idea here is to convert our intermediate states to their final // form. The only real complexity here is the process of converting @@ -470,6 +487,21 @@ impl Builder { State::Look { look, next } => { remap[sid] = nfa.add(nfa::State::Look { look, next }); } + State::WriteLookAround { lookaround_index } => { + remap[sid] = nfa + .add(nfa::State::WriteLookAround { lookaround_index }); + } + State::CheckLookAround { + lookaround_index, + positive, + next, + } => { + remap[sid] = nfa.add(nfa::State::CheckLookAround { + lookaround_index, + positive, + next, + }); + } State::CaptureStart { pattern_id, group_index, next } => { // We can't remove this empty state because of the side // effect of capturing an offset for this capture slot. @@ -678,6 +710,12 @@ impl Builder { self.start_pattern.len() } + /// Adds the `start_id` to the set of starting states that is used when + /// running look-behind expressions. + pub fn start_look_behind(&mut self, start_id: StateID) { + self.start_look_behind.push(start_id); + } + /// Add an "empty" NFA state. /// /// An "empty" NFA state is a state with a single unconditional epsilon @@ -693,6 +731,30 @@ impl Builder { self.add(State::Empty { next: StateID::ZERO }) } + /// Add a state which will record that the look-around with the given index + /// is satisfied at the current position. + pub fn add_write_lookaround( + &mut self, + index: SmallIndex, + ) -> Result { + self.add(State::WriteLookAround { lookaround_index: index }) + } + + /// Add a state which will check whether the look-around with the given + /// index is satisfied at the current position. + pub fn add_check_lookaround( + &mut self, + index: SmallIndex, + positive: bool, + next: StateID, + ) -> Result { + self.add(State::CheckLookAround { + lookaround_index: index, + positive, + next, + }) + } + /// Add a "union" NFA state. /// /// A "union" NFA state that contains zero or more unconditional epsilon @@ -1159,6 +1221,9 @@ impl Builder { State::Look { ref mut next, .. } => { *next = to; } + State::CheckLookAround { ref mut next, .. } => { + *next = to; + } State::Union { ref mut alternates } => { alternates.push(to); self.memory_states += mem::size_of::(); @@ -1173,6 +1238,7 @@ impl Builder { State::CaptureEnd { ref mut next, .. } => { *next = to; } + State::WriteLookAround { .. } => {} State::Fail => {} State::Match { .. } => {} } diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index ced17719d..42dd32127 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -3,7 +3,7 @@ use core::{borrow::Borrow, cell::RefCell}; use alloc::{sync::Arc, vec, vec::Vec}; use regex_syntax::{ - hir::{self, Hir}, + hir::{self, Hir, LookAround}, utf8::{Utf8Range, Utf8Sequences}, ParserBuilder, }; @@ -19,7 +19,7 @@ use crate::{ }, util::{ look::{Look, LookMatcher}, - primitives::{PatternID, StateID}, + primitives::{PatternID, SmallIndex, StateID}, }, }; @@ -711,6 +711,8 @@ pub struct Compiler { /// State used for caching common suffixes when compiling reverse UTF-8 /// automata (for Unicode character classes). utf8_suffix: RefCell, + /// The next index to use for a look-around expression. + lookaround_index: RefCell, } impl Compiler { @@ -723,6 +725,7 @@ impl Compiler { utf8_state: RefCell::new(Utf8State::new()), trie_state: RefCell::new(RangeTrie::new()), utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), + lookaround_index: RefCell::new(SmallIndex::ZERO), } } @@ -945,6 +948,13 @@ impl Compiler { { return Err(BuildError::unsupported_captures()); } + if self.config.get_reverse() + && exprs.iter().any(|e| { + (e.borrow() as &Hir).properties().contains_lookaround_expr() + }) + { + return Err(BuildError::unsupported_lookarounds()); + } self.builder.borrow_mut().clear(); self.builder.borrow_mut().set_utf8(self.config.get_utf8()); @@ -1003,6 +1013,7 @@ impl Compiler { Class(Class::Bytes(ref c)) => self.c_byte_class(c), Class(Class::Unicode(ref c)) => self.c_unicode_class(c), Look(ref look) => self.c_look(look), + LookAround(ref lookaround) => self.c_lookaround(lookaround), Repetition(ref rep) => self.c_repetition(rep), Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub), Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))), @@ -1010,6 +1021,32 @@ impl Compiler { } } + fn c_lookaround( + &self, + lookaround: &LookAround, + ) -> Result { + let idx = *self.lookaround_index.borrow(); + *self.lookaround_index.borrow_mut() = SmallIndex::new(idx.one_more()) + .map_err(|e| { + BuildError::too_many_lookarounds(e.attempted() as usize) + })?; + let pos = match lookaround { + LookAround::NegativeLookBehind(_) => false, + LookAround::PositiveLookBehind(_) => true, + }; + let check = self.add_check_lookaround(idx, pos)?; + + let unanchored = + self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; + self.builder.borrow_mut().start_look_behind(unanchored.start); + + let sub = self.c(lookaround.sub())?; + let write = self.add_write_lookaround(idx)?; + self.patch(unanchored.end, sub.start)?; + self.patch(sub.end, write)?; + Ok(ThompsonRef { start: check, end: check }) + } + /// Compile a concatenation of the sub-expressions yielded by the given /// iterator. If the iterator yields no elements, then this compiles down /// to an "empty" state that always matches. @@ -1630,6 +1667,25 @@ impl Compiler { self.builder.borrow_mut().add_empty() } + fn add_write_lookaround( + &self, + index: SmallIndex, + ) -> Result { + self.builder.borrow_mut().add_write_lookaround(index) + } + + fn add_check_lookaround( + &self, + index: SmallIndex, + positive: bool, + ) -> Result { + self.builder.borrow_mut().add_check_lookaround( + index, + positive, + StateID::ZERO, + ) + } + fn add_range(&self, start: u8, end: u8) -> Result { self.builder.borrow_mut().add_range(Transition { start, @@ -1958,6 +2014,22 @@ mod tests { } } + fn s_write_lookaround(id: usize) -> State { + State::WriteLookAround { + lookaround_index: SmallIndex::new(id) + .expect("look-around index too large"), + } + } + + fn s_check_lookaround(id: usize, positive: bool, next: usize) -> State { + State::CheckLookAround { + lookaround_index: SmallIndex::new(id) + .expect("look-around index too large"), + positive, + next: sid(next), + } + } + fn s_fail() -> State { State::Fail } @@ -2059,6 +2131,28 @@ mod tests { ); } + #[test] + fn compile_yes_unanchored_prefix_with_start_anchor_in_lookaround() { + let nfa = NFA::compiler() + .configure(NFA::config().which_captures(WhichCaptures::None)) + .build(r"(?<=^)a") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_bin_union(2, 1), + s_range(0, 255, 0), + s_check_lookaround(0, true, 7), + s_bin_union(5, 4), + s_range(0, 255, 3), + s_look(Look::Start, 6), + s_write_lookaround(0), + s_byte(b'a', 8), + s_match(0) + ] + ); + } + #[test] fn compile_empty() { assert_eq!(build("").states(), &[s_match(0),]); @@ -2183,6 +2277,37 @@ mod tests { ); } + #[test] + fn compile_lookbehind() { + assert_eq!( + build(r"(?<=a)").states(), + &[ + s_check_lookaround(0, true, 5), + s_bin_union(3, 2), + s_range(b'\x00', b'\xFF', 1), + s_byte(b'a', 4), + s_write_lookaround(0), + s_match(0) + ] + ); + assert_eq!( + build(r"(?<=a(? BuildError { + let limit = SmallIndex::LIMIT; + BuildError { + kind: BuildErrorKind::TooManyLookArounds { given, limit }, + } + } + pub(crate) fn exceeded_size_limit(limit: usize) -> BuildError { BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } } } @@ -127,6 +149,10 @@ impl BuildError { pub(crate) fn unsupported_captures() -> BuildError { BuildError { kind: BuildErrorKind::UnsupportedCaptures } } + + pub(crate) fn unsupported_lookarounds() -> BuildError { + BuildError { kind: BuildErrorKind::UnsupportedLookArounds } + } } #[cfg(feature = "std")] @@ -164,6 +190,12 @@ impl core::fmt::Display for BuildError { which exceeds the limit of {}", given, limit, ), + BuildErrorKind::TooManyLookArounds { given, limit } => write!( + f, + "attempted to compile {} look-around expressions, \ + which exceeds the limit of {}", + given, limit, + ), BuildErrorKind::ExceededSizeLimit { limit } => write!( f, "heap usage during NFA compilation exceeded limit of {}", @@ -180,6 +212,11 @@ impl core::fmt::Display for BuildError { "currently captures must be disabled when compiling \ a reverse NFA", ), + BuildErrorKind::UnsupportedLookArounds => write!( + f, + "currently look-around sub-expressions cannot be in the pattern \ + when compiling a reverse NFA or using the backtracking engine", + ), } } } diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 1f57f8ebd..1d63bd64a 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1100,6 +1100,18 @@ impl NFA { self.0.look_set_prefix_any } + /// Returns how many look-around sub-expressions this nfa contains. + #[inline] + pub fn lookaround_count(&self) -> usize { + self.0.lookaround_count + } + + /// Returns the starting states for initializing look-behind evaluation. + #[inline] + pub fn look_behind_starts(&self) -> &Vec { + &self.0.start_look_behind + } + // FIXME: The `look_set_prefix_all` computation was not correct, and it // seemed a little tricky to fix it. Since I wasn't actually using it for // anything, I just decided to remove it in the run up to the regex 1.9 @@ -1260,6 +1272,12 @@ pub(super) struct Inner { /// zero-length prefix for any of the patterns in this NFA. look_set_prefix_all: LookSet, */ + /// How many look-around expression this NFA contains. + /// This is needed to initialize the table for storing the result of + /// look-around evaluation. + lookaround_count: usize, + /// Contains the start states for each of the look-behind subexpressions. + start_look_behind: Vec, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this @@ -1288,7 +1306,11 @@ impl Inner { match self.states[sid] { State::ByteRange { .. } | State::Dense { .. } - | State::Fail => continue, + | State::Fail + | State::WriteLookAround { .. } => continue, + State::CheckLookAround { next, .. } => { + stack.push(next); + } State::Sparse(_) => { // This snippet below will rewrite this sparse state // as a dense state. By doing it here, we apply this @@ -1371,6 +1393,11 @@ impl Inner { State::Capture { .. } => { self.has_capture = true; } + State::CheckLookAround { lookaround_index, .. } + | State::WriteLookAround { lookaround_index } => { + self.lookaround_count = + self.lookaround_count.max(lookaround_index.as_usize() + 1); + } State::Union { .. } | State::BinaryUnion { .. } | State::Fail @@ -1400,6 +1427,13 @@ impl Inner { self.start_pattern = start_pattern.to_vec(); } + pub(super) fn set_look_behind_starts( + &mut self, + look_behind_starts: &[StateID], + ) { + self.start_look_behind = look_behind_starts.to_vec(); + } + /// Sets the UTF-8 mode of this NFA. pub(super) fn set_utf8(&mut self, yes: bool) { self.utf8 = yes; @@ -1453,6 +1487,9 @@ impl Inner { for id in self.start_pattern.iter_mut() { *id = old_to_new[*id]; } + for id in self.start_look_behind.iter_mut() { + *id = old_to_new[*id]; + } } } @@ -1464,6 +1501,8 @@ impl fmt::Debug for Inner { '^' } else if sid == self.start_unanchored { '>' + } else if self.start_look_behind.contains(&sid) { + '<' } else { ' ' }; @@ -1545,6 +1584,26 @@ pub enum State { /// satisfied. next: StateID, }, + /// This is like a match state but for a look-around expression. + /// Executing this state will write the current haystack offset into the + /// look-around oracle at index `lookaround_index`. + WriteLookAround { + /// The index of the look-around expression that matches. + lookaround_index: SmallIndex, + }, + /// This indicates that we need to check whether look-around expression with + /// index `lookaround_index` holds at the current position in the haystack. + /// If `positive` is false, then the look-around expression is negative and + /// hence must NOT hold. + CheckLookAround { + /// The index of the look-around expression that must be satisfied. + lookaround_index: SmallIndex, + /// Whether this is a positive look-around expression. + positive: bool, + /// The next state to transition if the look-around assertion is + /// satisfied. + next: StateID, + }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via earlier transitions /// are preferred over later transitions. @@ -1658,11 +1717,13 @@ impl State { | State::Sparse { .. } | State::Dense { .. } | State::Fail - | State::Match { .. } => false, + | State::Match { .. } + | State::WriteLookAround { .. } => false, State::Look { .. } | State::Union { .. } | State::BinaryUnion { .. } - | State::Capture { .. } => true, + | State::Capture { .. } + | State::CheckLookAround { .. } => true, } } @@ -1674,7 +1735,9 @@ impl State { | State::BinaryUnion { .. } | State::Capture { .. } | State::Match { .. } - | State::Fail => 0, + | State::Fail + | State::WriteLookAround { .. } + | State::CheckLookAround { .. } => 0, State::Sparse(SparseTransitions { ref transitions }) => { transitions.len() * mem::size_of::() } @@ -1707,6 +1770,9 @@ impl State { } } State::Look { ref mut next, .. } => *next = remap[*next], + State::CheckLookAround { ref mut next, .. } => { + *next = remap[*next] + } State::Union { ref mut alternates } => { for alt in alternates.iter_mut() { *alt = remap[*alt]; @@ -1717,8 +1783,9 @@ impl State { *alt2 = remap[*alt2]; } State::Capture { ref mut next, .. } => *next = remap[*next], - State::Fail => {} - State::Match { .. } => {} + State::Fail + | State::Match { .. } + | State::WriteLookAround { .. } => {} } } } @@ -1748,6 +1815,18 @@ impl fmt::Debug for State { State::Look { ref look, next } => { write!(f, "{:?} => {:?}", look, next.as_usize()) } + State::WriteLookAround { lookaround_index } => { + write!(f, "write-look-around({})", lookaround_index.as_u32()) + } + State::CheckLookAround { lookaround_index, positive, next } => { + write!( + f, + "check-look-around({} is {}) => {}", + lookaround_index.as_u32(), + if positive { "matched" } else { "not matched" }, + next.as_usize() + ) + } State::Union { ref alternates } => { let alts = alternates .iter() diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 0128c151a..b18101c53 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -891,6 +891,7 @@ impl PikeVM { cache: &'c mut Cache, input: I, ) -> FindMatches<'r, 'c, 'h> { + cache.keep_lookaround_state(true); let caps = Captures::matches(self.get_nfa().group_info().clone()); let it = iter::Searcher::new(input.into()); FindMatches { re: self, cache, caps, it } @@ -934,6 +935,7 @@ impl PikeVM { cache: &'c mut Cache, input: I, ) -> CapturesMatches<'r, 'c, 'h> { + cache.keep_lookaround_state(true); let caps = self.create_captures(); let it = iter::Searcher::new(input.into()); CapturesMatches { re: self, cache, caps, it } @@ -1216,6 +1218,10 @@ impl PikeVM { } impl PikeVM { + fn lookaround_count(&self) -> usize { + self.nfa.lookaround_count() + } + /// The implementation of standard leftmost search. /// /// Capturing group spans are written to `slots`, but only if requested. @@ -1254,7 +1260,51 @@ impl PikeVM { let pre = if anchored { None } else { self.get_config().get_prefilter() }; - let Cache { ref mut stack, ref mut curr, ref mut next } = cache; + let Cache { + ref mut stack, + ref mut curr, + ref mut next, + ref mut lookaround, + ref mut curr_lookaround, + ref mut next_lookaround, + ref mut match_lookaround, + ref keep_lookaround_state, + } = cache; + + if let Some(active) = match_lookaround { + *curr_lookaround = active.clone(); + } else if self.lookaround_count() > 0 { + // This initializes the look-behind threads from the start of the input + // Note: since capture groups are not allowed inside look-behinds, + // there won't be any Capture epsilon transitions and hence it is ok to + // use &mut [] for the slots parameter. We need to add the start states + // in reverse because more deeply nested look-behinds have a higher index + // but must be executed first, so that the result is available for the + // outer expression. + for look_behind_start in self.nfa.look_behind_starts().iter().rev() + { + self.epsilon_closure( + stack, + &mut [], + curr_lookaround, + lookaround, + input, + 0, + *look_behind_start, + ); + } + // This is necessary for look-behinds to be able to match outside of the + // input span. + self.fast_forward_lookbehinds( + Span { start: 0, end: input.start() }, + input, + stack, + curr_lookaround, + next_lookaround, + lookaround, + ); + } + let mut hm = None; // Yes, our search doesn't end at input.end(), but includes it. This // is necessary because matches are delayed by one byte, just like @@ -1294,7 +1344,21 @@ impl PikeVM { let span = Span::from(at..input.end()); match pre.find(input.haystack(), span) { None => break, - Some(ref span) => at = span.start, + Some(ref span) => { + if self.lookaround_count() > 0 { + // We are jumping ahead due to the pre-filter, thus we must bring + // the look-behind threads to the new position. + self.fast_forward_lookbehinds( + Span { start: at, end: span.start }, + input, + stack, + curr_lookaround, + next_lookaround, + lookaround, + ); + } + at = span.start + } } } } @@ -1361,11 +1425,28 @@ impl PikeVM { // transitions, and thus must be able to write offsets to the // slots given which are later copied to slot values in 'curr'. let slots = next.slot_table.all_absent(); - self.epsilon_closure(stack, slots, curr, input, at, start_id); + self.epsilon_closure( + stack, slots, curr, lookaround, input, at, start_id, + ); } - if let Some(pid) = self.nexts(stack, curr, next, input, at, slots) + // The look-behind states must be processed first, since their + // result must be available for the processing of the main states. + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + at, + &mut [], + ); + if let Some(pid) = + self.nexts(stack, curr, next, lookaround, input, at, slots) { hm = Some(HalfMatch::new(pid, at)); + if *keep_lookaround_state { + *match_lookaround = Some(curr_lookaround.clone()); + } } // Unless the caller asked us to return early, we need to mush on // to see if we can extend our match. (But note that 'nexts' will @@ -1375,13 +1456,45 @@ impl PikeVM { break; } core::mem::swap(curr, next); + core::mem::swap(curr_lookaround, next_lookaround); next.set.clear(); + next_lookaround.set.clear(); at += 1; } instrument!(|c| c.eprint(&self.nfa)); hm } + /// This brings the look-behind threads into the state they must be for + /// starting at [forward_span.end]. The assumption is that they are currently + /// at [forward_span.start]. + fn fast_forward_lookbehinds( + &self, + forward_span: Span, + input: &Input<'_>, + stack: &mut Vec, + curr_lookaround: &mut ActiveStates, + next_lookaround: &mut ActiveStates, + lookaround: &mut Vec>, + ) { + for lb_at in forward_span.start..forward_span.end { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + lb_at, + // Since capture groups are not allowed inside look-arounds, + // there won't be any Capture epsilon transitions and hence it is ok to + // use &mut [] for the slots parameter. + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } + } + /// The implementation for the 'which_overlapping_matches' API. Basically, /// we do a single scan through the entire haystack (unless our regex /// or search is anchored) and record every pattern that matched. In @@ -1425,7 +1538,39 @@ impl PikeVM { Some(config) => config, }; - let Cache { ref mut stack, ref mut curr, ref mut next } = cache; + let Cache { + ref mut stack, + ref mut curr, + ref mut next, + ref mut lookaround, + ref mut curr_lookaround, + ref mut next_lookaround, + // It makes no sense to keep any look-behind state for this version of + // the search, since the caller receives no information about + // where the search ended. + keep_lookaround_state: _, + match_lookaround: _, + } = cache; + + for look_behind_start in self.nfa.look_behind_starts().iter().rev() { + self.epsilon_closure( + stack, + &mut [], + curr_lookaround, + lookaround, + input, + 0, + *look_behind_start, + ); + } + self.fast_forward_lookbehinds( + Span { start: 0, end: input.start() }, + input, + stack, + curr_lookaround, + next_lookaround, + lookaround, + ); for at in input.start()..=input.end() { let any_matches = !patset.is_empty(); if curr.set.is_empty() { @@ -1438,9 +1583,22 @@ impl PikeVM { } if !any_matches || allmatches { let slots = &mut []; - self.epsilon_closure(stack, slots, curr, input, at, start_id); + self.epsilon_closure( + stack, slots, curr, lookaround, input, at, start_id, + ); } - self.nexts_overlapping(stack, curr, next, input, at, patset); + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + at, + &mut [], + ); + self.nexts_overlapping( + stack, curr, next, lookaround, input, at, patset, + ); // If we found a match and filled our set, then there is no more // additional info that we can provide. Thus, we can quit. We also // quit if the caller asked us to stop at the earliest point that @@ -1449,7 +1607,9 @@ impl PikeVM { break; } core::mem::swap(curr, next); + core::mem::swap(curr_lookaround, next_lookaround); next.set.clear(); + next_lookaround.set.clear(); } instrument!(|c| c.eprint(&self.nfa)); } @@ -1469,6 +1629,7 @@ impl PikeVM { stack: &mut Vec, curr: &mut ActiveStates, next: &mut ActiveStates, + lookarounds: &mut Vec>, input: &Input<'_>, at: usize, slots: &mut [Option], @@ -1477,7 +1638,15 @@ impl PikeVM { let mut pid = None; let ActiveStates { ref set, ref mut slot_table } = *curr; for sid in set.iter() { - pid = match self.next(stack, slot_table, next, input, at, sid) { + pid = match self.next( + stack, + slot_table, + next, + lookarounds, + input, + at, + sid, + ) { None => continue, Some(pid) => Some(pid), }; @@ -1497,6 +1666,7 @@ impl PikeVM { stack: &mut Vec, curr: &mut ActiveStates, next: &mut ActiveStates, + lookarounds: &mut Vec>, input: &Input<'_>, at: usize, patset: &mut PatternSet, @@ -1505,8 +1675,15 @@ impl PikeVM { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); let ActiveStates { ref set, ref mut slot_table } = *curr; for sid in set.iter() { - let pid = match self.next(stack, slot_table, next, input, at, sid) - { + let pid = match self.next( + stack, + slot_table, + next, + lookarounds, + input, + at, + sid, + ) { None => continue, Some(pid) => pid, }; @@ -1543,6 +1720,7 @@ impl PikeVM { stack: &mut Vec, curr_slot_table: &mut SlotTable, next: &mut ActiveStates, + lookarounds: &mut Vec>, input: &Input<'_>, at: usize, sid: StateID, @@ -1553,7 +1731,9 @@ impl PikeVM { | State::Look { .. } | State::Union { .. } | State::BinaryUnion { .. } - | State::Capture { .. } => None, + | State::Capture { .. } + | State::WriteLookAround { .. } + | State::CheckLookAround { .. } => None, State::ByteRange { ref trans } => { if trans.matches(input.haystack(), at) { let slots = curr_slot_table.for_state(sid); @@ -1561,7 +1741,13 @@ impl PikeVM { // adding 1 will never wrap. let at = at.wrapping_add(1); self.epsilon_closure( - stack, slots, next, input, at, trans.next, + stack, + slots, + next, + lookarounds, + input, + at, + trans.next, ); } None @@ -1573,7 +1759,13 @@ impl PikeVM { // adding 1 will never wrap. let at = at.wrapping_add(1); self.epsilon_closure( - stack, slots, next, input, at, next_sid, + stack, + slots, + next, + lookarounds, + input, + at, + next_sid, ); } None @@ -1585,7 +1777,13 @@ impl PikeVM { // adding 1 will never wrap. let at = at.wrapping_add(1); self.epsilon_closure( - stack, slots, next, input, at, next_sid, + stack, + slots, + next, + lookarounds, + input, + at, + next_sid, ); } None @@ -1613,6 +1811,7 @@ impl PikeVM { stack: &mut Vec, curr_slots: &mut [Option], next: &mut ActiveStates, + lookarounds: &mut Vec>, input: &Input<'_>, at: usize, sid: StateID, @@ -1629,7 +1828,13 @@ impl PikeVM { } FollowEpsilon::Explore(sid) => { self.epsilon_closure_explore( - stack, curr_slots, next, input, at, sid, + stack, + curr_slots, + next, + lookarounds, + input, + at, + sid, ); } } @@ -1666,6 +1871,7 @@ impl PikeVM { stack: &mut Vec, curr_slots: &mut [Option], next: &mut ActiveStates, + lookarounds: &mut Vec>, input: &Input<'_>, at: usize, mut sid: StateID, @@ -1705,6 +1911,25 @@ impl PikeVM { } sid = next; } + State::WriteLookAround { lookaround_index } => { + // This is ok since `at` is always less than `usize::MAX`. + lookarounds[lookaround_index] = NonMaxUsize::new(at); + return; + } + State::CheckLookAround { + lookaround_index, + positive, + next, + } => { + let state = match lookarounds[lookaround_index] { + None => usize::MAX, + Some(pos) => pos.get(), + }; + if (state == at) != positive { + return; + } + sid = next; + } State::Union { ref alternates } => { sid = match alternates.get(0) { None => return, @@ -1813,10 +2038,14 @@ impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { *self; // 'advance' converts errors into panics, which is OK here because // the PikeVM can never return an error. - it.advance(|input| { + let result = it.advance(|input| { re.search(cache, input, caps); Ok(caps.get_match()) - }) + }); + if result.is_none() { + cache.keep_lookaround_state(false); + } + result } } @@ -1858,6 +2087,7 @@ impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> { if caps.is_match() { Some(caps.clone()) } else { + cache.keep_lookaround_state(false); None } } @@ -1886,6 +2116,22 @@ pub struct Cache { /// The next set of states we're building that will be explored for the /// next byte in the haystack. next: ActiveStates, + /// This answers the question: "What is the maximum position in the + /// haystack at which look-around indexed x holds and which is <= to the + /// current position". + lookaround: Vec>, + /// The current active states for look-behind subexpressions. + curr_lookaround: ActiveStates, + /// The next set of states to be explored for look-behind subexpressions. + next_lookaround: ActiveStates, + /// The set of active threads, belonging to look-behind expressions, + /// when a match was found. This is needed to resume a search after a match + /// was found (to look for further matches), without having to re-scan the + /// beginning of the haystack. + match_lookaround: Option, + /// When true, use the states of `match_lookaround` to initialize a search, + /// otherwise recompute from the beginning of the haystack. + keep_lookaround_state: bool, } impl Cache { @@ -1902,6 +2148,11 @@ impl Cache { stack: vec![], curr: ActiveStates::new(re), next: ActiveStates::new(re), + lookaround: vec![None; re.lookaround_count()], + curr_lookaround: ActiveStates::new(re), + next_lookaround: ActiveStates::new(re), + match_lookaround: None, + keep_lookaround_state: false, } } @@ -1945,6 +2196,28 @@ impl Cache { pub fn reset(&mut self, re: &PikeVM) { self.curr.reset(re); self.next.reset(re); + self.curr_lookaround.reset(re); + self.next_lookaround.reset(re); + self.lookaround = vec![None; re.lookaround_count()]; + self.match_lookaround = None; + self.keep_lookaround_state = false; + } + + /// Set this cache to store a copy of the active threads belonging + /// to look-behind assertions upon a match being found. + /// + /// This is a performance optimization and must only be called with a + /// value of `true` when intending to start a new search at the end of + /// a previously found match. Otherwise, the result of look-behind + /// sub-expressions will be out of sync with the main regex. + /// + /// Calling this function with a value of `false` will clear any previously + /// stored look-behind state. + pub fn keep_lookaround_state(&mut self, keep: bool) { + self.keep_lookaround_state = keep; + if !keep { + self.match_lookaround = None; + } } /// Returns the heap memory usage, in bytes, of this cache. @@ -1953,9 +2226,16 @@ impl Cache { /// compute that, use `std::mem::size_of::()`. pub fn memory_usage(&self) -> usize { use core::mem::size_of; + let match_lookaround_memory = match &self.match_lookaround { + Some(ml) => ml.memory_usage(), + None => 0, + }; (self.stack.len() * size_of::()) + self.curr.memory_usage() + self.next.memory_usage() + + self.curr_lookaround.memory_usage() + + self.next_lookaround.memory_usage() + + match_lookaround_memory } /// Clears this cache. This should be called at the start of every search @@ -1972,6 +2252,10 @@ impl Cache { self.stack.clear(); self.curr.setup_search(captures_slot_len); self.next.setup_search(captures_slot_len); + // capture groups are not allowed inside look-arounds, so we + // set the slot-length to zero. + self.curr_lookaround.setup_search(0); + self.next_lookaround.setup_search(0); } } diff --git a/regex-automata/src/util/determinize/mod.rs b/regex-automata/src/util/determinize/mod.rs index ba32991d0..bdcb4e025 100644 --- a/regex-automata/src/util/determinize/mod.rs +++ b/regex-automata/src/util/determinize/mod.rs @@ -251,6 +251,10 @@ pub(crate) fn next( | thompson::State::Fail | thompson::State::Look { .. } | thompson::State::Capture { .. } => {} + thompson::State::CheckLookAround { .. } + | thompson::State::WriteLookAround { .. } => { + unimplemented!("look-around support in DFA") + } thompson::State::Match { pattern_id } => { // Notice here that we are calling the NEW state a match // state if the OLD state we are transitioning from @@ -399,6 +403,10 @@ pub(crate) fn epsilon_closure( | thompson::State::Dense { .. } | thompson::State::Fail | thompson::State::Match { .. } => break, + thompson::State::WriteLookAround { .. } + | thompson::State::CheckLookAround { .. } => { + unimplemented!("look-around support in DFA") + } thompson::State::Look { look, next } => { if !look_have.contains(look) { break; @@ -465,6 +473,10 @@ pub(crate) fn add_nfa_states( builder.add_nfa_state_id(nfa_id); builder.set_look_need(|need| need.insert(look)); } + thompson::State::CheckLookAround { .. } + | thompson::State::WriteLookAround { .. } => { + unimplemented!("look-around support in DFA") + } thompson::State::Union { .. } | thompson::State::BinaryUnion { .. } => { // Pure epsilon transitions don't need to be tracked as part diff --git a/regex-automata/tests/dfa/onepass/suite.rs b/regex-automata/tests/dfa/onepass/suite.rs index 20bd6965c..4c7682f7f 100644 --- a/regex-automata/tests/dfa/onepass/suite.rs +++ b/regex-automata/tests/dfa/onepass/suite.rs @@ -79,7 +79,10 @@ fn compiler( // Since our error types are all generally opaque, we just // look for an error string. Not great, but not the end of the // world. - if test.compiles() && msg.contains("not one-pass") { + if test.compiles() + && (msg.contains("not one-pass") + || msg.contains("look-around")) + { return Ok(CompiledRegex::skip()); } return Err(err.into()); diff --git a/regex-automata/tests/dfa/suite.rs b/regex-automata/tests/dfa/suite.rs index 8ed6dd007..aa43cc7e6 100644 --- a/regex-automata/tests/dfa/suite.rs +++ b/regex-automata/tests/dfa/suite.rs @@ -289,10 +289,16 @@ fn compiler( } } } + // Or look-around expressions. + for hir in hirs.iter() { + if hir.properties().contains_lookaround_expr() { + return Ok(CompiledRegex::skip()); + } + } if !configure_regex_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } - create_matcher(&builder, pre, builder.build_many(®exes)?) + create_matcher(&builder, pre, builder.build_many(regexes)?) } } diff --git a/regex-automata/tests/hybrid/suite.rs b/regex-automata/tests/hybrid/suite.rs index 4aaca6698..65769f001 100644 --- a/regex-automata/tests/hybrid/suite.rs +++ b/regex-automata/tests/hybrid/suite.rs @@ -180,6 +180,12 @@ fn compiler( } } } + // Or look-around expressions. + for hir in hirs.iter() { + if hir.properties().contains_lookaround_expr() { + return Ok(CompiledRegex::skip()); + } + } if !configure_regex_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } diff --git a/regex-automata/tests/lib.rs b/regex-automata/tests/lib.rs index 67c979aa8..1ba08fe87 100644 --- a/regex-automata/tests/lib.rs +++ b/regex-automata/tests/lib.rs @@ -65,6 +65,7 @@ fn suite() -> anyhow::Result { load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition"); + load!("lookaround"); Ok(tests) } diff --git a/regex-automata/tests/nfa/thompson/backtrack/suite.rs b/regex-automata/tests/nfa/thompson/backtrack/suite.rs index bce0eef40..b0aa0fc6c 100644 --- a/regex-automata/tests/nfa/thompson/backtrack/suite.rs +++ b/regex-automata/tests/nfa/thompson/backtrack/suite.rs @@ -74,6 +74,10 @@ fn min_visited_capacity() -> Result<()> { .configure(config_thompson(test)) .syntax(config_syntax(test)) .build_many(®exes)?; + // The backtracker doesn't support lookarounds, so skip if there are any. + if nfa.lookaround_count() > 0 { + return Ok(CompiledRegex::skip()); + } let mut builder = BoundedBacktracker::builder(); if !configure_backtrack_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); @@ -104,7 +108,17 @@ fn compiler( if !configure_backtrack_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } - let re = builder.build_many(®exes)?; + let re = match builder.build_many(®exes) { + Ok(re) => re, + // Due to errors being opaque, we need to check the error message to skip tests with look-arounds + Err(err) => { + if test.compiles() && err.to_string().contains("look-around") { + return Ok(CompiledRegex::skip()); + } + + return Err(err.into()); + } + }; let mut cache = re.create_cache(); Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, &mut cache, test) diff --git a/regex-cli/Cargo.toml b/regex-cli/Cargo.toml index d7fd44b7b..4284091ea 100644 --- a/regex-cli/Cargo.toml +++ b/regex-cli/Cargo.toml @@ -29,8 +29,8 @@ lexopt = "0.3.0" log = { version = "0.4.17", features = ["std"] } memmap2 = "0.9.4" regex = { version = "1.9.0", path = ".." } -regex-automata = { version = "0.4.8", path = "../regex-automata", features = ["logging"] } +regex-automata = { version = "0.5.0", path = "../regex-automata", features = ["logging"] } regex-lite = { version = "0.1.0", path = "../regex-lite" } -regex-syntax = { version = "0.8.5", path = "../regex-syntax" } +regex-syntax = { version = "0.9.0", path = "../regex-syntax" } tabwriter = { version = "1.2.1", features = ["ansi_formatting"] } textwrap = { version = "0.16.0", default-features = false } diff --git a/regex-cli/cmd/generate/fowler.rs b/regex-cli/cmd/generate/fowler.rs index 404c47721..052d59ef8 100644 --- a/regex-cli/cmd/generate/fowler.rs +++ b/regex-cli/cmd/generate/fowler.rs @@ -412,6 +412,9 @@ fn count_capturing_groups_ast(ast: ®ex_syntax::ast::Ast) -> usize { let this = if group.is_capturing() { 1 } else { 0 }; this + count_capturing_groups_ast(&*group.ast) } + Ast::LookAround(ref lookaround) => { + count_capturing_groups_ast(&lookaround.ast) + } Ast::Alternation(ref alt) => { alt.asts.iter().map(count_capturing_groups_ast).sum() } diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index 0cbcde5e7..f6a443546 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "regex-syntax" -version = "0.8.5" #:version +version = "0.9.0" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax" diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index ce79a89ab..25f3b9280 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -144,6 +144,10 @@ pub enum ErrorKind { /// /// The span of this error corresponds to the unclosed parenthesis. GroupUnclosed, + /// An unclosed look-around, e.g., `(? write!(f, "invalid capture group character"), GroupNameUnexpectedEof => write!(f, "unclosed capture group name"), GroupUnclosed => write!(f, "unclosed group"), + LookAroundUnclosed => write!(f, "unclosed look-around"), GroupUnopened => write!(f, "unopened group"), NestLimitExceeded(limit) => write!( f, @@ -301,11 +309,10 @@ impl core::fmt::Display for ErrorKind { UnsupportedBackreference => { write!(f, "backreferences are not supported") } - UnsupportedLookAround => write!( - f, - "look-around, including look-ahead and look-behind, \ - is not supported" - ), + UnsupportedLookAhead => write!(f, "look-aheads are not supported"), + UnsupportedCaptureInLookBehind => { + write!(f, "capture groups are not supported in look-behinds") + } } } } @@ -477,6 +484,8 @@ pub enum Ast { Dot(Box), /// A single zero-width assertion. Assertion(Box), + /// A single look-around regular expression. + LookAround(Box), /// A single Unicode character class, e.g., `\pL` or `\p{Greek}`. ClassUnicode(Box), /// A single perl character class, e.g., `\d` or `\W`. @@ -521,6 +530,11 @@ impl Ast { Ast::Assertion(Box::new(e)) } + /// Create a "look-around" AST item. + pub fn lookaround(e: LookAround) -> Ast { + Ast::LookAround(Box::new(e)) + } + /// Create a "Unicode class" AST item. pub fn class_unicode(e: ClassUnicode) -> Ast { Ast::ClassUnicode(Box::new(e)) @@ -564,6 +578,7 @@ impl Ast { Ast::Literal(ref x) => &x.span, Ast::Dot(ref span) => span, Ast::Assertion(ref x) => &x.span, + Ast::LookAround(ref x) => &x.span, Ast::ClassUnicode(ref x) => &x.span, Ast::ClassPerl(ref x) => &x.span, Ast::ClassBracketed(ref x) => &x.span, @@ -596,6 +611,7 @@ impl Ast { Ast::ClassBracketed(_) | Ast::Repetition(_) | Ast::Group(_) + | Ast::LookAround(_) | Ast::Alternation(_) | Ast::Concat(_) => true, } @@ -1342,6 +1358,28 @@ pub enum AssertionKind { WordBoundaryEndHalf, } +/// A single zero-width look-around. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct LookAround { + /// The span of this look-around. + pub span: Span, + /// The look-around kind, e.g. negative/positive look-behind. + pub kind: LookAroundKind, + /// The regular expression inside the look-around. + pub ast: Box, +} + +/// A look-around kind. +#[derive(Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum LookAroundKind { + /// `(?<=...)` + PositiveLookBehind, + /// `(? return, Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, Ast::Group(ref x) if !x.ast.has_subexprs() => return, + Ast::LookAround(ref x) if !x.ast.has_subexprs() => return, Ast::Alternation(ref x) if x.asts.is_empty() => return, Ast::Concat(ref x) if x.asts.is_empty() => return, _ => {} @@ -1673,6 +1712,9 @@ impl Drop for Ast { Ast::Group(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } + Ast::LookAround(ref mut x) => { + stack.push(mem::replace(&mut x.ast, empty_ast())); + } Ast::Alternation(ref mut x) => { stack.extend(x.asts.drain(..)); } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 0c2a35265..5883a0dd4 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -159,6 +159,7 @@ impl ParserBuilder { stack_class: RefCell::new(vec![]), capture_names: RefCell::new(vec![]), scratch: RefCell::new(String::new()), + lookaround_depth: Cell::new(0), } } @@ -280,6 +281,9 @@ pub struct Parser { /// A scratch buffer used in various places. Mostly this is used to /// accumulate relevant characters from parts of a pattern. scratch: RefCell, + /// Whether the parser is currently in a look-around. This is used to + /// detect capture groups within look-arounds, which are not supported. + lookaround_depth: Cell, } /// ParserI is the internal parser implementation. @@ -299,9 +303,9 @@ struct ParserI<'s, P> { pattern: &'s str, } -/// GroupState represents a single stack frame while parsing nested groups -/// and alternations. Each frame records the state up to an opening parenthesis -/// or a alternating bracket `|`. +/// GroupState represents a single stack frame while parsing nested groups, +/// look-arounds and alternations. Each frame records the state up to an opening +/// parenthesis or a alternating bracket `|`. #[derive(Clone, Debug)] enum GroupState { /// This state is pushed whenever an opening group is found. @@ -313,6 +317,13 @@ enum GroupState { /// Whether this group has the `x` flag enabled or not. ignore_whitespace: bool, }, + /// This state is pushed whenever an opening look-around is found. + LookAround { + /// The concatenation immediately preceding the opening look-around. + concat: ast::Concat, + /// The look-around that has been opened. Its sub-AST is always empty. + lookaround: ast::LookAround, + }, /// This state is pushed whenever a new alternation branch is found. If /// an alternation branch is found and this state is at the top of the /// stack, then this state should be modified to include the new @@ -385,6 +396,7 @@ impl Parser { self.comments.borrow_mut().clear(); self.stack_group.borrow_mut().clear(); self.stack_class.borrow_mut().clear(); + self.lookaround_depth.set(0); } } @@ -470,6 +482,11 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.parser().ignore_whitespace.get() } + /// Return whether the parser is currently in a look-around. + fn in_lookaround(&self) -> bool { + self.parser().lookaround_depth.get() != 0 + } + /// Return the character at the current position of the parser. /// /// This panics if the current position does not point to a valid char. @@ -521,18 +538,15 @@ impl<'s, P: Borrow> ParserI<'s, P> { } } - /// Returns true if and only if the parser is positioned at a look-around + /// Returns true if and only if the parser is positioned at a look-ahead /// prefix. The conditions under which this returns true must always /// correspond to a regular expression that would otherwise be consider /// invalid. /// /// This should only be called immediately after parsing the opening of /// a group or a set of flags. - fn is_lookaround_prefix(&self) -> bool { - self.bump_if("?=") - || self.bump_if("?!") - || self.bump_if("?<=") - || self.bump_if("? bool { + self.bump_if("?=") || self.bump_if("?!") } /// Bump the parser, and if the `x` flag is enabled, bump through any @@ -686,9 +700,9 @@ impl<'s, P: Borrow> ParserI<'s, P> { })); } - /// Parse and push a group AST (and its parent concatenation) on to the - /// parser's internal stack. Return a fresh concatenation corresponding - /// to the group's sub-AST. + /// Parse and push a group or look-around AST (and its parent + /// concatenation) on to the parser's internal stack. Return a fresh + /// concatenation corresponding to the grouping's sub-AST. /// /// If a set of flags was found (with no group), then the concatenation /// is returned with that set of flags added. @@ -697,12 +711,12 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// parenthesis. It advances the parser to the character at the start /// of the sub-expression (or adjoining expression). /// - /// If there was a problem parsing the start of the group, then an error - /// is returned. + /// If there was a problem parsing the start of the grouping, then an + /// error is returned. #[inline(never)] - fn push_group(&self, mut concat: ast::Concat) -> Result { + fn push_grouping(&self, mut concat: ast::Concat) -> Result { assert_eq!(self.char(), '('); - match self.parse_group()? { + match self.parse_grouping()? { Either::Left(set) => { let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace); if let Some(v) = ignore { @@ -712,7 +726,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { concat.asts.push(Ast::flags(set)); Ok(concat) } - Either::Right(group) => { + Either::Right(Either::Left(group)) => { let old_ignore_whitespace = self.ignore_whitespace(); let new_ignore_whitespace = group .flags() @@ -728,61 +742,124 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.parser().ignore_whitespace.set(new_ignore_whitespace); Ok(ast::Concat { span: self.span(), asts: vec![] }) } + Either::Right(Either::Right(lookaround)) => { + self.parser() + .stack_group + .borrow_mut() + .push(GroupState::LookAround { concat, lookaround }); + self.parser() + .lookaround_depth + .set(self.parser().lookaround_depth.get() + 1); + Ok(ast::Concat { span: self.span(), asts: vec![] }) + } } } - /// Pop a group AST from the parser's internal stack and set the group's - /// AST to the given concatenation. Return the concatenation containing - /// the group. + /// Pop a group or look-around AST from the parser's internal stack and + /// set the grouping's AST to the given concatenation. Return the + /// concatenation containing the grouping. /// /// This assumes that the parser is currently positioned on the closing /// parenthesis and advances the parser to the character following the `)`. /// - /// If no such group could be popped, then an unopened group error is + /// If no such grouping could be popped, then an unopened group error is /// returned. + /// + /// If a look-behind contains a capture group, then an error is returned. #[inline(never)] - fn pop_group(&self, mut group_concat: ast::Concat) -> Result { + fn pop_grouping( + &self, + mut grouping_concat: ast::Concat, + ) -> Result { use self::GroupState::*; assert_eq!(self.char(), ')'); let mut stack = self.parser().stack_group.borrow_mut(); - let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack - .pop() - { - Some(Group { concat, group, ignore_whitespace }) => { - (concat, group, ignore_whitespace, None) - } - Some(Alternation(alt)) => match stack.pop() { + let (mut prior_concat, mut grouping, ignore_whitespace, alt) = + match stack.pop() { Some(Group { concat, group, ignore_whitespace }) => { - (concat, group, ignore_whitespace, Some(alt)) + (concat, Either::Left(group), ignore_whitespace, None) } - None | Some(Alternation(_)) => { + Some(LookAround { concat, lookaround }) => ( + concat, + Either::Right(lookaround), + self.ignore_whitespace(), + None, + ), + Some(Alternation(alt)) => match stack.pop() { + Some(Group { concat, group, ignore_whitespace }) => ( + concat, + Either::Left(group), + ignore_whitespace, + Some(alt), + ), + Some(LookAround { concat, lookaround }) => ( + concat, + Either::Right(lookaround), + self.ignore_whitespace(), + Some(alt), + ), + None | Some(Alternation(_)) => { + return Err(self.error( + self.span_char(), + ast::ErrorKind::GroupUnopened, + )); + } + }, + None => { return Err(self.error( self.span_char(), ast::ErrorKind::GroupUnopened, )); } - }, - None => { - return Err(self - .error(self.span_char(), ast::ErrorKind::GroupUnopened)); - } - }; + }; self.parser().ignore_whitespace.set(ignore_whitespace); - group_concat.span.end = self.pos(); + grouping_concat.span.end = self.pos(); self.bump(); - group.span.end = self.pos(); + match &mut grouping { + Either::Left(group) => group.span.end = self.pos(), + Either::Right(lookaround) => lookaround.span.end = self.pos(), + } match alt { Some(mut alt) => { - alt.span.end = group_concat.span.end; - alt.asts.push(group_concat.into_ast()); - group.ast = Box::new(alt.into_ast()); - } - None => { - group.ast = Box::new(group_concat.into_ast()); + alt.span.end = grouping_concat.span.end; + alt.asts.push(grouping_concat.into_ast()); + match &mut grouping { + Either::Left(group) => { + group.ast = Box::new(alt.into_ast()) + } + Either::Right(lookaround) => { + lookaround.ast = Box::new(alt.into_ast()) + } + } } + None => match &mut grouping { + Either::Left(group) => { + group.ast = Box::new(grouping_concat.into_ast()) + } + Either::Right(lookaround) => { + lookaround.ast = Box::new(grouping_concat.into_ast()) + } + }, } - prior_concat.asts.push(Ast::group(group)); + prior_concat.asts.push(match grouping { + Either::Left(group) => { + if group.is_capturing() && self.in_lookaround() { + return Err(self.error( + group.span, + ast::ErrorKind::UnsupportedCaptureInLookBehind, + )); + } + + Ast::group(group) + } + Either::Right(lookaround) => { + self.parser() + .lookaround_depth + .set(self.parser().lookaround_depth.get() - 1); + Ast::lookaround(lookaround) + } + }); Ok(prior_concat) } @@ -793,7 +870,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// This assumes that the parser has advanced to the end. #[inline(never)] - fn pop_group_end(&self, mut concat: ast::Concat) -> Result { + fn pop_grouping_end(&self, mut concat: ast::Concat) -> Result { concat.span.end = self.pos(); let mut stack = self.parser().stack_group.borrow_mut(); let ast = match stack.pop() { @@ -808,6 +885,12 @@ impl<'s, P: Borrow> ParserI<'s, P> { self.error(group.span, ast::ErrorKind::GroupUnclosed) ); } + Some(GroupState::LookAround { lookaround, .. }) => { + return Err(self.error( + lookaround.span, + ast::ErrorKind::LookAroundUnclosed, + )); + } }; // If we try to pop again, there should be nothing. match stack.pop() { @@ -824,6 +907,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { Some(GroupState::Group { group, .. }) => { Err(self.error(group.span, ast::ErrorKind::GroupUnclosed)) } + Some(GroupState::LookAround { lookaround, .. }) => Err(self + .error(lookaround.span, ast::ErrorKind::LookAroundUnclosed)), } } @@ -989,8 +1074,8 @@ impl<'s, P: Borrow> ParserI<'s, P> { break; } match self.char() { - '(' => concat = self.push_group(concat)?, - ')' => concat = self.pop_group(concat)?, + '(' => concat = self.push_grouping(concat)?, + ')' => concat = self.pop_grouping(concat)?, '|' => concat = self.push_alternate(concat)?, '[' => { let class = self.parse_set_class()?; @@ -1020,7 +1105,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { _ => concat.asts.push(self.parse_primitive()?.into_ast()), } } - let ast = self.pop_group_end(concat)?; + let ast = self.pop_grouping_end(concat)?; NestLimiter::new(self).check(&ast)?; Ok(ast::WithComments { ast, @@ -1205,16 +1290,17 @@ impl<'s, P: Borrow> ParserI<'s, P> { Ok(concat) } - /// Parse a group (which contains a sub-expression) or a set of flags. + /// Parse a group or look-around (which contain a sub-expression), or a + /// set of flags. /// - /// If a group was found, then it is returned with an empty AST. If a set - /// of flags is found, then that set is returned. + /// If a group or look-around was found, then it is returned with an + /// empty AST. If a set of flags is found, then that set is returned. /// /// The parser should be positioned at the opening parenthesis. /// /// This advances the parser to the character before the start of the - /// sub-expression (in the case of a group) or to the closing parenthesis - /// immediately following the set of flags. + /// sub-expression (in the case of a group or look-around) or to the + /// closing parenthesis immediately following the set of flags. /// /// # Errors /// @@ -1223,19 +1309,38 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// /// If a capture name is given and it is incorrectly specified, then a /// corresponding error is returned. + /// + /// If a look-ahead is given (which is currently unsupported), then an + /// error is returned. #[inline(never)] - fn parse_group(&self) -> Result> { + fn parse_grouping( + &self, + ) -> Result>> + { assert_eq!(self.char(), '('); let open_span = self.span_char(); self.bump(); self.bump_space(); - if self.is_lookaround_prefix() { + if self.is_lookahead_prefix() { return Err(self.error( Span::new(open_span.start, self.span().end), - ast::ErrorKind::UnsupportedLookAround, + ast::ErrorKind::UnsupportedLookAhead, )); } let inner_span = self.span(); + + let mut lookaround_kind = ast::LookAroundKind::PositiveLookBehind; + if self.bump_if("?<=") || { + lookaround_kind = ast::LookAroundKind::NegativeLookBehind; + self.bump_if("?> ParserI<'s, P> { } { let capture_index = self.next_capture_index(open_span)?; let name = self.parse_capture_name(capture_index)?; - Ok(Either::Right(ast::Group { + Ok(Either::Right(Either::Left(ast::Group { span: open_span, kind: ast::GroupKind::CaptureName { starts_with_p, name }, ast: Box::new(Ast::empty(self.span())), - })) + }))) } else if self.bump_if("?") { if self.is_eof() { return Err( @@ -1272,19 +1377,19 @@ impl<'s, P: Borrow> ParserI<'s, P> { })) } else { assert_eq!(char_end, ':'); - Ok(Either::Right(ast::Group { + Ok(Either::Right(Either::Left(ast::Group { span: open_span, kind: ast::GroupKind::NonCapturing(flags), ast: Box::new(Ast::empty(self.span())), - })) + }))) } } else { let capture_index = self.next_capture_index(open_span)?; - Ok(Either::Right(ast::Group { + Ok(Either::Right(Either::Left(ast::Group { span: open_span, kind: ast::GroupKind::CaptureIndex(capture_index), ast: Box::new(Ast::empty(self.span())), - })) + }))) } } @@ -2328,6 +2433,7 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { Ast::ClassBracketed(ref x) => &x.span, Ast::Repetition(ref x) => &x.span, Ast::Group(ref x) => &x.span, + Ast::LookAround(ref x) => &x.span, Ast::Alternation(ref x) => &x.span, Ast::Concat(ref x) => &x.span, }; @@ -2349,6 +2455,7 @@ impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { Ast::ClassBracketed(_) | Ast::Repetition(_) | Ast::Group(_) + | Ast::LookAround(_) | Ast::Alternation(_) | Ast::Concat(_) => { self.decrement_depth(); @@ -3736,33 +3843,204 @@ bar } #[test] - fn parse_unsupported_lookaround() { + fn parse_unsupported_lookahead() { assert_eq!( parser(r"(?=a)").parse().unwrap_err(), TestError { span: span(0..3), - kind: ast::ErrorKind::UnsupportedLookAround, + kind: ast::ErrorKind::UnsupportedLookAhead, } ); assert_eq!( parser(r"(?!a)").parse().unwrap_err(), TestError { span: span(0..3), - kind: ast::ErrorKind::UnsupportedLookAround, + kind: ast::ErrorKind::UnsupportedLookAhead, + } + ); + } + + #[test] + fn parse_lookbehinds() { + assert_eq!( + parser(r"(?<=)").parse(), + Ok(Ast::lookaround(ast::LookAround { + span: span(0..5), + ast: Box::new(Ast::empty(span(4..4))), + kind: ast::LookAroundKind::PositiveLookBehind + })) + ); + assert_eq!( + parser(r"(?<=(?<=))(a)").parse(), + Ok(concat( + 0..13, + vec![ + Ast::lookaround(ast::LookAround { + span: span(0..10), + ast: Box::new(Ast::lookaround(ast::LookAround { + span: span(4..9), + ast: Box::new(Ast::empty(span(8..8))), + kind: ast::LookAroundKind::PositiveLookBehind + })), + kind: ast::LookAroundKind::PositiveLookBehind + }), + group(10..13, 1, lit('a', 11)), + ] + )) + ); + assert_eq!( + parser(r"(?<=a)").parse(), + Ok(Ast::lookaround(ast::LookAround { + span: span(0..6), + ast: Box::new(lit('a', 4)), + kind: ast::LookAroundKind::PositiveLookBehind + })) + ); + assert_eq!( + parser(r"(?<=(?:a))").parse(), + Ok(Ast::lookaround(ast::LookAround { + span: span(0..10), + ast: Box::new(Ast::group(ast::Group { + span: span(4..9), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span(6..6), + items: vec![], + }), + ast: Box::new(lit('a', 7)), + })), + kind: ast::LookAroundKind::PositiveLookBehind + })) + ); + assert_eq!( + parser(r"(?a))").parse().unwrap_err(), + TestError { + span: span(4..14), + kind: ast::ErrorKind::UnsupportedCaptureInLookBehind, + } + ); + assert_eq!( + parser(r"(?a)|b)").parse().unwrap_err(), + TestError { + span: span(6..16), + kind: ast::ErrorKind::UnsupportedCaptureInLookBehind, } ); } diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 1ceb3c7fa..0e87599d2 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -80,6 +80,7 @@ impl Visitor for Writer { fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { match *ast { Ast::Group(ref x) => self.fmt_group_pre(x), + Ast::LookAround(ref x) => self.fmt_lookaround_pre(x), Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x), _ => Ok(()), } @@ -92,6 +93,7 @@ impl Visitor for Writer { Ast::Literal(ref x) => self.fmt_literal(x), Ast::Dot(_) => self.wtr.write_str("."), Ast::Assertion(ref x) => self.fmt_assertion(x), + Ast::LookAround(ref x) => self.fmt_lookaround_post(x), Ast::ClassPerl(ref x) => self.fmt_class_perl(x), Ast::ClassUnicode(ref x) => self.fmt_class_unicode(x), Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x), @@ -174,6 +176,18 @@ impl Writer { self.wtr.write_str(")") } + fn fmt_lookaround_pre(&mut self, ast: &ast::LookAround) -> fmt::Result { + use crate::ast::LookAroundKind::*; + match ast.kind { + PositiveLookBehind => self.wtr.write_str("(?<="), + NegativeLookBehind => self.wtr.write_str("(? fmt::Result { + self.wtr.write_str(")") + } + fn fmt_repetition(&mut self, ast: &ast::Repetition) -> fmt::Result { use crate::ast::RepetitionKind::*; match ast.op.kind { @@ -511,6 +525,12 @@ mod tests { roundtrip("(a)"); } + #[test] + fn print_lookaround() { + roundtrip("(?<=a)"); + roundtrip("(? { /// A stack frame allocated just before descending into a group's child /// node. Group(&'a ast::Group), + /// A stack frame allocated just before descending into a look-around's + /// child node. + LookAround(&'a ast::LookAround), /// The stack frame used while visiting every child node of a concatenation /// of expressions. Concat { @@ -270,6 +273,7 @@ impl<'a> HeapVisitor<'a> { } Ast::Repetition(ref x) => Some(Frame::Repetition(x)), Ast::Group(ref x) => Some(Frame::Group(x)), + Ast::LookAround(ref x) => Some(Frame::LookAround(x)), Ast::Concat(ref x) if x.asts.is_empty() => None, Ast::Concat(ref x) => { Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] }) @@ -289,6 +293,7 @@ impl<'a> HeapVisitor<'a> { match induct { Frame::Repetition(_) => None, Frame::Group(_) => None, + Frame::LookAround(_) => None, Frame::Concat { tail, .. } => { if tail.is_empty() { None @@ -444,6 +449,7 @@ impl<'a> Frame<'a> { match *self { Frame::Repetition(rep) => &rep.ast, Frame::Group(group) => &group.ast, + Frame::LookAround(look) => &look.ast, Frame::Concat { head, .. } => head, Frame::Alternation { head, .. } => head, } diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index a5a3737f6..e09879d81 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -172,7 +172,9 @@ impl Extractor { use crate::hir::HirKind::*; match *hir.kind() { - Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), + Empty | Look(_) | LookAround(_) => { + Seq::singleton(self::Literal::exact(vec![])) + } Literal(hir::Literal(ref bytes)) => { let mut seq = Seq::singleton(self::Literal::exact(bytes.to_vec())); @@ -2453,6 +2455,21 @@ mod tests { assert_eq!(expected, e(r"^aZ*b")); } + #[test] + fn lookaround() { + assert_eq!(exact([E("ab")]), e(r"a(?<=qwa)b")); + assert_eq!(exact([E("ab")]), e(r"a(? Hir { + let props = Properties::lookaround(&lookaround); + Hir { kind: HirKind::LookAround(lookaround), props } + } + /// Creates a repetition HIR expression. #[inline] pub fn repetition(mut rep: Repetition) -> Hir { @@ -728,6 +735,8 @@ pub enum HirKind { Class(Class), /// A look-around assertion. A look-around match always has zero length. Look(Look), + /// A look-around subexpression. + LookAround(LookAround), /// A repetition operation applied to a sub-expression. Repetition(Repetition), /// A capturing group, which contains a sub-expression. @@ -761,6 +770,7 @@ impl HirKind { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => &[], + HirKind::LookAround(ref lookaround) => from_ref(lookaround.sub()), HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub), HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub), HirKind::Concat(ref subs) => subs, @@ -1786,6 +1796,52 @@ impl Look { } } +/// Represents a general look-around assertion. +/// +/// Currently, only lookbehind assertions are supported. +/// Furthermore, capture groups inside assertions are not supported. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum LookAround { + /// A positive lookbehind assertion. + PositiveLookBehind(Box), + /// A negative lookbehind assertion. + NegativeLookBehind(Box), +} + +impl LookAround { + /// Returns a reference to the inner expression that must match for this + /// look-around assertion to hold. + pub fn sub(&self) -> &Hir { + match self { + Self::PositiveLookBehind(sub) | Self::NegativeLookBehind(sub) => { + sub + } + } + } + + /// Returns a mutable reference to the inner expression. + pub fn sub_mut(&mut self) -> &mut Hir { + match self { + Self::PositiveLookBehind(sub) | Self::NegativeLookBehind(sub) => { + sub + } + } + } + + /// Returns a new look-around of the same kind, but with its + /// sub-expression replaced with the one given. + pub fn with(&self, sub: Hir) -> LookAround { + match self { + Self::PositiveLookBehind(_) => { + Self::PositiveLookBehind(Box::new(sub)) + } + Self::NegativeLookBehind(_) => { + Self::NegativeLookBehind(Box::new(sub)) + } + } + } +} + /// The high-level intermediate representation for a capturing group. /// /// A capturing group always has an index and a child expression. It may @@ -1920,6 +1976,9 @@ impl Drop for Hir { | HirKind::Class(_) | HirKind::Look(_) => return, HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, + HirKind::LookAround(ref x) if x.sub().kind.subs().is_empty() => { + return + } HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { return } @@ -1935,6 +1994,9 @@ impl Drop for Hir { | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => {} + HirKind::LookAround(ref mut x) => { + stack.push(mem::replace(x.sub_mut(), Hir::empty())); + } HirKind::Capture(ref mut x) => { stack.push(mem::replace(&mut x.sub, Hir::empty())); } @@ -1979,6 +2041,7 @@ struct PropertiesI { look_set_suffix: LookSet, look_set_prefix_any: LookSet, look_set_suffix_any: LookSet, + contains_lookaround_expr: bool, utf8: bool, explicit_captures_len: usize, static_explicit_captures_len: Option, @@ -2072,6 +2135,15 @@ impl Properties { self.0.look_set_suffix_any } + /// Returns whether there are any look-around expressions in this HIR value. + /// + /// Only returns true for [`HirKind::LookAround`] and not for + /// [`HirKind::Look`], which can be queried by [`look_set`](Properties::look_set) instead. + #[inline] + pub fn contains_lookaround_expr(&self) -> bool { + self.0.contains_lookaround_expr + } + /// Return true if and only if the corresponding HIR will always match /// valid UTF-8. /// @@ -2341,6 +2413,7 @@ impl Properties { look_set_suffix: fix, look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + contains_lookaround_expr: false, utf8: true, explicit_captures_len: 0, static_explicit_captures_len, @@ -2356,6 +2429,8 @@ impl Properties { props.look_set_suffix.set_intersect(p.look_set_suffix()); props.look_set_prefix_any.set_union(p.look_set_prefix_any()); props.look_set_suffix_any.set_union(p.look_set_suffix_any()); + props.contains_lookaround_expr = + props.contains_lookaround_expr || p.contains_lookaround_expr(); props.utf8 = props.utf8 && p.is_utf8(); props.explicit_captures_len = props .explicit_captures_len @@ -2403,6 +2478,7 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + contains_lookaround_expr: false, // It is debatable whether an empty regex always matches at valid // UTF-8 boundaries. Strictly speaking, at a byte oriented view, // it is clearly false. There are, for example, many empty strings @@ -2439,6 +2515,7 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + contains_lookaround_expr: false, utf8: core::str::from_utf8(&lit.0).is_ok(), explicit_captures_len: 0, static_explicit_captures_len: Some(0), @@ -2458,6 +2535,7 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + contains_lookaround_expr: false, utf8: class.is_utf8(), explicit_captures_len: 0, static_explicit_captures_len: Some(0), @@ -2477,6 +2555,9 @@ impl Properties { look_set_suffix: LookSet::singleton(look), look_set_prefix_any: LookSet::singleton(look), look_set_suffix_any: LookSet::singleton(look), + // Note, this field represents _general_ lookarounds (ones using + // LookAround) and not assertions (using Look). + contains_lookaround_expr: false, // This requires a little explanation. Basically, we don't consider // matching an empty string to be equivalent to matching invalid // UTF-8, even though technically matching every empty string will @@ -2499,6 +2580,24 @@ impl Properties { Properties(Box::new(inner)) } + /// Create a new set of HIR properties for a look-around. + fn lookaround(lookaround: &LookAround) -> Properties { + let sub_p = lookaround.sub().properties(); + let inner = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + literal: false, + alternation_literal: false, + contains_lookaround_expr: true, + // We do not want look-around subexpressions to influence matching + // of the main expression when they contain anchors, so we clear the set. + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + ..*sub_p.0.clone() + }; + Properties(Box::new(inner)) + } + /// Create a new set of HIR properties for a repetition. fn repetition(rep: &Repetition) -> Properties { let p = rep.sub.properties(); @@ -2520,6 +2619,7 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: p.look_set_prefix_any(), look_set_suffix_any: p.look_set_suffix_any(), + contains_lookaround_expr: p.contains_lookaround_expr(), utf8: p.is_utf8(), explicit_captures_len: p.explicit_captures_len(), static_explicit_captures_len: p.static_explicit_captures_len(), @@ -2581,6 +2681,7 @@ impl Properties { look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), + contains_lookaround_expr: false, utf8: true, explicit_captures_len: 0, static_explicit_captures_len: Some(0), @@ -2592,6 +2693,8 @@ impl Properties { let p = x.properties(); props.look_set.set_union(p.look_set()); props.utf8 = props.utf8 && p.is_utf8(); + props.contains_lookaround_expr = + props.contains_lookaround_expr || p.contains_lookaround_expr(); props.explicit_captures_len = props .explicit_captures_len .saturating_add(p.explicit_captures_len()); diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index dfa6d4032..4b032fae4 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -227,6 +227,12 @@ impl Visitor for Writer { self.wtr.write_str(r"\b{end-half}")?; } }, + HirKind::LookAround(hir::LookAround::PositiveLookBehind(_)) => { + self.wtr.write_str(r"(?<=")?; + } + HirKind::LookAround(hir::LookAround::NegativeLookBehind(_)) => { + self.wtr.write_str(r"(? { self.wtr.write_str("(")?; if let Some(ref name) = *name { @@ -293,7 +299,8 @@ impl Visitor for Writer { } HirKind::Capture(_) | HirKind::Concat(_) - | HirKind::Alternation(_) => { + | HirKind::Alternation(_) + | HirKind::LookAround(_) => { self.wtr.write_str(r")")?; } } @@ -477,6 +484,17 @@ mod tests { roundtrip("((((a))))", "((((a))))"); } + #[test] + fn print_look_around() { + roundtrip("(?<=)", "(?<=(?:))"); + roundtrip("(? {} + _ => { + panic!( + "tried to unwrap look-around from HirFrame, got: {:?}", + self + ) + } + } + } + /// Assert that the current stack frame is a group indicator and return /// its corresponding flags (the flags that were active at the time the /// group was entered). @@ -363,6 +382,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::AlternationBranch); } } + Ast::LookAround(_) => self.push(HirFrame::LookAround), _ => {} } Ok(()) @@ -446,6 +466,18 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.trans().flags.set(old_flags); self.push(HirFrame::Expr(self.hir_capture(x, expr))); } + Ast::LookAround(ref x) => { + let expr = Box::new(self.pop().unwrap().unwrap_expr()); + self.pop().unwrap().unwrap_lookaround(); + self.push(HirFrame::Expr(Hir::lookaround(match x.kind { + ast::LookAroundKind::PositiveLookBehind => { + hir::LookAround::PositiveLookBehind(expr) + } + ast::LookAroundKind::NegativeLookBehind => { + hir::LookAround::NegativeLookBehind(expr) + } + }))); + } Ast::Concat(_) => { let mut exprs = vec![]; while let Some(expr) = self.pop_concat_expr() { @@ -759,6 +791,9 @@ impl<'t, 'p> TranslatorI<'t, 'p> { HirFrame::AlternationBranch => { unreachable!("expected expr or concat, got alt branch marker") } + HirFrame::LookAround => { + unreachable!("expected expr or concat, got look-around") + } } } @@ -790,6 +825,9 @@ impl<'t, 'p> TranslatorI<'t, 'p> { HirFrame::AlternationBranch => { unreachable!("expected expr or alt, got alt branch marker") } + HirFrame::LookAround => { + unreachable!("expected expr or alt, got look-around") + } } } @@ -1601,6 +1639,15 @@ mod tests { Hir::look(look) } + fn hir_lookbehind(expr: Hir, positive: bool) -> Hir { + let lookaround = if positive { + hir::LookAround::PositiveLookBehind(Box::new(expr)) + } else { + hir::LookAround::NegativeLookBehind(Box::new(expr)) + }; + Hir::lookaround(lookaround) + } + #[test] fn empty() { assert_eq!(t(""), Hir::empty()); @@ -1824,6 +1871,44 @@ mod tests { assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate)); } + #[test] + fn lookarounds() { + assert_eq!(t("(?<=a)"), hir_lookbehind(hir_lit("a"), true)); + assert_eq!(t("(? { /// A stack frame allocated just before descending into a capture's child /// node. Capture(&'a hir::Capture), + /// A stack frame allocated just before descending into a look-around's + /// child node. + LookAround(&'a hir::LookAround), /// The stack frame used while visiting every child node of a concatenation /// of expressions. Concat { @@ -162,6 +165,7 @@ impl<'a> HeapVisitor<'a> { match *hir.kind() { HirKind::Repetition(ref x) => Some(Frame::Repetition(x)), HirKind::Capture(ref x) => Some(Frame::Capture(x)), + HirKind::LookAround(ref x) => Some(Frame::LookAround(x)), HirKind::Concat(ref x) if x.is_empty() => None, HirKind::Concat(ref x) => { Some(Frame::Concat { head: &x[0], tail: &x[1..] }) @@ -180,6 +184,7 @@ impl<'a> HeapVisitor<'a> { match induct { Frame::Repetition(_) => None, Frame::Capture(_) => None, + Frame::LookAround(_) => None, Frame::Concat { tail, .. } => { if tail.is_empty() { None @@ -208,6 +213,7 @@ impl<'a> Frame<'a> { match *self { Frame::Repetition(rep) => &rep.sub, Frame::Capture(capture) => &capture.sub, + Frame::LookAround(lookaround) => lookaround.sub(), Frame::Concat { head, .. } => head, Frame::Alternation { head, .. } => head, } diff --git a/testdata/lookaround.toml b/testdata/lookaround.toml new file mode 100644 index 000000000..91fab56a0 --- /dev/null +++ b/testdata/lookaround.toml @@ -0,0 +1,92 @@ +[[test]] +name = "basic lookbehind positive" +regex = "(?<=b)a" +haystack = "ba" +matches = [[1, 2]] + +[[test]] +name = "basic lookbehind negative" +regex = "(?