Skip to content

Commit 83fec9a

Browse files
authored
Merge pull request #1827 from rust-lang/TC/parse-without-regex
Parse grammar without regexes
2 parents c703c8d + 1a7304b commit 83fec9a

File tree

1 file changed

+68
-48
lines changed

1 file changed

+68
-48
lines changed

mdbook-spec/src/grammar/parser.rs

Lines changed: 68 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
//! A parser of the ENBF-like grammar.
22
33
use super::{Characters, Expression, ExpressionKind, Grammar, Production};
4-
use regex::{Captures, Regex};
54
use std::fmt;
65
use std::fmt::Display;
76
use std::path::Path;
8-
use std::sync::LazyLock;
97

108
struct Parser<'a> {
119
input: &'a str,
@@ -76,18 +74,6 @@ impl Parser<'_> {
7674
&self.input[i..i + upper]
7775
}
7876

79-
/// If the input matches the given regex, it is returned and the head is moved forward.
80-
///
81-
/// Note that regexes must start with `^`.
82-
fn take_re(&mut self, re: &Regex) -> Option<Captures<'_>> {
83-
if let Some(cap) = re.captures(&self.input[self.index..]) {
84-
self.index += cap[0].len();
85-
Some(cap)
86-
} else {
87-
None
88-
}
89-
}
90-
9177
/// Returns whether or not the given string is next, and advances the head if it is.
9278
fn take_str(&mut self, s: &str) -> bool {
9379
if self.input[self.index..].starts_with(s) {
@@ -168,13 +154,12 @@ impl Parser<'_> {
168154
}
169155

170156
fn parse_expression(&mut self) -> Result<Option<Expression>> {
171-
static ALT_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^ *\| *").unwrap());
172-
173157
let mut es = Vec::new();
174158
loop {
175159
let Some(e) = self.parse_seq()? else { break };
176160
es.push(e);
177-
if self.take_re(&ALT_RE).is_none() {
161+
_ = self.space0();
162+
if !self.take_str("|") {
178163
break;
179164
}
180165
}
@@ -268,21 +253,28 @@ impl Parser<'_> {
268253
Some(ExpressionKind::Nt(nt))
269254
}
270255

256+
/// Parse terminal within backticks.
271257
fn parse_terminal(&mut self) -> Result<ExpressionKind> {
272-
static TERMINAL_RE: LazyLock<Regex> =
273-
LazyLock::new(|| Regex::new(r"^`([^`\n]+)`").unwrap());
274-
match self.take_re(&TERMINAL_RE) {
275-
Some(cap) => Ok(ExpressionKind::Terminal(cap[1].to_string())),
276-
None => bail!(self, "unterminated terminal, expected closing backtick"),
258+
Ok(ExpressionKind::Terminal(self.parse_terminal_str()?))
259+
}
260+
261+
/// Parse string within backticks.
262+
fn parse_terminal_str(&mut self) -> Result<String> {
263+
self.expect("`", "expected opening backtick")?;
264+
let term = self.take_while(&|x| !['\n', '`'].contains(&x)).to_string();
265+
if term.is_empty() {
266+
bail!(self, "expected terminal");
277267
}
268+
self.expect("`", "expected closing backtick")?;
269+
Ok(term)
278270
}
279271

280272
fn parse_charset(&mut self) -> Result<ExpressionKind> {
281273
self.expect("[", "expected opening [")?;
282274
let mut characters = Vec::new();
283275
loop {
284276
self.space0();
285-
let Some(ch) = self.parse_characters() else {
277+
let Some(ch) = self.parse_characters()? else {
286278
break;
287279
};
288280
characters.push(ch);
@@ -295,27 +287,48 @@ impl Parser<'_> {
295287
Ok(ExpressionKind::Charset(characters))
296288
}
297289

298-
fn parse_characters(&mut self) -> Option<Characters> {
299-
static RANGE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^`(.)`-`(.)`").unwrap());
300-
static TERMINAL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new("^`([^`\n]+)`").unwrap());
301-
if let Some(cap) = self.take_re(&RANGE_RE) {
302-
let a = cap[1].chars().next().unwrap();
303-
let b = cap[2].chars().next().unwrap();
304-
Some(Characters::Range(a, b))
305-
} else if let Some(cap) = self.take_re(&TERMINAL_RE) {
306-
Some(Characters::Terminal(cap[1].to_string()))
290+
/// Parse an element of a character class, e.g.
291+
/// `` `a`-`b` `` | `` `term` `` | `` NonTerminal ``.
292+
fn parse_characters(&mut self) -> Result<Option<Characters>> {
293+
if let Some(b'`') = self.peek() {
294+
let recov = self.index;
295+
let a = self.parse_terminal_str()?;
296+
if self.take_str("-") {
297+
//~^ Parse `` `a`-`b` `` character range.
298+
if a.len() > 1 {
299+
self.index = recov + 1;
300+
bail!(self, "invalid start terminal in range");
301+
}
302+
let recov = self.index;
303+
let b = self.parse_terminal_str()?;
304+
if b.len() > 1 {
305+
self.index = recov + 1;
306+
bail!(self, "invalid end terminal in range");
307+
}
308+
let a = a.chars().next().unwrap();
309+
let b = b.chars().next().unwrap();
310+
Ok(Some(Characters::Range(a, b)))
311+
} else {
312+
//~^ Parse terminal in backticks.
313+
Ok(Some(Characters::Terminal(a)))
314+
}
315+
} else if let Some(name) = self.parse_name() {
316+
//~^ Parse nonterminal identifier.
317+
Ok(Some(Characters::Named(name)))
307318
} else {
308-
let name = self.parse_name()?;
309-
Some(Characters::Named(name))
319+
Ok(None)
310320
}
311321
}
312322

323+
/// Parse e.g. `<prose text>`.
313324
fn parse_prose(&mut self) -> Result<ExpressionKind> {
314-
static PROSE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^<([^>\n]+)>").unwrap());
315-
match self.take_re(&PROSE_RE) {
316-
Some(cap) => Ok(ExpressionKind::Prose(cap[1].to_string())),
317-
None => bail!(self, "unterminated prose, expected closing `>`"),
325+
self.expect("<", "expected opening `<`")?;
326+
let text = self.take_while(&|x| !['\n', '>'].contains(&x)).to_string();
327+
if text.is_empty() {
328+
bail!(self, "expected prose text");
318329
}
330+
self.expect(">", "expected closing `>`")?;
331+
Ok(ExpressionKind::Prose(text))
319332
}
320333

321334
fn parse_grouped(&mut self) -> Result<ExpressionKind> {
@@ -344,13 +357,19 @@ impl Parser<'_> {
344357
Ok(ExpressionKind::NegExpression(box_kind(kind)))
345358
}
346359

360+
/// Parse e.g. `F00F` after `U+`.
347361
fn parse_unicode(&mut self) -> Result<ExpressionKind> {
348-
static UNICODE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[A-Z0-9]{4}").unwrap());
349-
350-
match self.take_re(&UNICODE_RE) {
351-
Some(s) => Ok(ExpressionKind::Unicode(s[0].to_string())),
352-
None => bail!(self, "expected 4 hexadecimal uppercase digits after U+"),
362+
let mut xs = Vec::with_capacity(4);
363+
for _ in 0..4 {
364+
match self.peek() {
365+
Some(x @ (b'0'..=b'9' | b'A'..=b'F')) => {
366+
xs.push(x);
367+
self.index += 1;
368+
}
369+
_ => bail!(self, "expected 4 uppercase hexidecimal digits after `U+`"),
370+
}
353371
}
372+
Ok(ExpressionKind::Unicode(String::from_utf8(xs).unwrap()))
354373
}
355374

356375
/// Parse `?` after expression.
@@ -428,16 +447,17 @@ impl Parser<'_> {
428447
Ok(Some(self.input[start..self.index - 1].to_string()))
429448
}
430449

450+
/// Parse footnote reference, e.g. `[^id]`.
431451
fn parse_footnote(&mut self) -> Result<Option<String>> {
432-
static FOOTNOTE_RE: LazyLock<Regex> =
433-
LazyLock::new(|| Regex::new(r"^([^\]\n]+)]").unwrap());
434452
if !self.take_str("[^") {
435453
return Ok(None);
436454
}
437-
match self.take_re(&FOOTNOTE_RE) {
438-
Some(cap) => Ok(Some(cap[1].to_string())),
439-
None => bail!(self, "unterminated footnote, expected closing `]`"),
455+
let id = self.take_while(&|x| !['\n', ']'].contains(&x)).to_string();
456+
if id.is_empty() {
457+
bail!(self, "expected footnote id");
440458
}
459+
self.expect("]", "expected closing `]`")?;
460+
Ok(Some(id))
441461
}
442462
}
443463

0 commit comments

Comments
 (0)