From cff2201c30bda7b346e3b47875d95a2cf9cafaa3 Mon Sep 17 00:00:00 2001 From: Edwin Cheng Date: Tue, 2 Feb 2021 04:42:37 +0800 Subject: NFA parser for mbe matcher --- crates/mbe/src/expander/matcher.rs | 559 ++++++++++++++++++++++++++++--------- 1 file changed, 432 insertions(+), 127 deletions(-) (limited to 'crates/mbe/src/expander/matcher.rs') diff --git a/crates/mbe/src/expander/matcher.rs b/crates/mbe/src/expander/matcher.rs index e3bd4c09a..9d3d28055 100644 --- a/crates/mbe/src/expander/matcher.rs +++ b/crates/mbe/src/expander/matcher.rs @@ -1,14 +1,74 @@ -//! FIXME: write short doc here +//! An NFA-based parser, which is porting from rustc mbe parsing code +//! +//! See https://github.com/rust-lang/rust/blob/70b18bc2cbac4712020019f5bf57c00905373205/compiler/rustc_expand/src/mbe/macro_parser.rs +//! Here is a quick intro to how the parser works, copied from rustc: +//! +//! A 'position' is a dot in the middle of a matcher, usually represented as a +//! dot. For example `· a $( a )* a b` is a position, as is `a $( · a )* a b`. +//! +//! The parser walks through the input a character at a time, maintaining a list +//! of threads consistent with the current position in the input string: `cur_items`. +//! +//! As it processes them, it fills up `eof_items` with threads that would be valid if +//! the macro invocation is now over, `bb_items` with threads that are waiting on +//! a Rust non-terminal like `$e:expr`, and `next_items` with threads that are waiting +//! on a particular token. Most of the logic concerns moving the · through the +//! repetitions indicated by Kleene stars. The rules for moving the · without +//! consuming any input are called epsilon transitions. It only advances or calls +//! out to the real Rust parser when no `cur_items` threads remain. +//! +//! Example: +//! +//! ```text, ignore +//! Start parsing a a a a b against [· a $( a )* a b]. +//! +//! Remaining input: a a a a b +//! next: [· a $( a )* a b] +//! +//! - - - Advance over an a. - - - +//! +//! Remaining input: a a a b +//! cur: [a · $( a )* a b] +//! Descend/Skip (first item). +//! next: [a $( · a )* a b] [a $( a )* · a b]. +//! +//! - - - Advance over an a. - - - +//! +//! Remaining input: a a b +//! cur: [a $( a · )* a b] [a $( a )* a · b] +//! Follow epsilon transition: Finish/Repeat (first item) +//! next: [a $( a )* · a b] [a $( · a )* a b] [a $( a )* a · b] +//! +//! - - - Advance over an a. - - - (this looks exactly like the last step) +//! +//! Remaining input: a b +//! cur: [a $( a · )* a b] [a $( a )* a · b] +//! Follow epsilon transition: Finish/Repeat (first item) +//! next: [a $( a )* · a b] [a $( · a )* a b] [a $( a )* a · b] +//! +//! - - - Advance over an a. - - - (this looks exactly like the last step) +//! +//! Remaining input: b +//! cur: [a $( a · )* a b] [a $( a )* a · b] +//! Follow epsilon transition: Finish/Repeat (first item) +//! next: [a $( a )* · a b] [a $( · a )* a b] [a $( a )* a · b] +//! +//! - - - Advance over a b. - - - +//! +//! Remaining input: '' +//! eof: [a $( a )* a b ·] +//! ``` use crate::{ expander::{Binding, Bindings, Fragment}, - parser::{Op, RepeatKind, Separator}, + parser::{Op, OpDelimited, OpDelimitedIter, RepeatKind, Separator}, tt_iter::TtIter, ExpandError, MetaTemplate, }; use super::ExpandResult; use parser::FragmentKind::*; +use smallvec::{smallvec, SmallVec}; use syntax::SmolStr; impl Bindings { @@ -16,19 +76,19 @@ impl Bindings { // FIXME: Do we have a better way to represent an empty token ? // Insert an empty subtree for empty token let tt = tt::Subtree::default().into(); - self.inner.insert(name.clone(), Binding::Fragment(Fragment::Tokens(tt))); + self.inner.push((name.clone(), Binding::Fragment(Fragment::Tokens(tt)))); } fn push_empty(&mut self, name: &SmolStr) { - self.inner.insert(name.clone(), Binding::Empty); + self.inner.push((name.clone(), Binding::Empty)); } fn push_nested(&mut self, idx: usize, nested: Bindings) -> Result<(), ExpandError> { for (key, value) in nested.inner { - if !self.inner.contains_key(&key) { - self.inner.insert(key.clone(), Binding::Nested(Vec::new())); + if self.get_mut(&key).is_none() { + self.inner.push((key.clone(), Binding::Nested(Vec::new()))); } - match self.inner.get_mut(&key) { + match self.get_mut(&key) { Some(Binding::Nested(it)) => { // insert empty nested bindings before this one while it.len() < idx { @@ -46,6 +106,14 @@ impl Bindings { } Ok(()) } + + fn get_mut(&mut self, name: &str) -> Option<&mut Binding> { + self.inner.iter_mut().find_map(|(n, b)| if n == name { Some(b) } else { None }) + } + + fn bindings(&self) -> impl Iterator { + self.inner.iter().map(|(_, b)| b) + } } macro_rules! err { @@ -57,7 +125,7 @@ macro_rules! err { }; } -#[derive(Debug, Default)] +#[derive(Clone, Debug, Default, PartialEq, Eq)] pub(super) struct Match { pub(super) bindings: Bindings, /// We currently just keep the first error and count the rest to compare matches. @@ -65,6 +133,8 @@ pub(super) struct Match { pub(super) err_count: usize, /// How many top-level token trees were left to match. pub(super) unmatched_tts: usize, + /// The number of bound variables + pub(super) bound_count: usize, } impl Match { @@ -76,72 +146,373 @@ impl Match { } /// Matching errors are added to the `Match`. -pub(super) fn match_(pattern: &MetaTemplate, src: &tt::Subtree) -> Match { - let mut res = Match::default(); - let mut src = TtIter::new(src); +pub(super) fn match_(pattern: &MetaTemplate, input: &tt::Subtree) -> Match { + let mut res = match_loop(pattern, &input); + res.bound_count = count(res.bindings.bindings()); + return res; + + fn count<'a>(bindings: impl Iterator) -> usize { + bindings + .map(|it| match it { + Binding::Fragment(_) => 1, + Binding::Empty => 1, + Binding::Nested(it) => count(it.iter()), + }) + .sum() + } +} - match_tokens(&mut res, pattern, &mut src); +#[derive(Debug, Clone)] +struct MatchState<'t> { + /// The position of the "dot" in this matcher + dot: OpDelimitedIter<'t>, - if src.len() > 0 { - res.unmatched_tts += src.len(); - res.add_err(err!("leftover tokens")); - } + /// Token subtree stack + /// When matching against matchers with nested delimited submatchers (e.g., `pat ( pat ( .. ) + /// pat ) pat`), we need to keep track of the matchers we are descending into. This stack does + /// that where the bottom of the stack is the outermost matcher. + stack: SmallVec<[OpDelimitedIter<'t>; 4]>, + + /// The "parent" matcher position if we are in a repetition. That is, the matcher position just + /// before we enter the repetition. + up: Option>>, + + /// The separator if we are in a repetition. + sep: Option, + + /// The KleeneOp of this sequence if we are in a repetition. + sep_kind: Option, - res + /// Number of tokens of seperator parsed + sep_parsed: Option, + + /// Matched meta variables bindings + bindings: SmallVec<[Bindings; 4]>, + + /// Cached result of meta variable parsing + meta_result: Option<(TtIter<'t>, ExpandResult>)>, + + /// Is error occuried in this state, will `poised` to "parent" + is_error: bool, } -fn match_tokens(res: &mut Match, pattern: &MetaTemplate, src: &mut TtIter) { - for op in pattern.iter() { - match op { - Op::Leaf(lhs) => { - if let Err(err) = match_leaf(lhs, src) { - res.add_err(err); - continue; +/// Process the matcher positions of `cur_items` until it is empty. In the process, this will +/// produce more items in `next_items`, `eof_items`, and `bb_items`. +/// +/// For more info about the how this happens, see the module-level doc comments and the inline +/// comments of this function. +/// +/// # Parameters +/// +/// - `src`: the current token of the parser. +/// - `stack`: the "parent" frames of the token tree +/// - `res`: the match result to store errors +/// - `cur_items`: the set of current items to be processed. This should be empty by the end of a +/// successful execution of this function. +/// - `next_items`: the set of newly generated items. These are used to replenish `cur_items` in +/// the function `parse`. +/// - `eof_items`: the set of items that would be valid if this was the EOF. +/// - `bb_items`: the set of items that are waiting for the black-box parser. +/// - `error_items`: the set of items in errors, used for error-resilient parsing +fn match_loop_inner<'t>( + src: TtIter<'t>, + stack: &[TtIter<'t>], + res: &mut Match, + cur_items: &mut SmallVec<[MatchState<'t>; 1]>, + bb_items: &mut SmallVec<[MatchState<'t>; 1]>, + next_items: &mut Vec>, + eof_items: &mut SmallVec<[MatchState<'t>; 1]>, + error_items: &mut SmallVec<[MatchState<'t>; 1]>, +) { + macro_rules! try_push { + ($items: expr, $it:expr) => { + if $it.is_error { + error_items.push($it); + } else { + $items.push($it); + } + }; + } + + while let Some(mut item) = cur_items.pop() { + while item.dot.is_eof() { + match item.stack.pop() { + Some(frame) => { + item.dot = frame; + item.dot.next(); } + None => break, } - Op::Subtree { tokens, delimiter: delim } => { - let rhs = match src.expect_subtree() { - Ok(s) => s, - Err(()) => { - res.add_err(err!("expected subtree")); - continue; + } + let op = match item.dot.peek() { + None => { + // We are at or past the end of the matcher of `item`. + if item.up.is_some() { + if item.sep_parsed.is_none() { + // Get the `up` matcher + let mut new_pos = *item.up.clone().unwrap(); + // Add matches from this repetition to the `matches` of `up` + if let Some(bindings) = new_pos.bindings.last_mut() { + for (i, b) in item.bindings.iter_mut().enumerate() { + bindings.push_nested(i, b.clone()).unwrap(); + } + } + // Move the "dot" past the repetition in `up` + new_pos.dot.next(); + new_pos.is_error = new_pos.is_error || item.is_error; + cur_items.push(new_pos); + } + + // Check if we need a separator. + // We check the separator one by one + let sep_idx = *item.sep_parsed.as_ref().unwrap_or(&0); + let sep_len = item.sep.as_ref().map_or(0, Separator::tt_count); + if item.sep.is_some() && sep_idx != sep_len { + let sep = item.sep.as_ref().unwrap(); + if src.clone().expect_separator(&sep, sep_idx) { + item.dot.next(); + item.sep_parsed = Some(sep_idx + 1); + try_push!(next_items, item); + } + } + // We don't need a separator. Move the "dot" back to the beginning of the matcher + // and try to match again UNLESS we are only allowed to have _one_ repetition. + else if item.sep_kind != Some(RepeatKind::ZeroOrOne) { + item.dot = item.dot.reset(); + item.sep_parsed = None; + item.bindings.push(Bindings::default()); + cur_items.push(item); + } + } else { + // If we are not in a repetition, then being at the end of a matcher means that we have + // reached the potential end of the input. + try_push!(eof_items, item); + } + continue; + } + Some(it) => it, + }; + + // We are in the middle of a matcher. + match op { + OpDelimited::Op(Op::Repeat { tokens, kind, separator }) => { + if matches!(kind, RepeatKind::ZeroOrMore | RepeatKind::ZeroOrOne) { + let mut new_item = item.clone(); + new_item.dot.next(); + let mut vars = Vec::new(); + let bindings = new_item.bindings.last_mut().unwrap(); + collect_vars(&mut vars, tokens); + for var in vars { + bindings.push_empty(&var); } - }; - if delim.map(|it| it.kind) != rhs.delimiter_kind() { - res.add_err(err!("mismatched delimiter")); - continue; + cur_items.push(new_item); } - let mut src = TtIter::new(rhs); - match_tokens(res, tokens, &mut src); - if src.len() > 0 { - res.add_err(err!("leftover tokens")); + cur_items.push(MatchState { + dot: tokens.iter_delimited(None), + stack: Default::default(), + up: Some(Box::new(item)), + sep: separator.clone(), + sep_kind: Some(*kind), + sep_parsed: None, + bindings: smallvec![Bindings::default()], + meta_result: None, + is_error: false, + }) + } + OpDelimited::Op(Op::Subtree { tokens, delimiter }) => { + if let Ok(subtree) = src.clone().expect_subtree() { + if subtree.delimiter_kind() == delimiter.map(|it| it.kind) { + item.stack.push(item.dot); + item.dot = tokens.iter_delimited(delimiter.as_ref()); + cur_items.push(item); + } } } - Op::Var { name, kind, .. } => { - let kind = match kind { - Some(k) => k, - None => { - res.add_err(ExpandError::UnexpectedToken); - continue; + OpDelimited::Op(Op::Var { kind, name, .. }) => { + if let Some(kind) = kind { + let mut fork = src.clone(); + let match_res = match_meta_var(kind.as_str(), &mut fork); + match match_res.err { + None => { + // Some meta variables are optional (e.g. vis) + if match_res.value.is_some() { + item.meta_result = Some((fork, match_res)); + try_push!(bb_items, item); + } else { + item.bindings.last_mut().unwrap().push_optional(name); + item.dot.next(); + cur_items.push(item); + } + } + Some(err) => { + res.add_err(err); + match match_res.value { + Some(fragment) => { + item.bindings + .last_mut() + .unwrap() + .inner + .push((name.clone(), Binding::Fragment(fragment))); + } + _ => {} + } + item.is_error = true; + error_items.push(item); + } } - }; - let ExpandResult { value: matched, err: match_err } = - match_meta_var(kind.as_str(), src); - match matched { + } + } + OpDelimited::Op(Op::Leaf(leaf)) => { + if let Err(err) = match_leaf(&leaf, &mut src.clone()) { + res.add_err(err); + item.is_error = true; + } else { + item.dot.next(); + } + try_push!(next_items, item); + } + OpDelimited::Open => { + if matches!(src.clone().next(), Some(tt::TokenTree::Subtree(..))) { + item.dot.next(); + try_push!(next_items, item); + } + } + OpDelimited::Close => { + let is_delim_closed = src.peek_n(0).is_none() && !stack.is_empty(); + if is_delim_closed { + item.dot.next(); + try_push!(next_items, item); + } + } + } + } +} + +fn match_loop(pattern: &MetaTemplate, src: &tt::Subtree) -> Match { + let mut src = TtIter::new(src); + let mut stack: SmallVec<[TtIter; 1]> = SmallVec::new(); + let mut res = Match::default(); + let mut error_reover_item = None; + + let mut cur_items = smallvec![MatchState { + dot: pattern.iter_delimited(None), + stack: Default::default(), + up: None, + sep: None, + sep_kind: None, + sep_parsed: None, + bindings: smallvec![Bindings::default()], + is_error: false, + meta_result: None, + }]; + + let mut next_items = vec![]; + + loop { + let mut bb_items = SmallVec::new(); + let mut eof_items = SmallVec::new(); + let mut error_items = SmallVec::new(); + + stdx::always!(next_items.is_empty()); + + match_loop_inner( + src.clone(), + &stack, + &mut res, + &mut cur_items, + &mut bb_items, + &mut next_items, + &mut eof_items, + &mut error_items, + ); + stdx::always!(cur_items.is_empty()); + + if error_items.len() > 0 { + error_reover_item = error_items.pop(); + } else if eof_items.len() > 0 { + error_reover_item = Some(eof_items[0].clone()); + } + + // We need to do some post processing after the `match_loop_inner`. + // If we reached the EOF, check that there is EXACTLY ONE possible matcher. Otherwise, + // either the parse is ambiguous (which should never happen) or there is a syntax error. + if src.peek_n(0).is_none() && stack.is_empty() { + if eof_items.len() == 1 { + // remove all errors, because it is the correct answer ! + res = Match::default(); + res.bindings = eof_items[0].bindings[0].clone(); + } else { + // Error recovery + if error_reover_item.is_some() { + res.bindings = error_reover_item.unwrap().bindings[0].clone(); + } + res.add_err(ExpandError::UnexpectedToken); + } + return res; + } + + // If there are no possible next positions AND we aren't waiting for the black-box parser, + // then there is a syntax error. + // + // Another possibility is that we need to call out to parse some rust nonterminal + // (black-box) parser. However, if there is not EXACTLY ONE of these, something is wrong. + if (bb_items.is_empty() && next_items.is_empty()) + || (!bb_items.is_empty() && !next_items.is_empty()) + || bb_items.len() > 1 + { + res.unmatched_tts += src.len(); + while let Some(it) = stack.pop() { + src = it; + res.unmatched_tts += src.len(); + } + res.add_err(err!("leftover tokens")); + + if let Some(mut error_reover_item) = error_reover_item { + res.bindings = error_reover_item.bindings.remove(0); + } + return res; + } + // Dump all possible `next_items` into `cur_items` for the next iteration. + else if !next_items.is_empty() { + // Now process the next token + cur_items.extend(next_items.drain(..)); + + match src.next() { + Some(tt::TokenTree::Subtree(subtree)) => { + stack.push(src.clone()); + src = TtIter::new(subtree); + } + None if !stack.is_empty() => src = stack.pop().unwrap(), + _ => (), + } + } + // Finally, we have the case where we need to call the black-box parser to get some + // nonterminal. + else { + stdx::always!(bb_items.len() == 1); + let mut item = bb_items.pop().unwrap(); + + if let Some(OpDelimited::Op(Op::Var { name, .. })) = item.dot.peek() { + let (iter, match_res) = item.meta_result.take().unwrap(); + let bindings = item.bindings.last_mut().unwrap(); + match match_res.value { Some(fragment) => { - res.bindings.inner.insert(name.clone(), Binding::Fragment(fragment)); + bindings.inner.push((name.clone(), Binding::Fragment(fragment))); } - None if match_err.is_none() => res.bindings.push_optional(name), + None if match_res.err.is_none() => bindings.push_optional(name), _ => {} } - if let Some(err) = match_err { + if let Some(err) = match_res.err { res.add_err(err); } + src = iter.clone(); + item.dot.next(); + } else { + unreachable!() } - Op::Repeat { tokens: subtree, kind, separator } => { - match_repeat(res, subtree, *kind, separator, src); - } + cur_items.push(item); } + stdx::always!(!cur_items.is_empty()); } } @@ -173,73 +544,6 @@ fn match_leaf(lhs: &tt::Leaf, src: &mut TtIter) -> Result<(), ExpandError> { Ok(()) } -fn match_repeat( - res: &mut Match, - pattern: &MetaTemplate, - kind: RepeatKind, - separator: &Option, - src: &mut TtIter, -) { - // Dirty hack to make macro-expansion terminate. - // This should be replaced by a proper macro-by-example implementation - let mut limit = 65536; - let mut counter = 0; - - for i in 0.. { - let mut fork = src.clone(); - - if let Some(separator) = &separator { - if i != 0 && !fork.eat_separator(separator) { - break; - } - } - - let mut nested = Match::default(); - match_tokens(&mut nested, pattern, &mut fork); - if nested.err.is_none() { - limit -= 1; - if limit == 0 { - log::warn!( - "match_lhs exceeded repeat pattern limit => {:#?}\n{:#?}\n{:#?}\n{:#?}", - pattern, - src, - kind, - separator - ); - break; - } - *src = fork; - - if let Err(err) = res.bindings.push_nested(counter, nested.bindings) { - res.add_err(err); - } - counter += 1; - if counter == 1 { - if let RepeatKind::ZeroOrOne = kind { - break; - } - } - } else { - break; - } - } - - match (kind, counter) { - (RepeatKind::OneOrMore, 0) => { - res.add_err(ExpandError::UnexpectedToken); - } - (_, 0) => { - // Collect all empty variables in subtrees - let mut vars = Vec::new(); - collect_vars(&mut vars, pattern); - for var in vars { - res.bindings.push_empty(&var) - } - } - _ => (), - } -} - fn match_meta_var(kind: &str, input: &mut TtIter) -> ExpandResult> { let fragment = match kind { "path" => Path, @@ -303,14 +607,14 @@ fn collect_vars(buf: &mut Vec, pattern: &MetaTemplate) { } impl<'a> TtIter<'a> { - fn eat_separator(&mut self, separator: &Separator) -> bool { + fn expect_separator(&mut self, separator: &Separator, idx: usize) -> bool { let mut fork = self.clone(); let ok = match separator { - Separator::Ident(lhs) => match fork.expect_ident() { + Separator::Ident(lhs) if idx == 0 => match fork.expect_ident() { Ok(rhs) => rhs.text == lhs.text, _ => false, }, - Separator::Literal(lhs) => match fork.expect_literal() { + Separator::Literal(lhs) if idx == 0 => match fork.expect_literal() { Ok(rhs) => match rhs { tt::Leaf::Literal(rhs) => rhs.text == lhs.text, tt::Leaf::Ident(rhs) => rhs.text == lhs.text, @@ -318,10 +622,11 @@ impl<'a> TtIter<'a> { }, _ => false, }, - Separator::Puncts(lhss) => lhss.iter().all(|lhs| match fork.expect_punct() { - Ok(rhs) => rhs.char == lhs.char, + Separator::Puncts(lhss) if idx < lhss.len() => match fork.expect_punct() { + Ok(rhs) => rhs.char == lhss[idx].char, _ => false, - }), + }, + _ => false, }; if ok { *self = fork; -- cgit v1.2.3