From a1c187eef3ba08076aedb5154929f7eda8d1b424 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Wed, 12 Aug 2020 18:26:51 +0200 Subject: Rename ra_syntax -> syntax --- crates/syntax/src/parsing/lexer.rs | 244 +++++++++++++ crates/syntax/src/parsing/reparsing.rs | 455 +++++++++++++++++++++++++ crates/syntax/src/parsing/text_token_source.rs | 84 +++++ crates/syntax/src/parsing/text_tree_sink.rs | 183 ++++++++++ 4 files changed, 966 insertions(+) create mode 100644 crates/syntax/src/parsing/lexer.rs create mode 100644 crates/syntax/src/parsing/reparsing.rs create mode 100644 crates/syntax/src/parsing/text_token_source.rs create mode 100644 crates/syntax/src/parsing/text_tree_sink.rs (limited to 'crates/syntax/src/parsing') diff --git a/crates/syntax/src/parsing/lexer.rs b/crates/syntax/src/parsing/lexer.rs new file mode 100644 index 000000000..fa3be1016 --- /dev/null +++ b/crates/syntax/src/parsing/lexer.rs @@ -0,0 +1,244 @@ +//! Lexer analyzes raw input string and produces lexemes (tokens). +//! It is just a bridge to `rustc_lexer`. + +use rustc_lexer::{LiteralKind as LK, RawStrError}; + +use std::convert::TryInto; + +use crate::{ + SyntaxError, + SyntaxKind::{self, *}, + TextRange, TextSize, T, +}; + +/// A token of Rust source. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Token { + /// The kind of token. + pub kind: SyntaxKind, + /// The length of the token. + pub len: TextSize, +} + +/// Break a string up into its component tokens. +/// Beware that it checks for shebang first and its length contributes to resulting +/// tokens offsets. +pub fn tokenize(text: &str) -> (Vec, Vec) { + // non-empty string is a precondtion of `rustc_lexer::strip_shebang()`. + if text.is_empty() { + return Default::default(); + } + + let mut tokens = Vec::new(); + let mut errors = Vec::new(); + + let mut offset = match rustc_lexer::strip_shebang(text) { + Some(shebang_len) => { + tokens.push(Token { kind: SHEBANG, len: shebang_len.try_into().unwrap() }); + shebang_len + } + None => 0, + }; + + let text_without_shebang = &text[offset..]; + + for rustc_token in rustc_lexer::tokenize(text_without_shebang) { + let token_len: TextSize = rustc_token.len.try_into().unwrap(); + let token_range = TextRange::at(offset.try_into().unwrap(), token_len); + + let (syntax_kind, err_message) = + rustc_token_kind_to_syntax_kind(&rustc_token.kind, &text[token_range]); + + tokens.push(Token { kind: syntax_kind, len: token_len }); + + if let Some(err_message) = err_message { + errors.push(SyntaxError::new(err_message, token_range)); + } + + offset += rustc_token.len; + } + + (tokens, errors) +} + +/// Returns `SyntaxKind` and `Option` of the first token +/// encountered at the beginning of the string. +/// +/// Returns `None` if the string contains zero *or two or more* tokens. +/// The token is malformed if the returned error is not `None`. +/// +/// Beware that unescape errors are not checked at tokenization time. +pub fn lex_single_syntax_kind(text: &str) -> Option<(SyntaxKind, Option)> { + lex_first_token(text) + .filter(|(token, _)| token.len == TextSize::of(text)) + .map(|(token, error)| (token.kind, error)) +} + +/// The same as `lex_single_syntax_kind()` but returns only `SyntaxKind` and +/// returns `None` if any tokenization error occured. +/// +/// Beware that unescape errors are not checked at tokenization time. +pub fn lex_single_valid_syntax_kind(text: &str) -> Option { + lex_first_token(text) + .filter(|(token, error)| !error.is_some() && token.len == TextSize::of(text)) + .map(|(token, _error)| token.kind) +} + +/// Returns `SyntaxKind` and `Option` of the first token +/// encountered at the beginning of the string. +/// +/// Returns `None` if the string contains zero tokens or if the token was parsed +/// with an error. +/// The token is malformed if the returned error is not `None`. +/// +/// Beware that unescape errors are not checked at tokenization time. +fn lex_first_token(text: &str) -> Option<(Token, Option)> { + // non-empty string is a precondtion of `rustc_lexer::first_token()`. + if text.is_empty() { + return None; + } + + let rustc_token = rustc_lexer::first_token(text); + let (syntax_kind, err_message) = rustc_token_kind_to_syntax_kind(&rustc_token.kind, text); + + let token = Token { kind: syntax_kind, len: rustc_token.len.try_into().unwrap() }; + let optional_error = err_message + .map(|err_message| SyntaxError::new(err_message, TextRange::up_to(TextSize::of(text)))); + + Some((token, optional_error)) +} + +/// Returns `SyntaxKind` and an optional tokenize error message. +fn rustc_token_kind_to_syntax_kind( + rustc_token_kind: &rustc_lexer::TokenKind, + token_text: &str, +) -> (SyntaxKind, Option<&'static str>) { + // A note on an intended tradeoff: + // We drop some useful infromation here (see patterns with double dots `..`) + // Storing that info in `SyntaxKind` is not possible due to its layout requirements of + // being `u16` that come from `rowan::SyntaxKind`. + + let syntax_kind = { + match rustc_token_kind { + rustc_lexer::TokenKind::LineComment => COMMENT, + + rustc_lexer::TokenKind::BlockComment { terminated: true } => COMMENT, + rustc_lexer::TokenKind::BlockComment { terminated: false } => { + return ( + COMMENT, + Some("Missing trailing `*/` symbols to terminate the block comment"), + ); + } + + rustc_lexer::TokenKind::Whitespace => WHITESPACE, + + rustc_lexer::TokenKind::Ident => { + if token_text == "_" { + UNDERSCORE + } else { + SyntaxKind::from_keyword(token_text).unwrap_or(IDENT) + } + } + + rustc_lexer::TokenKind::RawIdent => IDENT, + rustc_lexer::TokenKind::Literal { kind, .. } => return match_literal_kind(&kind), + + rustc_lexer::TokenKind::Lifetime { starts_with_number: false } => LIFETIME, + rustc_lexer::TokenKind::Lifetime { starts_with_number: true } => { + return (LIFETIME, Some("Lifetime name cannot start with a number")) + } + + rustc_lexer::TokenKind::Semi => T![;], + rustc_lexer::TokenKind::Comma => T![,], + rustc_lexer::TokenKind::Dot => T![.], + rustc_lexer::TokenKind::OpenParen => T!['('], + rustc_lexer::TokenKind::CloseParen => T![')'], + rustc_lexer::TokenKind::OpenBrace => T!['{'], + rustc_lexer::TokenKind::CloseBrace => T!['}'], + rustc_lexer::TokenKind::OpenBracket => T!['['], + rustc_lexer::TokenKind::CloseBracket => T![']'], + rustc_lexer::TokenKind::At => T![@], + rustc_lexer::TokenKind::Pound => T![#], + rustc_lexer::TokenKind::Tilde => T![~], + rustc_lexer::TokenKind::Question => T![?], + rustc_lexer::TokenKind::Colon => T![:], + rustc_lexer::TokenKind::Dollar => T![$], + rustc_lexer::TokenKind::Eq => T![=], + rustc_lexer::TokenKind::Not => T![!], + rustc_lexer::TokenKind::Lt => T![<], + rustc_lexer::TokenKind::Gt => T![>], + rustc_lexer::TokenKind::Minus => T![-], + rustc_lexer::TokenKind::And => T![&], + rustc_lexer::TokenKind::Or => T![|], + rustc_lexer::TokenKind::Plus => T![+], + rustc_lexer::TokenKind::Star => T![*], + rustc_lexer::TokenKind::Slash => T![/], + rustc_lexer::TokenKind::Caret => T![^], + rustc_lexer::TokenKind::Percent => T![%], + rustc_lexer::TokenKind::Unknown => ERROR, + } + }; + + return (syntax_kind, None); + + fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<&'static str>) { + #[rustfmt::skip] + let syntax_kind = match *kind { + LK::Int { empty_int: false, .. } => INT_NUMBER, + LK::Int { empty_int: true, .. } => { + return (INT_NUMBER, Some("Missing digits after the integer base prefix")) + } + + LK::Float { empty_exponent: false, .. } => FLOAT_NUMBER, + LK::Float { empty_exponent: true, .. } => { + return (FLOAT_NUMBER, Some("Missing digits after the exponent symbol")) + } + + LK::Char { terminated: true } => CHAR, + LK::Char { terminated: false } => { + return (CHAR, Some("Missing trailing `'` symbol to terminate the character literal")) + } + + LK::Byte { terminated: true } => BYTE, + LK::Byte { terminated: false } => { + return (BYTE, Some("Missing trailing `'` symbol to terminate the byte literal")) + } + + LK::Str { terminated: true } => STRING, + LK::Str { terminated: false } => { + return (STRING, Some("Missing trailing `\"` symbol to terminate the string literal")) + } + + + LK::ByteStr { terminated: true } => BYTE_STRING, + LK::ByteStr { terminated: false } => { + return (BYTE_STRING, Some("Missing trailing `\"` symbol to terminate the byte string literal")) + } + + LK::RawStr { err, .. } => match err { + None => RAW_STRING, + Some(RawStrError::InvalidStarter { .. }) => return (RAW_STRING, Some("Missing `\"` symbol after `#` symbols to begin the raw string literal")), + Some(RawStrError::NoTerminator { expected, found, .. }) => if expected == found { + return (RAW_STRING, Some("Missing trailing `\"` to terminate the raw string literal")) + } else { + return (RAW_STRING, Some("Missing trailing `\"` with `#` symbols to terminate the raw string literal")) + + }, + Some(RawStrError::TooManyDelimiters { .. }) => return (RAW_STRING, Some("Too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols")), + }, + LK::RawByteStr { err, .. } => match err { + None => RAW_BYTE_STRING, + Some(RawStrError::InvalidStarter { .. }) => return (RAW_BYTE_STRING, Some("Missing `\"` symbol after `#` symbols to begin the raw byte string literal")), + Some(RawStrError::NoTerminator { expected, found, .. }) => if expected == found { + return (RAW_BYTE_STRING, Some("Missing trailing `\"` to terminate the raw byte string literal")) + } else { + return (RAW_BYTE_STRING, Some("Missing trailing `\"` with `#` symbols to terminate the raw byte string literal")) + + }, + Some(RawStrError::TooManyDelimiters { .. }) => return (RAW_BYTE_STRING, Some("Too many `#` symbols: raw byte strings may be delimited by up to 65535 `#` symbols")), + }, + }; + + (syntax_kind, None) + } +} diff --git a/crates/syntax/src/parsing/reparsing.rs b/crates/syntax/src/parsing/reparsing.rs new file mode 100644 index 000000000..4149f856a --- /dev/null +++ b/crates/syntax/src/parsing/reparsing.rs @@ -0,0 +1,455 @@ +//! Implementation of incremental re-parsing. +//! +//! We use two simple strategies for this: +//! - if the edit modifies only a single token (like changing an identifier's +//! letter), we replace only this token. +//! - otherwise, we search for the nearest `{}` block which contains the edit +//! and try to parse only this block. + +use parser::Reparser; +use text_edit::Indel; + +use crate::{ + algo, + parsing::{ + lexer::{lex_single_syntax_kind, tokenize, Token}, + text_token_source::TextTokenSource, + text_tree_sink::TextTreeSink, + }, + syntax_node::{GreenNode, GreenToken, NodeOrToken, SyntaxElement, SyntaxNode}, + SyntaxError, + SyntaxKind::*, + TextRange, TextSize, T, +}; + +pub(crate) fn incremental_reparse( + node: &SyntaxNode, + edit: &Indel, + errors: Vec, +) -> Option<(GreenNode, Vec, TextRange)> { + if let Some((green, new_errors, old_range)) = reparse_token(node, &edit) { + return Some((green, merge_errors(errors, new_errors, old_range, edit), old_range)); + } + + if let Some((green, new_errors, old_range)) = reparse_block(node, &edit) { + return Some((green, merge_errors(errors, new_errors, old_range, edit), old_range)); + } + None +} + +fn reparse_token<'node>( + root: &'node SyntaxNode, + edit: &Indel, +) -> Option<(GreenNode, Vec, TextRange)> { + let prev_token = algo::find_covering_element(root, edit.delete).as_token()?.clone(); + let prev_token_kind = prev_token.kind(); + match prev_token_kind { + WHITESPACE | COMMENT | IDENT | STRING | RAW_STRING => { + if prev_token_kind == WHITESPACE || prev_token_kind == COMMENT { + // removing a new line may extends previous token + let deleted_range = edit.delete - prev_token.text_range().start(); + if prev_token.text()[deleted_range].contains('\n') { + return None; + } + } + + let mut new_text = get_text_after_edit(prev_token.clone().into(), &edit); + let (new_token_kind, new_err) = lex_single_syntax_kind(&new_text)?; + + if new_token_kind != prev_token_kind + || (new_token_kind == IDENT && is_contextual_kw(&new_text)) + { + return None; + } + + // Check that edited token is not a part of the bigger token. + // E.g. if for source code `bruh"str"` the user removed `ruh`, then + // `b` no longer remains an identifier, but becomes a part of byte string literal + if let Some(next_char) = root.text().char_at(prev_token.text_range().end()) { + new_text.push(next_char); + let token_with_next_char = lex_single_syntax_kind(&new_text); + if let Some((_kind, _error)) = token_with_next_char { + return None; + } + new_text.pop(); + } + + let new_token = + GreenToken::new(rowan::SyntaxKind(prev_token_kind.into()), new_text.into()); + Some(( + prev_token.replace_with(new_token), + new_err.into_iter().collect(), + prev_token.text_range(), + )) + } + _ => None, + } +} + +fn reparse_block<'node>( + root: &'node SyntaxNode, + edit: &Indel, +) -> Option<(GreenNode, Vec, TextRange)> { + let (node, reparser) = find_reparsable_node(root, edit.delete)?; + let text = get_text_after_edit(node.clone().into(), edit); + + let (tokens, new_lexer_errors) = tokenize(&text); + if !is_balanced(&tokens) { + return None; + } + + let mut token_source = TextTokenSource::new(&text, &tokens); + let mut tree_sink = TextTreeSink::new(&text, &tokens); + reparser.parse(&mut token_source, &mut tree_sink); + + let (green, mut new_parser_errors) = tree_sink.finish(); + new_parser_errors.extend(new_lexer_errors); + + Some((node.replace_with(green), new_parser_errors, node.text_range())) +} + +fn get_text_after_edit(element: SyntaxElement, edit: &Indel) -> String { + let edit = Indel::replace(edit.delete - element.text_range().start(), edit.insert.clone()); + + let mut text = match element { + NodeOrToken::Token(token) => token.text().to_string(), + NodeOrToken::Node(node) => node.text().to_string(), + }; + edit.apply(&mut text); + text +} + +fn is_contextual_kw(text: &str) -> bool { + matches!(text, "auto" | "default" | "union") +} + +fn find_reparsable_node(node: &SyntaxNode, range: TextRange) -> Option<(SyntaxNode, Reparser)> { + let node = algo::find_covering_element(node, range); + + let mut ancestors = match node { + NodeOrToken::Token(it) => it.parent().ancestors(), + NodeOrToken::Node(it) => it.ancestors(), + }; + ancestors.find_map(|node| { + let first_child = node.first_child_or_token().map(|it| it.kind()); + let parent = node.parent().map(|it| it.kind()); + Reparser::for_node(node.kind(), first_child, parent).map(|r| (node, r)) + }) +} + +fn is_balanced(tokens: &[Token]) -> bool { + if tokens.is_empty() + || tokens.first().unwrap().kind != T!['{'] + || tokens.last().unwrap().kind != T!['}'] + { + return false; + } + let mut balance = 0usize; + for t in &tokens[1..tokens.len() - 1] { + match t.kind { + T!['{'] => balance += 1, + T!['}'] => { + balance = match balance.checked_sub(1) { + Some(b) => b, + None => return false, + } + } + _ => (), + } + } + balance == 0 +} + +fn merge_errors( + old_errors: Vec, + new_errors: Vec, + range_before_reparse: TextRange, + edit: &Indel, +) -> Vec { + let mut res = Vec::new(); + + for old_err in old_errors { + let old_err_range = old_err.range(); + if old_err_range.end() <= range_before_reparse.start() { + res.push(old_err); + } else if old_err_range.start() >= range_before_reparse.end() { + let inserted_len = TextSize::of(&edit.insert); + res.push(old_err.with_range((old_err_range + inserted_len) - edit.delete.len())); + // Note: extra parens are intentional to prevent uint underflow, HWAB (here was a bug) + } + } + res.extend(new_errors.into_iter().map(|new_err| { + // fighting borrow checker with a variable ;) + let offseted_range = new_err.range() + range_before_reparse.start(); + new_err.with_range(offseted_range) + })); + res +} + +#[cfg(test)] +mod tests { + use test_utils::{assert_eq_text, extract_range}; + + use super::*; + use crate::{AstNode, Parse, SourceFile}; + + fn do_check(before: &str, replace_with: &str, reparsed_len: u32) { + let (range, before) = extract_range(before); + let edit = Indel::replace(range, replace_with.to_owned()); + let after = { + let mut after = before.clone(); + edit.apply(&mut after); + after + }; + + let fully_reparsed = SourceFile::parse(&after); + let incrementally_reparsed: Parse = { + let before = SourceFile::parse(&before); + let (green, new_errors, range) = + incremental_reparse(before.tree().syntax(), &edit, before.errors.to_vec()).unwrap(); + assert_eq!(range.len(), reparsed_len.into(), "reparsed fragment has wrong length"); + Parse::new(green, new_errors) + }; + + assert_eq_text!( + &format!("{:#?}", fully_reparsed.tree().syntax()), + &format!("{:#?}", incrementally_reparsed.tree().syntax()), + ); + assert_eq!(fully_reparsed.errors(), incrementally_reparsed.errors()); + } + + #[test] // FIXME: some test here actually test token reparsing + fn reparse_block_tests() { + do_check( + r" +fn foo() { + let x = foo + <|>bar<|> +} +", + "baz", + 3, + ); + do_check( + r" +fn foo() { + let x = foo<|> + bar<|> +} +", + "baz", + 25, + ); + do_check( + r" +struct Foo { + f: foo<|><|> +} +", + ",\n g: (),", + 14, + ); + do_check( + r" +fn foo { + let; + 1 + 1; + <|>92<|>; +} +", + "62", + 31, // FIXME: reparse only int literal here + ); + do_check( + r" +mod foo { + fn <|><|> +} +", + "bar", + 11, + ); + + do_check( + r" +trait Foo { + type <|>Foo<|>; +} +", + "Output", + 3, + ); + do_check( + r" +impl IntoIterator for Foo { + f<|><|> +} +", + "n next(", + 9, + ); + do_check(r"use a::b::{foo,<|>,bar<|>};", "baz", 10); + do_check( + r" +pub enum A { + Foo<|><|> +} +", + "\nBar;\n", + 11, + ); + do_check( + r" +foo!{a, b<|><|> d} +", + ", c[3]", + 8, + ); + do_check( + r" +fn foo() { + vec![<|><|>] +} +", + "123", + 14, + ); + do_check( + r" +extern { + fn<|>;<|> +} +", + " exit(code: c_int)", + 11, + ); + } + + #[test] + fn reparse_token_tests() { + do_check( + r"<|><|> +fn foo() -> i32 { 1 } +", + "\n\n\n \n", + 1, + ); + do_check( + r" +fn foo() -> <|><|> {} +", + " \n", + 2, + ); + do_check( + r" +fn <|>foo<|>() -> i32 { 1 } +", + "bar", + 3, + ); + do_check( + r" +fn foo<|><|>foo() { } +", + "bar", + 6, + ); + do_check( + r" +fn foo /* <|><|> */ () {} +", + "some comment", + 6, + ); + do_check( + r" +fn baz <|><|> () {} +", + " \t\t\n\n", + 2, + ); + do_check( + r" +fn baz <|><|> () {} +", + " \t\t\n\n", + 2, + ); + do_check( + r" +/// foo <|><|>omment +mod { } +", + "c", + 14, + ); + do_check( + r#" +fn -> &str { "Hello<|><|>" } +"#, + ", world", + 7, + ); + do_check( + r#" +fn -> &str { // "Hello<|><|>" +"#, + ", world", + 10, + ); + do_check( + r##" +fn -> &str { r#"Hello<|><|>"# +"##, + ", world", + 10, + ); + do_check( + r" +#[derive(<|>Copy<|>)] +enum Foo { + +} +", + "Clone", + 4, + ); + } + + #[test] + fn reparse_str_token_with_error_unchanged() { + do_check(r#""<|>Unclosed<|> string literal"#, "Still unclosed", 24); + } + + #[test] + fn reparse_str_token_with_error_fixed() { + do_check(r#""unterinated<|><|>"#, "\"", 12); + } + + #[test] + fn reparse_block_with_error_in_middle_unchanged() { + do_check( + r#"fn main() { + if {} + 32 + 4<|><|> + return + if {} + }"#, + "23", + 105, + ) + } + + #[test] + fn reparse_block_with_error_in_middle_fixed() { + do_check( + r#"fn main() { + if {} + 32 + 4<|><|> + return + if {} + }"#, + ";", + 105, + ) + } +} diff --git a/crates/syntax/src/parsing/text_token_source.rs b/crates/syntax/src/parsing/text_token_source.rs new file mode 100644 index 000000000..df866dc2b --- /dev/null +++ b/crates/syntax/src/parsing/text_token_source.rs @@ -0,0 +1,84 @@ +//! See `TextTokenSource` docs. + +use parser::TokenSource; + +use crate::{parsing::lexer::Token, SyntaxKind::EOF, TextRange, TextSize}; + +/// Implementation of `parser::TokenSource` that takes tokens from source code text. +pub(crate) struct TextTokenSource<'t> { + text: &'t str, + /// token and its start position (non-whitespace/comment tokens) + /// ```non-rust + /// struct Foo; + /// ^------^--^- + /// | | \________ + /// | \____ \ + /// | \ | + /// (struct, 0) (Foo, 7) (;, 10) + /// ``` + /// `[(struct, 0), (Foo, 7), (;, 10)]` + token_offset_pairs: Vec<(Token, TextSize)>, + + /// Current token and position + curr: (parser::Token, usize), +} + +impl<'t> TokenSource for TextTokenSource<'t> { + fn current(&self) -> parser::Token { + self.curr.0 + } + + fn lookahead_nth(&self, n: usize) -> parser::Token { + mk_token(self.curr.1 + n, &self.token_offset_pairs) + } + + fn bump(&mut self) { + if self.curr.0.kind == EOF { + return; + } + + let pos = self.curr.1 + 1; + self.curr = (mk_token(pos, &self.token_offset_pairs), pos); + } + + fn is_keyword(&self, kw: &str) -> bool { + self.token_offset_pairs + .get(self.curr.1) + .map(|(token, offset)| &self.text[TextRange::at(*offset, token.len)] == kw) + .unwrap_or(false) + } +} + +fn mk_token(pos: usize, token_offset_pairs: &[(Token, TextSize)]) -> parser::Token { + let (kind, is_jointed_to_next) = match token_offset_pairs.get(pos) { + Some((token, offset)) => ( + token.kind, + token_offset_pairs + .get(pos + 1) + .map(|(_, next_offset)| offset + token.len == *next_offset) + .unwrap_or(false), + ), + None => (EOF, false), + }; + parser::Token { kind, is_jointed_to_next } +} + +impl<'t> TextTokenSource<'t> { + /// Generate input from tokens(expect comment and whitespace). + pub fn new(text: &'t str, raw_tokens: &'t [Token]) -> TextTokenSource<'t> { + let token_offset_pairs: Vec<_> = raw_tokens + .iter() + .filter_map({ + let mut len = 0.into(); + move |token| { + let pair = if token.kind.is_trivia() { None } else { Some((*token, len)) }; + len += token.len; + pair + } + }) + .collect(); + + let first = mk_token(0, &token_offset_pairs); + TextTokenSource { text, token_offset_pairs, curr: (first, 0) } + } +} diff --git a/crates/syntax/src/parsing/text_tree_sink.rs b/crates/syntax/src/parsing/text_tree_sink.rs new file mode 100644 index 000000000..c1b5f246d --- /dev/null +++ b/crates/syntax/src/parsing/text_tree_sink.rs @@ -0,0 +1,183 @@ +//! FIXME: write short doc here + +use std::mem; + +use parser::{ParseError, TreeSink}; + +use crate::{ + parsing::Token, + syntax_node::GreenNode, + SmolStr, SyntaxError, + SyntaxKind::{self, *}, + SyntaxTreeBuilder, TextRange, TextSize, +}; + +/// Bridges the parser with our specific syntax tree representation. +/// +/// `TextTreeSink` also handles attachment of trivia (whitespace) to nodes. +pub(crate) struct TextTreeSink<'a> { + text: &'a str, + tokens: &'a [Token], + text_pos: TextSize, + token_pos: usize, + state: State, + inner: SyntaxTreeBuilder, +} + +enum State { + PendingStart, + Normal, + PendingFinish, +} + +impl<'a> TreeSink for TextTreeSink<'a> { + fn token(&mut self, kind: SyntaxKind, n_tokens: u8) { + match mem::replace(&mut self.state, State::Normal) { + State::PendingStart => unreachable!(), + State::PendingFinish => self.inner.finish_node(), + State::Normal => (), + } + self.eat_trivias(); + let n_tokens = n_tokens as usize; + let len = self.tokens[self.token_pos..self.token_pos + n_tokens] + .iter() + .map(|it| it.len) + .sum::(); + self.do_token(kind, len, n_tokens); + } + + fn start_node(&mut self, kind: SyntaxKind) { + match mem::replace(&mut self.state, State::Normal) { + State::PendingStart => { + self.inner.start_node(kind); + // No need to attach trivias to previous node: there is no + // previous node. + return; + } + State::PendingFinish => self.inner.finish_node(), + State::Normal => (), + } + + let n_trivias = + self.tokens[self.token_pos..].iter().take_while(|it| it.kind.is_trivia()).count(); + let leading_trivias = &self.tokens[self.token_pos..self.token_pos + n_trivias]; + let mut trivia_end = + self.text_pos + leading_trivias.iter().map(|it| it.len).sum::(); + + let n_attached_trivias = { + let leading_trivias = leading_trivias.iter().rev().map(|it| { + let next_end = trivia_end - it.len; + let range = TextRange::new(next_end, trivia_end); + trivia_end = next_end; + (it.kind, &self.text[range]) + }); + n_attached_trivias(kind, leading_trivias) + }; + self.eat_n_trivias(n_trivias - n_attached_trivias); + self.inner.start_node(kind); + self.eat_n_trivias(n_attached_trivias); + } + + fn finish_node(&mut self) { + match mem::replace(&mut self.state, State::PendingFinish) { + State::PendingStart => unreachable!(), + State::PendingFinish => self.inner.finish_node(), + State::Normal => (), + } + } + + fn error(&mut self, error: ParseError) { + self.inner.error(error, self.text_pos) + } +} + +impl<'a> TextTreeSink<'a> { + pub(super) fn new(text: &'a str, tokens: &'a [Token]) -> Self { + Self { + text, + tokens, + text_pos: 0.into(), + token_pos: 0, + state: State::PendingStart, + inner: SyntaxTreeBuilder::default(), + } + } + + pub(super) fn finish(mut self) -> (GreenNode, Vec) { + match mem::replace(&mut self.state, State::Normal) { + State::PendingFinish => { + self.eat_trivias(); + self.inner.finish_node() + } + State::PendingStart | State::Normal => unreachable!(), + } + + self.inner.finish_raw() + } + + fn eat_trivias(&mut self) { + while let Some(&token) = self.tokens.get(self.token_pos) { + if !token.kind.is_trivia() { + break; + } + self.do_token(token.kind, token.len, 1); + } + } + + fn eat_n_trivias(&mut self, n: usize) { + for _ in 0..n { + let token = self.tokens[self.token_pos]; + assert!(token.kind.is_trivia()); + self.do_token(token.kind, token.len, 1); + } + } + + fn do_token(&mut self, kind: SyntaxKind, len: TextSize, n_tokens: usize) { + let range = TextRange::at(self.text_pos, len); + let text: SmolStr = self.text[range].into(); + self.text_pos += len; + self.token_pos += n_tokens; + self.inner.token(kind, text); + } +} + +fn n_attached_trivias<'a>( + kind: SyntaxKind, + trivias: impl Iterator, +) -> usize { + match kind { + MACRO_CALL | CONST | TYPE_ALIAS | STRUCT | ENUM | VARIANT | FN | TRAIT | MODULE + | RECORD_FIELD | STATIC => { + let mut res = 0; + let mut trivias = trivias.enumerate().peekable(); + + while let Some((i, (kind, text))) = trivias.next() { + match kind { + WHITESPACE => { + if text.contains("\n\n") { + // we check whether the next token is a doc-comment + // and skip the whitespace in this case + if let Some((peek_kind, peek_text)) = + trivias.peek().map(|(_, pair)| pair) + { + if *peek_kind == COMMENT + && peek_text.starts_with("///") + && !peek_text.starts_with("////") + { + continue; + } + } + break; + } + } + COMMENT => { + res = i + 1; + } + _ => (), + } + } + res + } + _ => 0, + } +} -- cgit v1.2.3