Rename ra_syntax -> syntax

author: Aleksey Kladov <[email protected]> 2020-08-12 17:26:51 +0100
committer: Aleksey Kladov <[email protected]> 2020-08-12 17:30:53 +0100
commit: a1c187eef3ba08076aedb5154929f7eda8d1b424 (patch)
tree: 9d898eb9600b0c36a74e4f95238f679c683fa566 /crates/syntax/src/parsing
parent: 3d6889cba72a9d02199f7adaa2ecc69bc30af834 (diff)
4 files changed, 966 insertions, 0 deletions
diff --git a/crates/syntax/src/parsing/lexer.rs b/crates/syntax/src/parsing/lexer.rs
new file mode 100644
index 000000000..fa3be1016
--- /dev/null
+++ b/crates/syntax/src/parsing/lexer.rs
@@ -0,0 +1,244 @@
+//! Lexer analyzes raw input string and produces lexemes (tokens).
+//! It is just a bridge to `rustc_lexer`.
+use rustc_lexer::{LiteralKind as LK, RawStrError};
+use std::convert::TryInto;
+use crate::{
+    SyntaxError,
+    SyntaxKind::{self, *},
+    TextRange, TextSize, T,
+};
+/// A token of Rust source.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct Token {
+    /// The kind of token.
+    pub kind: SyntaxKind,
+    /// The length of the token.
+    pub len: TextSize,
+}
+/// Break a string up into its component tokens.
+/// Beware that it checks for shebang first and its length contributes to resulting
+/// tokens offsets.
+pub fn tokenize(text: &str) -> (Vec<Token>, Vec<SyntaxError>) {
+    // non-empty string is a precondtion of `rustc_lexer::strip_shebang()`.
+    if text.is_empty() {
+        return Default::default();
+    }
+    let mut tokens = Vec::new();
+    let mut errors = Vec::new();
+    let mut offset = match rustc_lexer::strip_shebang(text) {
+        Some(shebang_len) => {
+            tokens.push(Token { kind: SHEBANG, len: shebang_len.try_into().unwrap() });
+            shebang_len
+        }
+        None => 0,
+    };
+    let text_without_shebang = &text[offset..];
+    for rustc_token in rustc_lexer::tokenize(text_without_shebang) {
+        let token_len: TextSize = rustc_token.len.try_into().unwrap();
+        let token_range = TextRange::at(offset.try_into().unwrap(), token_len);
+        let (syntax_kind, err_message) =
+            rustc_token_kind_to_syntax_kind(&rustc_token.kind, &text[token_range]);
+        tokens.push(Token { kind: syntax_kind, len: token_len });
+        if let Some(err_message) = err_message {
+            errors.push(SyntaxError::new(err_message, token_range));
+        }
+        offset += rustc_token.len;
+    }
+    (tokens, errors)
+}
+/// Returns `SyntaxKind` and `Option<SyntaxError>` of the first token
+/// encountered at the beginning of the string.
+///
+/// Returns `None` if the string contains zero *or two or more* tokens.
+/// The token is malformed if the returned error is not `None`.
+///
+/// Beware that unescape errors are not checked at tokenization time.
+pub fn lex_single_syntax_kind(text: &str) -> Option<(SyntaxKind, Option<SyntaxError>)> {
+    lex_first_token(text)
+        .filter(|(token, _)| token.len == TextSize::of(text))
+        .map(|(token, error)| (token.kind, error))
+}
+/// The same as `lex_single_syntax_kind()` but returns only `SyntaxKind` and
+/// returns `None` if any tokenization error occured.
+///
+/// Beware that unescape errors are not checked at tokenization time.
+pub fn lex_single_valid_syntax_kind(text: &str) -> Option<SyntaxKind> {
+    lex_first_token(text)
+        .filter(|(token, error)| !error.is_some() && token.len == TextSize::of(text))
+        .map(|(token, _error)| token.kind)
+}
+/// Returns `SyntaxKind` and `Option<SyntaxError>` of the first token
+/// encountered at the beginning of the string.
+///
+/// Returns `None` if the string contains zero tokens or if the token was parsed
+/// with an error.
+/// The token is malformed if the returned error is not `None`.
+///
+/// Beware that unescape errors are not checked at tokenization time.
+fn lex_first_token(text: &str) -> Option<(Token, Option<SyntaxError>)> {
+    // non-empty string is a precondtion of `rustc_lexer::first_token()`.
+    if text.is_empty() {
+        return None;
+    }
+    let rustc_token = rustc_lexer::first_token(text);
+    let (syntax_kind, err_message) = rustc_token_kind_to_syntax_kind(&rustc_token.kind, text);
+    let token = Token { kind: syntax_kind, len: rustc_token.len.try_into().unwrap() };
+    let optional_error = err_message
+        .map(|err_message| SyntaxError::new(err_message, TextRange::up_to(TextSize::of(text))));
+    Some((token, optional_error))
+}
+/// Returns `SyntaxKind` and an optional tokenize error message.
+fn rustc_token_kind_to_syntax_kind(
+    rustc_token_kind: &rustc_lexer::TokenKind,
+    token_text: &str,
+) -> (SyntaxKind, Option<&'static str>) {
+    // A note on an intended tradeoff:
+    // We drop some useful infromation here (see patterns with double dots `..`)
+    // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
+    // being `u16` that come from `rowan::SyntaxKind`.
+    let syntax_kind = {
+        match rustc_token_kind {
+            rustc_lexer::TokenKind::LineComment => COMMENT,
+            rustc_lexer::TokenKind::BlockComment { terminated: true } => COMMENT,
+            rustc_lexer::TokenKind::BlockComment { terminated: false } => {
+                return (
+                    COMMENT,
+                    Some("Missing trailing `*/` symbols to terminate the block comment"),
+                );
+            }
+            rustc_lexer::TokenKind::Whitespace => WHITESPACE,
+            rustc_lexer::TokenKind::Ident => {
+                if token_text == "_" {
+                    UNDERSCORE
+                } else {
+                    SyntaxKind::from_keyword(token_text).unwrap_or(IDENT)
+                }
+            }
+            rustc_lexer::TokenKind::RawIdent => IDENT,
+            rustc_lexer::TokenKind::Literal { kind, .. } => return match_literal_kind(&kind),
+            rustc_lexer::TokenKind::Lifetime { starts_with_number: false } => LIFETIME,
+            rustc_lexer::TokenKind::Lifetime { starts_with_number: true } => {
+                return (LIFETIME, Some("Lifetime name cannot start with a number"))
+            }
+            rustc_lexer::TokenKind::Semi => T![;],
+            rustc_lexer::TokenKind::Comma => T![,],
+            rustc_lexer::TokenKind::Dot => T![.],
+            rustc_lexer::TokenKind::OpenParen => T!['('],
+            rustc_lexer::TokenKind::CloseParen => T![')'],
+            rustc_lexer::TokenKind::OpenBrace => T!['{'],
+            rustc_lexer::TokenKind::CloseBrace => T!['}'],
+            rustc_lexer::TokenKind::OpenBracket => T!['['],
+            rustc_lexer::TokenKind::CloseBracket => T![']'],
+            rustc_lexer::TokenKind::At => T![@],
+            rustc_lexer::TokenKind::Pound => T![#],
+            rustc_lexer::TokenKind::Tilde => T![~],
+            rustc_lexer::TokenKind::Question => T![?],
+            rustc_lexer::TokenKind::Colon => T![:],
+            rustc_lexer::TokenKind::Dollar => T![$],
+            rustc_lexer::TokenKind::Eq => T![=],
+            rustc_lexer::TokenKind::Not => T![!],
+            rustc_lexer::TokenKind::Lt => T![<],
+            rustc_lexer::TokenKind::Gt => T![>],
+            rustc_lexer::TokenKind::Minus => T![-],
+            rustc_lexer::TokenKind::And => T![&],
+            rustc_lexer::TokenKind::Or => T![|],
+            rustc_lexer::TokenKind::Plus => T![+],
+            rustc_lexer::TokenKind::Star => T![*],
+            rustc_lexer::TokenKind::Slash => T![/],
+            rustc_lexer::TokenKind::Caret => T![^],
+            rustc_lexer::TokenKind::Percent => T![%],
+            rustc_lexer::TokenKind::Unknown => ERROR,
+        }
+    };
+    return (syntax_kind, None);
+    fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<&'static str>) {
+        #[rustfmt::skip]
+        let syntax_kind = match *kind {
+            LK::Int { empty_int: false, .. } => INT_NUMBER,
+            LK::Int { empty_int: true, .. } => {
+                return (INT_NUMBER, Some("Missing digits after the integer base prefix"))
+            }
+            LK::Float { empty_exponent: false, .. } => FLOAT_NUMBER,
+            LK::Float { empty_exponent: true, .. } => {
+                return (FLOAT_NUMBER, Some("Missing digits after the exponent symbol"))
+            }
+            LK::Char { terminated: true } => CHAR,
+            LK::Char { terminated: false } => {
+                return (CHAR, Some("Missing trailing `'` symbol to terminate the character literal"))
+            }
+            LK::Byte { terminated: true } => BYTE,
+            LK::Byte { terminated: false } => {
+                return (BYTE, Some("Missing trailing `'` symbol to terminate the byte literal"))
+            }
+            LK::Str { terminated: true } => STRING,
+            LK::Str { terminated: false } => {
+                return (STRING, Some("Missing trailing `\"` symbol to terminate the string literal"))
+            }
+            LK::ByteStr { terminated: true } => BYTE_STRING,
+            LK::ByteStr { terminated: false } => {
+                return (BYTE_STRING, Some("Missing trailing `\"` symbol to terminate the byte string literal"))
+            }
+            LK::RawStr { err, .. } => match err {
+                None => RAW_STRING,
+                Some(RawStrError::InvalidStarter { .. }) => return (RAW_STRING, Some("Missing `\"` symbol after `#` symbols to begin the raw string literal")),
+                Some(RawStrError::NoTerminator { expected, found, .. }) => if expected == found {
+                    return (RAW_STRING, Some("Missing trailing `\"` to terminate the raw string literal"))
+                } else {
+                    return (RAW_STRING, Some("Missing trailing `\"` with `#` symbols to terminate the raw string literal"))
+                },
+                Some(RawStrError::TooManyDelimiters { .. }) => return (RAW_STRING, Some("Too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols")),
+            },
+            LK::RawByteStr { err, .. } => match err {
+                None => RAW_BYTE_STRING,
+                Some(RawStrError::InvalidStarter { .. }) => return (RAW_BYTE_STRING, Some("Missing `\"` symbol after `#` symbols to begin the raw byte string literal")),
+                Some(RawStrError::NoTerminator { expected, found, .. }) => if expected == found {
+                    return (RAW_BYTE_STRING, Some("Missing trailing `\"` to terminate the raw byte string literal"))
+                } else {
+                    return (RAW_BYTE_STRING, Some("Missing trailing `\"` with `#` symbols to terminate the raw byte string literal"))
+                },
+                Some(RawStrError::TooManyDelimiters { .. }) => return (RAW_BYTE_STRING, Some("Too many `#` symbols: raw byte strings may be delimited by up to 65535 `#` symbols")),
+            },
+        };
+        (syntax_kind, None)
+    }
+}
diff --git a/crates/syntax/src/parsing/reparsing.rs b/crates/syntax/src/parsing/reparsing.rs
new file mode 100644
index 000000000..4149f856a
--- /dev/null
+++ b/crates/syntax/src/parsing/reparsing.rs
@@ -0,0 +1,455 @@
+//! Implementation of incremental re-parsing.
+//!
+//! We use two simple strategies for this:
+//!   - if the edit modifies only a single token (like changing an identifier's
+//!     letter), we replace only this token.
+//!   - otherwise, we search for the nearest `{}` block which contains the edit
+//!     and try to parse only this block.
+use parser::Reparser;
+use text_edit::Indel;
+use crate::{
+    algo,
+    parsing::{
+        lexer::{lex_single_syntax_kind, tokenize, Token},
+        text_token_source::TextTokenSource,
+        text_tree_sink::TextTreeSink,
+    },
+    syntax_node::{GreenNode, GreenToken, NodeOrToken, SyntaxElement, SyntaxNode},
+    SyntaxError,
+    SyntaxKind::*,
+    TextRange, TextSize, T,
+};
+pub(crate) fn incremental_reparse(
+    node: &SyntaxNode,
+    edit: &Indel,
+    errors: Vec<SyntaxError>,
+) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {
+    if let Some((green, new_errors, old_range)) = reparse_token(node, &edit) {
+        return Some((green, merge_errors(errors, new_errors, old_range, edit), old_range));
+    }
+    if let Some((green, new_errors, old_range)) = reparse_block(node, &edit) {
+        return Some((green, merge_errors(errors, new_errors, old_range, edit), old_range));
+    }
+    None
+}
+fn reparse_token<'node>(
+    root: &'node SyntaxNode,
+    edit: &Indel,
+) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {
+    let prev_token = algo::find_covering_element(root, edit.delete).as_token()?.clone();
+    let prev_token_kind = prev_token.kind();
+    match prev_token_kind {
+        WHITESPACE | COMMENT | IDENT | STRING | RAW_STRING => {
+            if prev_token_kind == WHITESPACE || prev_token_kind == COMMENT {
+                // removing a new line may extends previous token
+                let deleted_range = edit.delete - prev_token.text_range().start();
+                if prev_token.text()[deleted_range].contains('\n') {
+                    return None;
+                }
+            }
+            let mut new_text = get_text_after_edit(prev_token.clone().into(), &edit);
+            let (new_token_kind, new_err) = lex_single_syntax_kind(&new_text)?;
+            if new_token_kind != prev_token_kind
+                || (new_token_kind == IDENT && is_contextual_kw(&new_text))
+            {
+                return None;
+            }
+            // Check that edited token is not a part of the bigger token.
+            // E.g. if for source code `bruh"str"` the user removed `ruh`, then
+            // `b` no longer remains an identifier, but becomes a part of byte string literal
+            if let Some(next_char) = root.text().char_at(prev_token.text_range().end()) {
+                new_text.push(next_char);
+                let token_with_next_char = lex_single_syntax_kind(&new_text);
+                if let Some((_kind, _error)) = token_with_next_char {
+                    return None;
+                }
+                new_text.pop();
+            }
+            let new_token =
+                GreenToken::new(rowan::SyntaxKind(prev_token_kind.into()), new_text.into());
+            Some((
+                prev_token.replace_with(new_token),
+                new_err.into_iter().collect(),
+                prev_token.text_range(),
+            ))
+        }
+        _ => None,
+    }
+}
+fn reparse_block<'node>(
+    root: &'node SyntaxNode,
+    edit: &Indel,
+) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {
+    let (node, reparser) = find_reparsable_node(root, edit.delete)?;
+    let text = get_text_after_edit(node.clone().into(), edit);
+    let (tokens, new_lexer_errors) = tokenize(&text);
+    if !is_balanced(&tokens) {
+        return None;
+    }
+    let mut token_source = TextTokenSource::new(&text, &tokens);
+    let mut tree_sink = TextTreeSink::new(&text, &tokens);
+    reparser.parse(&mut token_source, &mut tree_sink);
+    let (green, mut new_parser_errors) = tree_sink.finish();
+    new_parser_errors.extend(new_lexer_errors);
+    Some((node.replace_with(green), new_parser_errors, node.text_range()))
+}
+fn get_text_after_edit(element: SyntaxElement, edit: &Indel) -> String {
+    let edit = Indel::replace(edit.delete - element.text_range().start(), edit.insert.clone());
+    let mut text = match element {
+        NodeOrToken::Token(token) => token.text().to_string(),
+        NodeOrToken::Node(node) => node.text().to_string(),
+    };
+    edit.apply(&mut text);
+    text
+}
+fn is_contextual_kw(text: &str) -> bool {
+    matches!(text, "auto" | "default" | "union")
+}
+fn find_reparsable_node(node: &SyntaxNode, range: TextRange) -> Option<(SyntaxNode, Reparser)> {
+    let node = algo::find_covering_element(node, range);
+    let mut ancestors = match node {
+        NodeOrToken::Token(it) => it.parent().ancestors(),
+        NodeOrToken::Node(it) => it.ancestors(),
+    };
+    ancestors.find_map(|node| {
+        let first_child = node.first_child_or_token().map(|it| it.kind());
+        let parent = node.parent().map(|it| it.kind());
+        Reparser::for_node(node.kind(), first_child, parent).map(|r| (node, r))
+    })
+}
+fn is_balanced(tokens: &[Token]) -> bool {
+    if tokens.is_empty()
+        || tokens.first().unwrap().kind != T!['{']
+        || tokens.last().unwrap().kind != T!['}']
+    {
+        return false;
+    }
+    let mut balance = 0usize;
+    for t in &tokens[1..tokens.len() - 1] {
+        match t.kind {
+            T!['{'] => balance += 1,
+            T!['}'] => {
+                balance = match balance.checked_sub(1) {
+                    Some(b) => b,
+                    None => return false,
+                }
+            }
+            _ => (),
+        }
+    }
+    balance == 0
+}
+fn merge_errors(
+    old_errors: Vec<SyntaxError>,
+    new_errors: Vec<SyntaxError>,
+    range_before_reparse: TextRange,
+    edit: &Indel,
+) -> Vec<SyntaxError> {
+    let mut res = Vec::new();
+    for old_err in old_errors {
+        let old_err_range = old_err.range();
+        if old_err_range.end() <= range_before_reparse.start() {
+            res.push(old_err);
+        } else if old_err_range.start() >= range_before_reparse.end() {
+            let inserted_len = TextSize::of(&edit.insert);
+            res.push(old_err.with_range((old_err_range + inserted_len) - edit.delete.len()));
+            // Note: extra parens are intentional to prevent uint underflow, HWAB (here was a bug)
+        }
+    }
+    res.extend(new_errors.into_iter().map(|new_err| {
+        // fighting borrow checker with a variable ;)
+        let offseted_range = new_err.range() + range_before_reparse.start();
+        new_err.with_range(offseted_range)
+    }));
+    res
+}
+#[cfg(test)]
+mod tests {
+    use test_utils::{assert_eq_text, extract_range};
+    use super::*;
+    use crate::{AstNode, Parse, SourceFile};
+    fn do_check(before: &str, replace_with: &str, reparsed_len: u32) {
+        let (range, before) = extract_range(before);
+        let edit = Indel::replace(range, replace_with.to_owned());
+        let after = {
+            let mut after = before.clone();
+            edit.apply(&mut after);
+            after
+        };
+        let fully_reparsed = SourceFile::parse(&after);
+        let incrementally_reparsed: Parse<SourceFile> = {
+            let before = SourceFile::parse(&before);
+            let (green, new_errors, range) =
+                incremental_reparse(before.tree().syntax(), &edit, before.errors.to_vec()).unwrap();
+            assert_eq!(range.len(), reparsed_len.into(), "reparsed fragment has wrong length");
+            Parse::new(green, new_errors)
+        };
+        assert_eq_text!(
+            &format!("{:#?}", fully_reparsed.tree().syntax()),
+            &format!("{:#?}", incrementally_reparsed.tree().syntax()),
+        );
+        assert_eq!(fully_reparsed.errors(), incrementally_reparsed.errors());
+    }
+    #[test] // FIXME: some test here actually test token reparsing
+    fn reparse_block_tests() {
+        do_check(
+            r"
+fn foo() {
+    let x = foo + <|>bar<|>
+}
+",
+            "baz",
+            3,
+        );
+        do_check(
+            r"
+fn foo() {
+    let x = foo<|> + bar<|>
+}
+",
+            "baz",
+            25,
+        );
+        do_check(
+            r"
+struct Foo {
+    f: foo<|><|>
+}
+",
+            ",\n    g: (),",
+            14,
+        );
+        do_check(
+            r"
+fn foo {
+    let;
+    1 + 1;
+    <|>92<|>;
+}
+",
+            "62",
+            31, // FIXME: reparse only int literal here
+        );
+        do_check(
+            r"
+mod foo {
+    fn <|><|>
+}
+",
+            "bar",
+            11,
+        );
+        do_check(
+            r"
+trait Foo {
+    type <|>Foo<|>;
+}
+",
+            "Output",
+            3,
+        );
+        do_check(
+            r"
+impl IntoIterator<Item=i32> for Foo {
+    f<|><|>
+}
+",
+            "n next(",
+            9,
+        );
+        do_check(r"use a::b::{foo,<|>,bar<|>};", "baz", 10);
+        do_check(
+            r"
+pub enum A {
+    Foo<|><|>
+}
+",
+            "\nBar;\n",
+            11,
+        );
+        do_check(
+            r"
+foo!{a, b<|><|> d}
+",
+            ", c[3]",
+            8,
+        );
+        do_check(
+            r"
+fn foo() {
+    vec![<|><|>]
+}
+",
+            "123",
+            14,
+        );
+        do_check(
+            r"
+extern {
+    fn<|>;<|>
+}
+",
+            " exit(code: c_int)",
+            11,
+        );
+    }
+    #[test]
+    fn reparse_token_tests() {
+        do_check(
+            r"<|><|>
+fn foo() -> i32 { 1 }
+",
+            "\n\n\n   \n",
+            1,
+        );
+        do_check(
+            r"
+fn foo() -> <|><|> {}
+",
+            "  \n",
+            2,
+        );
+        do_check(
+            r"
+fn <|>foo<|>() -> i32 { 1 }
+",
+            "bar",
+            3,
+        );
+        do_check(
+            r"
+fn foo<|><|>foo() {  }
+",
+            "bar",
+            6,
+        );
+        do_check(
+            r"
+fn foo /* <|><|> */ () {}
+",
+            "some comment",
+            6,
+        );
+        do_check(
+            r"
+fn baz <|><|> () {}
+",
+            "    \t\t\n\n",
+            2,
+        );
+        do_check(
+            r"
+fn baz <|><|> () {}
+",
+            "    \t\t\n\n",
+            2,
+        );
+        do_check(
+            r"
+/// foo <|><|>omment
+mod { }
+",
+            "c",
+            14,
+        );
+        do_check(
+            r#"
+fn -> &str { "Hello<|><|>" }
+"#,
+            ", world",
+            7,
+        );
+        do_check(
+            r#"
+fn -> &str { // "Hello<|><|>"
+"#,
+            ", world",
+            10,
+        );
+        do_check(
+            r##"
+fn -> &str { r#"Hello<|><|>"#
+"##,
+            ", world",
+            10,
+        );
+        do_check(
+            r"
+#[derive(<|>Copy<|>)]
+enum Foo {
+}
+",
+            "Clone",
+            4,
+        );
+    }
+    #[test]
+    fn reparse_str_token_with_error_unchanged() {
+        do_check(r#""<|>Unclosed<|> string literal"#, "Still unclosed", 24);
+    }
+    #[test]
+    fn reparse_str_token_with_error_fixed() {
+        do_check(r#""unterinated<|><|>"#, "\"", 12);
+    }
+    #[test]
+    fn reparse_block_with_error_in_middle_unchanged() {
+        do_check(
+            r#"fn main() {
+                if {}
+                32 + 4<|><|>
+                return
+                if {}
+            }"#,
+            "23",
+            105,
+        )
+    }
+    #[test]
+    fn reparse_block_with_error_in_middle_fixed() {
+        do_check(
+            r#"fn main() {
+                if {}
+                32 + 4<|><|>
+                return
+                if {}
+            }"#,
+            ";",
+            105,
+        )
+    }
+}
diff --git a/crates/syntax/src/parsing/text_token_source.rs b/crates/syntax/src/parsing/text_token_source.rs
new file mode 100644
index 000000000..df866dc2b
--- /dev/null
+++ b/crates/syntax/src/parsing/text_token_source.rs
@@ -0,0 +1,84 @@
+//! See `TextTokenSource` docs.
+use parser::TokenSource;
+use crate::{parsing::lexer::Token, SyntaxKind::EOF, TextRange, TextSize};
+/// Implementation of `parser::TokenSource` that takes tokens from source code text.
+pub(crate) struct TextTokenSource<'t> {
+    text: &'t str,
+    /// token and its start position (non-whitespace/comment tokens)
+    /// ```non-rust
+    ///  struct Foo;
+    ///  ^------^--^-
+    ///  |      |    \________
+    ///  |      \____         \
+    ///  |           \         |
+    ///  (struct, 0) (Foo, 7) (;, 10)
+    /// ```
+    /// `[(struct, 0), (Foo, 7), (;, 10)]`
+    token_offset_pairs: Vec<(Token, TextSize)>,
+    /// Current token and position
+    curr: (parser::Token, usize),
+}
+impl<'t> TokenSource for TextTokenSource<'t> {
+    fn current(&self) -> parser::Token {
+        self.curr.0
+    }
+    fn lookahead_nth(&self, n: usize) -> parser::Token {
+        mk_token(self.curr.1 + n, &self.token_offset_pairs)
+    }
+    fn bump(&mut self) {
+        if self.curr.0.kind == EOF {
+            return;
+        }
+        let pos = self.curr.1 + 1;
+        self.curr = (mk_token(pos, &self.token_offset_pairs), pos);
+    }
+    fn is_keyword(&self, kw: &str) -> bool {
+        self.token_offset_pairs
+            .get(self.curr.1)
+            .map(|(token, offset)| &self.text[TextRange::at(*offset, token.len)] == kw)
+            .unwrap_or(false)
+    }
+}
+fn mk_token(pos: usize, token_offset_pairs: &[(Token, TextSize)]) -> parser::Token {
+    let (kind, is_jointed_to_next) = match token_offset_pairs.get(pos) {
+        Some((token, offset)) => (
+            token.kind,
+            token_offset_pairs
+                .get(pos + 1)
+                .map(|(_, next_offset)| offset + token.len == *next_offset)
+                .unwrap_or(false),
+        ),
+        None => (EOF, false),
+    };
+    parser::Token { kind, is_jointed_to_next }
+}
+impl<'t> TextTokenSource<'t> {
+    /// Generate input from tokens(expect comment and whitespace).
+    pub fn new(text: &'t str, raw_tokens: &'t [Token]) -> TextTokenSource<'t> {
+        let token_offset_pairs: Vec<_> = raw_tokens
+            .iter()
+            .filter_map({
+                let mut len = 0.into();
+                move |token| {
+                    let pair = if token.kind.is_trivia() { None } else { Some((*token, len)) };
+                    len += token.len;
+                    pair
+                }
+            })
+            .collect();
+        let first = mk_token(0, &token_offset_pairs);
+        TextTokenSource { text, token_offset_pairs, curr: (first, 0) }
+    }
+}
diff --git a/crates/syntax/src/parsing/text_tree_sink.rs b/crates/syntax/src/parsing/text_tree_sink.rs
new file mode 100644
index 000000000..c1b5f246d
--- /dev/null
+++ b/crates/syntax/src/parsing/text_tree_sink.rs
@@ -0,0 +1,183 @@
+//! FIXME: write short doc here
+use std::mem;
+use parser::{ParseError, TreeSink};
+use crate::{
+    parsing::Token,
+    syntax_node::GreenNode,
+    SmolStr, SyntaxError,
+    SyntaxKind::{self, *},
+    SyntaxTreeBuilder, TextRange, TextSize,
+};
+/// Bridges the parser with our specific syntax tree representation.
+///
+/// `TextTreeSink` also handles attachment of trivia (whitespace) to nodes.
+pub(crate) struct TextTreeSink<'a> {
+    text: &'a str,
+    tokens: &'a [Token],
+    text_pos: TextSize,
+    token_pos: usize,
+    state: State,
+    inner: SyntaxTreeBuilder,
+}
+enum State {
+    PendingStart,
+    Normal,
+    PendingFinish,
+}
+impl<'a> TreeSink for TextTreeSink<'a> {
+    fn token(&mut self, kind: SyntaxKind, n_tokens: u8) {
+        match mem::replace(&mut self.state, State::Normal) {
+            State::PendingStart => unreachable!(),
+            State::PendingFinish => self.inner.finish_node(),
+            State::Normal => (),
+        }
+        self.eat_trivias();
+        let n_tokens = n_tokens as usize;
+        let len = self.tokens[self.token_pos..self.token_pos + n_tokens]
+            .iter()
+            .map(|it| it.len)
+            .sum::<TextSize>();
+        self.do_token(kind, len, n_tokens);
+    }
+    fn start_node(&mut self, kind: SyntaxKind) {
+        match mem::replace(&mut self.state, State::Normal) {
+            State::PendingStart => {
+                self.inner.start_node(kind);
+                // No need to attach trivias to previous node: there is no
+                // previous node.
+                return;
+            }
+            State::PendingFinish => self.inner.finish_node(),
+            State::Normal => (),
+        }
+        let n_trivias =
+            self.tokens[self.token_pos..].iter().take_while(|it| it.kind.is_trivia()).count();
+        let leading_trivias = &self.tokens[self.token_pos..self.token_pos + n_trivias];
+        let mut trivia_end =
+            self.text_pos + leading_trivias.iter().map(|it| it.len).sum::<TextSize>();
+        let n_attached_trivias = {
+            let leading_trivias = leading_trivias.iter().rev().map(|it| {
+                let next_end = trivia_end - it.len;
+                let range = TextRange::new(next_end, trivia_end);
+                trivia_end = next_end;
+                (it.kind, &self.text[range])
+            });
+            n_attached_trivias(kind, leading_trivias)
+        };
+        self.eat_n_trivias(n_trivias - n_attached_trivias);
+        self.inner.start_node(kind);
+        self.eat_n_trivias(n_attached_trivias);
+    }
+    fn finish_node(&mut self) {
+        match mem::replace(&mut self.state, State::PendingFinish) {
+            State::PendingStart => unreachable!(),
+            State::PendingFinish => self.inner.finish_node(),
+            State::Normal => (),
+        }
+    }
+    fn error(&mut self, error: ParseError) {
+        self.inner.error(error, self.text_pos)
+    }
+}
+impl<'a> TextTreeSink<'a> {
+    pub(super) fn new(text: &'a str, tokens: &'a [Token]) -> Self {
+        Self {
+            text,
+            tokens,
+            text_pos: 0.into(),
+            token_pos: 0,
+            state: State::PendingStart,
+            inner: SyntaxTreeBuilder::default(),
+        }
+    }
+    pub(super) fn finish(mut self) -> (GreenNode, Vec<SyntaxError>) {
+        match mem::replace(&mut self.state, State::Normal) {
+            State::PendingFinish => {
+                self.eat_trivias();
+                self.inner.finish_node()
+            }
+            State::PendingStart | State::Normal => unreachable!(),
+        }
+        self.inner.finish_raw()
+    }
+    fn eat_trivias(&mut self) {
+        while let Some(&token) = self.tokens.get(self.token_pos) {
+            if !token.kind.is_trivia() {
+                break;
+            }
+            self.do_token(token.kind, token.len, 1);
+        }
+    }
+    fn eat_n_trivias(&mut self, n: usize) {
+        for _ in 0..n {
+            let token = self.tokens[self.token_pos];
+            assert!(token.kind.is_trivia());
+            self.do_token(token.kind, token.len, 1);
+        }
+    }
+    fn do_token(&mut self, kind: SyntaxKind, len: TextSize, n_tokens: usize) {
+        let range = TextRange::at(self.text_pos, len);
+        let text: SmolStr = self.text[range].into();
+        self.text_pos += len;
+        self.token_pos += n_tokens;
+        self.inner.token(kind, text);
+    }
+}
+fn n_attached_trivias<'a>(
+    kind: SyntaxKind,
+    trivias: impl Iterator<Item = (SyntaxKind, &'a str)>,
+) -> usize {
+    match kind {
+        MACRO_CALL | CONST | TYPE_ALIAS | STRUCT | ENUM | VARIANT | FN | TRAIT | MODULE
+        | RECORD_FIELD | STATIC => {
+            let mut res = 0;
+            let mut trivias = trivias.enumerate().peekable();
+            while let Some((i, (kind, text))) = trivias.next() {
+                match kind {
+                    WHITESPACE => {
+                        if text.contains("\n\n") {
+                            // we check whether the next token is a doc-comment
+                            // and skip the whitespace in this case
+                            if let Some((peek_kind, peek_text)) =
+                                trivias.peek().map(|(_, pair)| pair)
+                            {
+                                if *peek_kind == COMMENT
+                                    && peek_text.starts_with("///")
+                                    && !peek_text.starts_with("////")
+                                {
+                                    continue;
+                                }
+                            }
+                            break;
+                        }
+                    }
+                    COMMENT => {
+                        res = i + 1;
+                    }
+                    _ => (),
+                }
+            }
+            res
+        }
+        _ => 0,
+    }
+}
author	Aleksey Kladov <[email protected]>	2020-08-12 17:26:51 +0100
committer	Aleksey Kladov <[email protected]>	2020-08-12 17:30:53 +0100
commit	a1c187eef3ba08076aedb5154929f7eda8d1b424 (patch)
tree	9d898eb9600b0c36a74e4f95238f679c683fa566 /crates/syntax/src/parsing
parent	3d6889cba72a9d02199f7adaa2ecc69bc30af834 (diff)