ra_syntax: refactored the lexer design as per @matklad and @kiljacken PR review

author: Veetaha <[email protected]> 2020-01-28 05:09:13 +0000
committer: Veetaha <[email protected]> 2020-02-03 22:00:55 +0000
commit: 9e7eaa959f9dc368a55f1a80b35651b78b3d0883 (patch)
tree: 4b1f4af14d9898301949fa937219006d671a72ef /crates/ra_syntax/src/parsing
parent: bf60661aa3e2a77fedb3e1627675842d05538860 (diff)
3 files changed, 180 insertions, 162 deletions
diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs
index bf6b4d637..55755be18 100644
--- a/crates/ra_syntax/src/parsing/lexer.rs
+++ b/crates/ra_syntax/src/parsing/lexer.rs
@@ -16,55 +16,21 @@ pub struct Token {
    pub len: TextUnit,
 }
-/// Represents the result of parsing one token. Beware that the token may be malformed.
-#[derive(Debug)]
-pub struct ParsedToken {
-    /// Parsed token.
-    pub token: Token,
-    /// If error is present then parsed token is malformed.
-    pub error: Option<SyntaxError>,
-}
-#[derive(Debug, Default)]
-/// Represents the result of parsing source code of Rust language.
-pub struct ParsedTokens {
-    /// Parsed tokens in order they appear in source code.
-    pub tokens: Vec<Token>,
-    /// Collection of all occured tokenization errors.
-    /// In general `self.errors.len() <= self.tokens.len()`
-    pub errors: Vec<SyntaxError>,
-}
-impl ParsedTokens {
-    /// Append `token` and `error` (if pressent) to the result.
-    pub fn push(&mut self, ParsedToken { token, error }: ParsedToken) {
-        self.tokens.push(token);
-        if let Some(error) = error {
-            self.errors.push(error)
-        }
-    }
-}
-/// Same as `tokenize_append()`, just a shortcut for creating `ParsedTokens`
-/// and returning the result the usual way.
-pub fn tokenize(text: &str) -> ParsedTokens {
-    let mut parsed = ParsedTokens::default();
-    tokenize_append(text, &mut parsed);
-    parsed
-}
 /// Break a string up into its component tokens.
-/// Writes to `ParsedTokens` which are basically a pair `(Vec<Token>, Vec<SyntaxError>)`.
 /// Beware that it checks for shebang first and its length contributes to resulting
 /// tokens offsets.
-pub fn tokenize_append(text: &str, parsed: &mut ParsedTokens) {
+pub fn tokenize(text: &str) -> (Vec<Token>, Vec<SyntaxError>) {
    // non-empty string is a precondtion of `rustc_lexer::strip_shebang()`.
    if text.is_empty() {
-        return;
+        return Default::default();
    }
+    let mut tokens = Vec::new();
+    let mut errors = Vec::new();
    let mut offset: usize = rustc_lexer::strip_shebang(text)
        .map(|shebang_len| {
-            parsed.tokens.push(Token { kind: SHEBANG, len: TextUnit::from_usize(shebang_len) });
+            tokens.push(Token { kind: SHEBANG, len: TextUnit::from_usize(shebang_len) });
            shebang_len
        })
        .unwrap_or(0);
@@ -72,35 +38,76 @@ pub fn tokenize_append(text: &str, parsed: &mut ParsedTokens) {
    let text_without_shebang = &text[offset..];
    for rustc_token in rustc_lexer::tokenize(text_without_shebang) {
-        parsed.push(rustc_token_to_parsed_token(&rustc_token, text, TextUnit::from_usize(offset)));
+        let token_len = TextUnit::from_usize(rustc_token.len);
+        let token_range = TextRange::offset_len(TextUnit::from_usize(offset), token_len);
+        let (syntax_kind, error) =
+            rustc_token_kind_to_syntax_kind(&rustc_token.kind, &text[token_range]);
+        tokens.push(Token { kind: syntax_kind, len: token_len });
+        if let Some(error) = error {
+            errors.push(SyntaxError::new(SyntaxErrorKind::TokenizeError(error), token_range));
+        }
        offset += rustc_token.len;
    }
+    (tokens, errors)
 }
-/// Returns the first encountered token at the beginning of the string.
+/// Returns `SyntaxKind` and `Option<SyntaxError>` of the first token
-/// If the string contains zero or *two or more tokens* returns `None`.
+/// encountered at the beginning of the string.
+///
+/// Returns `None` if the string contains zero *or two or more* tokens.
+/// The token is malformed if the returned error is not `None`.
+///
+/// Beware that unescape errors are not checked at tokenization time.
+pub fn lex_single_syntax_kind(text: &str) -> Option<(SyntaxKind, Option<SyntaxError>)> {
+    first_token(text)
+        .filter(|(token, _)| token.len.to_usize() == text.len())
+        .map(|(token, error)| (token.kind, error))
+}
+/// The same as `single_syntax_kind()` but returns only `SyntaxKind` and
+/// returns `None` if any tokenization error occured.
 ///
-/// The main difference between `first_token()` and `single_token()` is that
+/// Beware that unescape errors are not checked at tokenization time.
-/// the latter returns `None` if the string contains more than one token.
+pub fn lex_single_valid_syntax_kind(text: &str) -> Option<SyntaxKind> {
-pub fn single_token(text: &str) -> Option<ParsedToken> {
+    first_token(text)
-    first_token(text).filter(|parsed| parsed.token.len.to_usize() == text.len())
+        .filter(|(token, error)| !error.is_some() && token.len.to_usize() == text.len())
+        .map(|(token, _error)| token.kind)
 }
 /// Returns the first encountered token at the beginning of the string.
-/// If the string contains zero tokens returns `None`.
 ///
-/// The main difference between `first_token() and single_token()` is that
+/// Returns `None` if the string contains zero tokens or if the token was parsed
-/// the latter returns `None` if the string contains more than one token.
+/// with an error.
-pub fn first_token(text: &str) -> Option<ParsedToken> {
+///
+/// Beware that unescape errors are not checked at tokenization time.
+fn first_token(text: &str) -> Option<(Token, Option<SyntaxError>)> {
    // non-empty string is a precondtion of `rustc_lexer::first_token()`.
    if text.is_empty() {
-        None
+        return None;
-    } else {
-        let rustc_token = rustc_lexer::first_token(text);
-        Some(rustc_token_to_parsed_token(&rustc_token, text, TextUnit::from(0)))
    }
+    let rustc_token = rustc_lexer::first_token(text);
+    let (syntax_kind, error) = rustc_token_kind_to_syntax_kind(&rustc_token.kind, text);
+    let token = Token { kind: syntax_kind, len: TextUnit::from_usize(rustc_token.len) };
+    let error = error.map(|error| {
+        SyntaxError::new(
+            SyntaxErrorKind::TokenizeError(error),
+            TextRange::from_to(TextUnit::from(0), TextUnit::of_str(text)),
+        )
+    });
+    Some((token, error))
 }
+// FIXME: simplify TokenizeError to `SyntaxError(String, TextRange)` as per @matklad advice:
+// https://github.com/rust-analyzer/rust-analyzer/pull/2911/files#r371175067
 /// Describes the values of `SyntaxErrorKind::TokenizeError` enum variant.
 /// It describes all the types of errors that may happen during the tokenization
 /// of Rust source.
@@ -136,122 +143,132 @@ pub enum TokenizeError {
    LifetimeStartsWithNumber,
 }
-/// Mapper function that converts `rustc_lexer::Token` with some additional context
+fn rustc_token_kind_to_syntax_kind(
-/// to `ParsedToken`
+    rustc_token_kind: &rustc_lexer::TokenKind,
-fn rustc_token_to_parsed_token(
+    token_text: &str,
-    rustc_token: &rustc_lexer::Token,
+) -> (SyntaxKind, Option<TokenizeError>) {
-    text: &str,
+    // A note on an intended tradeoff:
-    token_start_offset: TextUnit,
-) -> ParsedToken {
    // We drop some useful infromation here (see patterns with double dots `..`)
    // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
-    // being `u16` that come from `rowan::SyntaxKind` type and changes to `rowan::SyntaxKind`
+    // being `u16` that come from `rowan::SyntaxKind`.
-    // would mean hell of a rewrite
-    let token_range =
+    let syntax_kind = {
-        TextRange::offset_len(token_start_offset, TextUnit::from_usize(rustc_token.len));
-    let token_text = &text[token_range];
-    let (syntax_kind, error) = {
        use rustc_lexer::TokenKind as TK;
        use TokenizeError as TE;
-        match rustc_token.kind {
+        match rustc_token_kind {
-            TK::LineComment => ok(COMMENT),
+            TK::LineComment => COMMENT,
-            TK::BlockComment { terminated } => {
-                ok_if(terminated, COMMENT, TE::UnterminatedBlockComment)
+            TK::BlockComment { terminated: true } => COMMENT,
+            TK::BlockComment { terminated: false } => {
+                return (COMMENT, Some(TE::UnterminatedBlockComment));
            }
-            TK::Whitespace => ok(WHITESPACE),
-            TK::Ident => ok(if token_text == "_" {
+            TK::Whitespace => WHITESPACE,
-                UNDERSCORE
-            } else {
+            TK::Ident => {
-                SyntaxKind::from_keyword(token_text).unwrap_or(IDENT)
+                if token_text == "_" {
-            }),
+                    UNDERSCORE
-            TK::RawIdent => ok(IDENT),
+                } else {
-            TK::Literal { kind, .. } => match_literal_kind(&kind),
+                    SyntaxKind::from_keyword(token_text).unwrap_or(IDENT)
-            TK::Lifetime { starts_with_number } => {
+                }
-                ok_if(!starts_with_number, LIFETIME, TE::LifetimeStartsWithNumber)
            }
-            TK::Semi => ok(SEMI),
-            TK::Comma => ok(COMMA),
-            TK::Dot => ok(DOT),
-            TK::OpenParen => ok(L_PAREN),
-            TK::CloseParen => ok(R_PAREN),
-            TK::OpenBrace => ok(L_CURLY),
-            TK::CloseBrace => ok(R_CURLY),
-            TK::OpenBracket => ok(L_BRACK),
-            TK::CloseBracket => ok(R_BRACK),
-            TK::At => ok(AT),
-            TK::Pound => ok(POUND),
-            TK::Tilde => ok(TILDE),
-            TK::Question => ok(QUESTION),
-            TK::Colon => ok(COLON),
-            TK::Dollar => ok(DOLLAR),
-            TK::Eq => ok(EQ),
-            TK::Not => ok(EXCL),
-            TK::Lt => ok(L_ANGLE),
-            TK::Gt => ok(R_ANGLE),
-            TK::Minus => ok(MINUS),
-            TK::And => ok(AMP),
-            TK::Or => ok(PIPE),
-            TK::Plus => ok(PLUS),
-            TK::Star => ok(STAR),
-            TK::Slash => ok(SLASH),
-            TK::Caret => ok(CARET),
-            TK::Percent => ok(PERCENT),
-            TK::Unknown => ok(ERROR),
-        }
-    };
-    return ParsedToken {
+            TK::RawIdent => IDENT,
-        token: Token { kind: syntax_kind, len: token_range.len() },
+            TK::Literal { kind, .. } => return match_literal_kind(&kind),
-        error: error
-            .map(|error| SyntaxError::new(SyntaxErrorKind::TokenizeError(error), token_range)),
+            TK::Lifetime { starts_with_number: false } => LIFETIME,
+            TK::Lifetime { starts_with_number: true } => {
+                return (LIFETIME, Some(TE::LifetimeStartsWithNumber))
+            }
+            TK::Semi => SEMI,
+            TK::Comma => COMMA,
+            TK::Dot => DOT,
+            TK::OpenParen => L_PAREN,
+            TK::CloseParen => R_PAREN,
+            TK::OpenBrace => L_CURLY,
+            TK::CloseBrace => R_CURLY,
+            TK::OpenBracket => L_BRACK,
+            TK::CloseBracket => R_BRACK,
+            TK::At => AT,
+            TK::Pound => POUND,
+            TK::Tilde => TILDE,
+            TK::Question => QUESTION,
+            TK::Colon => COLON,
+            TK::Dollar => DOLLAR,
+            TK::Eq => EQ,
+            TK::Not => EXCL,
+            TK::Lt => L_ANGLE,
+            TK::Gt => R_ANGLE,
+            TK::Minus => MINUS,
+            TK::And => AMP,
+            TK::Or => PIPE,
+            TK::Plus => PLUS,
+            TK::Star => STAR,
+            TK::Slash => SLASH,
+            TK::Caret => CARET,
+            TK::Percent => PERCENT,
+            TK::Unknown => ERROR,
+        }
    };
-    type ParsedSyntaxKind = (SyntaxKind, Option<TokenizeError>);
+    return (syntax_kind, None);
-    fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> ParsedSyntaxKind {
+    fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<TokenizeError>) {
        use rustc_lexer::LiteralKind as LK;
        use TokenizeError as TE;
-        match *kind {
+        #[rustfmt::skip]
-            LK::Int { empty_int, .. } => ok_if(!empty_int, INT_NUMBER, TE::EmptyInt),
+        let syntax_kind = match *kind {
-            LK::Float { empty_exponent, .. } => {
+            LK::Int { empty_int: false, .. } => INT_NUMBER,
-                ok_if(!empty_exponent, FLOAT_NUMBER, TE::EmptyExponent)
+            LK::Int { empty_int: true, .. } => {
+                return (INT_NUMBER, Some(TE::EmptyInt))
+            }
+            LK::Float { empty_exponent: false, .. } => FLOAT_NUMBER,
+            LK::Float { empty_exponent: true, .. } => {
+                return (FLOAT_NUMBER, Some(TE::EmptyExponent))
+            }
+            LK::Char { terminated: true } => CHAR,
+            LK::Char { terminated: false } => {
+                return (CHAR, Some(TE::UnterminatedChar))
+            }
+            LK::Byte { terminated: true } => BYTE,
+            LK::Byte { terminated: false } => {
+                return (BYTE, Some(TE::UnterminatedByte))
            }
-            LK::Char { terminated } => ok_if(terminated, CHAR, TE::UnterminatedChar),
-            LK::Byte { terminated } => ok_if(terminated, BYTE, TE::UnterminatedByte),
+            LK::Str { terminated: true } => STRING,
-            LK::Str { terminated } => ok_if(terminated, STRING, TE::UnterminatedString),
+            LK::Str { terminated: false } => {
-            LK::ByteStr { terminated } => {
+                return (STRING, Some(TE::UnterminatedString))
-                ok_if(terminated, BYTE_STRING, TE::UnterminatedByteString)
+            }
+            LK::ByteStr { terminated: true } => BYTE_STRING,
+            LK::ByteStr { terminated: false } => {
+                return (BYTE_STRING, Some(TE::UnterminatedByteString))
            }
-            LK::RawStr { started: true, terminated, .. } => {
+            LK::RawStr { started: true, terminated: true, .. } => RAW_STRING,
-                ok_if(terminated, RAW_STRING, TE::UnterminatedRawString)
+            LK::RawStr { started: true, terminated: false, .. } => {
+                return (RAW_STRING, Some(TE::UnterminatedRawString))
+            }
+            LK::RawStr { started: false, .. } => {
+                return (RAW_STRING, Some(TE::UnstartedRawString))
            }
-            LK::RawStr { started: false, .. } => err(RAW_STRING, TE::UnstartedRawString),
-            LK::RawByteStr { started: true, terminated, .. } => {
+            LK::RawByteStr { started: true, terminated: true, .. } => RAW_BYTE_STRING,
-                ok_if(terminated, RAW_BYTE_STRING, TE::UnterminatedRawByteString)
+            LK::RawByteStr { started: true, terminated: false, .. } => {
+                return (RAW_BYTE_STRING, Some(TE::UnterminatedRawByteString))
            }
            LK::RawByteStr { started: false, .. } => {
-                err(RAW_BYTE_STRING, TE::UnstartedRawByteString)
+                return (RAW_BYTE_STRING, Some(TE::UnstartedRawByteString))
            }
-        }
+        };
-    }
-    const fn ok(syntax_kind: SyntaxKind) -> ParsedSyntaxKind {
        (syntax_kind, None)
    }
-    const fn err(syntax_kind: SyntaxKind, error: TokenizeError) -> ParsedSyntaxKind {
-        (syntax_kind, Some(error))
-    }
-    fn ok_if(cond: bool, syntax_kind: SyntaxKind, error: TokenizeError) -> ParsedSyntaxKind {
-        if cond {
-            ok(syntax_kind)
-        } else {
-            err(syntax_kind, error)
-        }
-    }
 }
diff --git a/crates/ra_syntax/src/parsing/reparsing.rs b/crates/ra_syntax/src/parsing/reparsing.rs
index ad1a7c855..1f351e9fc 100644
--- a/crates/ra_syntax/src/parsing/reparsing.rs
+++ b/crates/ra_syntax/src/parsing/reparsing.rs
@@ -12,7 +12,7 @@ use ra_text_edit::AtomTextEdit;
 use crate::{
    algo,
    parsing::{
-        lexer::{single_token, tokenize, ParsedTokens, Token},
+        lexer::{lex_single_syntax_kind, tokenize, Token},
        text_token_source::TextTokenSource,
        text_tree_sink::TextTreeSink,
    },
@@ -54,7 +54,7 @@ fn reparse_token<'node>(
            }
            let mut new_text = get_text_after_edit(prev_token.clone().into(), &edit);
-            let new_token_kind = single_token(&new_text)?.token.kind;
+            let (new_token_kind, _error) = lex_single_syntax_kind(&new_text)?;
            if new_token_kind != prev_token_kind
                || (new_token_kind == IDENT && is_contextual_kw(&new_text))
@@ -67,8 +67,8 @@ fn reparse_token<'node>(
            // `b` no longer remains an identifier, but becomes a part of byte string literal
            if let Some(next_char) = root.text().char_at(prev_token.text_range().end()) {
                new_text.push(next_char);
-                let token_with_next_char = single_token(&new_text);
+                let token_with_next_char = lex_single_syntax_kind(&new_text);
-                if token_with_next_char.is_some() {
+                if let Some((_kind, _error)) = token_with_next_char {
                    return None;
                }
                new_text.pop();
@@ -88,23 +88,26 @@ fn reparse_block<'node>(
 ) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {
    let (node, reparser) = find_reparsable_node(root, edit.delete)?;
    let text = get_text_after_edit(node.clone().into(), &edit);
-    let ParsedTokens { tokens, errors } = tokenize(&text);
+    let (tokens, new_lexer_errors) = tokenize(&text);
    if !is_balanced(&tokens) {
        return None;
    }
    let mut token_source = TextTokenSource::new(&text, &tokens);
-    let mut tree_sink = TextTreeSink::new(&text, &tokens, errors);
+    let mut tree_sink = TextTreeSink::new(&text, &tokens);
    reparser.parse(&mut token_source, &mut tree_sink);
-    let (green, new_errors) = tree_sink.finish();
-    Some((node.replace_with(green), new_errors, node.text_range()))
+    let (green, mut new_parser_errors) = tree_sink.finish();
+    new_parser_errors.extend(new_lexer_errors);
+    Some((node.replace_with(green), new_parser_errors, node.text_range()))
 }
 fn get_text_after_edit(element: SyntaxElement, edit: &AtomTextEdit) -> String {
    let edit =
        AtomTextEdit::replace(edit.delete - element.text_range().start(), edit.insert.clone());
-    // Note: we could move this match to a method or even further: use enum_dispatch crate
-    // https://crates.io/crates/enum_dispatch
    let text = match element {
        NodeOrToken::Token(token) => token.text().to_string(),
        NodeOrToken::Node(node) => node.text().to_string(),
@@ -122,8 +125,6 @@ fn is_contextual_kw(text: &str) -> bool {
 fn find_reparsable_node(node: &SyntaxNode, range: TextRange) -> Option<(SyntaxNode, Reparser)> {
    let node = algo::find_covering_element(node, range);
-    // Note: we could move this match to a method or even further: use enum_dispatch crate
-    // https://crates.io/crates/enum_dispatch
    let mut ancestors = match node {
        NodeOrToken::Token(it) => it.parent().ancestors(),
        NodeOrToken::Node(it) => it.ancestors(),
diff --git a/crates/ra_syntax/src/parsing/text_tree_sink.rs b/crates/ra_syntax/src/parsing/text_tree_sink.rs
index 5faac588b..dd202601d 100644
--- a/crates/ra_syntax/src/parsing/text_tree_sink.rs
+++ b/crates/ra_syntax/src/parsing/text_tree_sink.rs
@@ -92,14 +92,14 @@ impl<'a> TreeSink for TextTreeSink<'a> {
 }
 impl<'a> TextTreeSink<'a> {
-    pub(super) fn new(text: &'a str, tokens: &'a [Token], errors: Vec<SyntaxError>) -> Self {
+    pub(super) fn new(text: &'a str, tokens: &'a [Token]) -> Self {
        Self {
            text,
            tokens,
            text_pos: 0.into(),
            token_pos: 0,
            state: State::PendingStart,
-            inner: SyntaxTreeBuilder::new(errors),
+            inner: SyntaxTreeBuilder::default(),
        }
    }
author	Veetaha <[email protected]>	2020-01-28 05:09:13 +0000
committer	Veetaha <[email protected]>	2020-02-03 22:00:55 +0000
commit	9e7eaa959f9dc368a55f1a80b35651b78b3d0883 (patch)
tree	4b1f4af14d9898301949fa937219006d671a72ef /crates/ra_syntax/src/parsing
parent	bf60661aa3e2a77fedb3e1627675842d05538860 (diff)

diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs index bf6b4d637..55755be18 100644 --- a/crates/ra_syntax/src/parsing/lexer.rs +++ b/crates/ra_syntax/src/parsing/lexer.rs
@@ -16,55 +16,21 @@ pub struct Token {
16	pub len: TextUnit,	16	pub len: TextUnit,
17	}	17	}
18		18
19	/// Represents the result of parsing one token. Beware that the token may be malformed.
20	#[derive(Debug)]
21	pub struct ParsedToken {
22	/// Parsed token.
23	pub token: Token,
24	/// If error is present then parsed token is malformed.
25	pub error: Option<SyntaxError>,
26	}
27
28	#[derive(Debug, Default)]
29	/// Represents the result of parsing source code of Rust language.
30	pub struct ParsedTokens {
31	/// Parsed tokens in order they appear in source code.
32	pub tokens: Vec<Token>,
33	/// Collection of all occured tokenization errors.
34	/// In general `self.errors.len() <= self.tokens.len()`
35	pub errors: Vec<SyntaxError>,
36	}
37	impl ParsedTokens {
38	/// Append `token` and `error` (if pressent) to the result.
39	pub fn push(&mut self, ParsedToken { token, error }: ParsedToken) {
40	self.tokens.push(token);
41	if let Some(error) = error {
42	self.errors.push(error)
43	}
44	}
45	}
46
47	/// Same as `tokenize_append()`, just a shortcut for creating `ParsedTokens`
48	/// and returning the result the usual way.
49	pub fn tokenize(text: &str) -> ParsedTokens {
50	let mut parsed = ParsedTokens::default();
51	tokenize_append(text, &mut parsed);
52	parsed
53	}
54
55	/// Break a string up into its component tokens.	19	/// Break a string up into its component tokens.
56	/// Writes to `ParsedTokens` which are basically a pair `(Vec<Token>, Vec<SyntaxError>)`.
57	/// Beware that it checks for shebang first and its length contributes to resulting	20	/// Beware that it checks for shebang first and its length contributes to resulting
58	/// tokens offsets.	21	/// tokens offsets.
59	pub fn tokenize_append(text: &str, parsed: &mut ParsedTokens) {	22	pub fn tokenize(text: &str) -> (Vec<Token>, Vec<SyntaxError>) {
60	// non-empty string is a precondtion of `rustc_lexer::strip_shebang()`.	23	// non-empty string is a precondtion of `rustc_lexer::strip_shebang()`.
61	if text.is_empty() {	24	if text.is_empty() {
62	return;	25	return Default::default();
63	}	26	}
64		27
		28	let mut tokens = Vec::new();
		29	let mut errors = Vec::new();
		30
65	let mut offset: usize = rustc_lexer::strip_shebang(text)	31	let mut offset: usize = rustc_lexer::strip_shebang(text)
66	.map(\|shebang_len\| {	32	.map(\|shebang_len\| {
67	parsed.tokens.push(Token { kind: SHEBANG, len: TextUnit::from_usize(shebang_len) });	33	tokens.push(Token { kind: SHEBANG, len: TextUnit::from_usize(shebang_len) });
68	shebang_len	34	shebang_len
69	})	35	})
70	.unwrap_or(0);	36	.unwrap_or(0);
@@ -72,35 +38,76 @@ pub fn tokenize_append(text: &str, parsed: &mut ParsedTokens) {
72	let text_without_shebang = &text[offset..];	38	let text_without_shebang = &text[offset..];
73		39
74	for rustc_token in rustc_lexer::tokenize(text_without_shebang) {	40	for rustc_token in rustc_lexer::tokenize(text_without_shebang) {
75	parsed.push(rustc_token_to_parsed_token(&rustc_token, text, TextUnit::from_usize(offset)));	41	let token_len = TextUnit::from_usize(rustc_token.len);
		42	let token_range = TextRange::offset_len(TextUnit::from_usize(offset), token_len);
		43
		44	let (syntax_kind, error) =
		45	rustc_token_kind_to_syntax_kind(&rustc_token.kind, &text[token_range]);
		46
		47	tokens.push(Token { kind: syntax_kind, len: token_len });
		48
		49	if let Some(error) = error {
		50	errors.push(SyntaxError::new(SyntaxErrorKind::TokenizeError(error), token_range));
		51	}
		52
76	offset += rustc_token.len;	53	offset += rustc_token.len;
77	}	54	}
		55
		56	(tokens, errors)
78	}	57	}
79		58
80	/// Returns the first encountered token at the beginning of the string.	59	/// Returns `SyntaxKind` and `Option<SyntaxError>` of the first token
81	/// If the string contains zero or two or more tokens returns `None`.	60	/// encountered at the beginning of the string.
		61	///
		62	/// Returns `None` if the string contains zero or two or more tokens.
		63	/// The token is malformed if the returned error is not `None`.
		64	///
		65	/// Beware that unescape errors are not checked at tokenization time.
		66	pub fn lex_single_syntax_kind(text: &str) -> Option<(SyntaxKind, Option<SyntaxError>)> {
		67	first_token(text)
		68	.filter(\|(token, _)\| token.len.to_usize() == text.len())
		69	.map(\|(token, error)\| (token.kind, error))
		70	}
		71
		72	/// The same as `single_syntax_kind()` but returns only `SyntaxKind` and
		73	/// returns `None` if any tokenization error occured.
82	///	74	///
83	/// The main difference between `first_token()` and `single_token()` is that	75	/// Beware that unescape errors are not checked at tokenization time.
84	/// the latter returns `None` if the string contains more than one token.	76	pub fn lex_single_valid_syntax_kind(text: &str) -> Option<SyntaxKind> {
85	pub fn single_token(text: &str) -> Option<ParsedToken> {	77	first_token(text)
86	first_token(text).filter(\|parsed\| parsed.token.len.to_usize() == text.len())	78	.filter(\|(token, error)\| !error.is_some() && token.len.to_usize() == text.len())
		79	.map(\|(token, _error)\| token.kind)
87	}	80	}
88		81
89	/// Returns the first encountered token at the beginning of the string.	82	/// Returns the first encountered token at the beginning of the string.
90	/// If the string contains zero tokens returns `None`.
91	///	83	///
92	/// The main difference between `first_token() and single_token()` is that	84	/// Returns `None` if the string contains zero tokens or if the token was parsed
93	/// the latter returns `None` if the string contains more than one token.	85	/// with an error.
94	pub fn first_token(text: &str) -> Option<ParsedToken> {	86	///
		87	/// Beware that unescape errors are not checked at tokenization time.
		88	fn first_token(text: &str) -> Option<(Token, Option<SyntaxError>)> {
95	// non-empty string is a precondtion of `rustc_lexer::first_token()`.	89	// non-empty string is a precondtion of `rustc_lexer::first_token()`.
96	if text.is_empty() {	90	if text.is_empty() {
97	None	91	return None;
98	} else {
99	let rustc_token = rustc_lexer::first_token(text);
100	Some(rustc_token_to_parsed_token(&rustc_token, text, TextUnit::from(0)))
101	}	92	}
		93
		94	let rustc_token = rustc_lexer::first_token(text);
		95	let (syntax_kind, error) = rustc_token_kind_to_syntax_kind(&rustc_token.kind, text);
		96
		97	let token = Token { kind: syntax_kind, len: TextUnit::from_usize(rustc_token.len) };
		98	let error = error.map(\|error\| {
		99	SyntaxError::new(
		100	SyntaxErrorKind::TokenizeError(error),
		101	TextRange::from_to(TextUnit::from(0), TextUnit::of_str(text)),
		102	)
		103	});
		104
		105	Some((token, error))
102	}	106	}
103		107
		108	// FIXME: simplify TokenizeError to `SyntaxError(String, TextRange)` as per @matklad advice:
		109	// https://github.com/rust-analyzer/rust-analyzer/pull/2911/files#r371175067
		110
104	/// Describes the values of `SyntaxErrorKind::TokenizeError` enum variant.	111	/// Describes the values of `SyntaxErrorKind::TokenizeError` enum variant.
105	/// It describes all the types of errors that may happen during the tokenization	112	/// It describes all the types of errors that may happen during the tokenization
106	/// of Rust source.	113	/// of Rust source.
@@ -136,122 +143,132 @@ pub enum TokenizeError {
136	LifetimeStartsWithNumber,	143	LifetimeStartsWithNumber,
137	}	144	}
138		145
139	/// Mapper function that converts `rustc_lexer::Token` with some additional context	146	fn rustc_token_kind_to_syntax_kind(
140	/// to `ParsedToken`	147	rustc_token_kind: &rustc_lexer::TokenKind,
141	fn rustc_token_to_parsed_token(	148	token_text: &str,
142	rustc_token: &rustc_lexer::Token,	149	) -> (SyntaxKind, Option<TokenizeError>) {
143	text: &str,	150	// A note on an intended tradeoff:
144	token_start_offset: TextUnit,
145	) -> ParsedToken {
146	// We drop some useful infromation here (see patterns with double dots `..`)	151	// We drop some useful infromation here (see patterns with double dots `..`)
147	// Storing that info in `SyntaxKind` is not possible due to its layout requirements of	152	// Storing that info in `SyntaxKind` is not possible due to its layout requirements of
148	// being `u16` that come from `rowan::SyntaxKind` type and changes to `rowan::SyntaxKind`	153	// being `u16` that come from `rowan::SyntaxKind`.
149	// would mean hell of a rewrite
150		154
151	let token_range =	155	let syntax_kind = {
152	TextRange::offset_len(token_start_offset, TextUnit::from_usize(rustc_token.len));
153
154	let token_text = &text[token_range];
155
156	let (syntax_kind, error) = {
157	use rustc_lexer::TokenKind as TK;	156	use rustc_lexer::TokenKind as TK;
158	use TokenizeError as TE;	157	use TokenizeError as TE;
159		158
160	match rustc_token.kind {	159	match rustc_token_kind {
161	TK::LineComment => ok(COMMENT),	160	TK::LineComment => COMMENT,
162	TK::BlockComment { terminated } => {	161
163	ok_if(terminated, COMMENT, TE::UnterminatedBlockComment)	162	TK::BlockComment { terminated: true } => COMMENT,
		163	TK::BlockComment { terminated: false } => {
		164	return (COMMENT, Some(TE::UnterminatedBlockComment));
164	}	165	}
165	TK::Whitespace => ok(WHITESPACE),	166
166	TK::Ident => ok(if token_text == "_" {	167	TK::Whitespace => WHITESPACE,
167	UNDERSCORE	168
168	} else {	169	TK::Ident => {
169	SyntaxKind::from_keyword(token_text).unwrap_or(IDENT)	170	if token_text == "_" {
170	}),	171	UNDERSCORE
171	TK::RawIdent => ok(IDENT),	172	} else {
172	TK::Literal { kind, .. } => match_literal_kind(&kind),	173	SyntaxKind::from_keyword(token_text).unwrap_or(IDENT)
173	TK::Lifetime { starts_with_number } => {	174	}
174	ok_if(!starts_with_number, LIFETIME, TE::LifetimeStartsWithNumber)
175	}	175	}
176	TK::Semi => ok(SEMI),
177	TK::Comma => ok(COMMA),
178	TK::Dot => ok(DOT),
179	TK::OpenParen => ok(L_PAREN),
180	TK::CloseParen => ok(R_PAREN),
181	TK::OpenBrace => ok(L_CURLY),
182	TK::CloseBrace => ok(R_CURLY),
183	TK::OpenBracket => ok(L_BRACK),
184	TK::CloseBracket => ok(R_BRACK),
185	TK::At => ok(AT),
186	TK::Pound => ok(POUND),
187	TK::Tilde => ok(TILDE),
188	TK::Question => ok(QUESTION),
189	TK::Colon => ok(COLON),
190	TK::Dollar => ok(DOLLAR),
191	TK::Eq => ok(EQ),
192	TK::Not => ok(EXCL),
193	TK::Lt => ok(L_ANGLE),
194	TK::Gt => ok(R_ANGLE),
195	TK::Minus => ok(MINUS),
196	TK::And => ok(AMP),
197	TK::Or => ok(PIPE),
198	TK::Plus => ok(PLUS),
199	TK::Star => ok(STAR),
200	TK::Slash => ok(SLASH),
201	TK::Caret => ok(CARET),
202	TK::Percent => ok(PERCENT),
203	TK::Unknown => ok(ERROR),
204	}
205	};
206		176
207	return ParsedToken {	177	TK::RawIdent => IDENT,
208	token: Token { kind: syntax_kind, len: token_range.len() },	178	TK::Literal { kind, .. } => return match_literal_kind(&kind),
209	error: error	179
210	.map(\|error\| SyntaxError::new(SyntaxErrorKind::TokenizeError(error), token_range)),	180	TK::Lifetime { starts_with_number: false } => LIFETIME,
		181	TK::Lifetime { starts_with_number: true } => {
		182	return (LIFETIME, Some(TE::LifetimeStartsWithNumber))
		183	}
		184
		185	TK::Semi => SEMI,
		186	TK::Comma => COMMA,
		187	TK::Dot => DOT,
		188	TK::OpenParen => L_PAREN,
		189	TK::CloseParen => R_PAREN,
		190	TK::OpenBrace => L_CURLY,
		191	TK::CloseBrace => R_CURLY,
		192	TK::OpenBracket => L_BRACK,
		193	TK::CloseBracket => R_BRACK,
		194	TK::At => AT,
		195	TK::Pound => POUND,
		196	TK::Tilde => TILDE,
		197	TK::Question => QUESTION,
		198	TK::Colon => COLON,
		199	TK::Dollar => DOLLAR,
		200	TK::Eq => EQ,
		201	TK::Not => EXCL,
		202	TK::Lt => L_ANGLE,
		203	TK::Gt => R_ANGLE,
		204	TK::Minus => MINUS,
		205	TK::And => AMP,
		206	TK::Or => PIPE,
		207	TK::Plus => PLUS,
		208	TK::Star => STAR,
		209	TK::Slash => SLASH,
		210	TK::Caret => CARET,
		211	TK::Percent => PERCENT,
		212	TK::Unknown => ERROR,
		213	}
211	};	214	};
212		215
213	type ParsedSyntaxKind = (SyntaxKind, Option<TokenizeError>);	216	return (syntax_kind, None);
214		217
215	fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> ParsedSyntaxKind {	218	fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<TokenizeError>) {
216	use rustc_lexer::LiteralKind as LK;	219	use rustc_lexer::LiteralKind as LK;
217	use TokenizeError as TE;	220	use TokenizeError as TE;
218		221
219	match *kind {	222	#[rustfmt::skip]
220	LK::Int { empty_int, .. } => ok_if(!empty_int, INT_NUMBER, TE::EmptyInt),	223	let syntax_kind = match *kind {
221	LK::Float { empty_exponent, .. } => {	224	LK::Int { empty_int: false, .. } => INT_NUMBER,
222	ok_if(!empty_exponent, FLOAT_NUMBER, TE::EmptyExponent)	225	LK::Int { empty_int: true, .. } => {
		226	return (INT_NUMBER, Some(TE::EmptyInt))
		227	}
		228
		229	LK::Float { empty_exponent: false, .. } => FLOAT_NUMBER,
		230	LK::Float { empty_exponent: true, .. } => {
		231	return (FLOAT_NUMBER, Some(TE::EmptyExponent))
		232	}
		233
		234	LK::Char { terminated: true } => CHAR,
		235	LK::Char { terminated: false } => {
		236	return (CHAR, Some(TE::UnterminatedChar))
		237	}
		238
		239	LK::Byte { terminated: true } => BYTE,
		240	LK::Byte { terminated: false } => {
		241	return (BYTE, Some(TE::UnterminatedByte))
223	}	242	}
224	LK::Char { terminated } => ok_if(terminated, CHAR, TE::UnterminatedChar),	243
225	LK::Byte { terminated } => ok_if(terminated, BYTE, TE::UnterminatedByte),	244	LK::Str { terminated: true } => STRING,
226	LK::Str { terminated } => ok_if(terminated, STRING, TE::UnterminatedString),	245	LK::Str { terminated: false } => {
227	LK::ByteStr { terminated } => {	246	return (STRING, Some(TE::UnterminatedString))
228	ok_if(terminated, BYTE_STRING, TE::UnterminatedByteString)	247	}
		248
		249
		250	LK::ByteStr { terminated: true } => BYTE_STRING,
		251	LK::ByteStr { terminated: false } => {
		252	return (BYTE_STRING, Some(TE::UnterminatedByteString))
229	}	253	}
230		254
231	LK::RawStr { started: true, terminated, .. } => {	255	LK::RawStr { started: true, terminated: true, .. } => RAW_STRING,
232	ok_if(terminated, RAW_STRING, TE::UnterminatedRawString)	256	LK::RawStr { started: true, terminated: false, .. } => {
		257	return (RAW_STRING, Some(TE::UnterminatedRawString))
		258	}
		259	LK::RawStr { started: false, .. } => {
		260	return (RAW_STRING, Some(TE::UnstartedRawString))
233	}	261	}
234	LK::RawStr { started: false, .. } => err(RAW_STRING, TE::UnstartedRawString),
235		262
236	LK::RawByteStr { started: true, terminated, .. } => {	263	LK::RawByteStr { started: true, terminated: true, .. } => RAW_BYTE_STRING,
237	ok_if(terminated, RAW_BYTE_STRING, TE::UnterminatedRawByteString)	264	LK::RawByteStr { started: true, terminated: false, .. } => {
		265	return (RAW_BYTE_STRING, Some(TE::UnterminatedRawByteString))
238	}	266	}
239	LK::RawByteStr { started: false, .. } => {	267	LK::RawByteStr { started: false, .. } => {
240	err(RAW_BYTE_STRING, TE::UnstartedRawByteString)	268	return (RAW_BYTE_STRING, Some(TE::UnstartedRawByteString))
241	}	269	}
242	}	270	};
243	}	271
244	const fn ok(syntax_kind: SyntaxKind) -> ParsedSyntaxKind {
245	(syntax_kind, None)	272	(syntax_kind, None)
246	}	273	}
247	const fn err(syntax_kind: SyntaxKind, error: TokenizeError) -> ParsedSyntaxKind {
248	(syntax_kind, Some(error))
249	}
250	fn ok_if(cond: bool, syntax_kind: SyntaxKind, error: TokenizeError) -> ParsedSyntaxKind {
251	if cond {
252	ok(syntax_kind)
253	} else {
254	err(syntax_kind, error)
255	}
256	}
257	}	274	}


diff --git a/crates/ra_syntax/src/parsing/reparsing.rs b/crates/ra_syntax/src/parsing/reparsing.rs index ad1a7c855..1f351e9fc 100644 --- a/crates/ra_syntax/src/parsing/reparsing.rs +++ b/crates/ra_syntax/src/parsing/reparsing.rs
@@ -12,7 +12,7 @@ use ra_text_edit::AtomTextEdit;
12	use crate::{	12	use crate::{
13	algo,	13	algo,
14	parsing::{	14	parsing::{
15	lexer::{single_token, tokenize, ParsedTokens, Token},	15	lexer::{lex_single_syntax_kind, tokenize, Token},
16	text_token_source::TextTokenSource,	16	text_token_source::TextTokenSource,
17	text_tree_sink::TextTreeSink,	17	text_tree_sink::TextTreeSink,
18	},	18	},
@@ -54,7 +54,7 @@ fn reparse_token<'node>(
54	}	54	}
55		55
56	let mut new_text = get_text_after_edit(prev_token.clone().into(), &edit);	56	let mut new_text = get_text_after_edit(prev_token.clone().into(), &edit);
57	let new_token_kind = single_token(&new_text)?.token.kind;	57	let (new_token_kind, _error) = lex_single_syntax_kind(&new_text)?;
58		58
59	if new_token_kind != prev_token_kind	59	if new_token_kind != prev_token_kind
60	\|\| (new_token_kind == IDENT && is_contextual_kw(&new_text))	60	\|\| (new_token_kind == IDENT && is_contextual_kw(&new_text))
@@ -67,8 +67,8 @@ fn reparse_token<'node>(
67	// `b` no longer remains an identifier, but becomes a part of byte string literal	67	// `b` no longer remains an identifier, but becomes a part of byte string literal
68	if let Some(next_char) = root.text().char_at(prev_token.text_range().end()) {	68	if let Some(next_char) = root.text().char_at(prev_token.text_range().end()) {
69	new_text.push(next_char);	69	new_text.push(next_char);
70	let token_with_next_char = single_token(&new_text);	70	let token_with_next_char = lex_single_syntax_kind(&new_text);
71	if token_with_next_char.is_some() {	71	if let Some((_kind, _error)) = token_with_next_char {
72	return None;	72	return None;
73	}	73	}
74	new_text.pop();	74	new_text.pop();
@@ -88,23 +88,26 @@ fn reparse_block<'node>(
88	) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {	88	) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {
89	let (node, reparser) = find_reparsable_node(root, edit.delete)?;	89	let (node, reparser) = find_reparsable_node(root, edit.delete)?;
90	let text = get_text_after_edit(node.clone().into(), &edit);	90	let text = get_text_after_edit(node.clone().into(), &edit);
91	let ParsedTokens { tokens, errors } = tokenize(&text);	91
		92	let (tokens, new_lexer_errors) = tokenize(&text);
92	if !is_balanced(&tokens) {	93	if !is_balanced(&tokens) {
93	return None;	94	return None;
94	}	95	}
		96
95	let mut token_source = TextTokenSource::new(&text, &tokens);	97	let mut token_source = TextTokenSource::new(&text, &tokens);
96	let mut tree_sink = TextTreeSink::new(&text, &tokens, errors);	98	let mut tree_sink = TextTreeSink::new(&text, &tokens);
97	reparser.parse(&mut token_source, &mut tree_sink);	99	reparser.parse(&mut token_source, &mut tree_sink);
98	let (green, new_errors) = tree_sink.finish();	100
99	Some((node.replace_with(green), new_errors, node.text_range()))	101	let (green, mut new_parser_errors) = tree_sink.finish();
		102	new_parser_errors.extend(new_lexer_errors);
		103
		104	Some((node.replace_with(green), new_parser_errors, node.text_range()))
100	}	105	}
101		106
102	fn get_text_after_edit(element: SyntaxElement, edit: &AtomTextEdit) -> String {	107	fn get_text_after_edit(element: SyntaxElement, edit: &AtomTextEdit) -> String {
103	let edit =	108	let edit =
104	AtomTextEdit::replace(edit.delete - element.text_range().start(), edit.insert.clone());	109	AtomTextEdit::replace(edit.delete - element.text_range().start(), edit.insert.clone());
105		110
106	// Note: we could move this match to a method or even further: use enum_dispatch crate
107	// https://crates.io/crates/enum_dispatch
108	let text = match element {	111	let text = match element {
109	NodeOrToken::Token(token) => token.text().to_string(),	112	NodeOrToken::Token(token) => token.text().to_string(),
110	NodeOrToken::Node(node) => node.text().to_string(),	113	NodeOrToken::Node(node) => node.text().to_string(),
@@ -122,8 +125,6 @@ fn is_contextual_kw(text: &str) -> bool {
122	fn find_reparsable_node(node: &SyntaxNode, range: TextRange) -> Option<(SyntaxNode, Reparser)> {	125	fn find_reparsable_node(node: &SyntaxNode, range: TextRange) -> Option<(SyntaxNode, Reparser)> {
123	let node = algo::find_covering_element(node, range);	126	let node = algo::find_covering_element(node, range);
124		127
125	// Note: we could move this match to a method or even further: use enum_dispatch crate
126	// https://crates.io/crates/enum_dispatch
127	let mut ancestors = match node {	128	let mut ancestors = match node {
128	NodeOrToken::Token(it) => it.parent().ancestors(),	129	NodeOrToken::Token(it) => it.parent().ancestors(),
129	NodeOrToken::Node(it) => it.ancestors(),	130	NodeOrToken::Node(it) => it.ancestors(),


diff --git a/crates/ra_syntax/src/parsing/text_tree_sink.rs b/crates/ra_syntax/src/parsing/text_tree_sink.rs index 5faac588b..dd202601d 100644 --- a/crates/ra_syntax/src/parsing/text_tree_sink.rs +++ b/crates/ra_syntax/src/parsing/text_tree_sink.rs
@@ -92,14 +92,14 @@ impl<'a> TreeSink for TextTreeSink<'a> {
92	}	92	}
93		93
94	impl<'a> TextTreeSink<'a> {	94	impl<'a> TextTreeSink<'a> {
95	pub(super) fn new(text: &'a str, tokens: &'a [Token], errors: Vec<SyntaxError>) -> Self {	95	pub(super) fn new(text: &'a str, tokens: &'a [Token]) -> Self {
96	Self {	96	Self {
97	text,	97	text,
98	tokens,	98	tokens,
99	text_pos: 0.into(),	99	text_pos: 0.into(),
100	token_pos: 0,	100	token_pos: 0,
101	state: State::PendingStart,	101	state: State::PendingStart,
102	inner: SyntaxTreeBuilder::new(errors),	102	inner: SyntaxTreeBuilder::default(),
103	}	103	}
104	}	104	}
105		105