diff options
Diffstat (limited to 'crates/syntax/src/parsing')
-rw-r--r-- | crates/syntax/src/parsing/lexer.rs | 244 | ||||
-rw-r--r-- | crates/syntax/src/parsing/reparsing.rs | 455 | ||||
-rw-r--r-- | crates/syntax/src/parsing/text_token_source.rs | 84 | ||||
-rw-r--r-- | crates/syntax/src/parsing/text_tree_sink.rs | 183 |
4 files changed, 966 insertions, 0 deletions
diff --git a/crates/syntax/src/parsing/lexer.rs b/crates/syntax/src/parsing/lexer.rs new file mode 100644 index 000000000..fa3be1016 --- /dev/null +++ b/crates/syntax/src/parsing/lexer.rs | |||
@@ -0,0 +1,244 @@ | |||
1 | //! Lexer analyzes raw input string and produces lexemes (tokens). | ||
2 | //! It is just a bridge to `rustc_lexer`. | ||
3 | |||
4 | use rustc_lexer::{LiteralKind as LK, RawStrError}; | ||
5 | |||
6 | use std::convert::TryInto; | ||
7 | |||
8 | use crate::{ | ||
9 | SyntaxError, | ||
10 | SyntaxKind::{self, *}, | ||
11 | TextRange, TextSize, T, | ||
12 | }; | ||
13 | |||
14 | /// A token of Rust source. | ||
15 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||
16 | pub struct Token { | ||
17 | /// The kind of token. | ||
18 | pub kind: SyntaxKind, | ||
19 | /// The length of the token. | ||
20 | pub len: TextSize, | ||
21 | } | ||
22 | |||
23 | /// Break a string up into its component tokens. | ||
24 | /// Beware that it checks for shebang first and its length contributes to resulting | ||
25 | /// tokens offsets. | ||
26 | pub fn tokenize(text: &str) -> (Vec<Token>, Vec<SyntaxError>) { | ||
27 | // non-empty string is a precondtion of `rustc_lexer::strip_shebang()`. | ||
28 | if text.is_empty() { | ||
29 | return Default::default(); | ||
30 | } | ||
31 | |||
32 | let mut tokens = Vec::new(); | ||
33 | let mut errors = Vec::new(); | ||
34 | |||
35 | let mut offset = match rustc_lexer::strip_shebang(text) { | ||
36 | Some(shebang_len) => { | ||
37 | tokens.push(Token { kind: SHEBANG, len: shebang_len.try_into().unwrap() }); | ||
38 | shebang_len | ||
39 | } | ||
40 | None => 0, | ||
41 | }; | ||
42 | |||
43 | let text_without_shebang = &text[offset..]; | ||
44 | |||
45 | for rustc_token in rustc_lexer::tokenize(text_without_shebang) { | ||
46 | let token_len: TextSize = rustc_token.len.try_into().unwrap(); | ||
47 | let token_range = TextRange::at(offset.try_into().unwrap(), token_len); | ||
48 | |||
49 | let (syntax_kind, err_message) = | ||
50 | rustc_token_kind_to_syntax_kind(&rustc_token.kind, &text[token_range]); | ||
51 | |||
52 | tokens.push(Token { kind: syntax_kind, len: token_len }); | ||
53 | |||
54 | if let Some(err_message) = err_message { | ||
55 | errors.push(SyntaxError::new(err_message, token_range)); | ||
56 | } | ||
57 | |||
58 | offset += rustc_token.len; | ||
59 | } | ||
60 | |||
61 | (tokens, errors) | ||
62 | } | ||
63 | |||
64 | /// Returns `SyntaxKind` and `Option<SyntaxError>` of the first token | ||
65 | /// encountered at the beginning of the string. | ||
66 | /// | ||
67 | /// Returns `None` if the string contains zero *or two or more* tokens. | ||
68 | /// The token is malformed if the returned error is not `None`. | ||
69 | /// | ||
70 | /// Beware that unescape errors are not checked at tokenization time. | ||
71 | pub fn lex_single_syntax_kind(text: &str) -> Option<(SyntaxKind, Option<SyntaxError>)> { | ||
72 | lex_first_token(text) | ||
73 | .filter(|(token, _)| token.len == TextSize::of(text)) | ||
74 | .map(|(token, error)| (token.kind, error)) | ||
75 | } | ||
76 | |||
77 | /// The same as `lex_single_syntax_kind()` but returns only `SyntaxKind` and | ||
78 | /// returns `None` if any tokenization error occured. | ||
79 | /// | ||
80 | /// Beware that unescape errors are not checked at tokenization time. | ||
81 | pub fn lex_single_valid_syntax_kind(text: &str) -> Option<SyntaxKind> { | ||
82 | lex_first_token(text) | ||
83 | .filter(|(token, error)| !error.is_some() && token.len == TextSize::of(text)) | ||
84 | .map(|(token, _error)| token.kind) | ||
85 | } | ||
86 | |||
87 | /// Returns `SyntaxKind` and `Option<SyntaxError>` of the first token | ||
88 | /// encountered at the beginning of the string. | ||
89 | /// | ||
90 | /// Returns `None` if the string contains zero tokens or if the token was parsed | ||
91 | /// with an error. | ||
92 | /// The token is malformed if the returned error is not `None`. | ||
93 | /// | ||
94 | /// Beware that unescape errors are not checked at tokenization time. | ||
95 | fn lex_first_token(text: &str) -> Option<(Token, Option<SyntaxError>)> { | ||
96 | // non-empty string is a precondtion of `rustc_lexer::first_token()`. | ||
97 | if text.is_empty() { | ||
98 | return None; | ||
99 | } | ||
100 | |||
101 | let rustc_token = rustc_lexer::first_token(text); | ||
102 | let (syntax_kind, err_message) = rustc_token_kind_to_syntax_kind(&rustc_token.kind, text); | ||
103 | |||
104 | let token = Token { kind: syntax_kind, len: rustc_token.len.try_into().unwrap() }; | ||
105 | let optional_error = err_message | ||
106 | .map(|err_message| SyntaxError::new(err_message, TextRange::up_to(TextSize::of(text)))); | ||
107 | |||
108 | Some((token, optional_error)) | ||
109 | } | ||
110 | |||
111 | /// Returns `SyntaxKind` and an optional tokenize error message. | ||
112 | fn rustc_token_kind_to_syntax_kind( | ||
113 | rustc_token_kind: &rustc_lexer::TokenKind, | ||
114 | token_text: &str, | ||
115 | ) -> (SyntaxKind, Option<&'static str>) { | ||
116 | // A note on an intended tradeoff: | ||
117 | // We drop some useful infromation here (see patterns with double dots `..`) | ||
118 | // Storing that info in `SyntaxKind` is not possible due to its layout requirements of | ||
119 | // being `u16` that come from `rowan::SyntaxKind`. | ||
120 | |||
121 | let syntax_kind = { | ||
122 | match rustc_token_kind { | ||
123 | rustc_lexer::TokenKind::LineComment => COMMENT, | ||
124 | |||
125 | rustc_lexer::TokenKind::BlockComment { terminated: true } => COMMENT, | ||
126 | rustc_lexer::TokenKind::BlockComment { terminated: false } => { | ||
127 | return ( | ||
128 | COMMENT, | ||
129 | Some("Missing trailing `*/` symbols to terminate the block comment"), | ||
130 | ); | ||
131 | } | ||
132 | |||
133 | rustc_lexer::TokenKind::Whitespace => WHITESPACE, | ||
134 | |||
135 | rustc_lexer::TokenKind::Ident => { | ||
136 | if token_text == "_" { | ||
137 | UNDERSCORE | ||
138 | } else { | ||
139 | SyntaxKind::from_keyword(token_text).unwrap_or(IDENT) | ||
140 | } | ||
141 | } | ||
142 | |||
143 | rustc_lexer::TokenKind::RawIdent => IDENT, | ||
144 | rustc_lexer::TokenKind::Literal { kind, .. } => return match_literal_kind(&kind), | ||
145 | |||
146 | rustc_lexer::TokenKind::Lifetime { starts_with_number: false } => LIFETIME, | ||
147 | rustc_lexer::TokenKind::Lifetime { starts_with_number: true } => { | ||
148 | return (LIFETIME, Some("Lifetime name cannot start with a number")) | ||
149 | } | ||
150 | |||
151 | rustc_lexer::TokenKind::Semi => T![;], | ||
152 | rustc_lexer::TokenKind::Comma => T![,], | ||
153 | rustc_lexer::TokenKind::Dot => T![.], | ||
154 | rustc_lexer::TokenKind::OpenParen => T!['('], | ||
155 | rustc_lexer::TokenKind::CloseParen => T![')'], | ||
156 | rustc_lexer::TokenKind::OpenBrace => T!['{'], | ||
157 | rustc_lexer::TokenKind::CloseBrace => T!['}'], | ||
158 | rustc_lexer::TokenKind::OpenBracket => T!['['], | ||
159 | rustc_lexer::TokenKind::CloseBracket => T![']'], | ||
160 | rustc_lexer::TokenKind::At => T![@], | ||
161 | rustc_lexer::TokenKind::Pound => T![#], | ||
162 | rustc_lexer::TokenKind::Tilde => T![~], | ||
163 | rustc_lexer::TokenKind::Question => T![?], | ||
164 | rustc_lexer::TokenKind::Colon => T![:], | ||
165 | rustc_lexer::TokenKind::Dollar => T![$], | ||
166 | rustc_lexer::TokenKind::Eq => T![=], | ||
167 | rustc_lexer::TokenKind::Not => T![!], | ||
168 | rustc_lexer::TokenKind::Lt => T![<], | ||
169 | rustc_lexer::TokenKind::Gt => T![>], | ||
170 | rustc_lexer::TokenKind::Minus => T![-], | ||
171 | rustc_lexer::TokenKind::And => T![&], | ||
172 | rustc_lexer::TokenKind::Or => T![|], | ||
173 | rustc_lexer::TokenKind::Plus => T![+], | ||
174 | rustc_lexer::TokenKind::Star => T![*], | ||
175 | rustc_lexer::TokenKind::Slash => T![/], | ||
176 | rustc_lexer::TokenKind::Caret => T![^], | ||
177 | rustc_lexer::TokenKind::Percent => T![%], | ||
178 | rustc_lexer::TokenKind::Unknown => ERROR, | ||
179 | } | ||
180 | }; | ||
181 | |||
182 | return (syntax_kind, None); | ||
183 | |||
184 | fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<&'static str>) { | ||
185 | #[rustfmt::skip] | ||
186 | let syntax_kind = match *kind { | ||
187 | LK::Int { empty_int: false, .. } => INT_NUMBER, | ||
188 | LK::Int { empty_int: true, .. } => { | ||
189 | return (INT_NUMBER, Some("Missing digits after the integer base prefix")) | ||
190 | } | ||
191 | |||
192 | LK::Float { empty_exponent: false, .. } => FLOAT_NUMBER, | ||
193 | LK::Float { empty_exponent: true, .. } => { | ||
194 | return (FLOAT_NUMBER, Some("Missing digits after the exponent symbol")) | ||
195 | } | ||
196 | |||
197 | LK::Char { terminated: true } => CHAR, | ||
198 | LK::Char { terminated: false } => { | ||
199 | return (CHAR, Some("Missing trailing `'` symbol to terminate the character literal")) | ||
200 | } | ||
201 | |||
202 | LK::Byte { terminated: true } => BYTE, | ||
203 | LK::Byte { terminated: false } => { | ||
204 | return (BYTE, Some("Missing trailing `'` symbol to terminate the byte literal")) | ||
205 | } | ||
206 | |||
207 | LK::Str { terminated: true } => STRING, | ||
208 | LK::Str { terminated: false } => { | ||
209 | return (STRING, Some("Missing trailing `\"` symbol to terminate the string literal")) | ||
210 | } | ||
211 | |||
212 | |||
213 | LK::ByteStr { terminated: true } => BYTE_STRING, | ||
214 | LK::ByteStr { terminated: false } => { | ||
215 | return (BYTE_STRING, Some("Missing trailing `\"` symbol to terminate the byte string literal")) | ||
216 | } | ||
217 | |||
218 | LK::RawStr { err, .. } => match err { | ||
219 | None => RAW_STRING, | ||
220 | Some(RawStrError::InvalidStarter { .. }) => return (RAW_STRING, Some("Missing `\"` symbol after `#` symbols to begin the raw string literal")), | ||
221 | Some(RawStrError::NoTerminator { expected, found, .. }) => if expected == found { | ||
222 | return (RAW_STRING, Some("Missing trailing `\"` to terminate the raw string literal")) | ||
223 | } else { | ||
224 | return (RAW_STRING, Some("Missing trailing `\"` with `#` symbols to terminate the raw string literal")) | ||
225 | |||
226 | }, | ||
227 | Some(RawStrError::TooManyDelimiters { .. }) => return (RAW_STRING, Some("Too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols")), | ||
228 | }, | ||
229 | LK::RawByteStr { err, .. } => match err { | ||
230 | None => RAW_BYTE_STRING, | ||
231 | Some(RawStrError::InvalidStarter { .. }) => return (RAW_BYTE_STRING, Some("Missing `\"` symbol after `#` symbols to begin the raw byte string literal")), | ||
232 | Some(RawStrError::NoTerminator { expected, found, .. }) => if expected == found { | ||
233 | return (RAW_BYTE_STRING, Some("Missing trailing `\"` to terminate the raw byte string literal")) | ||
234 | } else { | ||
235 | return (RAW_BYTE_STRING, Some("Missing trailing `\"` with `#` symbols to terminate the raw byte string literal")) | ||
236 | |||
237 | }, | ||
238 | Some(RawStrError::TooManyDelimiters { .. }) => return (RAW_BYTE_STRING, Some("Too many `#` symbols: raw byte strings may be delimited by up to 65535 `#` symbols")), | ||
239 | }, | ||
240 | }; | ||
241 | |||
242 | (syntax_kind, None) | ||
243 | } | ||
244 | } | ||
diff --git a/crates/syntax/src/parsing/reparsing.rs b/crates/syntax/src/parsing/reparsing.rs new file mode 100644 index 000000000..4149f856a --- /dev/null +++ b/crates/syntax/src/parsing/reparsing.rs | |||
@@ -0,0 +1,455 @@ | |||
1 | //! Implementation of incremental re-parsing. | ||
2 | //! | ||
3 | //! We use two simple strategies for this: | ||
4 | //! - if the edit modifies only a single token (like changing an identifier's | ||
5 | //! letter), we replace only this token. | ||
6 | //! - otherwise, we search for the nearest `{}` block which contains the edit | ||
7 | //! and try to parse only this block. | ||
8 | |||
9 | use parser::Reparser; | ||
10 | use text_edit::Indel; | ||
11 | |||
12 | use crate::{ | ||
13 | algo, | ||
14 | parsing::{ | ||
15 | lexer::{lex_single_syntax_kind, tokenize, Token}, | ||
16 | text_token_source::TextTokenSource, | ||
17 | text_tree_sink::TextTreeSink, | ||
18 | }, | ||
19 | syntax_node::{GreenNode, GreenToken, NodeOrToken, SyntaxElement, SyntaxNode}, | ||
20 | SyntaxError, | ||
21 | SyntaxKind::*, | ||
22 | TextRange, TextSize, T, | ||
23 | }; | ||
24 | |||
25 | pub(crate) fn incremental_reparse( | ||
26 | node: &SyntaxNode, | ||
27 | edit: &Indel, | ||
28 | errors: Vec<SyntaxError>, | ||
29 | ) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> { | ||
30 | if let Some((green, new_errors, old_range)) = reparse_token(node, &edit) { | ||
31 | return Some((green, merge_errors(errors, new_errors, old_range, edit), old_range)); | ||
32 | } | ||
33 | |||
34 | if let Some((green, new_errors, old_range)) = reparse_block(node, &edit) { | ||
35 | return Some((green, merge_errors(errors, new_errors, old_range, edit), old_range)); | ||
36 | } | ||
37 | None | ||
38 | } | ||
39 | |||
40 | fn reparse_token<'node>( | ||
41 | root: &'node SyntaxNode, | ||
42 | edit: &Indel, | ||
43 | ) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> { | ||
44 | let prev_token = algo::find_covering_element(root, edit.delete).as_token()?.clone(); | ||
45 | let prev_token_kind = prev_token.kind(); | ||
46 | match prev_token_kind { | ||
47 | WHITESPACE | COMMENT | IDENT | STRING | RAW_STRING => { | ||
48 | if prev_token_kind == WHITESPACE || prev_token_kind == COMMENT { | ||
49 | // removing a new line may extends previous token | ||
50 | let deleted_range = edit.delete - prev_token.text_range().start(); | ||
51 | if prev_token.text()[deleted_range].contains('\n') { | ||
52 | return None; | ||
53 | } | ||
54 | } | ||
55 | |||
56 | let mut new_text = get_text_after_edit(prev_token.clone().into(), &edit); | ||
57 | let (new_token_kind, new_err) = lex_single_syntax_kind(&new_text)?; | ||
58 | |||
59 | if new_token_kind != prev_token_kind | ||
60 | || (new_token_kind == IDENT && is_contextual_kw(&new_text)) | ||
61 | { | ||
62 | return None; | ||
63 | } | ||
64 | |||
65 | // Check that edited token is not a part of the bigger token. | ||
66 | // E.g. if for source code `bruh"str"` the user removed `ruh`, then | ||
67 | // `b` no longer remains an identifier, but becomes a part of byte string literal | ||
68 | if let Some(next_char) = root.text().char_at(prev_token.text_range().end()) { | ||
69 | new_text.push(next_char); | ||
70 | let token_with_next_char = lex_single_syntax_kind(&new_text); | ||
71 | if let Some((_kind, _error)) = token_with_next_char { | ||
72 | return None; | ||
73 | } | ||
74 | new_text.pop(); | ||
75 | } | ||
76 | |||
77 | let new_token = | ||
78 | GreenToken::new(rowan::SyntaxKind(prev_token_kind.into()), new_text.into()); | ||
79 | Some(( | ||
80 | prev_token.replace_with(new_token), | ||
81 | new_err.into_iter().collect(), | ||
82 | prev_token.text_range(), | ||
83 | )) | ||
84 | } | ||
85 | _ => None, | ||
86 | } | ||
87 | } | ||
88 | |||
89 | fn reparse_block<'node>( | ||
90 | root: &'node SyntaxNode, | ||
91 | edit: &Indel, | ||
92 | ) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> { | ||
93 | let (node, reparser) = find_reparsable_node(root, edit.delete)?; | ||
94 | let text = get_text_after_edit(node.clone().into(), edit); | ||
95 | |||
96 | let (tokens, new_lexer_errors) = tokenize(&text); | ||
97 | if !is_balanced(&tokens) { | ||
98 | return None; | ||
99 | } | ||
100 | |||
101 | let mut token_source = TextTokenSource::new(&text, &tokens); | ||
102 | let mut tree_sink = TextTreeSink::new(&text, &tokens); | ||
103 | reparser.parse(&mut token_source, &mut tree_sink); | ||
104 | |||
105 | let (green, mut new_parser_errors) = tree_sink.finish(); | ||
106 | new_parser_errors.extend(new_lexer_errors); | ||
107 | |||
108 | Some((node.replace_with(green), new_parser_errors, node.text_range())) | ||
109 | } | ||
110 | |||
111 | fn get_text_after_edit(element: SyntaxElement, edit: &Indel) -> String { | ||
112 | let edit = Indel::replace(edit.delete - element.text_range().start(), edit.insert.clone()); | ||
113 | |||
114 | let mut text = match element { | ||
115 | NodeOrToken::Token(token) => token.text().to_string(), | ||
116 | NodeOrToken::Node(node) => node.text().to_string(), | ||
117 | }; | ||
118 | edit.apply(&mut text); | ||
119 | text | ||
120 | } | ||
121 | |||
122 | fn is_contextual_kw(text: &str) -> bool { | ||
123 | matches!(text, "auto" | "default" | "union") | ||
124 | } | ||
125 | |||
126 | fn find_reparsable_node(node: &SyntaxNode, range: TextRange) -> Option<(SyntaxNode, Reparser)> { | ||
127 | let node = algo::find_covering_element(node, range); | ||
128 | |||
129 | let mut ancestors = match node { | ||
130 | NodeOrToken::Token(it) => it.parent().ancestors(), | ||
131 | NodeOrToken::Node(it) => it.ancestors(), | ||
132 | }; | ||
133 | ancestors.find_map(|node| { | ||
134 | let first_child = node.first_child_or_token().map(|it| it.kind()); | ||
135 | let parent = node.parent().map(|it| it.kind()); | ||
136 | Reparser::for_node(node.kind(), first_child, parent).map(|r| (node, r)) | ||
137 | }) | ||
138 | } | ||
139 | |||
140 | fn is_balanced(tokens: &[Token]) -> bool { | ||
141 | if tokens.is_empty() | ||
142 | || tokens.first().unwrap().kind != T!['{'] | ||
143 | || tokens.last().unwrap().kind != T!['}'] | ||
144 | { | ||
145 | return false; | ||
146 | } | ||
147 | let mut balance = 0usize; | ||
148 | for t in &tokens[1..tokens.len() - 1] { | ||
149 | match t.kind { | ||
150 | T!['{'] => balance += 1, | ||
151 | T!['}'] => { | ||
152 | balance = match balance.checked_sub(1) { | ||
153 | Some(b) => b, | ||
154 | None => return false, | ||
155 | } | ||
156 | } | ||
157 | _ => (), | ||
158 | } | ||
159 | } | ||
160 | balance == 0 | ||
161 | } | ||
162 | |||
163 | fn merge_errors( | ||
164 | old_errors: Vec<SyntaxError>, | ||
165 | new_errors: Vec<SyntaxError>, | ||
166 | range_before_reparse: TextRange, | ||
167 | edit: &Indel, | ||
168 | ) -> Vec<SyntaxError> { | ||
169 | let mut res = Vec::new(); | ||
170 | |||
171 | for old_err in old_errors { | ||
172 | let old_err_range = old_err.range(); | ||
173 | if old_err_range.end() <= range_before_reparse.start() { | ||
174 | res.push(old_err); | ||
175 | } else if old_err_range.start() >= range_before_reparse.end() { | ||
176 | let inserted_len = TextSize::of(&edit.insert); | ||
177 | res.push(old_err.with_range((old_err_range + inserted_len) - edit.delete.len())); | ||
178 | // Note: extra parens are intentional to prevent uint underflow, HWAB (here was a bug) | ||
179 | } | ||
180 | } | ||
181 | res.extend(new_errors.into_iter().map(|new_err| { | ||
182 | // fighting borrow checker with a variable ;) | ||
183 | let offseted_range = new_err.range() + range_before_reparse.start(); | ||
184 | new_err.with_range(offseted_range) | ||
185 | })); | ||
186 | res | ||
187 | } | ||
188 | |||
189 | #[cfg(test)] | ||
190 | mod tests { | ||
191 | use test_utils::{assert_eq_text, extract_range}; | ||
192 | |||
193 | use super::*; | ||
194 | use crate::{AstNode, Parse, SourceFile}; | ||
195 | |||
196 | fn do_check(before: &str, replace_with: &str, reparsed_len: u32) { | ||
197 | let (range, before) = extract_range(before); | ||
198 | let edit = Indel::replace(range, replace_with.to_owned()); | ||
199 | let after = { | ||
200 | let mut after = before.clone(); | ||
201 | edit.apply(&mut after); | ||
202 | after | ||
203 | }; | ||
204 | |||
205 | let fully_reparsed = SourceFile::parse(&after); | ||
206 | let incrementally_reparsed: Parse<SourceFile> = { | ||
207 | let before = SourceFile::parse(&before); | ||
208 | let (green, new_errors, range) = | ||
209 | incremental_reparse(before.tree().syntax(), &edit, before.errors.to_vec()).unwrap(); | ||
210 | assert_eq!(range.len(), reparsed_len.into(), "reparsed fragment has wrong length"); | ||
211 | Parse::new(green, new_errors) | ||
212 | }; | ||
213 | |||
214 | assert_eq_text!( | ||
215 | &format!("{:#?}", fully_reparsed.tree().syntax()), | ||
216 | &format!("{:#?}", incrementally_reparsed.tree().syntax()), | ||
217 | ); | ||
218 | assert_eq!(fully_reparsed.errors(), incrementally_reparsed.errors()); | ||
219 | } | ||
220 | |||
221 | #[test] // FIXME: some test here actually test token reparsing | ||
222 | fn reparse_block_tests() { | ||
223 | do_check( | ||
224 | r" | ||
225 | fn foo() { | ||
226 | let x = foo + <|>bar<|> | ||
227 | } | ||
228 | ", | ||
229 | "baz", | ||
230 | 3, | ||
231 | ); | ||
232 | do_check( | ||
233 | r" | ||
234 | fn foo() { | ||
235 | let x = foo<|> + bar<|> | ||
236 | } | ||
237 | ", | ||
238 | "baz", | ||
239 | 25, | ||
240 | ); | ||
241 | do_check( | ||
242 | r" | ||
243 | struct Foo { | ||
244 | f: foo<|><|> | ||
245 | } | ||
246 | ", | ||
247 | ",\n g: (),", | ||
248 | 14, | ||
249 | ); | ||
250 | do_check( | ||
251 | r" | ||
252 | fn foo { | ||
253 | let; | ||
254 | 1 + 1; | ||
255 | <|>92<|>; | ||
256 | } | ||
257 | ", | ||
258 | "62", | ||
259 | 31, // FIXME: reparse only int literal here | ||
260 | ); | ||
261 | do_check( | ||
262 | r" | ||
263 | mod foo { | ||
264 | fn <|><|> | ||
265 | } | ||
266 | ", | ||
267 | "bar", | ||
268 | 11, | ||
269 | ); | ||
270 | |||
271 | do_check( | ||
272 | r" | ||
273 | trait Foo { | ||
274 | type <|>Foo<|>; | ||
275 | } | ||
276 | ", | ||
277 | "Output", | ||
278 | 3, | ||
279 | ); | ||
280 | do_check( | ||
281 | r" | ||
282 | impl IntoIterator<Item=i32> for Foo { | ||
283 | f<|><|> | ||
284 | } | ||
285 | ", | ||
286 | "n next(", | ||
287 | 9, | ||
288 | ); | ||
289 | do_check(r"use a::b::{foo,<|>,bar<|>};", "baz", 10); | ||
290 | do_check( | ||
291 | r" | ||
292 | pub enum A { | ||
293 | Foo<|><|> | ||
294 | } | ||
295 | ", | ||
296 | "\nBar;\n", | ||
297 | 11, | ||
298 | ); | ||
299 | do_check( | ||
300 | r" | ||
301 | foo!{a, b<|><|> d} | ||
302 | ", | ||
303 | ", c[3]", | ||
304 | 8, | ||
305 | ); | ||
306 | do_check( | ||
307 | r" | ||
308 | fn foo() { | ||
309 | vec![<|><|>] | ||
310 | } | ||
311 | ", | ||
312 | "123", | ||
313 | 14, | ||
314 | ); | ||
315 | do_check( | ||
316 | r" | ||
317 | extern { | ||
318 | fn<|>;<|> | ||
319 | } | ||
320 | ", | ||
321 | " exit(code: c_int)", | ||
322 | 11, | ||
323 | ); | ||
324 | } | ||
325 | |||
326 | #[test] | ||
327 | fn reparse_token_tests() { | ||
328 | do_check( | ||
329 | r"<|><|> | ||
330 | fn foo() -> i32 { 1 } | ||
331 | ", | ||
332 | "\n\n\n \n", | ||
333 | 1, | ||
334 | ); | ||
335 | do_check( | ||
336 | r" | ||
337 | fn foo() -> <|><|> {} | ||
338 | ", | ||
339 | " \n", | ||
340 | 2, | ||
341 | ); | ||
342 | do_check( | ||
343 | r" | ||
344 | fn <|>foo<|>() -> i32 { 1 } | ||
345 | ", | ||
346 | "bar", | ||
347 | 3, | ||
348 | ); | ||
349 | do_check( | ||
350 | r" | ||
351 | fn foo<|><|>foo() { } | ||
352 | ", | ||
353 | "bar", | ||
354 | 6, | ||
355 | ); | ||
356 | do_check( | ||
357 | r" | ||
358 | fn foo /* <|><|> */ () {} | ||
359 | ", | ||
360 | "some comment", | ||
361 | 6, | ||
362 | ); | ||
363 | do_check( | ||
364 | r" | ||
365 | fn baz <|><|> () {} | ||
366 | ", | ||
367 | " \t\t\n\n", | ||
368 | 2, | ||
369 | ); | ||
370 | do_check( | ||
371 | r" | ||
372 | fn baz <|><|> () {} | ||
373 | ", | ||
374 | " \t\t\n\n", | ||
375 | 2, | ||
376 | ); | ||
377 | do_check( | ||
378 | r" | ||
379 | /// foo <|><|>omment | ||
380 | mod { } | ||
381 | ", | ||
382 | "c", | ||
383 | 14, | ||
384 | ); | ||
385 | do_check( | ||
386 | r#" | ||
387 | fn -> &str { "Hello<|><|>" } | ||
388 | "#, | ||
389 | ", world", | ||
390 | 7, | ||
391 | ); | ||
392 | do_check( | ||
393 | r#" | ||
394 | fn -> &str { // "Hello<|><|>" | ||
395 | "#, | ||
396 | ", world", | ||
397 | 10, | ||
398 | ); | ||
399 | do_check( | ||
400 | r##" | ||
401 | fn -> &str { r#"Hello<|><|>"# | ||
402 | "##, | ||
403 | ", world", | ||
404 | 10, | ||
405 | ); | ||
406 | do_check( | ||
407 | r" | ||
408 | #[derive(<|>Copy<|>)] | ||
409 | enum Foo { | ||
410 | |||
411 | } | ||
412 | ", | ||
413 | "Clone", | ||
414 | 4, | ||
415 | ); | ||
416 | } | ||
417 | |||
418 | #[test] | ||
419 | fn reparse_str_token_with_error_unchanged() { | ||
420 | do_check(r#""<|>Unclosed<|> string literal"#, "Still unclosed", 24); | ||
421 | } | ||
422 | |||
423 | #[test] | ||
424 | fn reparse_str_token_with_error_fixed() { | ||
425 | do_check(r#""unterinated<|><|>"#, "\"", 12); | ||
426 | } | ||
427 | |||
428 | #[test] | ||
429 | fn reparse_block_with_error_in_middle_unchanged() { | ||
430 | do_check( | ||
431 | r#"fn main() { | ||
432 | if {} | ||
433 | 32 + 4<|><|> | ||
434 | return | ||
435 | if {} | ||
436 | }"#, | ||
437 | "23", | ||
438 | 105, | ||
439 | ) | ||
440 | } | ||
441 | |||
442 | #[test] | ||
443 | fn reparse_block_with_error_in_middle_fixed() { | ||
444 | do_check( | ||
445 | r#"fn main() { | ||
446 | if {} | ||
447 | 32 + 4<|><|> | ||
448 | return | ||
449 | if {} | ||
450 | }"#, | ||
451 | ";", | ||
452 | 105, | ||
453 | ) | ||
454 | } | ||
455 | } | ||
diff --git a/crates/syntax/src/parsing/text_token_source.rs b/crates/syntax/src/parsing/text_token_source.rs new file mode 100644 index 000000000..df866dc2b --- /dev/null +++ b/crates/syntax/src/parsing/text_token_source.rs | |||
@@ -0,0 +1,84 @@ | |||
1 | //! See `TextTokenSource` docs. | ||
2 | |||
3 | use parser::TokenSource; | ||
4 | |||
5 | use crate::{parsing::lexer::Token, SyntaxKind::EOF, TextRange, TextSize}; | ||
6 | |||
7 | /// Implementation of `parser::TokenSource` that takes tokens from source code text. | ||
8 | pub(crate) struct TextTokenSource<'t> { | ||
9 | text: &'t str, | ||
10 | /// token and its start position (non-whitespace/comment tokens) | ||
11 | /// ```non-rust | ||
12 | /// struct Foo; | ||
13 | /// ^------^--^- | ||
14 | /// | | \________ | ||
15 | /// | \____ \ | ||
16 | /// | \ | | ||
17 | /// (struct, 0) (Foo, 7) (;, 10) | ||
18 | /// ``` | ||
19 | /// `[(struct, 0), (Foo, 7), (;, 10)]` | ||
20 | token_offset_pairs: Vec<(Token, TextSize)>, | ||
21 | |||
22 | /// Current token and position | ||
23 | curr: (parser::Token, usize), | ||
24 | } | ||
25 | |||
26 | impl<'t> TokenSource for TextTokenSource<'t> { | ||
27 | fn current(&self) -> parser::Token { | ||
28 | self.curr.0 | ||
29 | } | ||
30 | |||
31 | fn lookahead_nth(&self, n: usize) -> parser::Token { | ||
32 | mk_token(self.curr.1 + n, &self.token_offset_pairs) | ||
33 | } | ||
34 | |||
35 | fn bump(&mut self) { | ||
36 | if self.curr.0.kind == EOF { | ||
37 | return; | ||
38 | } | ||
39 | |||
40 | let pos = self.curr.1 + 1; | ||
41 | self.curr = (mk_token(pos, &self.token_offset_pairs), pos); | ||
42 | } | ||
43 | |||
44 | fn is_keyword(&self, kw: &str) -> bool { | ||
45 | self.token_offset_pairs | ||
46 | .get(self.curr.1) | ||
47 | .map(|(token, offset)| &self.text[TextRange::at(*offset, token.len)] == kw) | ||
48 | .unwrap_or(false) | ||
49 | } | ||
50 | } | ||
51 | |||
52 | fn mk_token(pos: usize, token_offset_pairs: &[(Token, TextSize)]) -> parser::Token { | ||
53 | let (kind, is_jointed_to_next) = match token_offset_pairs.get(pos) { | ||
54 | Some((token, offset)) => ( | ||
55 | token.kind, | ||
56 | token_offset_pairs | ||
57 | .get(pos + 1) | ||
58 | .map(|(_, next_offset)| offset + token.len == *next_offset) | ||
59 | .unwrap_or(false), | ||
60 | ), | ||
61 | None => (EOF, false), | ||
62 | }; | ||
63 | parser::Token { kind, is_jointed_to_next } | ||
64 | } | ||
65 | |||
66 | impl<'t> TextTokenSource<'t> { | ||
67 | /// Generate input from tokens(expect comment and whitespace). | ||
68 | pub fn new(text: &'t str, raw_tokens: &'t [Token]) -> TextTokenSource<'t> { | ||
69 | let token_offset_pairs: Vec<_> = raw_tokens | ||
70 | .iter() | ||
71 | .filter_map({ | ||
72 | let mut len = 0.into(); | ||
73 | move |token| { | ||
74 | let pair = if token.kind.is_trivia() { None } else { Some((*token, len)) }; | ||
75 | len += token.len; | ||
76 | pair | ||
77 | } | ||
78 | }) | ||
79 | .collect(); | ||
80 | |||
81 | let first = mk_token(0, &token_offset_pairs); | ||
82 | TextTokenSource { text, token_offset_pairs, curr: (first, 0) } | ||
83 | } | ||
84 | } | ||
diff --git a/crates/syntax/src/parsing/text_tree_sink.rs b/crates/syntax/src/parsing/text_tree_sink.rs new file mode 100644 index 000000000..c1b5f246d --- /dev/null +++ b/crates/syntax/src/parsing/text_tree_sink.rs | |||
@@ -0,0 +1,183 @@ | |||
1 | //! FIXME: write short doc here | ||
2 | |||
3 | use std::mem; | ||
4 | |||
5 | use parser::{ParseError, TreeSink}; | ||
6 | |||
7 | use crate::{ | ||
8 | parsing::Token, | ||
9 | syntax_node::GreenNode, | ||
10 | SmolStr, SyntaxError, | ||
11 | SyntaxKind::{self, *}, | ||
12 | SyntaxTreeBuilder, TextRange, TextSize, | ||
13 | }; | ||
14 | |||
15 | /// Bridges the parser with our specific syntax tree representation. | ||
16 | /// | ||
17 | /// `TextTreeSink` also handles attachment of trivia (whitespace) to nodes. | ||
18 | pub(crate) struct TextTreeSink<'a> { | ||
19 | text: &'a str, | ||
20 | tokens: &'a [Token], | ||
21 | text_pos: TextSize, | ||
22 | token_pos: usize, | ||
23 | state: State, | ||
24 | inner: SyntaxTreeBuilder, | ||
25 | } | ||
26 | |||
27 | enum State { | ||
28 | PendingStart, | ||
29 | Normal, | ||
30 | PendingFinish, | ||
31 | } | ||
32 | |||
33 | impl<'a> TreeSink for TextTreeSink<'a> { | ||
34 | fn token(&mut self, kind: SyntaxKind, n_tokens: u8) { | ||
35 | match mem::replace(&mut self.state, State::Normal) { | ||
36 | State::PendingStart => unreachable!(), | ||
37 | State::PendingFinish => self.inner.finish_node(), | ||
38 | State::Normal => (), | ||
39 | } | ||
40 | self.eat_trivias(); | ||
41 | let n_tokens = n_tokens as usize; | ||
42 | let len = self.tokens[self.token_pos..self.token_pos + n_tokens] | ||
43 | .iter() | ||
44 | .map(|it| it.len) | ||
45 | .sum::<TextSize>(); | ||
46 | self.do_token(kind, len, n_tokens); | ||
47 | } | ||
48 | |||
49 | fn start_node(&mut self, kind: SyntaxKind) { | ||
50 | match mem::replace(&mut self.state, State::Normal) { | ||
51 | State::PendingStart => { | ||
52 | self.inner.start_node(kind); | ||
53 | // No need to attach trivias to previous node: there is no | ||
54 | // previous node. | ||
55 | return; | ||
56 | } | ||
57 | State::PendingFinish => self.inner.finish_node(), | ||
58 | State::Normal => (), | ||
59 | } | ||
60 | |||
61 | let n_trivias = | ||
62 | self.tokens[self.token_pos..].iter().take_while(|it| it.kind.is_trivia()).count(); | ||
63 | let leading_trivias = &self.tokens[self.token_pos..self.token_pos + n_trivias]; | ||
64 | let mut trivia_end = | ||
65 | self.text_pos + leading_trivias.iter().map(|it| it.len).sum::<TextSize>(); | ||
66 | |||
67 | let n_attached_trivias = { | ||
68 | let leading_trivias = leading_trivias.iter().rev().map(|it| { | ||
69 | let next_end = trivia_end - it.len; | ||
70 | let range = TextRange::new(next_end, trivia_end); | ||
71 | trivia_end = next_end; | ||
72 | (it.kind, &self.text[range]) | ||
73 | }); | ||
74 | n_attached_trivias(kind, leading_trivias) | ||
75 | }; | ||
76 | self.eat_n_trivias(n_trivias - n_attached_trivias); | ||
77 | self.inner.start_node(kind); | ||
78 | self.eat_n_trivias(n_attached_trivias); | ||
79 | } | ||
80 | |||
81 | fn finish_node(&mut self) { | ||
82 | match mem::replace(&mut self.state, State::PendingFinish) { | ||
83 | State::PendingStart => unreachable!(), | ||
84 | State::PendingFinish => self.inner.finish_node(), | ||
85 | State::Normal => (), | ||
86 | } | ||
87 | } | ||
88 | |||
89 | fn error(&mut self, error: ParseError) { | ||
90 | self.inner.error(error, self.text_pos) | ||
91 | } | ||
92 | } | ||
93 | |||
94 | impl<'a> TextTreeSink<'a> { | ||
95 | pub(super) fn new(text: &'a str, tokens: &'a [Token]) -> Self { | ||
96 | Self { | ||
97 | text, | ||
98 | tokens, | ||
99 | text_pos: 0.into(), | ||
100 | token_pos: 0, | ||
101 | state: State::PendingStart, | ||
102 | inner: SyntaxTreeBuilder::default(), | ||
103 | } | ||
104 | } | ||
105 | |||
106 | pub(super) fn finish(mut self) -> (GreenNode, Vec<SyntaxError>) { | ||
107 | match mem::replace(&mut self.state, State::Normal) { | ||
108 | State::PendingFinish => { | ||
109 | self.eat_trivias(); | ||
110 | self.inner.finish_node() | ||
111 | } | ||
112 | State::PendingStart | State::Normal => unreachable!(), | ||
113 | } | ||
114 | |||
115 | self.inner.finish_raw() | ||
116 | } | ||
117 | |||
118 | fn eat_trivias(&mut self) { | ||
119 | while let Some(&token) = self.tokens.get(self.token_pos) { | ||
120 | if !token.kind.is_trivia() { | ||
121 | break; | ||
122 | } | ||
123 | self.do_token(token.kind, token.len, 1); | ||
124 | } | ||
125 | } | ||
126 | |||
127 | fn eat_n_trivias(&mut self, n: usize) { | ||
128 | for _ in 0..n { | ||
129 | let token = self.tokens[self.token_pos]; | ||
130 | assert!(token.kind.is_trivia()); | ||
131 | self.do_token(token.kind, token.len, 1); | ||
132 | } | ||
133 | } | ||
134 | |||
135 | fn do_token(&mut self, kind: SyntaxKind, len: TextSize, n_tokens: usize) { | ||
136 | let range = TextRange::at(self.text_pos, len); | ||
137 | let text: SmolStr = self.text[range].into(); | ||
138 | self.text_pos += len; | ||
139 | self.token_pos += n_tokens; | ||
140 | self.inner.token(kind, text); | ||
141 | } | ||
142 | } | ||
143 | |||
144 | fn n_attached_trivias<'a>( | ||
145 | kind: SyntaxKind, | ||
146 | trivias: impl Iterator<Item = (SyntaxKind, &'a str)>, | ||
147 | ) -> usize { | ||
148 | match kind { | ||
149 | MACRO_CALL | CONST | TYPE_ALIAS | STRUCT | ENUM | VARIANT | FN | TRAIT | MODULE | ||
150 | | RECORD_FIELD | STATIC => { | ||
151 | let mut res = 0; | ||
152 | let mut trivias = trivias.enumerate().peekable(); | ||
153 | |||
154 | while let Some((i, (kind, text))) = trivias.next() { | ||
155 | match kind { | ||
156 | WHITESPACE => { | ||
157 | if text.contains("\n\n") { | ||
158 | // we check whether the next token is a doc-comment | ||
159 | // and skip the whitespace in this case | ||
160 | if let Some((peek_kind, peek_text)) = | ||
161 | trivias.peek().map(|(_, pair)| pair) | ||
162 | { | ||
163 | if *peek_kind == COMMENT | ||
164 | && peek_text.starts_with("///") | ||
165 | && !peek_text.starts_with("////") | ||
166 | { | ||
167 | continue; | ||
168 | } | ||
169 | } | ||
170 | break; | ||
171 | } | ||
172 | } | ||
173 | COMMENT => { | ||
174 | res = i + 1; | ||
175 | } | ||
176 | _ => (), | ||
177 | } | ||
178 | } | ||
179 | res | ||
180 | } | ||
181 | _ => 0, | ||
182 | } | ||
183 | } | ||