aboutsummaryrefslogtreecommitdiff
path: root/crates/syntax/src/parsing
diff options
context:
space:
mode:
authorZac Pullar-Strecker <[email protected]>2020-08-24 10:19:53 +0100
committerZac Pullar-Strecker <[email protected]>2020-08-24 10:20:13 +0100
commit7bbca7a1b3f9293d2f5cc5745199bc5f8396f2f0 (patch)
treebdb47765991cb973b2cd5481a088fac636bd326c /crates/syntax/src/parsing
parentca464650eeaca6195891199a93f4f76cf3e7e697 (diff)
parente65d48d1fb3d4d91d9dc1148a7a836ff5c9a3c87 (diff)
Merge remote-tracking branch 'upstream/master' into 503-hover-doc-links
Diffstat (limited to 'crates/syntax/src/parsing')
-rw-r--r--crates/syntax/src/parsing/lexer.rs244
-rw-r--r--crates/syntax/src/parsing/reparsing.rs455
-rw-r--r--crates/syntax/src/parsing/text_token_source.rs84
-rw-r--r--crates/syntax/src/parsing/text_tree_sink.rs183
4 files changed, 966 insertions, 0 deletions
diff --git a/crates/syntax/src/parsing/lexer.rs b/crates/syntax/src/parsing/lexer.rs
new file mode 100644
index 000000000..fa3be1016
--- /dev/null
+++ b/crates/syntax/src/parsing/lexer.rs
@@ -0,0 +1,244 @@
1//! Lexer analyzes raw input string and produces lexemes (tokens).
2//! It is just a bridge to `rustc_lexer`.
3
4use rustc_lexer::{LiteralKind as LK, RawStrError};
5
6use std::convert::TryInto;
7
8use crate::{
9 SyntaxError,
10 SyntaxKind::{self, *},
11 TextRange, TextSize, T,
12};
13
14/// A token of Rust source.
15#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
16pub struct Token {
17 /// The kind of token.
18 pub kind: SyntaxKind,
19 /// The length of the token.
20 pub len: TextSize,
21}
22
23/// Break a string up into its component tokens.
24/// Beware that it checks for shebang first and its length contributes to resulting
25/// tokens offsets.
26pub fn tokenize(text: &str) -> (Vec<Token>, Vec<SyntaxError>) {
27 // non-empty string is a precondtion of `rustc_lexer::strip_shebang()`.
28 if text.is_empty() {
29 return Default::default();
30 }
31
32 let mut tokens = Vec::new();
33 let mut errors = Vec::new();
34
35 let mut offset = match rustc_lexer::strip_shebang(text) {
36 Some(shebang_len) => {
37 tokens.push(Token { kind: SHEBANG, len: shebang_len.try_into().unwrap() });
38 shebang_len
39 }
40 None => 0,
41 };
42
43 let text_without_shebang = &text[offset..];
44
45 for rustc_token in rustc_lexer::tokenize(text_without_shebang) {
46 let token_len: TextSize = rustc_token.len.try_into().unwrap();
47 let token_range = TextRange::at(offset.try_into().unwrap(), token_len);
48
49 let (syntax_kind, err_message) =
50 rustc_token_kind_to_syntax_kind(&rustc_token.kind, &text[token_range]);
51
52 tokens.push(Token { kind: syntax_kind, len: token_len });
53
54 if let Some(err_message) = err_message {
55 errors.push(SyntaxError::new(err_message, token_range));
56 }
57
58 offset += rustc_token.len;
59 }
60
61 (tokens, errors)
62}
63
64/// Returns `SyntaxKind` and `Option<SyntaxError>` of the first token
65/// encountered at the beginning of the string.
66///
67/// Returns `None` if the string contains zero *or two or more* tokens.
68/// The token is malformed if the returned error is not `None`.
69///
70/// Beware that unescape errors are not checked at tokenization time.
71pub fn lex_single_syntax_kind(text: &str) -> Option<(SyntaxKind, Option<SyntaxError>)> {
72 lex_first_token(text)
73 .filter(|(token, _)| token.len == TextSize::of(text))
74 .map(|(token, error)| (token.kind, error))
75}
76
77/// The same as `lex_single_syntax_kind()` but returns only `SyntaxKind` and
78/// returns `None` if any tokenization error occured.
79///
80/// Beware that unescape errors are not checked at tokenization time.
81pub fn lex_single_valid_syntax_kind(text: &str) -> Option<SyntaxKind> {
82 lex_first_token(text)
83 .filter(|(token, error)| !error.is_some() && token.len == TextSize::of(text))
84 .map(|(token, _error)| token.kind)
85}
86
87/// Returns `SyntaxKind` and `Option<SyntaxError>` of the first token
88/// encountered at the beginning of the string.
89///
90/// Returns `None` if the string contains zero tokens or if the token was parsed
91/// with an error.
92/// The token is malformed if the returned error is not `None`.
93///
94/// Beware that unescape errors are not checked at tokenization time.
95fn lex_first_token(text: &str) -> Option<(Token, Option<SyntaxError>)> {
96 // non-empty string is a precondtion of `rustc_lexer::first_token()`.
97 if text.is_empty() {
98 return None;
99 }
100
101 let rustc_token = rustc_lexer::first_token(text);
102 let (syntax_kind, err_message) = rustc_token_kind_to_syntax_kind(&rustc_token.kind, text);
103
104 let token = Token { kind: syntax_kind, len: rustc_token.len.try_into().unwrap() };
105 let optional_error = err_message
106 .map(|err_message| SyntaxError::new(err_message, TextRange::up_to(TextSize::of(text))));
107
108 Some((token, optional_error))
109}
110
111/// Returns `SyntaxKind` and an optional tokenize error message.
112fn rustc_token_kind_to_syntax_kind(
113 rustc_token_kind: &rustc_lexer::TokenKind,
114 token_text: &str,
115) -> (SyntaxKind, Option<&'static str>) {
116 // A note on an intended tradeoff:
117 // We drop some useful infromation here (see patterns with double dots `..`)
118 // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
119 // being `u16` that come from `rowan::SyntaxKind`.
120
121 let syntax_kind = {
122 match rustc_token_kind {
123 rustc_lexer::TokenKind::LineComment => COMMENT,
124
125 rustc_lexer::TokenKind::BlockComment { terminated: true } => COMMENT,
126 rustc_lexer::TokenKind::BlockComment { terminated: false } => {
127 return (
128 COMMENT,
129 Some("Missing trailing `*/` symbols to terminate the block comment"),
130 );
131 }
132
133 rustc_lexer::TokenKind::Whitespace => WHITESPACE,
134
135 rustc_lexer::TokenKind::Ident => {
136 if token_text == "_" {
137 UNDERSCORE
138 } else {
139 SyntaxKind::from_keyword(token_text).unwrap_or(IDENT)
140 }
141 }
142
143 rustc_lexer::TokenKind::RawIdent => IDENT,
144 rustc_lexer::TokenKind::Literal { kind, .. } => return match_literal_kind(&kind),
145
146 rustc_lexer::TokenKind::Lifetime { starts_with_number: false } => LIFETIME,
147 rustc_lexer::TokenKind::Lifetime { starts_with_number: true } => {
148 return (LIFETIME, Some("Lifetime name cannot start with a number"))
149 }
150
151 rustc_lexer::TokenKind::Semi => T![;],
152 rustc_lexer::TokenKind::Comma => T![,],
153 rustc_lexer::TokenKind::Dot => T![.],
154 rustc_lexer::TokenKind::OpenParen => T!['('],
155 rustc_lexer::TokenKind::CloseParen => T![')'],
156 rustc_lexer::TokenKind::OpenBrace => T!['{'],
157 rustc_lexer::TokenKind::CloseBrace => T!['}'],
158 rustc_lexer::TokenKind::OpenBracket => T!['['],
159 rustc_lexer::TokenKind::CloseBracket => T![']'],
160 rustc_lexer::TokenKind::At => T![@],
161 rustc_lexer::TokenKind::Pound => T![#],
162 rustc_lexer::TokenKind::Tilde => T![~],
163 rustc_lexer::TokenKind::Question => T![?],
164 rustc_lexer::TokenKind::Colon => T![:],
165 rustc_lexer::TokenKind::Dollar => T![$],
166 rustc_lexer::TokenKind::Eq => T![=],
167 rustc_lexer::TokenKind::Not => T![!],
168 rustc_lexer::TokenKind::Lt => T![<],
169 rustc_lexer::TokenKind::Gt => T![>],
170 rustc_lexer::TokenKind::Minus => T![-],
171 rustc_lexer::TokenKind::And => T![&],
172 rustc_lexer::TokenKind::Or => T![|],
173 rustc_lexer::TokenKind::Plus => T![+],
174 rustc_lexer::TokenKind::Star => T![*],
175 rustc_lexer::TokenKind::Slash => T![/],
176 rustc_lexer::TokenKind::Caret => T![^],
177 rustc_lexer::TokenKind::Percent => T![%],
178 rustc_lexer::TokenKind::Unknown => ERROR,
179 }
180 };
181
182 return (syntax_kind, None);
183
184 fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<&'static str>) {
185 #[rustfmt::skip]
186 let syntax_kind = match *kind {
187 LK::Int { empty_int: false, .. } => INT_NUMBER,
188 LK::Int { empty_int: true, .. } => {
189 return (INT_NUMBER, Some("Missing digits after the integer base prefix"))
190 }
191
192 LK::Float { empty_exponent: false, .. } => FLOAT_NUMBER,
193 LK::Float { empty_exponent: true, .. } => {
194 return (FLOAT_NUMBER, Some("Missing digits after the exponent symbol"))
195 }
196
197 LK::Char { terminated: true } => CHAR,
198 LK::Char { terminated: false } => {
199 return (CHAR, Some("Missing trailing `'` symbol to terminate the character literal"))
200 }
201
202 LK::Byte { terminated: true } => BYTE,
203 LK::Byte { terminated: false } => {
204 return (BYTE, Some("Missing trailing `'` symbol to terminate the byte literal"))
205 }
206
207 LK::Str { terminated: true } => STRING,
208 LK::Str { terminated: false } => {
209 return (STRING, Some("Missing trailing `\"` symbol to terminate the string literal"))
210 }
211
212
213 LK::ByteStr { terminated: true } => BYTE_STRING,
214 LK::ByteStr { terminated: false } => {
215 return (BYTE_STRING, Some("Missing trailing `\"` symbol to terminate the byte string literal"))
216 }
217
218 LK::RawStr { err, .. } => match err {
219 None => RAW_STRING,
220 Some(RawStrError::InvalidStarter { .. }) => return (RAW_STRING, Some("Missing `\"` symbol after `#` symbols to begin the raw string literal")),
221 Some(RawStrError::NoTerminator { expected, found, .. }) => if expected == found {
222 return (RAW_STRING, Some("Missing trailing `\"` to terminate the raw string literal"))
223 } else {
224 return (RAW_STRING, Some("Missing trailing `\"` with `#` symbols to terminate the raw string literal"))
225
226 },
227 Some(RawStrError::TooManyDelimiters { .. }) => return (RAW_STRING, Some("Too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols")),
228 },
229 LK::RawByteStr { err, .. } => match err {
230 None => RAW_BYTE_STRING,
231 Some(RawStrError::InvalidStarter { .. }) => return (RAW_BYTE_STRING, Some("Missing `\"` symbol after `#` symbols to begin the raw byte string literal")),
232 Some(RawStrError::NoTerminator { expected, found, .. }) => if expected == found {
233 return (RAW_BYTE_STRING, Some("Missing trailing `\"` to terminate the raw byte string literal"))
234 } else {
235 return (RAW_BYTE_STRING, Some("Missing trailing `\"` with `#` symbols to terminate the raw byte string literal"))
236
237 },
238 Some(RawStrError::TooManyDelimiters { .. }) => return (RAW_BYTE_STRING, Some("Too many `#` symbols: raw byte strings may be delimited by up to 65535 `#` symbols")),
239 },
240 };
241
242 (syntax_kind, None)
243 }
244}
diff --git a/crates/syntax/src/parsing/reparsing.rs b/crates/syntax/src/parsing/reparsing.rs
new file mode 100644
index 000000000..4149f856a
--- /dev/null
+++ b/crates/syntax/src/parsing/reparsing.rs
@@ -0,0 +1,455 @@
1//! Implementation of incremental re-parsing.
2//!
3//! We use two simple strategies for this:
4//! - if the edit modifies only a single token (like changing an identifier's
5//! letter), we replace only this token.
6//! - otherwise, we search for the nearest `{}` block which contains the edit
7//! and try to parse only this block.
8
9use parser::Reparser;
10use text_edit::Indel;
11
12use crate::{
13 algo,
14 parsing::{
15 lexer::{lex_single_syntax_kind, tokenize, Token},
16 text_token_source::TextTokenSource,
17 text_tree_sink::TextTreeSink,
18 },
19 syntax_node::{GreenNode, GreenToken, NodeOrToken, SyntaxElement, SyntaxNode},
20 SyntaxError,
21 SyntaxKind::*,
22 TextRange, TextSize, T,
23};
24
25pub(crate) fn incremental_reparse(
26 node: &SyntaxNode,
27 edit: &Indel,
28 errors: Vec<SyntaxError>,
29) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {
30 if let Some((green, new_errors, old_range)) = reparse_token(node, &edit) {
31 return Some((green, merge_errors(errors, new_errors, old_range, edit), old_range));
32 }
33
34 if let Some((green, new_errors, old_range)) = reparse_block(node, &edit) {
35 return Some((green, merge_errors(errors, new_errors, old_range, edit), old_range));
36 }
37 None
38}
39
40fn reparse_token<'node>(
41 root: &'node SyntaxNode,
42 edit: &Indel,
43) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {
44 let prev_token = algo::find_covering_element(root, edit.delete).as_token()?.clone();
45 let prev_token_kind = prev_token.kind();
46 match prev_token_kind {
47 WHITESPACE | COMMENT | IDENT | STRING | RAW_STRING => {
48 if prev_token_kind == WHITESPACE || prev_token_kind == COMMENT {
49 // removing a new line may extends previous token
50 let deleted_range = edit.delete - prev_token.text_range().start();
51 if prev_token.text()[deleted_range].contains('\n') {
52 return None;
53 }
54 }
55
56 let mut new_text = get_text_after_edit(prev_token.clone().into(), &edit);
57 let (new_token_kind, new_err) = lex_single_syntax_kind(&new_text)?;
58
59 if new_token_kind != prev_token_kind
60 || (new_token_kind == IDENT && is_contextual_kw(&new_text))
61 {
62 return None;
63 }
64
65 // Check that edited token is not a part of the bigger token.
66 // E.g. if for source code `bruh"str"` the user removed `ruh`, then
67 // `b` no longer remains an identifier, but becomes a part of byte string literal
68 if let Some(next_char) = root.text().char_at(prev_token.text_range().end()) {
69 new_text.push(next_char);
70 let token_with_next_char = lex_single_syntax_kind(&new_text);
71 if let Some((_kind, _error)) = token_with_next_char {
72 return None;
73 }
74 new_text.pop();
75 }
76
77 let new_token =
78 GreenToken::new(rowan::SyntaxKind(prev_token_kind.into()), new_text.into());
79 Some((
80 prev_token.replace_with(new_token),
81 new_err.into_iter().collect(),
82 prev_token.text_range(),
83 ))
84 }
85 _ => None,
86 }
87}
88
89fn reparse_block<'node>(
90 root: &'node SyntaxNode,
91 edit: &Indel,
92) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {
93 let (node, reparser) = find_reparsable_node(root, edit.delete)?;
94 let text = get_text_after_edit(node.clone().into(), edit);
95
96 let (tokens, new_lexer_errors) = tokenize(&text);
97 if !is_balanced(&tokens) {
98 return None;
99 }
100
101 let mut token_source = TextTokenSource::new(&text, &tokens);
102 let mut tree_sink = TextTreeSink::new(&text, &tokens);
103 reparser.parse(&mut token_source, &mut tree_sink);
104
105 let (green, mut new_parser_errors) = tree_sink.finish();
106 new_parser_errors.extend(new_lexer_errors);
107
108 Some((node.replace_with(green), new_parser_errors, node.text_range()))
109}
110
111fn get_text_after_edit(element: SyntaxElement, edit: &Indel) -> String {
112 let edit = Indel::replace(edit.delete - element.text_range().start(), edit.insert.clone());
113
114 let mut text = match element {
115 NodeOrToken::Token(token) => token.text().to_string(),
116 NodeOrToken::Node(node) => node.text().to_string(),
117 };
118 edit.apply(&mut text);
119 text
120}
121
122fn is_contextual_kw(text: &str) -> bool {
123 matches!(text, "auto" | "default" | "union")
124}
125
126fn find_reparsable_node(node: &SyntaxNode, range: TextRange) -> Option<(SyntaxNode, Reparser)> {
127 let node = algo::find_covering_element(node, range);
128
129 let mut ancestors = match node {
130 NodeOrToken::Token(it) => it.parent().ancestors(),
131 NodeOrToken::Node(it) => it.ancestors(),
132 };
133 ancestors.find_map(|node| {
134 let first_child = node.first_child_or_token().map(|it| it.kind());
135 let parent = node.parent().map(|it| it.kind());
136 Reparser::for_node(node.kind(), first_child, parent).map(|r| (node, r))
137 })
138}
139
140fn is_balanced(tokens: &[Token]) -> bool {
141 if tokens.is_empty()
142 || tokens.first().unwrap().kind != T!['{']
143 || tokens.last().unwrap().kind != T!['}']
144 {
145 return false;
146 }
147 let mut balance = 0usize;
148 for t in &tokens[1..tokens.len() - 1] {
149 match t.kind {
150 T!['{'] => balance += 1,
151 T!['}'] => {
152 balance = match balance.checked_sub(1) {
153 Some(b) => b,
154 None => return false,
155 }
156 }
157 _ => (),
158 }
159 }
160 balance == 0
161}
162
163fn merge_errors(
164 old_errors: Vec<SyntaxError>,
165 new_errors: Vec<SyntaxError>,
166 range_before_reparse: TextRange,
167 edit: &Indel,
168) -> Vec<SyntaxError> {
169 let mut res = Vec::new();
170
171 for old_err in old_errors {
172 let old_err_range = old_err.range();
173 if old_err_range.end() <= range_before_reparse.start() {
174 res.push(old_err);
175 } else if old_err_range.start() >= range_before_reparse.end() {
176 let inserted_len = TextSize::of(&edit.insert);
177 res.push(old_err.with_range((old_err_range + inserted_len) - edit.delete.len()));
178 // Note: extra parens are intentional to prevent uint underflow, HWAB (here was a bug)
179 }
180 }
181 res.extend(new_errors.into_iter().map(|new_err| {
182 // fighting borrow checker with a variable ;)
183 let offseted_range = new_err.range() + range_before_reparse.start();
184 new_err.with_range(offseted_range)
185 }));
186 res
187}
188
189#[cfg(test)]
190mod tests {
191 use test_utils::{assert_eq_text, extract_range};
192
193 use super::*;
194 use crate::{AstNode, Parse, SourceFile};
195
196 fn do_check(before: &str, replace_with: &str, reparsed_len: u32) {
197 let (range, before) = extract_range(before);
198 let edit = Indel::replace(range, replace_with.to_owned());
199 let after = {
200 let mut after = before.clone();
201 edit.apply(&mut after);
202 after
203 };
204
205 let fully_reparsed = SourceFile::parse(&after);
206 let incrementally_reparsed: Parse<SourceFile> = {
207 let before = SourceFile::parse(&before);
208 let (green, new_errors, range) =
209 incremental_reparse(before.tree().syntax(), &edit, before.errors.to_vec()).unwrap();
210 assert_eq!(range.len(), reparsed_len.into(), "reparsed fragment has wrong length");
211 Parse::new(green, new_errors)
212 };
213
214 assert_eq_text!(
215 &format!("{:#?}", fully_reparsed.tree().syntax()),
216 &format!("{:#?}", incrementally_reparsed.tree().syntax()),
217 );
218 assert_eq!(fully_reparsed.errors(), incrementally_reparsed.errors());
219 }
220
221 #[test] // FIXME: some test here actually test token reparsing
222 fn reparse_block_tests() {
223 do_check(
224 r"
225fn foo() {
226 let x = foo + <|>bar<|>
227}
228",
229 "baz",
230 3,
231 );
232 do_check(
233 r"
234fn foo() {
235 let x = foo<|> + bar<|>
236}
237",
238 "baz",
239 25,
240 );
241 do_check(
242 r"
243struct Foo {
244 f: foo<|><|>
245}
246",
247 ",\n g: (),",
248 14,
249 );
250 do_check(
251 r"
252fn foo {
253 let;
254 1 + 1;
255 <|>92<|>;
256}
257",
258 "62",
259 31, // FIXME: reparse only int literal here
260 );
261 do_check(
262 r"
263mod foo {
264 fn <|><|>
265}
266",
267 "bar",
268 11,
269 );
270
271 do_check(
272 r"
273trait Foo {
274 type <|>Foo<|>;
275}
276",
277 "Output",
278 3,
279 );
280 do_check(
281 r"
282impl IntoIterator<Item=i32> for Foo {
283 f<|><|>
284}
285",
286 "n next(",
287 9,
288 );
289 do_check(r"use a::b::{foo,<|>,bar<|>};", "baz", 10);
290 do_check(
291 r"
292pub enum A {
293 Foo<|><|>
294}
295",
296 "\nBar;\n",
297 11,
298 );
299 do_check(
300 r"
301foo!{a, b<|><|> d}
302",
303 ", c[3]",
304 8,
305 );
306 do_check(
307 r"
308fn foo() {
309 vec![<|><|>]
310}
311",
312 "123",
313 14,
314 );
315 do_check(
316 r"
317extern {
318 fn<|>;<|>
319}
320",
321 " exit(code: c_int)",
322 11,
323 );
324 }
325
326 #[test]
327 fn reparse_token_tests() {
328 do_check(
329 r"<|><|>
330fn foo() -> i32 { 1 }
331",
332 "\n\n\n \n",
333 1,
334 );
335 do_check(
336 r"
337fn foo() -> <|><|> {}
338",
339 " \n",
340 2,
341 );
342 do_check(
343 r"
344fn <|>foo<|>() -> i32 { 1 }
345",
346 "bar",
347 3,
348 );
349 do_check(
350 r"
351fn foo<|><|>foo() { }
352",
353 "bar",
354 6,
355 );
356 do_check(
357 r"
358fn foo /* <|><|> */ () {}
359",
360 "some comment",
361 6,
362 );
363 do_check(
364 r"
365fn baz <|><|> () {}
366",
367 " \t\t\n\n",
368 2,
369 );
370 do_check(
371 r"
372fn baz <|><|> () {}
373",
374 " \t\t\n\n",
375 2,
376 );
377 do_check(
378 r"
379/// foo <|><|>omment
380mod { }
381",
382 "c",
383 14,
384 );
385 do_check(
386 r#"
387fn -> &str { "Hello<|><|>" }
388"#,
389 ", world",
390 7,
391 );
392 do_check(
393 r#"
394fn -> &str { // "Hello<|><|>"
395"#,
396 ", world",
397 10,
398 );
399 do_check(
400 r##"
401fn -> &str { r#"Hello<|><|>"#
402"##,
403 ", world",
404 10,
405 );
406 do_check(
407 r"
408#[derive(<|>Copy<|>)]
409enum Foo {
410
411}
412",
413 "Clone",
414 4,
415 );
416 }
417
418 #[test]
419 fn reparse_str_token_with_error_unchanged() {
420 do_check(r#""<|>Unclosed<|> string literal"#, "Still unclosed", 24);
421 }
422
423 #[test]
424 fn reparse_str_token_with_error_fixed() {
425 do_check(r#""unterinated<|><|>"#, "\"", 12);
426 }
427
428 #[test]
429 fn reparse_block_with_error_in_middle_unchanged() {
430 do_check(
431 r#"fn main() {
432 if {}
433 32 + 4<|><|>
434 return
435 if {}
436 }"#,
437 "23",
438 105,
439 )
440 }
441
442 #[test]
443 fn reparse_block_with_error_in_middle_fixed() {
444 do_check(
445 r#"fn main() {
446 if {}
447 32 + 4<|><|>
448 return
449 if {}
450 }"#,
451 ";",
452 105,
453 )
454 }
455}
diff --git a/crates/syntax/src/parsing/text_token_source.rs b/crates/syntax/src/parsing/text_token_source.rs
new file mode 100644
index 000000000..df866dc2b
--- /dev/null
+++ b/crates/syntax/src/parsing/text_token_source.rs
@@ -0,0 +1,84 @@
1//! See `TextTokenSource` docs.
2
3use parser::TokenSource;
4
5use crate::{parsing::lexer::Token, SyntaxKind::EOF, TextRange, TextSize};
6
7/// Implementation of `parser::TokenSource` that takes tokens from source code text.
8pub(crate) struct TextTokenSource<'t> {
9 text: &'t str,
10 /// token and its start position (non-whitespace/comment tokens)
11 /// ```non-rust
12 /// struct Foo;
13 /// ^------^--^-
14 /// | | \________
15 /// | \____ \
16 /// | \ |
17 /// (struct, 0) (Foo, 7) (;, 10)
18 /// ```
19 /// `[(struct, 0), (Foo, 7), (;, 10)]`
20 token_offset_pairs: Vec<(Token, TextSize)>,
21
22 /// Current token and position
23 curr: (parser::Token, usize),
24}
25
26impl<'t> TokenSource for TextTokenSource<'t> {
27 fn current(&self) -> parser::Token {
28 self.curr.0
29 }
30
31 fn lookahead_nth(&self, n: usize) -> parser::Token {
32 mk_token(self.curr.1 + n, &self.token_offset_pairs)
33 }
34
35 fn bump(&mut self) {
36 if self.curr.0.kind == EOF {
37 return;
38 }
39
40 let pos = self.curr.1 + 1;
41 self.curr = (mk_token(pos, &self.token_offset_pairs), pos);
42 }
43
44 fn is_keyword(&self, kw: &str) -> bool {
45 self.token_offset_pairs
46 .get(self.curr.1)
47 .map(|(token, offset)| &self.text[TextRange::at(*offset, token.len)] == kw)
48 .unwrap_or(false)
49 }
50}
51
52fn mk_token(pos: usize, token_offset_pairs: &[(Token, TextSize)]) -> parser::Token {
53 let (kind, is_jointed_to_next) = match token_offset_pairs.get(pos) {
54 Some((token, offset)) => (
55 token.kind,
56 token_offset_pairs
57 .get(pos + 1)
58 .map(|(_, next_offset)| offset + token.len == *next_offset)
59 .unwrap_or(false),
60 ),
61 None => (EOF, false),
62 };
63 parser::Token { kind, is_jointed_to_next }
64}
65
66impl<'t> TextTokenSource<'t> {
67 /// Generate input from tokens(expect comment and whitespace).
68 pub fn new(text: &'t str, raw_tokens: &'t [Token]) -> TextTokenSource<'t> {
69 let token_offset_pairs: Vec<_> = raw_tokens
70 .iter()
71 .filter_map({
72 let mut len = 0.into();
73 move |token| {
74 let pair = if token.kind.is_trivia() { None } else { Some((*token, len)) };
75 len += token.len;
76 pair
77 }
78 })
79 .collect();
80
81 let first = mk_token(0, &token_offset_pairs);
82 TextTokenSource { text, token_offset_pairs, curr: (first, 0) }
83 }
84}
diff --git a/crates/syntax/src/parsing/text_tree_sink.rs b/crates/syntax/src/parsing/text_tree_sink.rs
new file mode 100644
index 000000000..c1b5f246d
--- /dev/null
+++ b/crates/syntax/src/parsing/text_tree_sink.rs
@@ -0,0 +1,183 @@
1//! FIXME: write short doc here
2
3use std::mem;
4
5use parser::{ParseError, TreeSink};
6
7use crate::{
8 parsing::Token,
9 syntax_node::GreenNode,
10 SmolStr, SyntaxError,
11 SyntaxKind::{self, *},
12 SyntaxTreeBuilder, TextRange, TextSize,
13};
14
15/// Bridges the parser with our specific syntax tree representation.
16///
17/// `TextTreeSink` also handles attachment of trivia (whitespace) to nodes.
18pub(crate) struct TextTreeSink<'a> {
19 text: &'a str,
20 tokens: &'a [Token],
21 text_pos: TextSize,
22 token_pos: usize,
23 state: State,
24 inner: SyntaxTreeBuilder,
25}
26
27enum State {
28 PendingStart,
29 Normal,
30 PendingFinish,
31}
32
33impl<'a> TreeSink for TextTreeSink<'a> {
34 fn token(&mut self, kind: SyntaxKind, n_tokens: u8) {
35 match mem::replace(&mut self.state, State::Normal) {
36 State::PendingStart => unreachable!(),
37 State::PendingFinish => self.inner.finish_node(),
38 State::Normal => (),
39 }
40 self.eat_trivias();
41 let n_tokens = n_tokens as usize;
42 let len = self.tokens[self.token_pos..self.token_pos + n_tokens]
43 .iter()
44 .map(|it| it.len)
45 .sum::<TextSize>();
46 self.do_token(kind, len, n_tokens);
47 }
48
49 fn start_node(&mut self, kind: SyntaxKind) {
50 match mem::replace(&mut self.state, State::Normal) {
51 State::PendingStart => {
52 self.inner.start_node(kind);
53 // No need to attach trivias to previous node: there is no
54 // previous node.
55 return;
56 }
57 State::PendingFinish => self.inner.finish_node(),
58 State::Normal => (),
59 }
60
61 let n_trivias =
62 self.tokens[self.token_pos..].iter().take_while(|it| it.kind.is_trivia()).count();
63 let leading_trivias = &self.tokens[self.token_pos..self.token_pos + n_trivias];
64 let mut trivia_end =
65 self.text_pos + leading_trivias.iter().map(|it| it.len).sum::<TextSize>();
66
67 let n_attached_trivias = {
68 let leading_trivias = leading_trivias.iter().rev().map(|it| {
69 let next_end = trivia_end - it.len;
70 let range = TextRange::new(next_end, trivia_end);
71 trivia_end = next_end;
72 (it.kind, &self.text[range])
73 });
74 n_attached_trivias(kind, leading_trivias)
75 };
76 self.eat_n_trivias(n_trivias - n_attached_trivias);
77 self.inner.start_node(kind);
78 self.eat_n_trivias(n_attached_trivias);
79 }
80
81 fn finish_node(&mut self) {
82 match mem::replace(&mut self.state, State::PendingFinish) {
83 State::PendingStart => unreachable!(),
84 State::PendingFinish => self.inner.finish_node(),
85 State::Normal => (),
86 }
87 }
88
89 fn error(&mut self, error: ParseError) {
90 self.inner.error(error, self.text_pos)
91 }
92}
93
94impl<'a> TextTreeSink<'a> {
95 pub(super) fn new(text: &'a str, tokens: &'a [Token]) -> Self {
96 Self {
97 text,
98 tokens,
99 text_pos: 0.into(),
100 token_pos: 0,
101 state: State::PendingStart,
102 inner: SyntaxTreeBuilder::default(),
103 }
104 }
105
106 pub(super) fn finish(mut self) -> (GreenNode, Vec<SyntaxError>) {
107 match mem::replace(&mut self.state, State::Normal) {
108 State::PendingFinish => {
109 self.eat_trivias();
110 self.inner.finish_node()
111 }
112 State::PendingStart | State::Normal => unreachable!(),
113 }
114
115 self.inner.finish_raw()
116 }
117
118 fn eat_trivias(&mut self) {
119 while let Some(&token) = self.tokens.get(self.token_pos) {
120 if !token.kind.is_trivia() {
121 break;
122 }
123 self.do_token(token.kind, token.len, 1);
124 }
125 }
126
127 fn eat_n_trivias(&mut self, n: usize) {
128 for _ in 0..n {
129 let token = self.tokens[self.token_pos];
130 assert!(token.kind.is_trivia());
131 self.do_token(token.kind, token.len, 1);
132 }
133 }
134
135 fn do_token(&mut self, kind: SyntaxKind, len: TextSize, n_tokens: usize) {
136 let range = TextRange::at(self.text_pos, len);
137 let text: SmolStr = self.text[range].into();
138 self.text_pos += len;
139 self.token_pos += n_tokens;
140 self.inner.token(kind, text);
141 }
142}
143
144fn n_attached_trivias<'a>(
145 kind: SyntaxKind,
146 trivias: impl Iterator<Item = (SyntaxKind, &'a str)>,
147) -> usize {
148 match kind {
149 MACRO_CALL | CONST | TYPE_ALIAS | STRUCT | ENUM | VARIANT | FN | TRAIT | MODULE
150 | RECORD_FIELD | STATIC => {
151 let mut res = 0;
152 let mut trivias = trivias.enumerate().peekable();
153
154 while let Some((i, (kind, text))) = trivias.next() {
155 match kind {
156 WHITESPACE => {
157 if text.contains("\n\n") {
158 // we check whether the next token is a doc-comment
159 // and skip the whitespace in this case
160 if let Some((peek_kind, peek_text)) =
161 trivias.peek().map(|(_, pair)| pair)
162 {
163 if *peek_kind == COMMENT
164 && peek_text.starts_with("///")
165 && !peek_text.starts_with("////")
166 {
167 continue;
168 }
169 }
170 break;
171 }
172 }
173 COMMENT => {
174 res = i + 1;
175 }
176 _ => (),
177 }
178 }
179 res
180 }
181 _ => 0,
182 }
183}