aboutsummaryrefslogtreecommitdiff
path: root/crates/ra_syntax/src/parsing
diff options
context:
space:
mode:
authorbors[bot] <26634292+bors[bot]@users.noreply.github.com>2020-02-18 12:57:26 +0000
committerGitHub <[email protected]>2020-02-18 12:57:26 +0000
commitc447fe9bc06006a7080da782cf67d739c91b534c (patch)
tree45cbc9578b24437da3eedc6a234784be22b1f38c /crates/ra_syntax/src/parsing
parent742459c8fe08e359ae380e3e1dc0d059c0b4f871 (diff)
parent053ccf4121797e4e559e3225d46d3f23cb1ad70b (diff)
Merge #3026
3026: ra_syntax: reshape SyntaxError for the sake of removing redundancy r=matklad a=Veetaha Followup of #2911, also puts some crosses to the todo list of #223. **AHTUNG!** A big part of the diff of this PR are test data files changes. Simplified `SyntaxError` that was `SyntaxError { kind: { /* big enum */ }, location: Location }` to `SyntaxError(String, TextRange)`. I am not sure whether the tuple struct here is best fit, I am inclined to add names to the fields, because I already provide getters `SyntaxError::message()`, `SyntaxError::range()`. I also removed `Location` altogether ... This is currently WIP, because the following is not done: - [ ] ~~Add tests to `test_data` dir for unescape errors *// I don't know where to put these errors in particular, because they are out of the scope of the lexer and parser. However, I have an idea in mind that we move all validators we have right now to parsing stage, but this is up to discussion...*~~ **[UPD]** I came to a conclusion that tree validation logic, which unescape errors are a part of, should be rethought of, we currently have no tests and no place to put tests for tree validations. So I'd like to extract potential redesign (maybe move of tree validation to ra_parser) and adding tests for this into a separate task. Co-authored-by: Veetaha <[email protected]> Co-authored-by: Veetaha <[email protected]>
Diffstat (limited to 'crates/ra_syntax/src/parsing')
-rw-r--r--crates/ra_syntax/src/parsing/lexer.rs92
-rw-r--r--crates/ra_syntax/src/parsing/reparsing.rs84
2 files changed, 93 insertions, 83 deletions
diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs
index f889e6a1d..f2684c852 100644
--- a/crates/ra_syntax/src/parsing/lexer.rs
+++ b/crates/ra_syntax/src/parsing/lexer.rs
@@ -2,7 +2,7 @@
2//! It is just a bridge to `rustc_lexer`. 2//! It is just a bridge to `rustc_lexer`.
3 3
4use crate::{ 4use crate::{
5 SyntaxError, SyntaxErrorKind, 5 SyntaxError,
6 SyntaxKind::{self, *}, 6 SyntaxKind::{self, *},
7 TextRange, TextUnit, 7 TextRange, TextUnit,
8}; 8};
@@ -41,13 +41,13 @@ pub fn tokenize(text: &str) -> (Vec<Token>, Vec<SyntaxError>) {
41 let token_len = TextUnit::from_usize(rustc_token.len); 41 let token_len = TextUnit::from_usize(rustc_token.len);
42 let token_range = TextRange::offset_len(TextUnit::from_usize(offset), token_len); 42 let token_range = TextRange::offset_len(TextUnit::from_usize(offset), token_len);
43 43
44 let (syntax_kind, error) = 44 let (syntax_kind, err_message) =
45 rustc_token_kind_to_syntax_kind(&rustc_token.kind, &text[token_range]); 45 rustc_token_kind_to_syntax_kind(&rustc_token.kind, &text[token_range]);
46 46
47 tokens.push(Token { kind: syntax_kind, len: token_len }); 47 tokens.push(Token { kind: syntax_kind, len: token_len });
48 48
49 if let Some(error) = error { 49 if let Some(err_message) = err_message {
50 errors.push(SyntaxError::new(SyntaxErrorKind::TokenizeError(error), token_range)); 50 errors.push(SyntaxError::new(err_message, token_range));
51 } 51 }
52 52
53 offset += rustc_token.len; 53 offset += rustc_token.len;
@@ -94,61 +94,21 @@ fn lex_first_token(text: &str) -> Option<(Token, Option<SyntaxError>)> {
94 } 94 }
95 95
96 let rustc_token = rustc_lexer::first_token(text); 96 let rustc_token = rustc_lexer::first_token(text);
97 let (syntax_kind, error) = rustc_token_kind_to_syntax_kind(&rustc_token.kind, text); 97 let (syntax_kind, err_message) = rustc_token_kind_to_syntax_kind(&rustc_token.kind, text);
98 98
99 let token = Token { kind: syntax_kind, len: TextUnit::from_usize(rustc_token.len) }; 99 let token = Token { kind: syntax_kind, len: TextUnit::from_usize(rustc_token.len) };
100 let error = error.map(|error| { 100 let optional_error = err_message.map(|err_message| {
101 SyntaxError::new( 101 SyntaxError::new(err_message, TextRange::from_to(0.into(), TextUnit::of_str(text)))
102 SyntaxErrorKind::TokenizeError(error),
103 TextRange::from_to(TextUnit::from(0), TextUnit::of_str(text)),
104 )
105 }); 102 });
106 103
107 Some((token, error)) 104 Some((token, optional_error))
108}
109
110// FIXME: simplify TokenizeError to `SyntaxError(String, TextRange)` as per @matklad advice:
111// https://github.com/rust-analyzer/rust-analyzer/pull/2911/files#r371175067
112
113/// Describes the values of `SyntaxErrorKind::TokenizeError` enum variant.
114/// It describes all the types of errors that may happen during the tokenization
115/// of Rust source.
116#[derive(Debug, Clone, PartialEq, Eq, Hash)]
117pub enum TokenizeError {
118 /// Base prefix was provided, but there were no digits
119 /// after it, e.g. `0x`, `0b`.
120 EmptyInt,
121 /// Float exponent lacks digits e.g. `12.34e+`, `12.3E+`, `12e-`, `1_E-`,
122 EmptyExponent,
123
124 /// Block comment lacks trailing delimiter `*/`
125 UnterminatedBlockComment,
126 /// Character literal lacks trailing delimiter `'`
127 UnterminatedChar,
128 /// Characterish byte literal lacks trailing delimiter `'`
129 UnterminatedByte,
130 /// String literal lacks trailing delimiter `"`
131 UnterminatedString,
132 /// Byte string literal lacks trailing delimiter `"`
133 UnterminatedByteString,
134 /// Raw literal lacks trailing delimiter e.g. `"##`
135 UnterminatedRawString,
136 /// Raw byte string literal lacks trailing delimiter e.g. `"##`
137 UnterminatedRawByteString,
138
139 /// Raw string lacks a quote after the pound characters e.g. `r###`
140 UnstartedRawString,
141 /// Raw byte string lacks a quote after the pound characters e.g. `br###`
142 UnstartedRawByteString,
143
144 /// Lifetime starts with a number e.g. `'4ever`
145 LifetimeStartsWithNumber,
146} 105}
147 106
107/// Returns `SyntaxKind` and an optional tokenize error message.
148fn rustc_token_kind_to_syntax_kind( 108fn rustc_token_kind_to_syntax_kind(
149 rustc_token_kind: &rustc_lexer::TokenKind, 109 rustc_token_kind: &rustc_lexer::TokenKind,
150 token_text: &str, 110 token_text: &str,
151) -> (SyntaxKind, Option<TokenizeError>) { 111) -> (SyntaxKind, Option<&'static str>) {
152 // A note on an intended tradeoff: 112 // A note on an intended tradeoff:
153 // We drop some useful infromation here (see patterns with double dots `..`) 113 // We drop some useful infromation here (see patterns with double dots `..`)
154 // Storing that info in `SyntaxKind` is not possible due to its layout requirements of 114 // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
@@ -156,14 +116,15 @@ fn rustc_token_kind_to_syntax_kind(
156 116
157 let syntax_kind = { 117 let syntax_kind = {
158 use rustc_lexer::TokenKind as TK; 118 use rustc_lexer::TokenKind as TK;
159 use TokenizeError as TE;
160
161 match rustc_token_kind { 119 match rustc_token_kind {
162 TK::LineComment => COMMENT, 120 TK::LineComment => COMMENT,
163 121
164 TK::BlockComment { terminated: true } => COMMENT, 122 TK::BlockComment { terminated: true } => COMMENT,
165 TK::BlockComment { terminated: false } => { 123 TK::BlockComment { terminated: false } => {
166 return (COMMENT, Some(TE::UnterminatedBlockComment)); 124 return (
125 COMMENT,
126 Some("Missing trailing `*/` symbols to terminate the block comment"),
127 );
167 } 128 }
168 129
169 TK::Whitespace => WHITESPACE, 130 TK::Whitespace => WHITESPACE,
@@ -181,7 +142,7 @@ fn rustc_token_kind_to_syntax_kind(
181 142
182 TK::Lifetime { starts_with_number: false } => LIFETIME, 143 TK::Lifetime { starts_with_number: false } => LIFETIME,
183 TK::Lifetime { starts_with_number: true } => { 144 TK::Lifetime { starts_with_number: true } => {
184 return (LIFETIME, Some(TE::LifetimeStartsWithNumber)) 145 return (LIFETIME, Some("Lifetime name cannot start with a number"))
185 } 146 }
186 147
187 TK::Semi => SEMI, 148 TK::Semi => SEMI,
@@ -217,57 +178,56 @@ fn rustc_token_kind_to_syntax_kind(
217 178
218 return (syntax_kind, None); 179 return (syntax_kind, None);
219 180
220 fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<TokenizeError>) { 181 fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<&'static str>) {
221 use rustc_lexer::LiteralKind as LK; 182 use rustc_lexer::LiteralKind as LK;
222 use TokenizeError as TE;
223 183
224 #[rustfmt::skip] 184 #[rustfmt::skip]
225 let syntax_kind = match *kind { 185 let syntax_kind = match *kind {
226 LK::Int { empty_int: false, .. } => INT_NUMBER, 186 LK::Int { empty_int: false, .. } => INT_NUMBER,
227 LK::Int { empty_int: true, .. } => { 187 LK::Int { empty_int: true, .. } => {
228 return (INT_NUMBER, Some(TE::EmptyInt)) 188 return (INT_NUMBER, Some("Missing digits after the integer base prefix"))
229 } 189 }
230 190
231 LK::Float { empty_exponent: false, .. } => FLOAT_NUMBER, 191 LK::Float { empty_exponent: false, .. } => FLOAT_NUMBER,
232 LK::Float { empty_exponent: true, .. } => { 192 LK::Float { empty_exponent: true, .. } => {
233 return (FLOAT_NUMBER, Some(TE::EmptyExponent)) 193 return (FLOAT_NUMBER, Some("Missing digits after the exponent symbol"))
234 } 194 }
235 195
236 LK::Char { terminated: true } => CHAR, 196 LK::Char { terminated: true } => CHAR,
237 LK::Char { terminated: false } => { 197 LK::Char { terminated: false } => {
238 return (CHAR, Some(TE::UnterminatedChar)) 198 return (CHAR, Some("Missing trailing `'` symbol to terminate the character literal"))
239 } 199 }
240 200
241 LK::Byte { terminated: true } => BYTE, 201 LK::Byte { terminated: true } => BYTE,
242 LK::Byte { terminated: false } => { 202 LK::Byte { terminated: false } => {
243 return (BYTE, Some(TE::UnterminatedByte)) 203 return (BYTE, Some("Missing trailing `'` symbol to terminate the byte literal"))
244 } 204 }
245 205
246 LK::Str { terminated: true } => STRING, 206 LK::Str { terminated: true } => STRING,
247 LK::Str { terminated: false } => { 207 LK::Str { terminated: false } => {
248 return (STRING, Some(TE::UnterminatedString)) 208 return (STRING, Some("Missing trailing `\"` symbol to terminate the string literal"))
249 } 209 }
250 210
251 211
252 LK::ByteStr { terminated: true } => BYTE_STRING, 212 LK::ByteStr { terminated: true } => BYTE_STRING,
253 LK::ByteStr { terminated: false } => { 213 LK::ByteStr { terminated: false } => {
254 return (BYTE_STRING, Some(TE::UnterminatedByteString)) 214 return (BYTE_STRING, Some("Missing trailing `\"` symbol to terminate the byte string literal"))
255 } 215 }
256 216
257 LK::RawStr { started: true, terminated: true, .. } => RAW_STRING, 217 LK::RawStr { started: true, terminated: true, .. } => RAW_STRING,
258 LK::RawStr { started: true, terminated: false, .. } => { 218 LK::RawStr { started: true, terminated: false, .. } => {
259 return (RAW_STRING, Some(TE::UnterminatedRawString)) 219 return (RAW_STRING, Some("Missing trailing `\"` with `#` symbols to terminate the raw string literal"))
260 } 220 }
261 LK::RawStr { started: false, .. } => { 221 LK::RawStr { started: false, .. } => {
262 return (RAW_STRING, Some(TE::UnstartedRawString)) 222 return (RAW_STRING, Some("Missing `\"` symbol after `#` symbols to begin the raw string literal"))
263 } 223 }
264 224
265 LK::RawByteStr { started: true, terminated: true, .. } => RAW_BYTE_STRING, 225 LK::RawByteStr { started: true, terminated: true, .. } => RAW_BYTE_STRING,
266 LK::RawByteStr { started: true, terminated: false, .. } => { 226 LK::RawByteStr { started: true, terminated: false, .. } => {
267 return (RAW_BYTE_STRING, Some(TE::UnterminatedRawByteString)) 227 return (RAW_BYTE_STRING, Some("Missing trailing `\"` with `#` symbols to terminate the raw byte string literal"))
268 } 228 }
269 LK::RawByteStr { started: false, .. } => { 229 LK::RawByteStr { started: false, .. } => {
270 return (RAW_BYTE_STRING, Some(TE::UnstartedRawByteString)) 230 return (RAW_BYTE_STRING, Some("Missing `\"` symbol after `#` symbols to begin the raw byte string literal"))
271 } 231 }
272 }; 232 };
273 233
diff --git a/crates/ra_syntax/src/parsing/reparsing.rs b/crates/ra_syntax/src/parsing/reparsing.rs
index a86da0675..aad70d015 100644
--- a/crates/ra_syntax/src/parsing/reparsing.rs
+++ b/crates/ra_syntax/src/parsing/reparsing.rs
@@ -27,8 +27,8 @@ pub(crate) fn incremental_reparse(
27 edit: &AtomTextEdit, 27 edit: &AtomTextEdit,
28 errors: Vec<SyntaxError>, 28 errors: Vec<SyntaxError>,
29) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> { 29) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {
30 if let Some((green, old_range)) = reparse_token(node, &edit) { 30 if let Some((green, new_errors, old_range)) = reparse_token(node, &edit) {
31 return Some((green, merge_errors(errors, Vec::new(), old_range, edit), old_range)); 31 return Some((green, merge_errors(errors, new_errors, old_range, edit), old_range));
32 } 32 }
33 33
34 if let Some((green, new_errors, old_range)) = reparse_block(node, &edit) { 34 if let Some((green, new_errors, old_range)) = reparse_block(node, &edit) {
@@ -40,7 +40,7 @@ pub(crate) fn incremental_reparse(
40fn reparse_token<'node>( 40fn reparse_token<'node>(
41 root: &'node SyntaxNode, 41 root: &'node SyntaxNode,
42 edit: &AtomTextEdit, 42 edit: &AtomTextEdit,
43) -> Option<(GreenNode, TextRange)> { 43) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {
44 let prev_token = algo::find_covering_element(root, edit.delete).as_token()?.clone(); 44 let prev_token = algo::find_covering_element(root, edit.delete).as_token()?.clone();
45 let prev_token_kind = prev_token.kind(); 45 let prev_token_kind = prev_token.kind();
46 match prev_token_kind { 46 match prev_token_kind {
@@ -54,7 +54,7 @@ fn reparse_token<'node>(
54 } 54 }
55 55
56 let mut new_text = get_text_after_edit(prev_token.clone().into(), &edit); 56 let mut new_text = get_text_after_edit(prev_token.clone().into(), &edit);
57 let (new_token_kind, _error) = lex_single_syntax_kind(&new_text)?; 57 let (new_token_kind, new_err) = lex_single_syntax_kind(&new_text)?;
58 58
59 if new_token_kind != prev_token_kind 59 if new_token_kind != prev_token_kind
60 || (new_token_kind == IDENT && is_contextual_kw(&new_text)) 60 || (new_token_kind == IDENT && is_contextual_kw(&new_text))
@@ -76,7 +76,11 @@ fn reparse_token<'node>(
76 76
77 let new_token = 77 let new_token =
78 GreenToken::new(rowan::SyntaxKind(prev_token_kind.into()), new_text.into()); 78 GreenToken::new(rowan::SyntaxKind(prev_token_kind.into()), new_text.into());
79 Some((prev_token.replace_with(new_token), prev_token.text_range())) 79 Some((
80 prev_token.replace_with(new_token),
81 new_err.into_iter().collect(),
82 prev_token.text_range(),
83 ))
80 } 84 }
81 _ => None, 85 _ => None,
82 } 86 }
@@ -87,7 +91,7 @@ fn reparse_block<'node>(
87 edit: &AtomTextEdit, 91 edit: &AtomTextEdit,
88) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> { 92) -> Option<(GreenNode, Vec<SyntaxError>, TextRange)> {
89 let (node, reparser) = find_reparsable_node(root, edit.delete)?; 93 let (node, reparser) = find_reparsable_node(root, edit.delete)?;
90 let text = get_text_after_edit(node.clone().into(), &edit); 94 let text = get_text_after_edit(node.clone().into(), edit);
91 95
92 let (tokens, new_lexer_errors) = tokenize(&text); 96 let (tokens, new_lexer_errors) = tokenize(&text);
93 if !is_balanced(&tokens) { 97 if !is_balanced(&tokens) {
@@ -162,20 +166,27 @@ fn is_balanced(tokens: &[Token]) -> bool {
162fn merge_errors( 166fn merge_errors(
163 old_errors: Vec<SyntaxError>, 167 old_errors: Vec<SyntaxError>,
164 new_errors: Vec<SyntaxError>, 168 new_errors: Vec<SyntaxError>,
165 old_range: TextRange, 169 range_before_reparse: TextRange,
166 edit: &AtomTextEdit, 170 edit: &AtomTextEdit,
167) -> Vec<SyntaxError> { 171) -> Vec<SyntaxError> {
168 let mut res = Vec::new(); 172 let mut res = Vec::new();
169 for e in old_errors { 173
170 if e.offset() <= old_range.start() { 174 for old_err in old_errors {
171 res.push(e) 175 let old_err_range = old_err.range();
172 } else if e.offset() >= old_range.end() { 176 // FIXME: make sure that .start() was here previously by a mistake
173 res.push(e.add_offset(TextUnit::of_str(&edit.insert), edit.delete.len())); 177 if old_err_range.end() <= range_before_reparse.start() {
178 res.push(old_err);
179 } else if old_err_range.start() >= range_before_reparse.end() {
180 let inserted_len = TextUnit::of_str(&edit.insert);
181 res.push(old_err.with_range((old_err_range + inserted_len) - edit.delete.len()));
182 // Note: extra parens are intentional to prevent uint underflow, HWAB (here was a bug)
174 } 183 }
175 } 184 }
176 for e in new_errors { 185 res.extend(new_errors.into_iter().map(|new_err| {
177 res.push(e.add_offset(old_range.start(), 0.into())); 186 // fighting borrow checker with a variable ;)
178 } 187 let offseted_range = new_err.range() + range_before_reparse.start();
188 new_err.with_range(offseted_range)
189 }));
179 res 190 res
180} 191}
181 192
@@ -193,9 +204,9 @@ mod tests {
193 204
194 let fully_reparsed = SourceFile::parse(&after); 205 let fully_reparsed = SourceFile::parse(&after);
195 let incrementally_reparsed: Parse<SourceFile> = { 206 let incrementally_reparsed: Parse<SourceFile> = {
196 let f = SourceFile::parse(&before); 207 let before = SourceFile::parse(&before);
197 let (green, new_errors, range) = 208 let (green, new_errors, range) =
198 incremental_reparse(f.tree().syntax(), &edit, f.errors.to_vec()).unwrap(); 209 incremental_reparse(before.tree().syntax(), &edit, before.errors.to_vec()).unwrap();
199 assert_eq!(range.len(), reparsed_len.into(), "reparsed fragment has wrong length"); 210 assert_eq!(range.len(), reparsed_len.into(), "reparsed fragment has wrong length");
200 Parse::new(green, new_errors) 211 Parse::new(green, new_errors)
201 }; 212 };
@@ -204,6 +215,7 @@ mod tests {
204 &format!("{:#?}", fully_reparsed.tree().syntax()), 215 &format!("{:#?}", fully_reparsed.tree().syntax()),
205 &format!("{:#?}", incrementally_reparsed.tree().syntax()), 216 &format!("{:#?}", incrementally_reparsed.tree().syntax()),
206 ); 217 );
218 assert_eq!(fully_reparsed.errors(), incrementally_reparsed.errors());
207 } 219 }
208 220
209 #[test] // FIXME: some test here actually test token reparsing 221 #[test] // FIXME: some test here actually test token reparsing
@@ -402,4 +414,42 @@ enum Foo {
402 4, 414 4,
403 ); 415 );
404 } 416 }
417
418 #[test]
419 fn reparse_str_token_with_error_unchanged() {
420 do_check(r#""<|>Unclosed<|> string literal"#, "Still unclosed", 24);
421 }
422
423 #[test]
424 fn reparse_str_token_with_error_fixed() {
425 do_check(r#""unterinated<|><|>"#, "\"", 12);
426 }
427
428 #[test]
429 fn reparse_block_with_error_in_middle_unchanged() {
430 do_check(
431 r#"fn main() {
432 if {}
433 32 + 4<|><|>
434 return
435 if {}
436 }"#,
437 "23",
438 105,
439 )
440 }
441
442 #[test]
443 fn reparse_block_with_error_in_middle_fixed() {
444 do_check(
445 r#"fn main() {
446 if {}
447 32 + 4<|><|>
448 return
449 if {}
450 }"#,
451 ";",
452 105,
453 )
454 }
405} 455}