diff options
author | Veetaha <[email protected]> | 2020-01-24 01:39:23 +0000 |
---|---|---|
committer | Veetaha <[email protected]> | 2020-02-03 22:00:55 +0000 |
commit | ad24976da38482948c586bdbc16004273662ff7e (patch) | |
tree | 9a54ae3eb36b123c82634b557df21e0d47848834 /crates/ra_syntax/src/parsing | |
parent | b090ee5a65f9630146c2842bc51fcfcc8da08da1 (diff) |
ra_syntax: changed added diagnostics information returned from tokenize() (implemented with iterators)
Diffstat (limited to 'crates/ra_syntax/src/parsing')
-rw-r--r-- | crates/ra_syntax/src/parsing/lexer.rs | 299 | ||||
-rw-r--r-- | crates/ra_syntax/src/parsing/reparsing.rs | 3 |
2 files changed, 230 insertions, 72 deletions
diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs index 6d839208d..9dca7d747 100644 --- a/crates/ra_syntax/src/parsing/lexer.rs +++ b/crates/ra_syntax/src/parsing/lexer.rs | |||
@@ -1,4 +1,6 @@ | |||
1 | //! FIXME: write short doc here | 1 | //! Lexer analyzes raw input string and produces lexemes (tokens). |
2 | |||
3 | use std::iter::{FromIterator, IntoIterator}; | ||
2 | 4 | ||
3 | use crate::{ | 5 | use crate::{ |
4 | SyntaxKind::{self, *}, | 6 | SyntaxKind::{self, *}, |
@@ -13,85 +15,242 @@ pub struct Token { | |||
13 | /// The length of the token. | 15 | /// The length of the token. |
14 | pub len: TextUnit, | 16 | pub len: TextUnit, |
15 | } | 17 | } |
18 | impl Token { | ||
19 | pub const fn new(kind: SyntaxKind, len: TextUnit) -> Self { | ||
20 | Self { kind, len } | ||
21 | } | ||
22 | } | ||
16 | 23 | ||
17 | fn match_literal_kind(kind: rustc_lexer::LiteralKind) -> SyntaxKind { | 24 | #[derive(Debug)] |
18 | match kind { | 25 | /// Represents the result of parsing one token. |
19 | rustc_lexer::LiteralKind::Int { .. } => INT_NUMBER, | 26 | pub struct ParsedToken { |
20 | rustc_lexer::LiteralKind::Float { .. } => FLOAT_NUMBER, | 27 | /// Parsed token. |
21 | rustc_lexer::LiteralKind::Char { .. } => CHAR, | 28 | pub token: Token, |
22 | rustc_lexer::LiteralKind::Byte { .. } => BYTE, | 29 | /// If error is present then parsed token is malformed. |
23 | rustc_lexer::LiteralKind::Str { .. } => STRING, | 30 | pub error: Option<TokenizeError>, |
24 | rustc_lexer::LiteralKind::ByteStr { .. } => BYTE_STRING, | 31 | } |
25 | rustc_lexer::LiteralKind::RawStr { .. } => RAW_STRING, | 32 | impl ParsedToken { |
26 | rustc_lexer::LiteralKind::RawByteStr { .. } => RAW_BYTE_STRING, | 33 | pub const fn new(token: Token, error: Option<TokenizeError>) -> Self { |
34 | Self { token, error } | ||
27 | } | 35 | } |
28 | } | 36 | } |
29 | 37 | ||
38 | #[derive(Debug, Default)] | ||
39 | /// Represents the result of parsing one token. | ||
40 | pub struct ParsedTokens { | ||
41 | /// Parsed token. | ||
42 | pub tokens: Vec<Token>, | ||
43 | /// If error is present then parsed token is malformed. | ||
44 | pub errors: Vec<TokenizeError>, | ||
45 | } | ||
46 | |||
47 | impl FromIterator<ParsedToken> for ParsedTokens { | ||
48 | fn from_iter<I: IntoIterator<Item = ParsedToken>>(iter: I) -> Self { | ||
49 | let res = Self::default(); | ||
50 | for entry in iter { | ||
51 | res.tokens.push(entry.token); | ||
52 | if let Some(error) = entry.error { | ||
53 | res.errors.push(error); | ||
54 | } | ||
55 | } | ||
56 | res | ||
57 | } | ||
58 | } | ||
59 | |||
60 | /// Returns the first encountered token from the string. | ||
61 | /// If the string contains zero or two or more tokens returns `None`. | ||
62 | pub fn single_token(text: &str) -> Option<ParsedToken> { | ||
63 | // TODO: test whether this condition indeed checks for a single token | ||
64 | first_token(text).filter(|parsed| parsed.token.len.to_usize() == text.len()) | ||
65 | } | ||
66 | |||
67 | /* | ||
68 | /// Returns `ParsedTokens` which are basically a pair `(Vec<Token>, Vec<TokenizeError>)` | ||
69 | /// This is just a shorthand for `tokenize(text).collect()` | ||
70 | pub fn tokenize_to_vec_with_errors(text: &str) -> ParsedTokens { | ||
71 | tokenize(text).collect() | ||
72 | } | ||
73 | |||
74 | /// The simplest version of tokenize, it just retunst a ready-made `Vec<Token>`. | ||
75 | /// It discards all tokenization errors while parsing. If you need that infromation | ||
76 | /// consider using `tokenize()` or `tokenize_to_vec_with_errors()`. | ||
77 | pub fn tokenize_to_vec(text: &str) -> Vec<Token> { | ||
78 | tokenize(text).map(|parsed_token| parsed_token.token).collect() | ||
79 | } | ||
80 | */ | ||
81 | |||
30 | /// Break a string up into its component tokens | 82 | /// Break a string up into its component tokens |
31 | pub fn tokenize(text: &str) -> Vec<Token> { | 83 | /// This is the core function, all other `tokenize*()` functions are simply |
32 | if text.is_empty() { | 84 | /// handy shortcuts for this one. |
33 | return vec![]; | 85 | pub fn tokenize(text: &str) -> impl Iterator<Item = ParsedToken> + '_ { |
86 | let shebang = rustc_lexer::strip_shebang(text).map(|shebang_len| { | ||
87 | text = &text[shebang_len..]; | ||
88 | ParsedToken::new(Token::new(SHEBANG, TextUnit::from_usize(shebang_len)), None) | ||
89 | }); | ||
90 | |||
91 | // Notice that we eagerly evaluate shebang since it may change text slice | ||
92 | // and we cannot simplify this into a single method call chain | ||
93 | shebang.into_iter().chain(tokenize_without_shebang(text)) | ||
94 | } | ||
95 | |||
96 | pub fn tokenize_without_shebang(text: &str) -> impl Iterator<Item = ParsedToken> + '_ { | ||
97 | rustc_lexer::tokenize(text).map(|rustc_token| { | ||
98 | let token_text = &text[..rustc_token.len]; | ||
99 | text = &text[rustc_token.len..]; | ||
100 | rustc_token_kind_to_parsed_token(&rustc_token.kind, token_text) | ||
101 | }) | ||
102 | } | ||
103 | |||
104 | #[derive(Debug)] | ||
105 | pub enum TokenizeError { | ||
106 | /// Base prefix was provided, but there were no digits | ||
107 | /// after it, e.g. `0x`. | ||
108 | EmptyInt, | ||
109 | /// Float exponent lacks digits e.g. `e+`, `E+`, `e-`, `E-`, | ||
110 | EmptyExponent, | ||
111 | |||
112 | /// Block comment lacks trailing delimiter `*/` | ||
113 | UnterminatedBlockComment, | ||
114 | /// Character literal lacks trailing delimiter `'` | ||
115 | UnterminatedChar, | ||
116 | /// Characterish byte literal lacks trailing delimiter `'` | ||
117 | UnterminatedByte, | ||
118 | /// String literal lacks trailing delimiter `"` | ||
119 | UnterminatedString, | ||
120 | /// Byte string literal lacks trailing delimiter `"` | ||
121 | UnterminatedByteString, | ||
122 | /// Raw literal lacks trailing delimiter e.g. `"##` | ||
123 | UnterminatedRawString, | ||
124 | /// Raw byte string literal lacks trailing delimiter e.g. `"##` | ||
125 | UnterminatedRawByteString, | ||
126 | |||
127 | /// Raw string lacks a quote after pound characters e.g. `r###` | ||
128 | UnstartedRawString, | ||
129 | /// Raw byte string lacks a quote after pound characters e.g. `br###` | ||
130 | UnstartedRawByteString, | ||
131 | |||
132 | /// Lifetime starts with a number e.g. `'4ever` | ||
133 | LifetimeStartsWithNumber, | ||
134 | } | ||
135 | |||
136 | fn rustc_token_kind_to_parsed_token( | ||
137 | rustc_token_kind: &rustc_lexer::TokenKind, | ||
138 | token_text: &str, | ||
139 | ) -> ParsedToken { | ||
140 | use rustc_lexer::TokenKind as TK; | ||
141 | use TokenizeError as TE; | ||
142 | |||
143 | // We drop some useful infromation here (see patterns with double dots `..`) | ||
144 | // Storing that info in `SyntaxKind` is not possible due to its layout requirements of | ||
145 | // being `u16` that come from `rowan::SyntaxKind` type and changes to `rowan::SyntaxKind` | ||
146 | // would mean hell of a rewrite. | ||
147 | |||
148 | let (syntax_kind, error) = match *rustc_token_kind { | ||
149 | TK::LineComment => ok(COMMENT), | ||
150 | TK::BlockComment { terminated } => ok_if(terminated, COMMENT, TE::UnterminatedBlockComment), | ||
151 | TK::Whitespace => ok(WHITESPACE), | ||
152 | TK::Ident => ok(if token_text == "_" { | ||
153 | UNDERSCORE | ||
154 | } else { | ||
155 | SyntaxKind::from_keyword(token_text).unwrap_or(IDENT) | ||
156 | }), | ||
157 | TK::RawIdent => ok(IDENT), | ||
158 | TK::Literal { kind, .. } => match_literal_kind(&kind), | ||
159 | TK::Lifetime { starts_with_number } => { | ||
160 | ok_if(!starts_with_number, LIFETIME, TE::LifetimeStartsWithNumber) | ||
161 | } | ||
162 | TK::Semi => ok(SEMI), | ||
163 | TK::Comma => ok(COMMA), | ||
164 | TK::Dot => ok(DOT), | ||
165 | TK::OpenParen => ok(L_PAREN), | ||
166 | TK::CloseParen => ok(R_PAREN), | ||
167 | TK::OpenBrace => ok(L_CURLY), | ||
168 | TK::CloseBrace => ok(R_CURLY), | ||
169 | TK::OpenBracket => ok(L_BRACK), | ||
170 | TK::CloseBracket => ok(R_BRACK), | ||
171 | TK::At => ok(AT), | ||
172 | TK::Pound => ok(POUND), | ||
173 | TK::Tilde => ok(TILDE), | ||
174 | TK::Question => ok(QUESTION), | ||
175 | TK::Colon => ok(COLON), | ||
176 | TK::Dollar => ok(DOLLAR), | ||
177 | TK::Eq => ok(EQ), | ||
178 | TK::Not => ok(EXCL), | ||
179 | TK::Lt => ok(L_ANGLE), | ||
180 | TK::Gt => ok(R_ANGLE), | ||
181 | TK::Minus => ok(MINUS), | ||
182 | TK::And => ok(AMP), | ||
183 | TK::Or => ok(PIPE), | ||
184 | TK::Plus => ok(PLUS), | ||
185 | TK::Star => ok(STAR), | ||
186 | TK::Slash => ok(SLASH), | ||
187 | TK::Caret => ok(CARET), | ||
188 | TK::Percent => ok(PERCENT), | ||
189 | TK::Unknown => ok(ERROR), | ||
190 | }; | ||
191 | |||
192 | return ParsedToken::new( | ||
193 | Token::new(syntax_kind, TextUnit::from_usize(token_text.len())), | ||
194 | error, | ||
195 | ); | ||
196 | |||
197 | type ParsedSyntaxKind = (SyntaxKind, Option<TokenizeError>); | ||
198 | |||
199 | const fn ok(syntax_kind: SyntaxKind) -> ParsedSyntaxKind { | ||
200 | (syntax_kind, None) | ||
34 | } | 201 | } |
35 | let mut text = text; | 202 | const fn ok_if(cond: bool, syntax_kind: SyntaxKind, error: TokenizeError) -> ParsedSyntaxKind { |
36 | let mut acc = Vec::new(); | 203 | if cond { |
37 | if let Some(len) = rustc_lexer::strip_shebang(text) { | 204 | ok(syntax_kind) |
38 | acc.push(Token { kind: SHEBANG, len: TextUnit::from_usize(len) }); | 205 | } else { |
39 | text = &text[len..]; | 206 | err(syntax_kind, error) |
207 | } | ||
40 | } | 208 | } |
41 | while !text.is_empty() { | 209 | const fn err(syntax_kind: SyntaxKind, error: TokenizeError) -> ParsedSyntaxKind { |
42 | let rustc_token = rustc_lexer::first_token(text); | 210 | (syntax_kind, Some(error)) |
43 | let kind = match rustc_token.kind { | 211 | } |
44 | rustc_lexer::TokenKind::LineComment => COMMENT, | 212 | |
45 | rustc_lexer::TokenKind::BlockComment { .. } => COMMENT, | 213 | const fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> ParsedSyntaxKind { |
46 | rustc_lexer::TokenKind::Whitespace => WHITESPACE, | 214 | use rustc_lexer::LiteralKind as LK; |
47 | rustc_lexer::TokenKind::Ident => { | 215 | match *kind { |
48 | let token_text = &text[..rustc_token.len]; | 216 | LK::Int { empty_int, .. } => ok_if(!empty_int, INT_NUMBER, TE::EmptyInt), |
49 | if token_text == "_" { | 217 | LK::Float { empty_exponent, .. } => { |
50 | UNDERSCORE | 218 | ok_if(!empty_exponent, FLOAT_NUMBER, TE::EmptyExponent) |
51 | } else { | ||
52 | SyntaxKind::from_keyword(&text[..rustc_token.len]).unwrap_or(IDENT) | ||
53 | } | ||
54 | } | 219 | } |
55 | rustc_lexer::TokenKind::RawIdent => IDENT, | 220 | LK::Char { terminated } => ok_if(terminated, CHAR, TE::UnterminatedChar), |
56 | rustc_lexer::TokenKind::Literal { kind, .. } => match_literal_kind(kind), | 221 | LK::Byte { terminated } => ok_if(terminated, BYTE, TE::UnterminatedByte), |
57 | rustc_lexer::TokenKind::Lifetime { .. } => LIFETIME, | 222 | LK::Str { terminated } => ok_if(terminated, STRING, TE::UnterminatedString), |
58 | rustc_lexer::TokenKind::Semi => SEMI, | 223 | LK::ByteStr { terminated } => { |
59 | rustc_lexer::TokenKind::Comma => COMMA, | 224 | ok_if(terminated, BYTE_STRING, TE::UnterminatedByteString) |
60 | rustc_lexer::TokenKind::Dot => DOT, | 225 | } |
61 | rustc_lexer::TokenKind::OpenParen => L_PAREN, | 226 | |
62 | rustc_lexer::TokenKind::CloseParen => R_PAREN, | 227 | LK::RawStr { started: true, terminated, .. } => { |
63 | rustc_lexer::TokenKind::OpenBrace => L_CURLY, | 228 | ok_if(terminated, RAW_STRING, TE::UnterminatedRawString) |
64 | rustc_lexer::TokenKind::CloseBrace => R_CURLY, | 229 | } |
65 | rustc_lexer::TokenKind::OpenBracket => L_BRACK, | 230 | LK::RawStr { started: false, .. } => err(RAW_STRING, TE::UnstartedRawString), |
66 | rustc_lexer::TokenKind::CloseBracket => R_BRACK, | 231 | |
67 | rustc_lexer::TokenKind::At => AT, | 232 | LK::RawByteStr { started: true, terminated, .. } => { |
68 | rustc_lexer::TokenKind::Pound => POUND, | 233 | ok_if(terminated, RAW_BYTE_STRING, TE::UnterminatedRawByteString) |
69 | rustc_lexer::TokenKind::Tilde => TILDE, | 234 | } |
70 | rustc_lexer::TokenKind::Question => QUESTION, | 235 | LK::RawByteStr { started: false, .. } => { |
71 | rustc_lexer::TokenKind::Colon => COLON, | 236 | err(RAW_BYTE_STRING, TE::UnstartedRawByteString) |
72 | rustc_lexer::TokenKind::Dollar => DOLLAR, | 237 | } |
73 | rustc_lexer::TokenKind::Eq => EQ, | 238 | } |
74 | rustc_lexer::TokenKind::Not => EXCL, | 239 | } |
75 | rustc_lexer::TokenKind::Lt => L_ANGLE, | 240 | } |
76 | rustc_lexer::TokenKind::Gt => R_ANGLE, | 241 | |
77 | rustc_lexer::TokenKind::Minus => MINUS, | 242 | pub fn first_token(text: &str) -> Option<ParsedToken> { |
78 | rustc_lexer::TokenKind::And => AMP, | 243 | // Checking for emptyness because of `rustc_lexer::first_token()` invariant (see its body) |
79 | rustc_lexer::TokenKind::Or => PIPE, | 244 | if text.is_empty() { |
80 | rustc_lexer::TokenKind::Plus => PLUS, | 245 | None |
81 | rustc_lexer::TokenKind::Star => STAR, | 246 | } else { |
82 | rustc_lexer::TokenKind::Slash => SLASH, | 247 | let rustc_token = rustc_lexer::first_token(text); |
83 | rustc_lexer::TokenKind::Caret => CARET, | 248 | Some(rustc_token_kind_to_parsed_token(&rustc_token.kind, &text[..rustc_token.len])) |
84 | rustc_lexer::TokenKind::Percent => PERCENT, | ||
85 | rustc_lexer::TokenKind::Unknown => ERROR, | ||
86 | }; | ||
87 | let token = Token { kind, len: TextUnit::from_usize(rustc_token.len) }; | ||
88 | acc.push(token); | ||
89 | text = &text[rustc_token.len..]; | ||
90 | } | 249 | } |
91 | acc | ||
92 | } | 250 | } |
93 | 251 | ||
94 | pub fn classify_literal(text: &str) -> Option<Token> { | 252 | // TODO: think what to do with this ad hoc function |
253 | pub fn classify_literal(text: &str) -> Option<ParsedToken> { | ||
95 | let t = rustc_lexer::first_token(text); | 254 | let t = rustc_lexer::first_token(text); |
96 | if t.len != text.len() { | 255 | if t.len != text.len() { |
97 | return None; | 256 | return None; |
@@ -100,5 +259,5 @@ pub fn classify_literal(text: &str) -> Option<Token> { | |||
100 | rustc_lexer::TokenKind::Literal { kind, .. } => match_literal_kind(kind), | 259 | rustc_lexer::TokenKind::Literal { kind, .. } => match_literal_kind(kind), |
101 | _ => return None, | 260 | _ => return None, |
102 | }; | 261 | }; |
103 | Some(Token { kind, len: TextUnit::from_usize(t.len) }) | 262 | Some(ParsedToken::new(Token::new(kind, TextUnit::from_usize(t.len)))) |
104 | } | 263 | } |
diff --git a/crates/ra_syntax/src/parsing/reparsing.rs b/crates/ra_syntax/src/parsing/reparsing.rs index 06bdda11d..3abc09877 100644 --- a/crates/ra_syntax/src/parsing/reparsing.rs +++ b/crates/ra_syntax/src/parsing/reparsing.rs | |||
@@ -46,8 +46,7 @@ fn reparse_token<'node>( | |||
46 | WHITESPACE | COMMENT | IDENT | STRING | RAW_STRING => { | 46 | WHITESPACE | COMMENT | IDENT | STRING | RAW_STRING => { |
47 | if token.kind() == WHITESPACE || token.kind() == COMMENT { | 47 | if token.kind() == WHITESPACE || token.kind() == COMMENT { |
48 | // removing a new line may extends previous token | 48 | // removing a new line may extends previous token |
49 | if token.text().to_string()[edit.delete - token.text_range().start()].contains('\n') | 49 | if token.text()[edit.delete - token.text_range().start()].contains('\n') { |
50 | { | ||
51 | return None; | 50 | return None; |
52 | } | 51 | } |
53 | } | 52 | } |