aboutsummaryrefslogtreecommitdiff
path: root/crates/ra_syntax/src/parsing/lexer.rs
diff options
context:
space:
mode:
Diffstat (limited to 'crates/ra_syntax/src/parsing/lexer.rs')
-rw-r--r--crates/ra_syntax/src/parsing/lexer.rs313
1 files changed, 165 insertions, 148 deletions
diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs
index bf6b4d637..55755be18 100644
--- a/crates/ra_syntax/src/parsing/lexer.rs
+++ b/crates/ra_syntax/src/parsing/lexer.rs
@@ -16,55 +16,21 @@ pub struct Token {
16 pub len: TextUnit, 16 pub len: TextUnit,
17} 17}
18 18
19/// Represents the result of parsing one token. Beware that the token may be malformed.
20#[derive(Debug)]
21pub struct ParsedToken {
22 /// Parsed token.
23 pub token: Token,
24 /// If error is present then parsed token is malformed.
25 pub error: Option<SyntaxError>,
26}
27
28#[derive(Debug, Default)]
29/// Represents the result of parsing source code of Rust language.
30pub struct ParsedTokens {
31 /// Parsed tokens in order they appear in source code.
32 pub tokens: Vec<Token>,
33 /// Collection of all occured tokenization errors.
34 /// In general `self.errors.len() <= self.tokens.len()`
35 pub errors: Vec<SyntaxError>,
36}
37impl ParsedTokens {
38 /// Append `token` and `error` (if pressent) to the result.
39 pub fn push(&mut self, ParsedToken { token, error }: ParsedToken) {
40 self.tokens.push(token);
41 if let Some(error) = error {
42 self.errors.push(error)
43 }
44 }
45}
46
47/// Same as `tokenize_append()`, just a shortcut for creating `ParsedTokens`
48/// and returning the result the usual way.
49pub fn tokenize(text: &str) -> ParsedTokens {
50 let mut parsed = ParsedTokens::default();
51 tokenize_append(text, &mut parsed);
52 parsed
53}
54
55/// Break a string up into its component tokens. 19/// Break a string up into its component tokens.
56/// Writes to `ParsedTokens` which are basically a pair `(Vec<Token>, Vec<SyntaxError>)`.
57/// Beware that it checks for shebang first and its length contributes to resulting 20/// Beware that it checks for shebang first and its length contributes to resulting
58/// tokens offsets. 21/// tokens offsets.
59pub fn tokenize_append(text: &str, parsed: &mut ParsedTokens) { 22pub fn tokenize(text: &str) -> (Vec<Token>, Vec<SyntaxError>) {
60 // non-empty string is a precondtion of `rustc_lexer::strip_shebang()`. 23 // non-empty string is a precondtion of `rustc_lexer::strip_shebang()`.
61 if text.is_empty() { 24 if text.is_empty() {
62 return; 25 return Default::default();
63 } 26 }
64 27
28 let mut tokens = Vec::new();
29 let mut errors = Vec::new();
30
65 let mut offset: usize = rustc_lexer::strip_shebang(text) 31 let mut offset: usize = rustc_lexer::strip_shebang(text)
66 .map(|shebang_len| { 32 .map(|shebang_len| {
67 parsed.tokens.push(Token { kind: SHEBANG, len: TextUnit::from_usize(shebang_len) }); 33 tokens.push(Token { kind: SHEBANG, len: TextUnit::from_usize(shebang_len) });
68 shebang_len 34 shebang_len
69 }) 35 })
70 .unwrap_or(0); 36 .unwrap_or(0);
@@ -72,35 +38,76 @@ pub fn tokenize_append(text: &str, parsed: &mut ParsedTokens) {
72 let text_without_shebang = &text[offset..]; 38 let text_without_shebang = &text[offset..];
73 39
74 for rustc_token in rustc_lexer::tokenize(text_without_shebang) { 40 for rustc_token in rustc_lexer::tokenize(text_without_shebang) {
75 parsed.push(rustc_token_to_parsed_token(&rustc_token, text, TextUnit::from_usize(offset))); 41 let token_len = TextUnit::from_usize(rustc_token.len);
42 let token_range = TextRange::offset_len(TextUnit::from_usize(offset), token_len);
43
44 let (syntax_kind, error) =
45 rustc_token_kind_to_syntax_kind(&rustc_token.kind, &text[token_range]);
46
47 tokens.push(Token { kind: syntax_kind, len: token_len });
48
49 if let Some(error) = error {
50 errors.push(SyntaxError::new(SyntaxErrorKind::TokenizeError(error), token_range));
51 }
52
76 offset += rustc_token.len; 53 offset += rustc_token.len;
77 } 54 }
55
56 (tokens, errors)
78} 57}
79 58
80/// Returns the first encountered token at the beginning of the string. 59/// Returns `SyntaxKind` and `Option<SyntaxError>` of the first token
81/// If the string contains zero or *two or more tokens* returns `None`. 60/// encountered at the beginning of the string.
61///
62/// Returns `None` if the string contains zero *or two or more* tokens.
63/// The token is malformed if the returned error is not `None`.
64///
65/// Beware that unescape errors are not checked at tokenization time.
66pub fn lex_single_syntax_kind(text: &str) -> Option<(SyntaxKind, Option<SyntaxError>)> {
67 first_token(text)
68 .filter(|(token, _)| token.len.to_usize() == text.len())
69 .map(|(token, error)| (token.kind, error))
70}
71
72/// The same as `single_syntax_kind()` but returns only `SyntaxKind` and
73/// returns `None` if any tokenization error occured.
82/// 74///
83/// The main difference between `first_token()` and `single_token()` is that 75/// Beware that unescape errors are not checked at tokenization time.
84/// the latter returns `None` if the string contains more than one token. 76pub fn lex_single_valid_syntax_kind(text: &str) -> Option<SyntaxKind> {
85pub fn single_token(text: &str) -> Option<ParsedToken> { 77 first_token(text)
86 first_token(text).filter(|parsed| parsed.token.len.to_usize() == text.len()) 78 .filter(|(token, error)| !error.is_some() && token.len.to_usize() == text.len())
79 .map(|(token, _error)| token.kind)
87} 80}
88 81
89/// Returns the first encountered token at the beginning of the string. 82/// Returns the first encountered token at the beginning of the string.
90/// If the string contains zero tokens returns `None`.
91/// 83///
92/// The main difference between `first_token() and single_token()` is that 84/// Returns `None` if the string contains zero tokens or if the token was parsed
93/// the latter returns `None` if the string contains more than one token. 85/// with an error.
94pub fn first_token(text: &str) -> Option<ParsedToken> { 86///
87/// Beware that unescape errors are not checked at tokenization time.
88fn first_token(text: &str) -> Option<(Token, Option<SyntaxError>)> {
95 // non-empty string is a precondtion of `rustc_lexer::first_token()`. 89 // non-empty string is a precondtion of `rustc_lexer::first_token()`.
96 if text.is_empty() { 90 if text.is_empty() {
97 None 91 return None;
98 } else {
99 let rustc_token = rustc_lexer::first_token(text);
100 Some(rustc_token_to_parsed_token(&rustc_token, text, TextUnit::from(0)))
101 } 92 }
93
94 let rustc_token = rustc_lexer::first_token(text);
95 let (syntax_kind, error) = rustc_token_kind_to_syntax_kind(&rustc_token.kind, text);
96
97 let token = Token { kind: syntax_kind, len: TextUnit::from_usize(rustc_token.len) };
98 let error = error.map(|error| {
99 SyntaxError::new(
100 SyntaxErrorKind::TokenizeError(error),
101 TextRange::from_to(TextUnit::from(0), TextUnit::of_str(text)),
102 )
103 });
104
105 Some((token, error))
102} 106}
103 107
108// FIXME: simplify TokenizeError to `SyntaxError(String, TextRange)` as per @matklad advice:
109// https://github.com/rust-analyzer/rust-analyzer/pull/2911/files#r371175067
110
104/// Describes the values of `SyntaxErrorKind::TokenizeError` enum variant. 111/// Describes the values of `SyntaxErrorKind::TokenizeError` enum variant.
105/// It describes all the types of errors that may happen during the tokenization 112/// It describes all the types of errors that may happen during the tokenization
106/// of Rust source. 113/// of Rust source.
@@ -136,122 +143,132 @@ pub enum TokenizeError {
136 LifetimeStartsWithNumber, 143 LifetimeStartsWithNumber,
137} 144}
138 145
139/// Mapper function that converts `rustc_lexer::Token` with some additional context 146fn rustc_token_kind_to_syntax_kind(
140/// to `ParsedToken` 147 rustc_token_kind: &rustc_lexer::TokenKind,
141fn rustc_token_to_parsed_token( 148 token_text: &str,
142 rustc_token: &rustc_lexer::Token, 149) -> (SyntaxKind, Option<TokenizeError>) {
143 text: &str, 150 // A note on an intended tradeoff:
144 token_start_offset: TextUnit,
145) -> ParsedToken {
146 // We drop some useful infromation here (see patterns with double dots `..`) 151 // We drop some useful infromation here (see patterns with double dots `..`)
147 // Storing that info in `SyntaxKind` is not possible due to its layout requirements of 152 // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
148 // being `u16` that come from `rowan::SyntaxKind` type and changes to `rowan::SyntaxKind` 153 // being `u16` that come from `rowan::SyntaxKind`.
149 // would mean hell of a rewrite
150 154
151 let token_range = 155 let syntax_kind = {
152 TextRange::offset_len(token_start_offset, TextUnit::from_usize(rustc_token.len));
153
154 let token_text = &text[token_range];
155
156 let (syntax_kind, error) = {
157 use rustc_lexer::TokenKind as TK; 156 use rustc_lexer::TokenKind as TK;
158 use TokenizeError as TE; 157 use TokenizeError as TE;
159 158
160 match rustc_token.kind { 159 match rustc_token_kind {
161 TK::LineComment => ok(COMMENT), 160 TK::LineComment => COMMENT,
162 TK::BlockComment { terminated } => { 161
163 ok_if(terminated, COMMENT, TE::UnterminatedBlockComment) 162 TK::BlockComment { terminated: true } => COMMENT,
163 TK::BlockComment { terminated: false } => {
164 return (COMMENT, Some(TE::UnterminatedBlockComment));
164 } 165 }
165 TK::Whitespace => ok(WHITESPACE), 166
166 TK::Ident => ok(if token_text == "_" { 167 TK::Whitespace => WHITESPACE,
167 UNDERSCORE 168
168 } else { 169 TK::Ident => {
169 SyntaxKind::from_keyword(token_text).unwrap_or(IDENT) 170 if token_text == "_" {
170 }), 171 UNDERSCORE
171 TK::RawIdent => ok(IDENT), 172 } else {
172 TK::Literal { kind, .. } => match_literal_kind(&kind), 173 SyntaxKind::from_keyword(token_text).unwrap_or(IDENT)
173 TK::Lifetime { starts_with_number } => { 174 }
174 ok_if(!starts_with_number, LIFETIME, TE::LifetimeStartsWithNumber)
175 } 175 }
176 TK::Semi => ok(SEMI),
177 TK::Comma => ok(COMMA),
178 TK::Dot => ok(DOT),
179 TK::OpenParen => ok(L_PAREN),
180 TK::CloseParen => ok(R_PAREN),
181 TK::OpenBrace => ok(L_CURLY),
182 TK::CloseBrace => ok(R_CURLY),
183 TK::OpenBracket => ok(L_BRACK),
184 TK::CloseBracket => ok(R_BRACK),
185 TK::At => ok(AT),
186 TK::Pound => ok(POUND),
187 TK::Tilde => ok(TILDE),
188 TK::Question => ok(QUESTION),
189 TK::Colon => ok(COLON),
190 TK::Dollar => ok(DOLLAR),
191 TK::Eq => ok(EQ),
192 TK::Not => ok(EXCL),
193 TK::Lt => ok(L_ANGLE),
194 TK::Gt => ok(R_ANGLE),
195 TK::Minus => ok(MINUS),
196 TK::And => ok(AMP),
197 TK::Or => ok(PIPE),
198 TK::Plus => ok(PLUS),
199 TK::Star => ok(STAR),
200 TK::Slash => ok(SLASH),
201 TK::Caret => ok(CARET),
202 TK::Percent => ok(PERCENT),
203 TK::Unknown => ok(ERROR),
204 }
205 };
206 176
207 return ParsedToken { 177 TK::RawIdent => IDENT,
208 token: Token { kind: syntax_kind, len: token_range.len() }, 178 TK::Literal { kind, .. } => return match_literal_kind(&kind),
209 error: error 179
210 .map(|error| SyntaxError::new(SyntaxErrorKind::TokenizeError(error), token_range)), 180 TK::Lifetime { starts_with_number: false } => LIFETIME,
181 TK::Lifetime { starts_with_number: true } => {
182 return (LIFETIME, Some(TE::LifetimeStartsWithNumber))
183 }
184
185 TK::Semi => SEMI,
186 TK::Comma => COMMA,
187 TK::Dot => DOT,
188 TK::OpenParen => L_PAREN,
189 TK::CloseParen => R_PAREN,
190 TK::OpenBrace => L_CURLY,
191 TK::CloseBrace => R_CURLY,
192 TK::OpenBracket => L_BRACK,
193 TK::CloseBracket => R_BRACK,
194 TK::At => AT,
195 TK::Pound => POUND,
196 TK::Tilde => TILDE,
197 TK::Question => QUESTION,
198 TK::Colon => COLON,
199 TK::Dollar => DOLLAR,
200 TK::Eq => EQ,
201 TK::Not => EXCL,
202 TK::Lt => L_ANGLE,
203 TK::Gt => R_ANGLE,
204 TK::Minus => MINUS,
205 TK::And => AMP,
206 TK::Or => PIPE,
207 TK::Plus => PLUS,
208 TK::Star => STAR,
209 TK::Slash => SLASH,
210 TK::Caret => CARET,
211 TK::Percent => PERCENT,
212 TK::Unknown => ERROR,
213 }
211 }; 214 };
212 215
213 type ParsedSyntaxKind = (SyntaxKind, Option<TokenizeError>); 216 return (syntax_kind, None);
214 217
215 fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> ParsedSyntaxKind { 218 fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<TokenizeError>) {
216 use rustc_lexer::LiteralKind as LK; 219 use rustc_lexer::LiteralKind as LK;
217 use TokenizeError as TE; 220 use TokenizeError as TE;
218 221
219 match *kind { 222 #[rustfmt::skip]
220 LK::Int { empty_int, .. } => ok_if(!empty_int, INT_NUMBER, TE::EmptyInt), 223 let syntax_kind = match *kind {
221 LK::Float { empty_exponent, .. } => { 224 LK::Int { empty_int: false, .. } => INT_NUMBER,
222 ok_if(!empty_exponent, FLOAT_NUMBER, TE::EmptyExponent) 225 LK::Int { empty_int: true, .. } => {
226 return (INT_NUMBER, Some(TE::EmptyInt))
227 }
228
229 LK::Float { empty_exponent: false, .. } => FLOAT_NUMBER,
230 LK::Float { empty_exponent: true, .. } => {
231 return (FLOAT_NUMBER, Some(TE::EmptyExponent))
232 }
233
234 LK::Char { terminated: true } => CHAR,
235 LK::Char { terminated: false } => {
236 return (CHAR, Some(TE::UnterminatedChar))
237 }
238
239 LK::Byte { terminated: true } => BYTE,
240 LK::Byte { terminated: false } => {
241 return (BYTE, Some(TE::UnterminatedByte))
223 } 242 }
224 LK::Char { terminated } => ok_if(terminated, CHAR, TE::UnterminatedChar), 243
225 LK::Byte { terminated } => ok_if(terminated, BYTE, TE::UnterminatedByte), 244 LK::Str { terminated: true } => STRING,
226 LK::Str { terminated } => ok_if(terminated, STRING, TE::UnterminatedString), 245 LK::Str { terminated: false } => {
227 LK::ByteStr { terminated } => { 246 return (STRING, Some(TE::UnterminatedString))
228 ok_if(terminated, BYTE_STRING, TE::UnterminatedByteString) 247 }
248
249
250 LK::ByteStr { terminated: true } => BYTE_STRING,
251 LK::ByteStr { terminated: false } => {
252 return (BYTE_STRING, Some(TE::UnterminatedByteString))
229 } 253 }
230 254
231 LK::RawStr { started: true, terminated, .. } => { 255 LK::RawStr { started: true, terminated: true, .. } => RAW_STRING,
232 ok_if(terminated, RAW_STRING, TE::UnterminatedRawString) 256 LK::RawStr { started: true, terminated: false, .. } => {
257 return (RAW_STRING, Some(TE::UnterminatedRawString))
258 }
259 LK::RawStr { started: false, .. } => {
260 return (RAW_STRING, Some(TE::UnstartedRawString))
233 } 261 }
234 LK::RawStr { started: false, .. } => err(RAW_STRING, TE::UnstartedRawString),
235 262
236 LK::RawByteStr { started: true, terminated, .. } => { 263 LK::RawByteStr { started: true, terminated: true, .. } => RAW_BYTE_STRING,
237 ok_if(terminated, RAW_BYTE_STRING, TE::UnterminatedRawByteString) 264 LK::RawByteStr { started: true, terminated: false, .. } => {
265 return (RAW_BYTE_STRING, Some(TE::UnterminatedRawByteString))
238 } 266 }
239 LK::RawByteStr { started: false, .. } => { 267 LK::RawByteStr { started: false, .. } => {
240 err(RAW_BYTE_STRING, TE::UnstartedRawByteString) 268 return (RAW_BYTE_STRING, Some(TE::UnstartedRawByteString))
241 } 269 }
242 } 270 };
243 } 271
244 const fn ok(syntax_kind: SyntaxKind) -> ParsedSyntaxKind {
245 (syntax_kind, None) 272 (syntax_kind, None)
246 } 273 }
247 const fn err(syntax_kind: SyntaxKind, error: TokenizeError) -> ParsedSyntaxKind {
248 (syntax_kind, Some(error))
249 }
250 fn ok_if(cond: bool, syntax_kind: SyntaxKind, error: TokenizeError) -> ParsedSyntaxKind {
251 if cond {
252 ok(syntax_kind)
253 } else {
254 err(syntax_kind, error)
255 }
256 }
257} 274}