aboutsummaryrefslogtreecommitdiff
path: root/crates/ra_syntax/src/parsing/lexer.rs
diff options
context:
space:
mode:
Diffstat (limited to 'crates/ra_syntax/src/parsing/lexer.rs')
-rw-r--r--crates/ra_syntax/src/parsing/lexer.rs324
1 files changed, 248 insertions, 76 deletions
diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs
index 6d839208d..f889e6a1d 100644
--- a/crates/ra_syntax/src/parsing/lexer.rs
+++ b/crates/ra_syntax/src/parsing/lexer.rs
@@ -1,8 +1,10 @@
1//! FIXME: write short doc here 1//! Lexer analyzes raw input string and produces lexemes (tokens).
2//! It is just a bridge to `rustc_lexer`.
2 3
3use crate::{ 4use crate::{
5 SyntaxError, SyntaxErrorKind,
4 SyntaxKind::{self, *}, 6 SyntaxKind::{self, *},
5 TextUnit, 7 TextRange, TextUnit,
6}; 8};
7 9
8/// A token of Rust source. 10/// A token of Rust source.
@@ -14,91 +16,261 @@ pub struct Token {
14 pub len: TextUnit, 16 pub len: TextUnit,
15} 17}
16 18
17fn match_literal_kind(kind: rustc_lexer::LiteralKind) -> SyntaxKind { 19/// Break a string up into its component tokens.
18 match kind { 20/// Beware that it checks for shebang first and its length contributes to resulting
19 rustc_lexer::LiteralKind::Int { .. } => INT_NUMBER, 21/// tokens offsets.
20 rustc_lexer::LiteralKind::Float { .. } => FLOAT_NUMBER, 22pub fn tokenize(text: &str) -> (Vec<Token>, Vec<SyntaxError>) {
21 rustc_lexer::LiteralKind::Char { .. } => CHAR, 23 // non-empty string is a precondtion of `rustc_lexer::strip_shebang()`.
22 rustc_lexer::LiteralKind::Byte { .. } => BYTE, 24 if text.is_empty() {
23 rustc_lexer::LiteralKind::Str { .. } => STRING, 25 return Default::default();
24 rustc_lexer::LiteralKind::ByteStr { .. } => BYTE_STRING, 26 }
25 rustc_lexer::LiteralKind::RawStr { .. } => RAW_STRING, 27
26 rustc_lexer::LiteralKind::RawByteStr { .. } => RAW_BYTE_STRING, 28 let mut tokens = Vec::new();
29 let mut errors = Vec::new();
30
31 let mut offset: usize = rustc_lexer::strip_shebang(text)
32 .map(|shebang_len| {
33 tokens.push(Token { kind: SHEBANG, len: TextUnit::from_usize(shebang_len) });
34 shebang_len
35 })
36 .unwrap_or(0);
37
38 let text_without_shebang = &text[offset..];
39
40 for rustc_token in rustc_lexer::tokenize(text_without_shebang) {
41 let token_len = TextUnit::from_usize(rustc_token.len);
42 let token_range = TextRange::offset_len(TextUnit::from_usize(offset), token_len);
43
44 let (syntax_kind, error) =
45 rustc_token_kind_to_syntax_kind(&rustc_token.kind, &text[token_range]);
46
47 tokens.push(Token { kind: syntax_kind, len: token_len });
48
49 if let Some(error) = error {
50 errors.push(SyntaxError::new(SyntaxErrorKind::TokenizeError(error), token_range));
51 }
52
53 offset += rustc_token.len;
27 } 54 }
55
56 (tokens, errors)
57}
58
59/// Returns `SyntaxKind` and `Option<SyntaxError>` of the first token
60/// encountered at the beginning of the string.
61///
62/// Returns `None` if the string contains zero *or two or more* tokens.
63/// The token is malformed if the returned error is not `None`.
64///
65/// Beware that unescape errors are not checked at tokenization time.
66pub fn lex_single_syntax_kind(text: &str) -> Option<(SyntaxKind, Option<SyntaxError>)> {
67 lex_first_token(text)
68 .filter(|(token, _)| token.len.to_usize() == text.len())
69 .map(|(token, error)| (token.kind, error))
70}
71
72/// The same as `lex_single_syntax_kind()` but returns only `SyntaxKind` and
73/// returns `None` if any tokenization error occured.
74///
75/// Beware that unescape errors are not checked at tokenization time.
76pub fn lex_single_valid_syntax_kind(text: &str) -> Option<SyntaxKind> {
77 lex_first_token(text)
78 .filter(|(token, error)| !error.is_some() && token.len.to_usize() == text.len())
79 .map(|(token, _error)| token.kind)
28} 80}
29 81
30/// Break a string up into its component tokens 82/// Returns `SyntaxKind` and `Option<SyntaxError>` of the first token
31pub fn tokenize(text: &str) -> Vec<Token> { 83/// encountered at the beginning of the string.
84///
85/// Returns `None` if the string contains zero tokens or if the token was parsed
86/// with an error.
87/// The token is malformed if the returned error is not `None`.
88///
89/// Beware that unescape errors are not checked at tokenization time.
90fn lex_first_token(text: &str) -> Option<(Token, Option<SyntaxError>)> {
91 // non-empty string is a precondtion of `rustc_lexer::first_token()`.
32 if text.is_empty() { 92 if text.is_empty() {
33 return vec![]; 93 return None;
34 }
35 let mut text = text;
36 let mut acc = Vec::new();
37 if let Some(len) = rustc_lexer::strip_shebang(text) {
38 acc.push(Token { kind: SHEBANG, len: TextUnit::from_usize(len) });
39 text = &text[len..];
40 } 94 }
41 while !text.is_empty() { 95
42 let rustc_token = rustc_lexer::first_token(text); 96 let rustc_token = rustc_lexer::first_token(text);
43 let kind = match rustc_token.kind { 97 let (syntax_kind, error) = rustc_token_kind_to_syntax_kind(&rustc_token.kind, text);
44 rustc_lexer::TokenKind::LineComment => COMMENT, 98
45 rustc_lexer::TokenKind::BlockComment { .. } => COMMENT, 99 let token = Token { kind: syntax_kind, len: TextUnit::from_usize(rustc_token.len) };
46 rustc_lexer::TokenKind::Whitespace => WHITESPACE, 100 let error = error.map(|error| {
47 rustc_lexer::TokenKind::Ident => { 101 SyntaxError::new(
48 let token_text = &text[..rustc_token.len]; 102 SyntaxErrorKind::TokenizeError(error),
103 TextRange::from_to(TextUnit::from(0), TextUnit::of_str(text)),
104 )
105 });
106
107 Some((token, error))
108}
109
110// FIXME: simplify TokenizeError to `SyntaxError(String, TextRange)` as per @matklad advice:
111// https://github.com/rust-analyzer/rust-analyzer/pull/2911/files#r371175067
112
113/// Describes the values of `SyntaxErrorKind::TokenizeError` enum variant.
114/// It describes all the types of errors that may happen during the tokenization
115/// of Rust source.
116#[derive(Debug, Clone, PartialEq, Eq, Hash)]
117pub enum TokenizeError {
118 /// Base prefix was provided, but there were no digits
119 /// after it, e.g. `0x`, `0b`.
120 EmptyInt,
121 /// Float exponent lacks digits e.g. `12.34e+`, `12.3E+`, `12e-`, `1_E-`,
122 EmptyExponent,
123
124 /// Block comment lacks trailing delimiter `*/`
125 UnterminatedBlockComment,
126 /// Character literal lacks trailing delimiter `'`
127 UnterminatedChar,
128 /// Characterish byte literal lacks trailing delimiter `'`
129 UnterminatedByte,
130 /// String literal lacks trailing delimiter `"`
131 UnterminatedString,
132 /// Byte string literal lacks trailing delimiter `"`
133 UnterminatedByteString,
134 /// Raw literal lacks trailing delimiter e.g. `"##`
135 UnterminatedRawString,
136 /// Raw byte string literal lacks trailing delimiter e.g. `"##`
137 UnterminatedRawByteString,
138
139 /// Raw string lacks a quote after the pound characters e.g. `r###`
140 UnstartedRawString,
141 /// Raw byte string lacks a quote after the pound characters e.g. `br###`
142 UnstartedRawByteString,
143
144 /// Lifetime starts with a number e.g. `'4ever`
145 LifetimeStartsWithNumber,
146}
147
148fn rustc_token_kind_to_syntax_kind(
149 rustc_token_kind: &rustc_lexer::TokenKind,
150 token_text: &str,
151) -> (SyntaxKind, Option<TokenizeError>) {
152 // A note on an intended tradeoff:
153 // We drop some useful infromation here (see patterns with double dots `..`)
154 // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
155 // being `u16` that come from `rowan::SyntaxKind`.
156
157 let syntax_kind = {
158 use rustc_lexer::TokenKind as TK;
159 use TokenizeError as TE;
160
161 match rustc_token_kind {
162 TK::LineComment => COMMENT,
163
164 TK::BlockComment { terminated: true } => COMMENT,
165 TK::BlockComment { terminated: false } => {
166 return (COMMENT, Some(TE::UnterminatedBlockComment));
167 }
168
169 TK::Whitespace => WHITESPACE,
170
171 TK::Ident => {
49 if token_text == "_" { 172 if token_text == "_" {
50 UNDERSCORE 173 UNDERSCORE
51 } else { 174 } else {
52 SyntaxKind::from_keyword(&text[..rustc_token.len]).unwrap_or(IDENT) 175 SyntaxKind::from_keyword(token_text).unwrap_or(IDENT)
53 } 176 }
54 } 177 }
55 rustc_lexer::TokenKind::RawIdent => IDENT, 178
56 rustc_lexer::TokenKind::Literal { kind, .. } => match_literal_kind(kind), 179 TK::RawIdent => IDENT,
57 rustc_lexer::TokenKind::Lifetime { .. } => LIFETIME, 180 TK::Literal { kind, .. } => return match_literal_kind(&kind),
58 rustc_lexer::TokenKind::Semi => SEMI, 181
59 rustc_lexer::TokenKind::Comma => COMMA, 182 TK::Lifetime { starts_with_number: false } => LIFETIME,
60 rustc_lexer::TokenKind::Dot => DOT, 183 TK::Lifetime { starts_with_number: true } => {
61 rustc_lexer::TokenKind::OpenParen => L_PAREN, 184 return (LIFETIME, Some(TE::LifetimeStartsWithNumber))
62 rustc_lexer::TokenKind::CloseParen => R_PAREN, 185 }
63 rustc_lexer::TokenKind::OpenBrace => L_CURLY, 186
64 rustc_lexer::TokenKind::CloseBrace => R_CURLY, 187 TK::Semi => SEMI,
65 rustc_lexer::TokenKind::OpenBracket => L_BRACK, 188 TK::Comma => COMMA,
66 rustc_lexer::TokenKind::CloseBracket => R_BRACK, 189 TK::Dot => DOT,
67 rustc_lexer::TokenKind::At => AT, 190 TK::OpenParen => L_PAREN,
68 rustc_lexer::TokenKind::Pound => POUND, 191 TK::CloseParen => R_PAREN,
69 rustc_lexer::TokenKind::Tilde => TILDE, 192 TK::OpenBrace => L_CURLY,
70 rustc_lexer::TokenKind::Question => QUESTION, 193 TK::CloseBrace => R_CURLY,
71 rustc_lexer::TokenKind::Colon => COLON, 194 TK::OpenBracket => L_BRACK,
72 rustc_lexer::TokenKind::Dollar => DOLLAR, 195 TK::CloseBracket => R_BRACK,
73 rustc_lexer::TokenKind::Eq => EQ, 196 TK::At => AT,
74 rustc_lexer::TokenKind::Not => EXCL, 197 TK::Pound => POUND,
75 rustc_lexer::TokenKind::Lt => L_ANGLE, 198 TK::Tilde => TILDE,
76 rustc_lexer::TokenKind::Gt => R_ANGLE, 199 TK::Question => QUESTION,
77 rustc_lexer::TokenKind::Minus => MINUS, 200 TK::Colon => COLON,
78 rustc_lexer::TokenKind::And => AMP, 201 TK::Dollar => DOLLAR,
79 rustc_lexer::TokenKind::Or => PIPE, 202 TK::Eq => EQ,
80 rustc_lexer::TokenKind::Plus => PLUS, 203 TK::Not => EXCL,
81 rustc_lexer::TokenKind::Star => STAR, 204 TK::Lt => L_ANGLE,
82 rustc_lexer::TokenKind::Slash => SLASH, 205 TK::Gt => R_ANGLE,
83 rustc_lexer::TokenKind::Caret => CARET, 206 TK::Minus => MINUS,
84 rustc_lexer::TokenKind::Percent => PERCENT, 207 TK::And => AMP,
85 rustc_lexer::TokenKind::Unknown => ERROR, 208 TK::Or => PIPE,
209 TK::Plus => PLUS,
210 TK::Star => STAR,
211 TK::Slash => SLASH,
212 TK::Caret => CARET,
213 TK::Percent => PERCENT,
214 TK::Unknown => ERROR,
215 }
216 };
217
218 return (syntax_kind, None);
219
220 fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<TokenizeError>) {
221 use rustc_lexer::LiteralKind as LK;
222 use TokenizeError as TE;
223
224 #[rustfmt::skip]
225 let syntax_kind = match *kind {
226 LK::Int { empty_int: false, .. } => INT_NUMBER,
227 LK::Int { empty_int: true, .. } => {
228 return (INT_NUMBER, Some(TE::EmptyInt))
229 }
230
231 LK::Float { empty_exponent: false, .. } => FLOAT_NUMBER,
232 LK::Float { empty_exponent: true, .. } => {
233 return (FLOAT_NUMBER, Some(TE::EmptyExponent))
234 }
235
236 LK::Char { terminated: true } => CHAR,
237 LK::Char { terminated: false } => {
238 return (CHAR, Some(TE::UnterminatedChar))
239 }
240
241 LK::Byte { terminated: true } => BYTE,
242 LK::Byte { terminated: false } => {
243 return (BYTE, Some(TE::UnterminatedByte))
244 }
245
246 LK::Str { terminated: true } => STRING,
247 LK::Str { terminated: false } => {
248 return (STRING, Some(TE::UnterminatedString))
249 }
250
251
252 LK::ByteStr { terminated: true } => BYTE_STRING,
253 LK::ByteStr { terminated: false } => {
254 return (BYTE_STRING, Some(TE::UnterminatedByteString))
255 }
256
257 LK::RawStr { started: true, terminated: true, .. } => RAW_STRING,
258 LK::RawStr { started: true, terminated: false, .. } => {
259 return (RAW_STRING, Some(TE::UnterminatedRawString))
260 }
261 LK::RawStr { started: false, .. } => {
262 return (RAW_STRING, Some(TE::UnstartedRawString))
263 }
264
265 LK::RawByteStr { started: true, terminated: true, .. } => RAW_BYTE_STRING,
266 LK::RawByteStr { started: true, terminated: false, .. } => {
267 return (RAW_BYTE_STRING, Some(TE::UnterminatedRawByteString))
268 }
269 LK::RawByteStr { started: false, .. } => {
270 return (RAW_BYTE_STRING, Some(TE::UnstartedRawByteString))
271 }
86 }; 272 };
87 let token = Token { kind, len: TextUnit::from_usize(rustc_token.len) };
88 acc.push(token);
89 text = &text[rustc_token.len..];
90 }
91 acc
92}
93 273
94pub fn classify_literal(text: &str) -> Option<Token> { 274 (syntax_kind, None)
95 let t = rustc_lexer::first_token(text);
96 if t.len != text.len() {
97 return None;
98 } 275 }
99 let kind = match t.kind {
100 rustc_lexer::TokenKind::Literal { kind, .. } => match_literal_kind(kind),
101 _ => return None,
102 };
103 Some(Token { kind, len: TextUnit::from_usize(t.len) })
104} 276}