diff options
author | bors[bot] <26634292+bors[bot]@users.noreply.github.com> | 2020-08-12 17:31:42 +0100 |
---|---|---|
committer | GitHub <[email protected]> | 2020-08-12 17:31:42 +0100 |
commit | d583f2c46d22cf8d643ebf98be9cb7059a304431 (patch) | |
tree | 9d898eb9600b0c36a74e4f95238f679c683fa566 /crates/syntax/src/parsing/lexer.rs | |
parent | 3d6889cba72a9d02199f7adaa2ecc69bc30af834 (diff) | |
parent | a1c187eef3ba08076aedb5154929f7eda8d1b424 (diff) |
Merge #5729
5729: Rename ra_syntax -> syntax
r=matklad a=matklad
bors r+
🤖
Co-authored-by: Aleksey Kladov <[email protected]>
Diffstat (limited to 'crates/syntax/src/parsing/lexer.rs')
-rw-r--r-- | crates/syntax/src/parsing/lexer.rs | 244 |
1 files changed, 244 insertions, 0 deletions
diff --git a/crates/syntax/src/parsing/lexer.rs b/crates/syntax/src/parsing/lexer.rs new file mode 100644 index 000000000..fa3be1016 --- /dev/null +++ b/crates/syntax/src/parsing/lexer.rs | |||
@@ -0,0 +1,244 @@ | |||
1 | //! Lexer analyzes raw input string and produces lexemes (tokens). | ||
2 | //! It is just a bridge to `rustc_lexer`. | ||
3 | |||
4 | use rustc_lexer::{LiteralKind as LK, RawStrError}; | ||
5 | |||
6 | use std::convert::TryInto; | ||
7 | |||
8 | use crate::{ | ||
9 | SyntaxError, | ||
10 | SyntaxKind::{self, *}, | ||
11 | TextRange, TextSize, T, | ||
12 | }; | ||
13 | |||
14 | /// A token of Rust source. | ||
15 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||
16 | pub struct Token { | ||
17 | /// The kind of token. | ||
18 | pub kind: SyntaxKind, | ||
19 | /// The length of the token. | ||
20 | pub len: TextSize, | ||
21 | } | ||
22 | |||
23 | /// Break a string up into its component tokens. | ||
24 | /// Beware that it checks for shebang first and its length contributes to resulting | ||
25 | /// tokens offsets. | ||
26 | pub fn tokenize(text: &str) -> (Vec<Token>, Vec<SyntaxError>) { | ||
27 | // non-empty string is a precondtion of `rustc_lexer::strip_shebang()`. | ||
28 | if text.is_empty() { | ||
29 | return Default::default(); | ||
30 | } | ||
31 | |||
32 | let mut tokens = Vec::new(); | ||
33 | let mut errors = Vec::new(); | ||
34 | |||
35 | let mut offset = match rustc_lexer::strip_shebang(text) { | ||
36 | Some(shebang_len) => { | ||
37 | tokens.push(Token { kind: SHEBANG, len: shebang_len.try_into().unwrap() }); | ||
38 | shebang_len | ||
39 | } | ||
40 | None => 0, | ||
41 | }; | ||
42 | |||
43 | let text_without_shebang = &text[offset..]; | ||
44 | |||
45 | for rustc_token in rustc_lexer::tokenize(text_without_shebang) { | ||
46 | let token_len: TextSize = rustc_token.len.try_into().unwrap(); | ||
47 | let token_range = TextRange::at(offset.try_into().unwrap(), token_len); | ||
48 | |||
49 | let (syntax_kind, err_message) = | ||
50 | rustc_token_kind_to_syntax_kind(&rustc_token.kind, &text[token_range]); | ||
51 | |||
52 | tokens.push(Token { kind: syntax_kind, len: token_len }); | ||
53 | |||
54 | if let Some(err_message) = err_message { | ||
55 | errors.push(SyntaxError::new(err_message, token_range)); | ||
56 | } | ||
57 | |||
58 | offset += rustc_token.len; | ||
59 | } | ||
60 | |||
61 | (tokens, errors) | ||
62 | } | ||
63 | |||
64 | /// Returns `SyntaxKind` and `Option<SyntaxError>` of the first token | ||
65 | /// encountered at the beginning of the string. | ||
66 | /// | ||
67 | /// Returns `None` if the string contains zero *or two or more* tokens. | ||
68 | /// The token is malformed if the returned error is not `None`. | ||
69 | /// | ||
70 | /// Beware that unescape errors are not checked at tokenization time. | ||
71 | pub fn lex_single_syntax_kind(text: &str) -> Option<(SyntaxKind, Option<SyntaxError>)> { | ||
72 | lex_first_token(text) | ||
73 | .filter(|(token, _)| token.len == TextSize::of(text)) | ||
74 | .map(|(token, error)| (token.kind, error)) | ||
75 | } | ||
76 | |||
77 | /// The same as `lex_single_syntax_kind()` but returns only `SyntaxKind` and | ||
78 | /// returns `None` if any tokenization error occured. | ||
79 | /// | ||
80 | /// Beware that unescape errors are not checked at tokenization time. | ||
81 | pub fn lex_single_valid_syntax_kind(text: &str) -> Option<SyntaxKind> { | ||
82 | lex_first_token(text) | ||
83 | .filter(|(token, error)| !error.is_some() && token.len == TextSize::of(text)) | ||
84 | .map(|(token, _error)| token.kind) | ||
85 | } | ||
86 | |||
87 | /// Returns `SyntaxKind` and `Option<SyntaxError>` of the first token | ||
88 | /// encountered at the beginning of the string. | ||
89 | /// | ||
90 | /// Returns `None` if the string contains zero tokens or if the token was parsed | ||
91 | /// with an error. | ||
92 | /// The token is malformed if the returned error is not `None`. | ||
93 | /// | ||
94 | /// Beware that unescape errors are not checked at tokenization time. | ||
95 | fn lex_first_token(text: &str) -> Option<(Token, Option<SyntaxError>)> { | ||
96 | // non-empty string is a precondtion of `rustc_lexer::first_token()`. | ||
97 | if text.is_empty() { | ||
98 | return None; | ||
99 | } | ||
100 | |||
101 | let rustc_token = rustc_lexer::first_token(text); | ||
102 | let (syntax_kind, err_message) = rustc_token_kind_to_syntax_kind(&rustc_token.kind, text); | ||
103 | |||
104 | let token = Token { kind: syntax_kind, len: rustc_token.len.try_into().unwrap() }; | ||
105 | let optional_error = err_message | ||
106 | .map(|err_message| SyntaxError::new(err_message, TextRange::up_to(TextSize::of(text)))); | ||
107 | |||
108 | Some((token, optional_error)) | ||
109 | } | ||
110 | |||
111 | /// Returns `SyntaxKind` and an optional tokenize error message. | ||
112 | fn rustc_token_kind_to_syntax_kind( | ||
113 | rustc_token_kind: &rustc_lexer::TokenKind, | ||
114 | token_text: &str, | ||
115 | ) -> (SyntaxKind, Option<&'static str>) { | ||
116 | // A note on an intended tradeoff: | ||
117 | // We drop some useful infromation here (see patterns with double dots `..`) | ||
118 | // Storing that info in `SyntaxKind` is not possible due to its layout requirements of | ||
119 | // being `u16` that come from `rowan::SyntaxKind`. | ||
120 | |||
121 | let syntax_kind = { | ||
122 | match rustc_token_kind { | ||
123 | rustc_lexer::TokenKind::LineComment => COMMENT, | ||
124 | |||
125 | rustc_lexer::TokenKind::BlockComment { terminated: true } => COMMENT, | ||
126 | rustc_lexer::TokenKind::BlockComment { terminated: false } => { | ||
127 | return ( | ||
128 | COMMENT, | ||
129 | Some("Missing trailing `*/` symbols to terminate the block comment"), | ||
130 | ); | ||
131 | } | ||
132 | |||
133 | rustc_lexer::TokenKind::Whitespace => WHITESPACE, | ||
134 | |||
135 | rustc_lexer::TokenKind::Ident => { | ||
136 | if token_text == "_" { | ||
137 | UNDERSCORE | ||
138 | } else { | ||
139 | SyntaxKind::from_keyword(token_text).unwrap_or(IDENT) | ||
140 | } | ||
141 | } | ||
142 | |||
143 | rustc_lexer::TokenKind::RawIdent => IDENT, | ||
144 | rustc_lexer::TokenKind::Literal { kind, .. } => return match_literal_kind(&kind), | ||
145 | |||
146 | rustc_lexer::TokenKind::Lifetime { starts_with_number: false } => LIFETIME, | ||
147 | rustc_lexer::TokenKind::Lifetime { starts_with_number: true } => { | ||
148 | return (LIFETIME, Some("Lifetime name cannot start with a number")) | ||
149 | } | ||
150 | |||
151 | rustc_lexer::TokenKind::Semi => T![;], | ||
152 | rustc_lexer::TokenKind::Comma => T![,], | ||
153 | rustc_lexer::TokenKind::Dot => T![.], | ||
154 | rustc_lexer::TokenKind::OpenParen => T!['('], | ||
155 | rustc_lexer::TokenKind::CloseParen => T![')'], | ||
156 | rustc_lexer::TokenKind::OpenBrace => T!['{'], | ||
157 | rustc_lexer::TokenKind::CloseBrace => T!['}'], | ||
158 | rustc_lexer::TokenKind::OpenBracket => T!['['], | ||
159 | rustc_lexer::TokenKind::CloseBracket => T![']'], | ||
160 | rustc_lexer::TokenKind::At => T![@], | ||
161 | rustc_lexer::TokenKind::Pound => T![#], | ||
162 | rustc_lexer::TokenKind::Tilde => T![~], | ||
163 | rustc_lexer::TokenKind::Question => T![?], | ||
164 | rustc_lexer::TokenKind::Colon => T![:], | ||
165 | rustc_lexer::TokenKind::Dollar => T![$], | ||
166 | rustc_lexer::TokenKind::Eq => T![=], | ||
167 | rustc_lexer::TokenKind::Not => T![!], | ||
168 | rustc_lexer::TokenKind::Lt => T![<], | ||
169 | rustc_lexer::TokenKind::Gt => T![>], | ||
170 | rustc_lexer::TokenKind::Minus => T![-], | ||
171 | rustc_lexer::TokenKind::And => T![&], | ||
172 | rustc_lexer::TokenKind::Or => T![|], | ||
173 | rustc_lexer::TokenKind::Plus => T![+], | ||
174 | rustc_lexer::TokenKind::Star => T![*], | ||
175 | rustc_lexer::TokenKind::Slash => T![/], | ||
176 | rustc_lexer::TokenKind::Caret => T![^], | ||
177 | rustc_lexer::TokenKind::Percent => T![%], | ||
178 | rustc_lexer::TokenKind::Unknown => ERROR, | ||
179 | } | ||
180 | }; | ||
181 | |||
182 | return (syntax_kind, None); | ||
183 | |||
184 | fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<&'static str>) { | ||
185 | #[rustfmt::skip] | ||
186 | let syntax_kind = match *kind { | ||
187 | LK::Int { empty_int: false, .. } => INT_NUMBER, | ||
188 | LK::Int { empty_int: true, .. } => { | ||
189 | return (INT_NUMBER, Some("Missing digits after the integer base prefix")) | ||
190 | } | ||
191 | |||
192 | LK::Float { empty_exponent: false, .. } => FLOAT_NUMBER, | ||
193 | LK::Float { empty_exponent: true, .. } => { | ||
194 | return (FLOAT_NUMBER, Some("Missing digits after the exponent symbol")) | ||
195 | } | ||
196 | |||
197 | LK::Char { terminated: true } => CHAR, | ||
198 | LK::Char { terminated: false } => { | ||
199 | return (CHAR, Some("Missing trailing `'` symbol to terminate the character literal")) | ||
200 | } | ||
201 | |||
202 | LK::Byte { terminated: true } => BYTE, | ||
203 | LK::Byte { terminated: false } => { | ||
204 | return (BYTE, Some("Missing trailing `'` symbol to terminate the byte literal")) | ||
205 | } | ||
206 | |||
207 | LK::Str { terminated: true } => STRING, | ||
208 | LK::Str { terminated: false } => { | ||
209 | return (STRING, Some("Missing trailing `\"` symbol to terminate the string literal")) | ||
210 | } | ||
211 | |||
212 | |||
213 | LK::ByteStr { terminated: true } => BYTE_STRING, | ||
214 | LK::ByteStr { terminated: false } => { | ||
215 | return (BYTE_STRING, Some("Missing trailing `\"` symbol to terminate the byte string literal")) | ||
216 | } | ||
217 | |||
218 | LK::RawStr { err, .. } => match err { | ||
219 | None => RAW_STRING, | ||
220 | Some(RawStrError::InvalidStarter { .. }) => return (RAW_STRING, Some("Missing `\"` symbol after `#` symbols to begin the raw string literal")), | ||
221 | Some(RawStrError::NoTerminator { expected, found, .. }) => if expected == found { | ||
222 | return (RAW_STRING, Some("Missing trailing `\"` to terminate the raw string literal")) | ||
223 | } else { | ||
224 | return (RAW_STRING, Some("Missing trailing `\"` with `#` symbols to terminate the raw string literal")) | ||
225 | |||
226 | }, | ||
227 | Some(RawStrError::TooManyDelimiters { .. }) => return (RAW_STRING, Some("Too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols")), | ||
228 | }, | ||
229 | LK::RawByteStr { err, .. } => match err { | ||
230 | None => RAW_BYTE_STRING, | ||
231 | Some(RawStrError::InvalidStarter { .. }) => return (RAW_BYTE_STRING, Some("Missing `\"` symbol after `#` symbols to begin the raw byte string literal")), | ||
232 | Some(RawStrError::NoTerminator { expected, found, .. }) => if expected == found { | ||
233 | return (RAW_BYTE_STRING, Some("Missing trailing `\"` to terminate the raw byte string literal")) | ||
234 | } else { | ||
235 | return (RAW_BYTE_STRING, Some("Missing trailing `\"` with `#` symbols to terminate the raw byte string literal")) | ||
236 | |||
237 | }, | ||
238 | Some(RawStrError::TooManyDelimiters { .. }) => return (RAW_BYTE_STRING, Some("Too many `#` symbols: raw byte strings may be delimited by up to 65535 `#` symbols")), | ||
239 | }, | ||
240 | }; | ||
241 | |||
242 | (syntax_kind, None) | ||
243 | } | ||
244 | } | ||