aboutsummaryrefslogtreecommitdiff
path: root/crates/ra_syntax/src/parsing/lexer.rs
diff options
context:
space:
mode:
Diffstat (limited to 'crates/ra_syntax/src/parsing/lexer.rs')
-rw-r--r--crates/ra_syntax/src/parsing/lexer.rs304
1 files changed, 149 insertions, 155 deletions
diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs
index 9dca7d747..6d96f8400 100644
--- a/crates/ra_syntax/src/parsing/lexer.rs
+++ b/crates/ra_syntax/src/parsing/lexer.rs
@@ -1,10 +1,10 @@
1//! Lexer analyzes raw input string and produces lexemes (tokens). 1//! Lexer analyzes raw input string and produces lexemes (tokens).
2 2//! It is just a bridge to `rustc_lexer`.
3use std::iter::{FromIterator, IntoIterator};
4 3
5use crate::{ 4use crate::{
5 SyntaxError, SyntaxErrorKind,
6 SyntaxKind::{self, *}, 6 SyntaxKind::{self, *},
7 TextUnit, 7 TextRange, TextUnit,
8}; 8};
9 9
10/// A token of Rust source. 10/// A token of Rust source.
@@ -15,93 +15,96 @@ pub struct Token {
15 /// The length of the token. 15 /// The length of the token.
16 pub len: TextUnit, 16 pub len: TextUnit,
17} 17}
18impl Token {
19 pub const fn new(kind: SyntaxKind, len: TextUnit) -> Self {
20 Self { kind, len }
21 }
22}
23 18
24#[derive(Debug)] 19#[derive(Debug)]
25/// Represents the result of parsing one token. 20/// Represents the result of parsing one token. Beware that the token may be malformed.
26pub struct ParsedToken { 21pub struct ParsedToken {
27 /// Parsed token. 22 /// Parsed token.
28 pub token: Token, 23 pub token: Token,
29 /// If error is present then parsed token is malformed. 24 /// If error is present then parsed token is malformed.
30 pub error: Option<TokenizeError>, 25 pub error: Option<SyntaxError>,
31}
32impl ParsedToken {
33 pub const fn new(token: Token, error: Option<TokenizeError>) -> Self {
34 Self { token, error }
35 }
36} 26}
37 27
38#[derive(Debug, Default)] 28#[derive(Debug, Default)]
39/// Represents the result of parsing one token. 29/// Represents the result of parsing source code of Rust language.
40pub struct ParsedTokens { 30pub struct ParsedTokens {
41 /// Parsed token. 31 /// Parsed tokens in order they appear in source code.
42 pub tokens: Vec<Token>, 32 pub tokens: Vec<Token>,
43 /// If error is present then parsed token is malformed. 33 /// Collection of all occured tokenization errors.
44 pub errors: Vec<TokenizeError>, 34 /// In general `self.errors.len() <= self.tokens.len()`
35 pub errors: Vec<SyntaxError>,
45} 36}
46 37impl ParsedTokens {
47impl FromIterator<ParsedToken> for ParsedTokens { 38 /// Append `token` and `error` (if pressent) to the result.
48 fn from_iter<I: IntoIterator<Item = ParsedToken>>(iter: I) -> Self { 39 pub fn push(&mut self, ParsedToken { token, error }: ParsedToken) {
49 let res = Self::default(); 40 self.tokens.push(token);
50 for entry in iter { 41 if let Some(error) = error {
51 res.tokens.push(entry.token); 42 self.errors.push(error)
52 if let Some(error) = entry.error {
53 res.errors.push(error);
54 }
55 } 43 }
56 res
57 } 44 }
58} 45}
59 46
60/// Returns the first encountered token from the string. 47/// Same as `tokenize_append()`, just a shortcut for creating `ParsedTokens`
61/// If the string contains zero or two or more tokens returns `None`. 48/// and returning the result the usual way.
62pub fn single_token(text: &str) -> Option<ParsedToken> { 49pub fn tokenize(text: &str) -> ParsedTokens {
63 // TODO: test whether this condition indeed checks for a single token 50 let mut parsed = ParsedTokens::default();
64 first_token(text).filter(|parsed| parsed.token.len.to_usize() == text.len()) 51 tokenize_append(text, &mut parsed);
52 parsed
65} 53}
66 54
67/* 55/// Break a string up into its component tokens.
68/// Returns `ParsedTokens` which are basically a pair `(Vec<Token>, Vec<TokenizeError>)` 56/// Returns `ParsedTokens` which are basically a pair `(Vec<Token>, Vec<SyntaxError>)`.
69/// This is just a shorthand for `tokenize(text).collect()` 57/// Beware that it checks for shebang first and its length contributes to resulting
70pub fn tokenize_to_vec_with_errors(text: &str) -> ParsedTokens { 58/// tokens offsets.
71 tokenize(text).collect() 59pub fn tokenize_append(text: &str, parsed: &mut ParsedTokens) {
72} 60 // non-empty string is a precondtion of `rustc_lexer::strip_shebang()`.
61 if text.is_empty() {
62 return;
63 }
73 64
74/// The simplest version of tokenize, it just retunst a ready-made `Vec<Token>`. 65 let mut offset: usize = rustc_lexer::strip_shebang(text)
75/// It discards all tokenization errors while parsing. If you need that infromation 66 .map(|shebang_len| {
76/// consider using `tokenize()` or `tokenize_to_vec_with_errors()`. 67 parsed.tokens.push(Token { kind: SHEBANG, len: TextUnit::from_usize(shebang_len) });
77pub fn tokenize_to_vec(text: &str) -> Vec<Token> { 68 shebang_len
78 tokenize(text).map(|parsed_token| parsed_token.token).collect() 69 })
79} 70 .unwrap_or(0);
80*/
81 71
82/// Break a string up into its component tokens 72 let text_without_shebang = &text[offset..];
83/// This is the core function, all other `tokenize*()` functions are simply
84/// handy shortcuts for this one.
85pub fn tokenize(text: &str) -> impl Iterator<Item = ParsedToken> + '_ {
86 let shebang = rustc_lexer::strip_shebang(text).map(|shebang_len| {
87 text = &text[shebang_len..];
88 ParsedToken::new(Token::new(SHEBANG, TextUnit::from_usize(shebang_len)), None)
89 });
90 73
91 // Notice that we eagerly evaluate shebang since it may change text slice 74 for rustc_token in rustc_lexer::tokenize(text_without_shebang) {
92 // and we cannot simplify this into a single method call chain 75 parsed.push(rustc_token_to_parsed_token(&rustc_token, text, TextUnit::from_usize(offset)));
93 shebang.into_iter().chain(tokenize_without_shebang(text)) 76 offset += rustc_token.len;
77 }
94} 78}
95 79
96pub fn tokenize_without_shebang(text: &str) -> impl Iterator<Item = ParsedToken> + '_ { 80/// Returns the first encountered token at the beginning of the string.
97 rustc_lexer::tokenize(text).map(|rustc_token| { 81/// If the string contains zero or *two or more tokens* returns `None`.
98 let token_text = &text[..rustc_token.len]; 82///
99 text = &text[rustc_token.len..]; 83/// The main difference between `first_token()` and `single_token()` is that
100 rustc_token_kind_to_parsed_token(&rustc_token.kind, token_text) 84/// the latter returns `None` if the string contains more than one token.
101 }) 85pub fn single_token(text: &str) -> Option<ParsedToken> {
86 first_token(text).filter(|parsed| parsed.token.len.to_usize() == text.len())
102} 87}
103 88
104#[derive(Debug)] 89/// Returns the first encountered token at the beginning of the string.
90/// If the string contains zero tokens returns `None`.
91///
92/// The main difference between `first_token() and single_token()` is that
93/// the latter returns `None` if the string contains more than one token.
94pub fn first_token(text: &str) -> Option<ParsedToken> {
95 // non-empty string is a precondtion of `rustc_lexer::first_token()`.
96 if text.is_empty() {
97 None
98 } else {
99 let rustc_token = rustc_lexer::first_token(text);
100 Some(rustc_token_to_parsed_token(&rustc_token, text, TextUnit::from(0)))
101 }
102}
103
104/// Describes the values of `SyntaxErrorKind::TokenizeError` enum variant.
105/// It describes all the types of errors that may happen during the tokenization
106/// of Rust source.
107#[derive(Debug, Clone, PartialEq, Eq, Hash)]
105pub enum TokenizeError { 108pub enum TokenizeError {
106 /// Base prefix was provided, but there were no digits 109 /// Base prefix was provided, but there were no digits
107 /// after it, e.g. `0x`. 110 /// after it, e.g. `0x`.
@@ -124,94 +127,95 @@ pub enum TokenizeError {
124 /// Raw byte string literal lacks trailing delimiter e.g. `"##` 127 /// Raw byte string literal lacks trailing delimiter e.g. `"##`
125 UnterminatedRawByteString, 128 UnterminatedRawByteString,
126 129
127 /// Raw string lacks a quote after pound characters e.g. `r###` 130 /// Raw string lacks a quote after the pound characters e.g. `r###`
128 UnstartedRawString, 131 UnstartedRawString,
129 /// Raw byte string lacks a quote after pound characters e.g. `br###` 132 /// Raw byte string lacks a quote after the pound characters e.g. `br###`
130 UnstartedRawByteString, 133 UnstartedRawByteString,
131 134
132 /// Lifetime starts with a number e.g. `'4ever` 135 /// Lifetime starts with a number e.g. `'4ever`
133 LifetimeStartsWithNumber, 136 LifetimeStartsWithNumber,
134} 137}
135 138
136fn rustc_token_kind_to_parsed_token( 139/// Mapper function that converts `rustc_lexer::Token` with some additional context
137 rustc_token_kind: &rustc_lexer::TokenKind, 140/// to `ParsedToken`
138 token_text: &str, 141fn rustc_token_to_parsed_token(
142 rustc_token: &rustc_lexer::Token,
143 text: &str,
144 token_start_offset: TextUnit,
139) -> ParsedToken { 145) -> ParsedToken {
140 use rustc_lexer::TokenKind as TK;
141 use TokenizeError as TE;
142
143 // We drop some useful infromation here (see patterns with double dots `..`) 146 // We drop some useful infromation here (see patterns with double dots `..`)
144 // Storing that info in `SyntaxKind` is not possible due to its layout requirements of 147 // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
145 // being `u16` that come from `rowan::SyntaxKind` type and changes to `rowan::SyntaxKind` 148 // being `u16` that come from `rowan::SyntaxKind` type and changes to `rowan::SyntaxKind`
146 // would mean hell of a rewrite. 149 // would mean hell of a rewrite
147 150
148 let (syntax_kind, error) = match *rustc_token_kind { 151 let token_range =
149 TK::LineComment => ok(COMMENT), 152 TextRange::offset_len(token_start_offset, TextUnit::from_usize(rustc_token.len));
150 TK::BlockComment { terminated } => ok_if(terminated, COMMENT, TE::UnterminatedBlockComment), 153
151 TK::Whitespace => ok(WHITESPACE), 154 let token_text = &text[token_range];
152 TK::Ident => ok(if token_text == "_" { 155
153 UNDERSCORE 156 let (syntax_kind, error) = {
154 } else { 157 use rustc_lexer::TokenKind as TK;
155 SyntaxKind::from_keyword(token_text).unwrap_or(IDENT) 158 use TokenizeError as TE;
156 }), 159
157 TK::RawIdent => ok(IDENT), 160 match rustc_token.kind {
158 TK::Literal { kind, .. } => match_literal_kind(&kind), 161 TK::LineComment => ok(COMMENT),
159 TK::Lifetime { starts_with_number } => { 162 TK::BlockComment { terminated } => {
160 ok_if(!starts_with_number, LIFETIME, TE::LifetimeStartsWithNumber) 163 ok_if(terminated, COMMENT, TE::UnterminatedBlockComment)
164 }
165 TK::Whitespace => ok(WHITESPACE),
166 TK::Ident => ok(if token_text == "_" {
167 UNDERSCORE
168 } else {
169 SyntaxKind::from_keyword(token_text).unwrap_or(IDENT)
170 }),
171 TK::RawIdent => ok(IDENT),
172 TK::Literal { kind, .. } => match_literal_kind(&kind),
173 TK::Lifetime { starts_with_number } => {
174 ok_if(!starts_with_number, LIFETIME, TE::LifetimeStartsWithNumber)
175 }
176 TK::Semi => ok(SEMI),
177 TK::Comma => ok(COMMA),
178 TK::Dot => ok(DOT),
179 TK::OpenParen => ok(L_PAREN),
180 TK::CloseParen => ok(R_PAREN),
181 TK::OpenBrace => ok(L_CURLY),
182 TK::CloseBrace => ok(R_CURLY),
183 TK::OpenBracket => ok(L_BRACK),
184 TK::CloseBracket => ok(R_BRACK),
185 TK::At => ok(AT),
186 TK::Pound => ok(POUND),
187 TK::Tilde => ok(TILDE),
188 TK::Question => ok(QUESTION),
189 TK::Colon => ok(COLON),
190 TK::Dollar => ok(DOLLAR),
191 TK::Eq => ok(EQ),
192 TK::Not => ok(EXCL),
193 TK::Lt => ok(L_ANGLE),
194 TK::Gt => ok(R_ANGLE),
195 TK::Minus => ok(MINUS),
196 TK::And => ok(AMP),
197 TK::Or => ok(PIPE),
198 TK::Plus => ok(PLUS),
199 TK::Star => ok(STAR),
200 TK::Slash => ok(SLASH),
201 TK::Caret => ok(CARET),
202 TK::Percent => ok(PERCENT),
203 TK::Unknown => ok(ERROR),
161 } 204 }
162 TK::Semi => ok(SEMI),
163 TK::Comma => ok(COMMA),
164 TK::Dot => ok(DOT),
165 TK::OpenParen => ok(L_PAREN),
166 TK::CloseParen => ok(R_PAREN),
167 TK::OpenBrace => ok(L_CURLY),
168 TK::CloseBrace => ok(R_CURLY),
169 TK::OpenBracket => ok(L_BRACK),
170 TK::CloseBracket => ok(R_BRACK),
171 TK::At => ok(AT),
172 TK::Pound => ok(POUND),
173 TK::Tilde => ok(TILDE),
174 TK::Question => ok(QUESTION),
175 TK::Colon => ok(COLON),
176 TK::Dollar => ok(DOLLAR),
177 TK::Eq => ok(EQ),
178 TK::Not => ok(EXCL),
179 TK::Lt => ok(L_ANGLE),
180 TK::Gt => ok(R_ANGLE),
181 TK::Minus => ok(MINUS),
182 TK::And => ok(AMP),
183 TK::Or => ok(PIPE),
184 TK::Plus => ok(PLUS),
185 TK::Star => ok(STAR),
186 TK::Slash => ok(SLASH),
187 TK::Caret => ok(CARET),
188 TK::Percent => ok(PERCENT),
189 TK::Unknown => ok(ERROR),
190 }; 205 };
191 206
192 return ParsedToken::new( 207 return ParsedToken {
193 Token::new(syntax_kind, TextUnit::from_usize(token_text.len())), 208 token: Token { kind: syntax_kind, len: token_range.len() },
194 error, 209 error: error
195 ); 210 .map(|error| SyntaxError::new(SyntaxErrorKind::TokenizeError(error), token_range)),
211 };
196 212
197 type ParsedSyntaxKind = (SyntaxKind, Option<TokenizeError>); 213 type ParsedSyntaxKind = (SyntaxKind, Option<TokenizeError>);
198 214
199 const fn ok(syntax_kind: SyntaxKind) -> ParsedSyntaxKind { 215 fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> ParsedSyntaxKind {
200 (syntax_kind, None)
201 }
202 const fn ok_if(cond: bool, syntax_kind: SyntaxKind, error: TokenizeError) -> ParsedSyntaxKind {
203 if cond {
204 ok(syntax_kind)
205 } else {
206 err(syntax_kind, error)
207 }
208 }
209 const fn err(syntax_kind: SyntaxKind, error: TokenizeError) -> ParsedSyntaxKind {
210 (syntax_kind, Some(error))
211 }
212
213 const fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> ParsedSyntaxKind {
214 use rustc_lexer::LiteralKind as LK; 216 use rustc_lexer::LiteralKind as LK;
217 use TokenizeError as TE;
218
215 match *kind { 219 match *kind {
216 LK::Int { empty_int, .. } => ok_if(!empty_int, INT_NUMBER, TE::EmptyInt), 220 LK::Int { empty_int, .. } => ok_if(!empty_int, INT_NUMBER, TE::EmptyInt),
217 LK::Float { empty_exponent, .. } => { 221 LK::Float { empty_exponent, .. } => {
@@ -237,27 +241,17 @@ fn rustc_token_kind_to_parsed_token(
237 } 241 }
238 } 242 }
239 } 243 }
240} 244 const fn ok(syntax_kind: SyntaxKind) -> ParsedSyntaxKind {
241 245 (syntax_kind, None)
242pub fn first_token(text: &str) -> Option<ParsedToken> {
243 // Checking for emptyness because of `rustc_lexer::first_token()` invariant (see its body)
244 if text.is_empty() {
245 None
246 } else {
247 let rustc_token = rustc_lexer::first_token(text);
248 Some(rustc_token_kind_to_parsed_token(&rustc_token.kind, &text[..rustc_token.len]))
249 } 246 }
250} 247 const fn err(syntax_kind: SyntaxKind, error: TokenizeError) -> ParsedSyntaxKind {
251 248 (syntax_kind, Some(error))
252// TODO: think what to do with this ad hoc function 249 }
253pub fn classify_literal(text: &str) -> Option<ParsedToken> { 250 fn ok_if(cond: bool, syntax_kind: SyntaxKind, error: TokenizeError) -> ParsedSyntaxKind {
254 let t = rustc_lexer::first_token(text); 251 if cond {
255 if t.len != text.len() { 252 ok(syntax_kind)
256 return None; 253 } else {
254 err(syntax_kind, error)
255 }
257 } 256 }
258 let kind = match t.kind {
259 rustc_lexer::TokenKind::Literal { kind, .. } => match_literal_kind(kind),
260 _ => return None,
261 };
262 Some(ParsedToken::new(Token::new(kind, TextUnit::from_usize(t.len))))
263} 257}