aboutsummaryrefslogtreecommitdiff
path: root/crates/ra_syntax/src/parsing
diff options
context:
space:
mode:
Diffstat (limited to 'crates/ra_syntax/src/parsing')
-rw-r--r--crates/ra_syntax/src/parsing/lexer.rs299
-rw-r--r--crates/ra_syntax/src/parsing/reparsing.rs3
2 files changed, 230 insertions, 72 deletions
diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs
index 6d839208d..9dca7d747 100644
--- a/crates/ra_syntax/src/parsing/lexer.rs
+++ b/crates/ra_syntax/src/parsing/lexer.rs
@@ -1,4 +1,6 @@
1//! FIXME: write short doc here 1//! Lexer analyzes raw input string and produces lexemes (tokens).
2
3use std::iter::{FromIterator, IntoIterator};
2 4
3use crate::{ 5use crate::{
4 SyntaxKind::{self, *}, 6 SyntaxKind::{self, *},
@@ -13,85 +15,242 @@ pub struct Token {
13 /// The length of the token. 15 /// The length of the token.
14 pub len: TextUnit, 16 pub len: TextUnit,
15} 17}
18impl Token {
19 pub const fn new(kind: SyntaxKind, len: TextUnit) -> Self {
20 Self { kind, len }
21 }
22}
16 23
17fn match_literal_kind(kind: rustc_lexer::LiteralKind) -> SyntaxKind { 24#[derive(Debug)]
18 match kind { 25/// Represents the result of parsing one token.
19 rustc_lexer::LiteralKind::Int { .. } => INT_NUMBER, 26pub struct ParsedToken {
20 rustc_lexer::LiteralKind::Float { .. } => FLOAT_NUMBER, 27 /// Parsed token.
21 rustc_lexer::LiteralKind::Char { .. } => CHAR, 28 pub token: Token,
22 rustc_lexer::LiteralKind::Byte { .. } => BYTE, 29 /// If error is present then parsed token is malformed.
23 rustc_lexer::LiteralKind::Str { .. } => STRING, 30 pub error: Option<TokenizeError>,
24 rustc_lexer::LiteralKind::ByteStr { .. } => BYTE_STRING, 31}
25 rustc_lexer::LiteralKind::RawStr { .. } => RAW_STRING, 32impl ParsedToken {
26 rustc_lexer::LiteralKind::RawByteStr { .. } => RAW_BYTE_STRING, 33 pub const fn new(token: Token, error: Option<TokenizeError>) -> Self {
34 Self { token, error }
27 } 35 }
28} 36}
29 37
38#[derive(Debug, Default)]
39/// Represents the result of parsing one token.
40pub struct ParsedTokens {
41 /// Parsed token.
42 pub tokens: Vec<Token>,
43 /// If error is present then parsed token is malformed.
44 pub errors: Vec<TokenizeError>,
45}
46
47impl FromIterator<ParsedToken> for ParsedTokens {
48 fn from_iter<I: IntoIterator<Item = ParsedToken>>(iter: I) -> Self {
49 let res = Self::default();
50 for entry in iter {
51 res.tokens.push(entry.token);
52 if let Some(error) = entry.error {
53 res.errors.push(error);
54 }
55 }
56 res
57 }
58}
59
60/// Returns the first encountered token from the string.
61/// If the string contains zero or two or more tokens returns `None`.
62pub fn single_token(text: &str) -> Option<ParsedToken> {
63 // TODO: test whether this condition indeed checks for a single token
64 first_token(text).filter(|parsed| parsed.token.len.to_usize() == text.len())
65}
66
67/*
68/// Returns `ParsedTokens` which are basically a pair `(Vec<Token>, Vec<TokenizeError>)`
69/// This is just a shorthand for `tokenize(text).collect()`
70pub fn tokenize_to_vec_with_errors(text: &str) -> ParsedTokens {
71 tokenize(text).collect()
72}
73
74/// The simplest version of tokenize, it just retunst a ready-made `Vec<Token>`.
75/// It discards all tokenization errors while parsing. If you need that infromation
76/// consider using `tokenize()` or `tokenize_to_vec_with_errors()`.
77pub fn tokenize_to_vec(text: &str) -> Vec<Token> {
78 tokenize(text).map(|parsed_token| parsed_token.token).collect()
79}
80*/
81
30/// Break a string up into its component tokens 82/// Break a string up into its component tokens
31pub fn tokenize(text: &str) -> Vec<Token> { 83/// This is the core function, all other `tokenize*()` functions are simply
32 if text.is_empty() { 84/// handy shortcuts for this one.
33 return vec![]; 85pub fn tokenize(text: &str) -> impl Iterator<Item = ParsedToken> + '_ {
86 let shebang = rustc_lexer::strip_shebang(text).map(|shebang_len| {
87 text = &text[shebang_len..];
88 ParsedToken::new(Token::new(SHEBANG, TextUnit::from_usize(shebang_len)), None)
89 });
90
91 // Notice that we eagerly evaluate shebang since it may change text slice
92 // and we cannot simplify this into a single method call chain
93 shebang.into_iter().chain(tokenize_without_shebang(text))
94}
95
96pub fn tokenize_without_shebang(text: &str) -> impl Iterator<Item = ParsedToken> + '_ {
97 rustc_lexer::tokenize(text).map(|rustc_token| {
98 let token_text = &text[..rustc_token.len];
99 text = &text[rustc_token.len..];
100 rustc_token_kind_to_parsed_token(&rustc_token.kind, token_text)
101 })
102}
103
104#[derive(Debug)]
105pub enum TokenizeError {
106 /// Base prefix was provided, but there were no digits
107 /// after it, e.g. `0x`.
108 EmptyInt,
109 /// Float exponent lacks digits e.g. `e+`, `E+`, `e-`, `E-`,
110 EmptyExponent,
111
112 /// Block comment lacks trailing delimiter `*/`
113 UnterminatedBlockComment,
114 /// Character literal lacks trailing delimiter `'`
115 UnterminatedChar,
116 /// Characterish byte literal lacks trailing delimiter `'`
117 UnterminatedByte,
118 /// String literal lacks trailing delimiter `"`
119 UnterminatedString,
120 /// Byte string literal lacks trailing delimiter `"`
121 UnterminatedByteString,
122 /// Raw literal lacks trailing delimiter e.g. `"##`
123 UnterminatedRawString,
124 /// Raw byte string literal lacks trailing delimiter e.g. `"##`
125 UnterminatedRawByteString,
126
127 /// Raw string lacks a quote after pound characters e.g. `r###`
128 UnstartedRawString,
129 /// Raw byte string lacks a quote after pound characters e.g. `br###`
130 UnstartedRawByteString,
131
132 /// Lifetime starts with a number e.g. `'4ever`
133 LifetimeStartsWithNumber,
134}
135
136fn rustc_token_kind_to_parsed_token(
137 rustc_token_kind: &rustc_lexer::TokenKind,
138 token_text: &str,
139) -> ParsedToken {
140 use rustc_lexer::TokenKind as TK;
141 use TokenizeError as TE;
142
143 // We drop some useful infromation here (see patterns with double dots `..`)
144 // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
145 // being `u16` that come from `rowan::SyntaxKind` type and changes to `rowan::SyntaxKind`
146 // would mean hell of a rewrite.
147
148 let (syntax_kind, error) = match *rustc_token_kind {
149 TK::LineComment => ok(COMMENT),
150 TK::BlockComment { terminated } => ok_if(terminated, COMMENT, TE::UnterminatedBlockComment),
151 TK::Whitespace => ok(WHITESPACE),
152 TK::Ident => ok(if token_text == "_" {
153 UNDERSCORE
154 } else {
155 SyntaxKind::from_keyword(token_text).unwrap_or(IDENT)
156 }),
157 TK::RawIdent => ok(IDENT),
158 TK::Literal { kind, .. } => match_literal_kind(&kind),
159 TK::Lifetime { starts_with_number } => {
160 ok_if(!starts_with_number, LIFETIME, TE::LifetimeStartsWithNumber)
161 }
162 TK::Semi => ok(SEMI),
163 TK::Comma => ok(COMMA),
164 TK::Dot => ok(DOT),
165 TK::OpenParen => ok(L_PAREN),
166 TK::CloseParen => ok(R_PAREN),
167 TK::OpenBrace => ok(L_CURLY),
168 TK::CloseBrace => ok(R_CURLY),
169 TK::OpenBracket => ok(L_BRACK),
170 TK::CloseBracket => ok(R_BRACK),
171 TK::At => ok(AT),
172 TK::Pound => ok(POUND),
173 TK::Tilde => ok(TILDE),
174 TK::Question => ok(QUESTION),
175 TK::Colon => ok(COLON),
176 TK::Dollar => ok(DOLLAR),
177 TK::Eq => ok(EQ),
178 TK::Not => ok(EXCL),
179 TK::Lt => ok(L_ANGLE),
180 TK::Gt => ok(R_ANGLE),
181 TK::Minus => ok(MINUS),
182 TK::And => ok(AMP),
183 TK::Or => ok(PIPE),
184 TK::Plus => ok(PLUS),
185 TK::Star => ok(STAR),
186 TK::Slash => ok(SLASH),
187 TK::Caret => ok(CARET),
188 TK::Percent => ok(PERCENT),
189 TK::Unknown => ok(ERROR),
190 };
191
192 return ParsedToken::new(
193 Token::new(syntax_kind, TextUnit::from_usize(token_text.len())),
194 error,
195 );
196
197 type ParsedSyntaxKind = (SyntaxKind, Option<TokenizeError>);
198
199 const fn ok(syntax_kind: SyntaxKind) -> ParsedSyntaxKind {
200 (syntax_kind, None)
34 } 201 }
35 let mut text = text; 202 const fn ok_if(cond: bool, syntax_kind: SyntaxKind, error: TokenizeError) -> ParsedSyntaxKind {
36 let mut acc = Vec::new(); 203 if cond {
37 if let Some(len) = rustc_lexer::strip_shebang(text) { 204 ok(syntax_kind)
38 acc.push(Token { kind: SHEBANG, len: TextUnit::from_usize(len) }); 205 } else {
39 text = &text[len..]; 206 err(syntax_kind, error)
207 }
40 } 208 }
41 while !text.is_empty() { 209 const fn err(syntax_kind: SyntaxKind, error: TokenizeError) -> ParsedSyntaxKind {
42 let rustc_token = rustc_lexer::first_token(text); 210 (syntax_kind, Some(error))
43 let kind = match rustc_token.kind { 211 }
44 rustc_lexer::TokenKind::LineComment => COMMENT, 212
45 rustc_lexer::TokenKind::BlockComment { .. } => COMMENT, 213 const fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> ParsedSyntaxKind {
46 rustc_lexer::TokenKind::Whitespace => WHITESPACE, 214 use rustc_lexer::LiteralKind as LK;
47 rustc_lexer::TokenKind::Ident => { 215 match *kind {
48 let token_text = &text[..rustc_token.len]; 216 LK::Int { empty_int, .. } => ok_if(!empty_int, INT_NUMBER, TE::EmptyInt),
49 if token_text == "_" { 217 LK::Float { empty_exponent, .. } => {
50 UNDERSCORE 218 ok_if(!empty_exponent, FLOAT_NUMBER, TE::EmptyExponent)
51 } else {
52 SyntaxKind::from_keyword(&text[..rustc_token.len]).unwrap_or(IDENT)
53 }
54 } 219 }
55 rustc_lexer::TokenKind::RawIdent => IDENT, 220 LK::Char { terminated } => ok_if(terminated, CHAR, TE::UnterminatedChar),
56 rustc_lexer::TokenKind::Literal { kind, .. } => match_literal_kind(kind), 221 LK::Byte { terminated } => ok_if(terminated, BYTE, TE::UnterminatedByte),
57 rustc_lexer::TokenKind::Lifetime { .. } => LIFETIME, 222 LK::Str { terminated } => ok_if(terminated, STRING, TE::UnterminatedString),
58 rustc_lexer::TokenKind::Semi => SEMI, 223 LK::ByteStr { terminated } => {
59 rustc_lexer::TokenKind::Comma => COMMA, 224 ok_if(terminated, BYTE_STRING, TE::UnterminatedByteString)
60 rustc_lexer::TokenKind::Dot => DOT, 225 }
61 rustc_lexer::TokenKind::OpenParen => L_PAREN, 226
62 rustc_lexer::TokenKind::CloseParen => R_PAREN, 227 LK::RawStr { started: true, terminated, .. } => {
63 rustc_lexer::TokenKind::OpenBrace => L_CURLY, 228 ok_if(terminated, RAW_STRING, TE::UnterminatedRawString)
64 rustc_lexer::TokenKind::CloseBrace => R_CURLY, 229 }
65 rustc_lexer::TokenKind::OpenBracket => L_BRACK, 230 LK::RawStr { started: false, .. } => err(RAW_STRING, TE::UnstartedRawString),
66 rustc_lexer::TokenKind::CloseBracket => R_BRACK, 231
67 rustc_lexer::TokenKind::At => AT, 232 LK::RawByteStr { started: true, terminated, .. } => {
68 rustc_lexer::TokenKind::Pound => POUND, 233 ok_if(terminated, RAW_BYTE_STRING, TE::UnterminatedRawByteString)
69 rustc_lexer::TokenKind::Tilde => TILDE, 234 }
70 rustc_lexer::TokenKind::Question => QUESTION, 235 LK::RawByteStr { started: false, .. } => {
71 rustc_lexer::TokenKind::Colon => COLON, 236 err(RAW_BYTE_STRING, TE::UnstartedRawByteString)
72 rustc_lexer::TokenKind::Dollar => DOLLAR, 237 }
73 rustc_lexer::TokenKind::Eq => EQ, 238 }
74 rustc_lexer::TokenKind::Not => EXCL, 239 }
75 rustc_lexer::TokenKind::Lt => L_ANGLE, 240}
76 rustc_lexer::TokenKind::Gt => R_ANGLE, 241
77 rustc_lexer::TokenKind::Minus => MINUS, 242pub fn first_token(text: &str) -> Option<ParsedToken> {
78 rustc_lexer::TokenKind::And => AMP, 243 // Checking for emptyness because of `rustc_lexer::first_token()` invariant (see its body)
79 rustc_lexer::TokenKind::Or => PIPE, 244 if text.is_empty() {
80 rustc_lexer::TokenKind::Plus => PLUS, 245 None
81 rustc_lexer::TokenKind::Star => STAR, 246 } else {
82 rustc_lexer::TokenKind::Slash => SLASH, 247 let rustc_token = rustc_lexer::first_token(text);
83 rustc_lexer::TokenKind::Caret => CARET, 248 Some(rustc_token_kind_to_parsed_token(&rustc_token.kind, &text[..rustc_token.len]))
84 rustc_lexer::TokenKind::Percent => PERCENT,
85 rustc_lexer::TokenKind::Unknown => ERROR,
86 };
87 let token = Token { kind, len: TextUnit::from_usize(rustc_token.len) };
88 acc.push(token);
89 text = &text[rustc_token.len..];
90 } 249 }
91 acc
92} 250}
93 251
94pub fn classify_literal(text: &str) -> Option<Token> { 252// TODO: think what to do with this ad hoc function
253pub fn classify_literal(text: &str) -> Option<ParsedToken> {
95 let t = rustc_lexer::first_token(text); 254 let t = rustc_lexer::first_token(text);
96 if t.len != text.len() { 255 if t.len != text.len() {
97 return None; 256 return None;
@@ -100,5 +259,5 @@ pub fn classify_literal(text: &str) -> Option<Token> {
100 rustc_lexer::TokenKind::Literal { kind, .. } => match_literal_kind(kind), 259 rustc_lexer::TokenKind::Literal { kind, .. } => match_literal_kind(kind),
101 _ => return None, 260 _ => return None,
102 }; 261 };
103 Some(Token { kind, len: TextUnit::from_usize(t.len) }) 262 Some(ParsedToken::new(Token::new(kind, TextUnit::from_usize(t.len))))
104} 263}
diff --git a/crates/ra_syntax/src/parsing/reparsing.rs b/crates/ra_syntax/src/parsing/reparsing.rs
index 06bdda11d..3abc09877 100644
--- a/crates/ra_syntax/src/parsing/reparsing.rs
+++ b/crates/ra_syntax/src/parsing/reparsing.rs
@@ -46,8 +46,7 @@ fn reparse_token<'node>(
46 WHITESPACE | COMMENT | IDENT | STRING | RAW_STRING => { 46 WHITESPACE | COMMENT | IDENT | STRING | RAW_STRING => {
47 if token.kind() == WHITESPACE || token.kind() == COMMENT { 47 if token.kind() == WHITESPACE || token.kind() == COMMENT {
48 // removing a new line may extends previous token 48 // removing a new line may extends previous token
49 if token.text().to_string()[edit.delete - token.text_range().start()].contains('\n') 49 if token.text()[edit.delete - token.text_range().start()].contains('\n') {
50 {
51 return None; 50 return None;
52 } 51 }
53 } 52 }