diff options
author | Adolfo OchagavĂa <[email protected]> | 2018-11-04 14:06:38 +0000 |
---|---|---|
committer | Adolfo OchagavĂa <[email protected]> | 2018-11-04 14:17:24 +0000 |
commit | 9b5bbab104d8ba445143f6f3a9e4149b40c29ae5 (patch) | |
tree | a1ef0fa5dbfd431e8a58afc6542c32c9ecefed04 | |
parent | 19c6cbd9540ef87850161cad7e108b380eceea24 (diff) |
Add character literal parsing and validation
-rw-r--r-- | crates/ra_syntax/src/ast/generated.rs | 34 | ||||
-rw-r--r-- | crates/ra_syntax/src/ast/mod.rs | 6 | ||||
-rw-r--r-- | crates/ra_syntax/src/grammar.ron | 1 | ||||
-rw-r--r-- | crates/ra_syntax/src/lib.rs | 7 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing/mod.rs | 311 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation.rs | 40 |
6 files changed, 397 insertions, 2 deletions
diff --git a/crates/ra_syntax/src/ast/generated.rs b/crates/ra_syntax/src/ast/generated.rs index f77795d05..75769a4e9 100644 --- a/crates/ra_syntax/src/ast/generated.rs +++ b/crates/ra_syntax/src/ast/generated.rs | |||
@@ -409,6 +409,40 @@ impl<'a> AstNode<'a> for CastExpr<'a> { | |||
409 | 409 | ||
410 | impl<'a> CastExpr<'a> {} | 410 | impl<'a> CastExpr<'a> {} |
411 | 411 | ||
412 | // Char | ||
413 | |||
414 | #[derive(Debug, Clone)] | ||
415 | pub struct CharNode(SyntaxNode); | ||
416 | |||
417 | impl CharNode { | ||
418 | pub fn ast(&self) -> Char { | ||
419 | Char::cast(self.0.borrowed()).unwrap() | ||
420 | } | ||
421 | } | ||
422 | |||
423 | impl<'a> From<Char<'a>> for CharNode { | ||
424 | fn from(ast: Char<'a>) -> CharNode { | ||
425 | let syntax = ast.syntax().owned(); | ||
426 | CharNode(syntax) | ||
427 | } | ||
428 | } | ||
429 | #[derive(Debug, Clone, Copy)] | ||
430 | pub struct Char<'a> { | ||
431 | syntax: SyntaxNodeRef<'a>, | ||
432 | } | ||
433 | |||
434 | impl<'a> AstNode<'a> for Char<'a> { | ||
435 | fn cast(syntax: SyntaxNodeRef<'a>) -> Option<Self> { | ||
436 | match syntax.kind() { | ||
437 | CHAR => Some(Char { syntax }), | ||
438 | _ => None, | ||
439 | } | ||
440 | } | ||
441 | fn syntax(self) -> SyntaxNodeRef<'a> { self.syntax } | ||
442 | } | ||
443 | |||
444 | impl<'a> Char<'a> {} | ||
445 | |||
412 | // Comment | 446 | // Comment |
413 | 447 | ||
414 | #[derive(Debug, Clone)] | 448 | #[derive(Debug, Clone)] |
diff --git a/crates/ra_syntax/src/ast/mod.rs b/crates/ra_syntax/src/ast/mod.rs index 688ffff47..4355531d0 100644 --- a/crates/ra_syntax/src/ast/mod.rs +++ b/crates/ra_syntax/src/ast/mod.rs | |||
@@ -123,6 +123,12 @@ impl<'a> Lifetime<'a> { | |||
123 | } | 123 | } |
124 | } | 124 | } |
125 | 125 | ||
126 | impl<'a> Char<'a> { | ||
127 | pub fn text(&self) -> &SmolStr { | ||
128 | &self.syntax().leaf_text().unwrap() | ||
129 | } | ||
130 | } | ||
131 | |||
126 | impl<'a> Comment<'a> { | 132 | impl<'a> Comment<'a> { |
127 | pub fn text(&self) -> &SmolStr { | 133 | pub fn text(&self) -> &SmolStr { |
128 | self.syntax().leaf_text().unwrap() | 134 | self.syntax().leaf_text().unwrap() |
diff --git a/crates/ra_syntax/src/grammar.ron b/crates/ra_syntax/src/grammar.ron index 0454fd8e4..2ed165c52 100644 --- a/crates/ra_syntax/src/grammar.ron +++ b/crates/ra_syntax/src/grammar.ron | |||
@@ -406,6 +406,7 @@ Grammar( | |||
406 | "PrefixExpr": (), | 406 | "PrefixExpr": (), |
407 | "RangeExpr": (), | 407 | "RangeExpr": (), |
408 | "BinExpr": (), | 408 | "BinExpr": (), |
409 | "Char": (), | ||
409 | "Literal": (), | 410 | "Literal": (), |
410 | 411 | ||
411 | "Expr": ( | 412 | "Expr": ( |
diff --git a/crates/ra_syntax/src/lib.rs b/crates/ra_syntax/src/lib.rs index 79394fd53..8996eb921 100644 --- a/crates/ra_syntax/src/lib.rs +++ b/crates/ra_syntax/src/lib.rs | |||
@@ -39,11 +39,12 @@ mod grammar; | |||
39 | mod parser_api; | 39 | mod parser_api; |
40 | mod parser_impl; | 40 | mod parser_impl; |
41 | mod reparsing; | 41 | mod reparsing; |
42 | 42 | mod string_lexing; | |
43 | mod syntax_kinds; | 43 | mod syntax_kinds; |
44 | pub mod text_utils; | 44 | pub mod text_utils; |
45 | /// Utilities for simple uses of the parser. | 45 | /// Utilities for simple uses of the parser. |
46 | pub mod utils; | 46 | pub mod utils; |
47 | mod validation; | ||
47 | mod yellow; | 48 | mod yellow; |
48 | 49 | ||
49 | pub use crate::{ | 50 | pub use crate::{ |
@@ -98,6 +99,8 @@ impl File { | |||
98 | self.root.borrowed() | 99 | self.root.borrowed() |
99 | } | 100 | } |
100 | pub fn errors(&self) -> Vec<SyntaxError> { | 101 | pub fn errors(&self) -> Vec<SyntaxError> { |
101 | self.root.root_data().clone() | 102 | let mut errors = self.root.root_data().clone(); |
103 | errors.extend(validation::validate(self)); | ||
104 | errors | ||
102 | } | 105 | } |
103 | } | 106 | } |
diff --git a/crates/ra_syntax/src/string_lexing/mod.rs b/crates/ra_syntax/src/string_lexing/mod.rs new file mode 100644 index 000000000..6b52c62c3 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/mod.rs | |||
@@ -0,0 +1,311 @@ | |||
1 | use self::CharComponentKind::*; | ||
2 | use rowan::{TextRange, TextUnit}; | ||
3 | |||
4 | pub fn parse_char_literal(src: &str) -> CharComponentIterator { | ||
5 | CharComponentIterator { | ||
6 | parser: Parser::new(src), | ||
7 | has_closing_quote: false, | ||
8 | } | ||
9 | } | ||
10 | |||
11 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
12 | pub struct CharComponent { | ||
13 | pub range: TextRange, | ||
14 | pub kind: CharComponentKind, | ||
15 | } | ||
16 | |||
17 | impl CharComponent { | ||
18 | fn new(range: TextRange, kind: CharComponentKind) -> CharComponent { | ||
19 | CharComponent { range, kind } | ||
20 | } | ||
21 | } | ||
22 | |||
23 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
24 | pub enum CharComponentKind { | ||
25 | CodePoint, | ||
26 | AsciiEscape, | ||
27 | AsciiCodeEscape, | ||
28 | UnicodeEscape, | ||
29 | } | ||
30 | |||
31 | pub struct CharComponentIterator<'a> { | ||
32 | parser: Parser<'a>, | ||
33 | pub has_closing_quote: bool, | ||
34 | } | ||
35 | |||
36 | impl<'a> Iterator for CharComponentIterator<'a> { | ||
37 | type Item = CharComponent; | ||
38 | fn next(&mut self) -> Option<CharComponent> { | ||
39 | if self.parser.pos == 0 { | ||
40 | assert!( | ||
41 | self.parser.advance() == '\'', | ||
42 | "char literal should start with a quote" | ||
43 | ); | ||
44 | } | ||
45 | |||
46 | if let Some(component) = self.parser.parse_char_component() { | ||
47 | return Some(component); | ||
48 | } | ||
49 | |||
50 | // We get here when there are no char components left to parse | ||
51 | if self.parser.peek() == Some('\'') { | ||
52 | self.parser.advance(); | ||
53 | self.has_closing_quote = true; | ||
54 | } | ||
55 | |||
56 | assert!( | ||
57 | self.parser.peek() == None, | ||
58 | "char literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
59 | self.parser.src, | ||
60 | self.parser.pos, | ||
61 | self.parser.src.len() | ||
62 | ); | ||
63 | |||
64 | None | ||
65 | } | ||
66 | } | ||
67 | |||
68 | pub struct Parser<'a> { | ||
69 | src: &'a str, | ||
70 | pos: usize, | ||
71 | } | ||
72 | |||
73 | impl<'a> Parser<'a> { | ||
74 | pub fn new(src: &'a str) -> Parser<'a> { | ||
75 | Parser { src, pos: 0 } | ||
76 | } | ||
77 | |||
78 | // Utility methods | ||
79 | |||
80 | pub fn peek(&self) -> Option<char> { | ||
81 | if self.pos == self.src.len() { | ||
82 | return None; | ||
83 | } | ||
84 | |||
85 | self.src[self.pos..].chars().next() | ||
86 | } | ||
87 | |||
88 | pub fn advance(&mut self) -> char { | ||
89 | let next = self | ||
90 | .peek() | ||
91 | .expect("cannot advance if end of input is reached"); | ||
92 | self.pos += next.len_utf8(); | ||
93 | next | ||
94 | } | ||
95 | |||
96 | pub fn get_pos(&self) -> TextUnit { | ||
97 | (self.pos as u32).into() | ||
98 | } | ||
99 | |||
100 | // Char parsing methods | ||
101 | |||
102 | fn parse_unicode_escape(&mut self, start: TextUnit) -> CharComponent { | ||
103 | // Note: validation of UnicodeEscape will be done elsewhere: | ||
104 | // * Only hex digits or underscores allowed | ||
105 | // * Max 6 chars | ||
106 | // * Within allowed range (must be at most 10FFFF) | ||
107 | match self.peek() { | ||
108 | Some('{') => { | ||
109 | self.advance(); | ||
110 | |||
111 | // Parse anything until we reach `}` | ||
112 | while let Some(next) = self.peek() { | ||
113 | self.advance(); | ||
114 | if next == '}' { | ||
115 | break; | ||
116 | } | ||
117 | } | ||
118 | |||
119 | let end = self.get_pos(); | ||
120 | CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) | ||
121 | } | ||
122 | Some(_) | None => { | ||
123 | let end = self.get_pos(); | ||
124 | CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) | ||
125 | } | ||
126 | } | ||
127 | } | ||
128 | |||
129 | fn parse_ascii_code_escape(&mut self, start: TextUnit) -> CharComponent { | ||
130 | // Note: validation of AsciiCodeEscape will be done elsewhere: | ||
131 | // * First digit is octal | ||
132 | // * Second digit is hex | ||
133 | let code_start = self.get_pos(); | ||
134 | while let Some(next) = self.peek() { | ||
135 | if next == '\'' || (self.get_pos() - code_start == 2.into()) { | ||
136 | break; | ||
137 | } | ||
138 | |||
139 | self.advance(); | ||
140 | } | ||
141 | |||
142 | let end = self.get_pos(); | ||
143 | CharComponent::new(TextRange::from_to(start, end), AsciiCodeEscape) | ||
144 | } | ||
145 | |||
146 | fn parse_escape(&mut self, start: TextUnit) -> CharComponent { | ||
147 | // Note: validation of AsciiEscape will be done elsewhere: | ||
148 | // * The escape sequence is non-empty | ||
149 | // * The escape sequence is valid | ||
150 | if self.peek().is_none() { | ||
151 | return CharComponent::new(TextRange::from_to(start, start), AsciiEscape); | ||
152 | } | ||
153 | |||
154 | let next = self.advance(); | ||
155 | let end = self.get_pos(); | ||
156 | let range = TextRange::from_to(start, end); | ||
157 | match next { | ||
158 | 'x' => self.parse_ascii_code_escape(start), | ||
159 | 'u' => self.parse_unicode_escape(start), | ||
160 | _ => CharComponent::new(range, AsciiEscape), | ||
161 | } | ||
162 | } | ||
163 | |||
164 | pub fn parse_char_component(&mut self) -> Option<CharComponent> { | ||
165 | let next = self.peek()?; | ||
166 | |||
167 | // Ignore character close | ||
168 | if next == '\'' { | ||
169 | return None; | ||
170 | } | ||
171 | |||
172 | let start = self.get_pos(); | ||
173 | self.advance(); | ||
174 | |||
175 | if next == '\\' { | ||
176 | Some(self.parse_escape(start)) | ||
177 | } else { | ||
178 | let end = self.get_pos(); | ||
179 | Some(CharComponent::new( | ||
180 | TextRange::from_to(start, end), | ||
181 | CodePoint, | ||
182 | )) | ||
183 | } | ||
184 | } | ||
185 | } | ||
186 | |||
187 | #[cfg(test)] | ||
188 | mod tests { | ||
189 | use super::*; | ||
190 | |||
191 | fn parse(src: &str) -> (bool, Vec<CharComponent>) { | ||
192 | let component_iterator = &mut super::parse_char_literal(src); | ||
193 | let components: Vec<_> = component_iterator.collect(); | ||
194 | (component_iterator.has_closing_quote, components) | ||
195 | } | ||
196 | |||
197 | fn unclosed_char_component(src: &str) -> CharComponent { | ||
198 | let (has_closing_quote, components) = parse(src); | ||
199 | assert!(!has_closing_quote, "char should not have closing quote"); | ||
200 | assert!(components.len() == 1); | ||
201 | components[0].clone() | ||
202 | } | ||
203 | |||
204 | fn closed_char_component(src: &str) -> CharComponent { | ||
205 | let (has_closing_quote, components) = parse(src); | ||
206 | assert!(has_closing_quote, "char should have closing quote"); | ||
207 | assert!( | ||
208 | components.len() == 1, | ||
209 | "Literal: {}\nComponents: {:#?}", | ||
210 | src, | ||
211 | components | ||
212 | ); | ||
213 | components[0].clone() | ||
214 | } | ||
215 | |||
216 | fn closed_char_components(src: &str) -> Vec<CharComponent> { | ||
217 | let (has_closing_quote, components) = parse(src); | ||
218 | assert!(has_closing_quote, "char should have closing quote"); | ||
219 | components | ||
220 | } | ||
221 | |||
222 | fn range_closed(src: &str) -> TextRange { | ||
223 | TextRange::from_to(1.into(), (src.len() as u32 - 1).into()) | ||
224 | } | ||
225 | |||
226 | fn range_unclosed(src: &str) -> TextRange { | ||
227 | TextRange::from_to(1.into(), (src.len() as u32).into()) | ||
228 | } | ||
229 | |||
230 | #[test] | ||
231 | fn test_unicode_escapes() { | ||
232 | let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", ""]; | ||
233 | for escape in unicode_escapes { | ||
234 | let escape_sequence = format!(r"'\u{}'", escape); | ||
235 | let component = closed_char_component(&escape_sequence); | ||
236 | let expected_range = range_closed(&escape_sequence); | ||
237 | assert_eq!(component.kind, CharComponentKind::UnicodeEscape); | ||
238 | assert_eq!(component.range, expected_range); | ||
239 | } | ||
240 | } | ||
241 | |||
242 | #[test] | ||
243 | fn test_unicode_escapes_unclosed() { | ||
244 | let unicode_escapes = &["{DEAD", "{BEEF", "{FF"]; | ||
245 | for escape in unicode_escapes { | ||
246 | let escape_sequence = format!(r"'\u{}'", escape); | ||
247 | let component = unclosed_char_component(&escape_sequence); | ||
248 | let expected_range = range_unclosed(&escape_sequence); | ||
249 | assert_eq!(component.kind, CharComponentKind::UnicodeEscape); | ||
250 | assert_eq!(component.range, expected_range); | ||
251 | } | ||
252 | } | ||
253 | |||
254 | #[test] | ||
255 | fn test_empty_char() { | ||
256 | let (has_closing_quote, components) = parse("''"); | ||
257 | assert!(has_closing_quote, "char should have closing quote"); | ||
258 | assert!(components.len() == 0); | ||
259 | } | ||
260 | |||
261 | #[test] | ||
262 | fn test_unclosed_char() { | ||
263 | let component = unclosed_char_component("'a"); | ||
264 | assert!(component.kind == CodePoint); | ||
265 | assert!(component.range == TextRange::from_to(1.into(), 2.into())); | ||
266 | } | ||
267 | |||
268 | #[test] | ||
269 | fn test_digit_escapes() { | ||
270 | let literals = &[r"", r"5", r"55"]; | ||
271 | |||
272 | for literal in literals { | ||
273 | let lit_text = format!(r"'\x{}'", literal); | ||
274 | let component = closed_char_component(&lit_text); | ||
275 | assert!(component.kind == CharComponentKind::AsciiCodeEscape); | ||
276 | assert!(component.range == range_closed(&lit_text)); | ||
277 | } | ||
278 | |||
279 | // More than 2 digits starts a new codepoint | ||
280 | let components = closed_char_components(r"'\x555'"); | ||
281 | assert!(components.len() == 2); | ||
282 | assert!(components[1].kind == CharComponentKind::CodePoint); | ||
283 | } | ||
284 | |||
285 | #[test] | ||
286 | fn test_ascii_escapes() { | ||
287 | let literals = &[ | ||
288 | r"\'", "\\\"", // equivalent to \" | ||
289 | r"\n", r"\r", r"\t", r"\\", r"\0", | ||
290 | ]; | ||
291 | |||
292 | for literal in literals { | ||
293 | let lit_text = format!("'{}'", literal); | ||
294 | let component = closed_char_component(&lit_text); | ||
295 | assert!(component.kind == CharComponentKind::AsciiEscape); | ||
296 | assert!(component.range == range_closed(&lit_text)); | ||
297 | } | ||
298 | } | ||
299 | |||
300 | #[test] | ||
301 | fn test_no_escapes() { | ||
302 | let literals = &['"', 'n', 'r', 't', '0', 'x', 'u']; | ||
303 | |||
304 | for &literal in literals { | ||
305 | let lit_text = format!("'{}'", literal); | ||
306 | let component = closed_char_component(&lit_text); | ||
307 | assert!(component.kind == CharComponentKind::CodePoint); | ||
308 | assert!(component.range == range_closed(&lit_text)); | ||
309 | } | ||
310 | } | ||
311 | } | ||
diff --git a/crates/ra_syntax/src/validation.rs b/crates/ra_syntax/src/validation.rs new file mode 100644 index 000000000..03d98eff4 --- /dev/null +++ b/crates/ra_syntax/src/validation.rs | |||
@@ -0,0 +1,40 @@ | |||
1 | use crate::{ | ||
2 | ast::{self, AstNode}, | ||
3 | File, | ||
4 | string_lexing, | ||
5 | yellow::{ | ||
6 | SyntaxError, | ||
7 | }, | ||
8 | }; | ||
9 | |||
10 | pub(crate) fn validate(file: &File) -> Vec<SyntaxError> { | ||
11 | let mut errors = Vec::new(); | ||
12 | for d in file.root.borrowed().descendants() { | ||
13 | if let Some(c) = ast::Char::cast(d) { | ||
14 | let components = &mut string_lexing::parse_char_literal(c.text()); | ||
15 | let len = components.count(); | ||
16 | |||
17 | if !components.has_closing_quote { | ||
18 | errors.push(SyntaxError { | ||
19 | msg: "Unclosed char literal".to_string(), | ||
20 | offset: d.range().start(), | ||
21 | }); | ||
22 | } | ||
23 | |||
24 | if len == 0 { | ||
25 | errors.push(SyntaxError { | ||
26 | msg: "Empty char literal".to_string(), | ||
27 | offset: d.range().start(), | ||
28 | }); | ||
29 | } | ||
30 | |||
31 | if len > 1 { | ||
32 | errors.push(SyntaxError { | ||
33 | msg: "Character literal should be only one character long".to_string(), | ||
34 | offset: d.range().start(), | ||
35 | }); | ||
36 | } | ||
37 | } | ||
38 | } | ||
39 | errors | ||
40 | } | ||