aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdolfo OchagavĂ­a <[email protected]>2018-11-04 14:06:38 +0000
committerAdolfo OchagavĂ­a <[email protected]>2018-11-04 14:17:24 +0000
commit9b5bbab104d8ba445143f6f3a9e4149b40c29ae5 (patch)
treea1ef0fa5dbfd431e8a58afc6542c32c9ecefed04
parent19c6cbd9540ef87850161cad7e108b380eceea24 (diff)
Add character literal parsing and validation
-rw-r--r--crates/ra_syntax/src/ast/generated.rs34
-rw-r--r--crates/ra_syntax/src/ast/mod.rs6
-rw-r--r--crates/ra_syntax/src/grammar.ron1
-rw-r--r--crates/ra_syntax/src/lib.rs7
-rw-r--r--crates/ra_syntax/src/string_lexing/mod.rs311
-rw-r--r--crates/ra_syntax/src/validation.rs40
6 files changed, 397 insertions, 2 deletions
diff --git a/crates/ra_syntax/src/ast/generated.rs b/crates/ra_syntax/src/ast/generated.rs
index f77795d05..75769a4e9 100644
--- a/crates/ra_syntax/src/ast/generated.rs
+++ b/crates/ra_syntax/src/ast/generated.rs
@@ -409,6 +409,40 @@ impl<'a> AstNode<'a> for CastExpr<'a> {
409 409
410impl<'a> CastExpr<'a> {} 410impl<'a> CastExpr<'a> {}
411 411
412// Char
413
414#[derive(Debug, Clone)]
415pub struct CharNode(SyntaxNode);
416
417impl CharNode {
418 pub fn ast(&self) -> Char {
419 Char::cast(self.0.borrowed()).unwrap()
420 }
421}
422
423impl<'a> From<Char<'a>> for CharNode {
424 fn from(ast: Char<'a>) -> CharNode {
425 let syntax = ast.syntax().owned();
426 CharNode(syntax)
427 }
428}
429#[derive(Debug, Clone, Copy)]
430pub struct Char<'a> {
431 syntax: SyntaxNodeRef<'a>,
432}
433
434impl<'a> AstNode<'a> for Char<'a> {
435 fn cast(syntax: SyntaxNodeRef<'a>) -> Option<Self> {
436 match syntax.kind() {
437 CHAR => Some(Char { syntax }),
438 _ => None,
439 }
440 }
441 fn syntax(self) -> SyntaxNodeRef<'a> { self.syntax }
442}
443
444impl<'a> Char<'a> {}
445
412// Comment 446// Comment
413 447
414#[derive(Debug, Clone)] 448#[derive(Debug, Clone)]
diff --git a/crates/ra_syntax/src/ast/mod.rs b/crates/ra_syntax/src/ast/mod.rs
index 688ffff47..4355531d0 100644
--- a/crates/ra_syntax/src/ast/mod.rs
+++ b/crates/ra_syntax/src/ast/mod.rs
@@ -123,6 +123,12 @@ impl<'a> Lifetime<'a> {
123 } 123 }
124} 124}
125 125
126impl<'a> Char<'a> {
127 pub fn text(&self) -> &SmolStr {
128 &self.syntax().leaf_text().unwrap()
129 }
130}
131
126impl<'a> Comment<'a> { 132impl<'a> Comment<'a> {
127 pub fn text(&self) -> &SmolStr { 133 pub fn text(&self) -> &SmolStr {
128 self.syntax().leaf_text().unwrap() 134 self.syntax().leaf_text().unwrap()
diff --git a/crates/ra_syntax/src/grammar.ron b/crates/ra_syntax/src/grammar.ron
index 0454fd8e4..2ed165c52 100644
--- a/crates/ra_syntax/src/grammar.ron
+++ b/crates/ra_syntax/src/grammar.ron
@@ -406,6 +406,7 @@ Grammar(
406 "PrefixExpr": (), 406 "PrefixExpr": (),
407 "RangeExpr": (), 407 "RangeExpr": (),
408 "BinExpr": (), 408 "BinExpr": (),
409 "Char": (),
409 "Literal": (), 410 "Literal": (),
410 411
411 "Expr": ( 412 "Expr": (
diff --git a/crates/ra_syntax/src/lib.rs b/crates/ra_syntax/src/lib.rs
index 79394fd53..8996eb921 100644
--- a/crates/ra_syntax/src/lib.rs
+++ b/crates/ra_syntax/src/lib.rs
@@ -39,11 +39,12 @@ mod grammar;
39mod parser_api; 39mod parser_api;
40mod parser_impl; 40mod parser_impl;
41mod reparsing; 41mod reparsing;
42 42mod string_lexing;
43mod syntax_kinds; 43mod syntax_kinds;
44pub mod text_utils; 44pub mod text_utils;
45/// Utilities for simple uses of the parser. 45/// Utilities for simple uses of the parser.
46pub mod utils; 46pub mod utils;
47mod validation;
47mod yellow; 48mod yellow;
48 49
49pub use crate::{ 50pub use crate::{
@@ -98,6 +99,8 @@ impl File {
98 self.root.borrowed() 99 self.root.borrowed()
99 } 100 }
100 pub fn errors(&self) -> Vec<SyntaxError> { 101 pub fn errors(&self) -> Vec<SyntaxError> {
101 self.root.root_data().clone() 102 let mut errors = self.root.root_data().clone();
103 errors.extend(validation::validate(self));
104 errors
102 } 105 }
103} 106}
diff --git a/crates/ra_syntax/src/string_lexing/mod.rs b/crates/ra_syntax/src/string_lexing/mod.rs
new file mode 100644
index 000000000..6b52c62c3
--- /dev/null
+++ b/crates/ra_syntax/src/string_lexing/mod.rs
@@ -0,0 +1,311 @@
1use self::CharComponentKind::*;
2use rowan::{TextRange, TextUnit};
3
4pub fn parse_char_literal(src: &str) -> CharComponentIterator {
5 CharComponentIterator {
6 parser: Parser::new(src),
7 has_closing_quote: false,
8 }
9}
10
11#[derive(Debug, Eq, PartialEq, Clone)]
12pub struct CharComponent {
13 pub range: TextRange,
14 pub kind: CharComponentKind,
15}
16
17impl CharComponent {
18 fn new(range: TextRange, kind: CharComponentKind) -> CharComponent {
19 CharComponent { range, kind }
20 }
21}
22
23#[derive(Debug, Eq, PartialEq, Clone)]
24pub enum CharComponentKind {
25 CodePoint,
26 AsciiEscape,
27 AsciiCodeEscape,
28 UnicodeEscape,
29}
30
31pub struct CharComponentIterator<'a> {
32 parser: Parser<'a>,
33 pub has_closing_quote: bool,
34}
35
36impl<'a> Iterator for CharComponentIterator<'a> {
37 type Item = CharComponent;
38 fn next(&mut self) -> Option<CharComponent> {
39 if self.parser.pos == 0 {
40 assert!(
41 self.parser.advance() == '\'',
42 "char literal should start with a quote"
43 );
44 }
45
46 if let Some(component) = self.parser.parse_char_component() {
47 return Some(component);
48 }
49
50 // We get here when there are no char components left to parse
51 if self.parser.peek() == Some('\'') {
52 self.parser.advance();
53 self.has_closing_quote = true;
54 }
55
56 assert!(
57 self.parser.peek() == None,
58 "char literal should leave no unparsed input: src = {}, pos = {}, length = {}",
59 self.parser.src,
60 self.parser.pos,
61 self.parser.src.len()
62 );
63
64 None
65 }
66}
67
68pub struct Parser<'a> {
69 src: &'a str,
70 pos: usize,
71}
72
73impl<'a> Parser<'a> {
74 pub fn new(src: &'a str) -> Parser<'a> {
75 Parser { src, pos: 0 }
76 }
77
78 // Utility methods
79
80 pub fn peek(&self) -> Option<char> {
81 if self.pos == self.src.len() {
82 return None;
83 }
84
85 self.src[self.pos..].chars().next()
86 }
87
88 pub fn advance(&mut self) -> char {
89 let next = self
90 .peek()
91 .expect("cannot advance if end of input is reached");
92 self.pos += next.len_utf8();
93 next
94 }
95
96 pub fn get_pos(&self) -> TextUnit {
97 (self.pos as u32).into()
98 }
99
100 // Char parsing methods
101
102 fn parse_unicode_escape(&mut self, start: TextUnit) -> CharComponent {
103 // Note: validation of UnicodeEscape will be done elsewhere:
104 // * Only hex digits or underscores allowed
105 // * Max 6 chars
106 // * Within allowed range (must be at most 10FFFF)
107 match self.peek() {
108 Some('{') => {
109 self.advance();
110
111 // Parse anything until we reach `}`
112 while let Some(next) = self.peek() {
113 self.advance();
114 if next == '}' {
115 break;
116 }
117 }
118
119 let end = self.get_pos();
120 CharComponent::new(TextRange::from_to(start, end), UnicodeEscape)
121 }
122 Some(_) | None => {
123 let end = self.get_pos();
124 CharComponent::new(TextRange::from_to(start, end), UnicodeEscape)
125 }
126 }
127 }
128
129 fn parse_ascii_code_escape(&mut self, start: TextUnit) -> CharComponent {
130 // Note: validation of AsciiCodeEscape will be done elsewhere:
131 // * First digit is octal
132 // * Second digit is hex
133 let code_start = self.get_pos();
134 while let Some(next) = self.peek() {
135 if next == '\'' || (self.get_pos() - code_start == 2.into()) {
136 break;
137 }
138
139 self.advance();
140 }
141
142 let end = self.get_pos();
143 CharComponent::new(TextRange::from_to(start, end), AsciiCodeEscape)
144 }
145
146 fn parse_escape(&mut self, start: TextUnit) -> CharComponent {
147 // Note: validation of AsciiEscape will be done elsewhere:
148 // * The escape sequence is non-empty
149 // * The escape sequence is valid
150 if self.peek().is_none() {
151 return CharComponent::new(TextRange::from_to(start, start), AsciiEscape);
152 }
153
154 let next = self.advance();
155 let end = self.get_pos();
156 let range = TextRange::from_to(start, end);
157 match next {
158 'x' => self.parse_ascii_code_escape(start),
159 'u' => self.parse_unicode_escape(start),
160 _ => CharComponent::new(range, AsciiEscape),
161 }
162 }
163
164 pub fn parse_char_component(&mut self) -> Option<CharComponent> {
165 let next = self.peek()?;
166
167 // Ignore character close
168 if next == '\'' {
169 return None;
170 }
171
172 let start = self.get_pos();
173 self.advance();
174
175 if next == '\\' {
176 Some(self.parse_escape(start))
177 } else {
178 let end = self.get_pos();
179 Some(CharComponent::new(
180 TextRange::from_to(start, end),
181 CodePoint,
182 ))
183 }
184 }
185}
186
187#[cfg(test)]
188mod tests {
189 use super::*;
190
191 fn parse(src: &str) -> (bool, Vec<CharComponent>) {
192 let component_iterator = &mut super::parse_char_literal(src);
193 let components: Vec<_> = component_iterator.collect();
194 (component_iterator.has_closing_quote, components)
195 }
196
197 fn unclosed_char_component(src: &str) -> CharComponent {
198 let (has_closing_quote, components) = parse(src);
199 assert!(!has_closing_quote, "char should not have closing quote");
200 assert!(components.len() == 1);
201 components[0].clone()
202 }
203
204 fn closed_char_component(src: &str) -> CharComponent {
205 let (has_closing_quote, components) = parse(src);
206 assert!(has_closing_quote, "char should have closing quote");
207 assert!(
208 components.len() == 1,
209 "Literal: {}\nComponents: {:#?}",
210 src,
211 components
212 );
213 components[0].clone()
214 }
215
216 fn closed_char_components(src: &str) -> Vec<CharComponent> {
217 let (has_closing_quote, components) = parse(src);
218 assert!(has_closing_quote, "char should have closing quote");
219 components
220 }
221
222 fn range_closed(src: &str) -> TextRange {
223 TextRange::from_to(1.into(), (src.len() as u32 - 1).into())
224 }
225
226 fn range_unclosed(src: &str) -> TextRange {
227 TextRange::from_to(1.into(), (src.len() as u32).into())
228 }
229
230 #[test]
231 fn test_unicode_escapes() {
232 let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", ""];
233 for escape in unicode_escapes {
234 let escape_sequence = format!(r"'\u{}'", escape);
235 let component = closed_char_component(&escape_sequence);
236 let expected_range = range_closed(&escape_sequence);
237 assert_eq!(component.kind, CharComponentKind::UnicodeEscape);
238 assert_eq!(component.range, expected_range);
239 }
240 }
241
242 #[test]
243 fn test_unicode_escapes_unclosed() {
244 let unicode_escapes = &["{DEAD", "{BEEF", "{FF"];
245 for escape in unicode_escapes {
246 let escape_sequence = format!(r"'\u{}'", escape);
247 let component = unclosed_char_component(&escape_sequence);
248 let expected_range = range_unclosed(&escape_sequence);
249 assert_eq!(component.kind, CharComponentKind::UnicodeEscape);
250 assert_eq!(component.range, expected_range);
251 }
252 }
253
254 #[test]
255 fn test_empty_char() {
256 let (has_closing_quote, components) = parse("''");
257 assert!(has_closing_quote, "char should have closing quote");
258 assert!(components.len() == 0);
259 }
260
261 #[test]
262 fn test_unclosed_char() {
263 let component = unclosed_char_component("'a");
264 assert!(component.kind == CodePoint);
265 assert!(component.range == TextRange::from_to(1.into(), 2.into()));
266 }
267
268 #[test]
269 fn test_digit_escapes() {
270 let literals = &[r"", r"5", r"55"];
271
272 for literal in literals {
273 let lit_text = format!(r"'\x{}'", literal);
274 let component = closed_char_component(&lit_text);
275 assert!(component.kind == CharComponentKind::AsciiCodeEscape);
276 assert!(component.range == range_closed(&lit_text));
277 }
278
279 // More than 2 digits starts a new codepoint
280 let components = closed_char_components(r"'\x555'");
281 assert!(components.len() == 2);
282 assert!(components[1].kind == CharComponentKind::CodePoint);
283 }
284
285 #[test]
286 fn test_ascii_escapes() {
287 let literals = &[
288 r"\'", "\\\"", // equivalent to \"
289 r"\n", r"\r", r"\t", r"\\", r"\0",
290 ];
291
292 for literal in literals {
293 let lit_text = format!("'{}'", literal);
294 let component = closed_char_component(&lit_text);
295 assert!(component.kind == CharComponentKind::AsciiEscape);
296 assert!(component.range == range_closed(&lit_text));
297 }
298 }
299
300 #[test]
301 fn test_no_escapes() {
302 let literals = &['"', 'n', 'r', 't', '0', 'x', 'u'];
303
304 for &literal in literals {
305 let lit_text = format!("'{}'", literal);
306 let component = closed_char_component(&lit_text);
307 assert!(component.kind == CharComponentKind::CodePoint);
308 assert!(component.range == range_closed(&lit_text));
309 }
310 }
311}
diff --git a/crates/ra_syntax/src/validation.rs b/crates/ra_syntax/src/validation.rs
new file mode 100644
index 000000000..03d98eff4
--- /dev/null
+++ b/crates/ra_syntax/src/validation.rs
@@ -0,0 +1,40 @@
1use crate::{
2 ast::{self, AstNode},
3 File,
4 string_lexing,
5 yellow::{
6 SyntaxError,
7 },
8};
9
10pub(crate) fn validate(file: &File) -> Vec<SyntaxError> {
11 let mut errors = Vec::new();
12 for d in file.root.borrowed().descendants() {
13 if let Some(c) = ast::Char::cast(d) {
14 let components = &mut string_lexing::parse_char_literal(c.text());
15 let len = components.count();
16
17 if !components.has_closing_quote {
18 errors.push(SyntaxError {
19 msg: "Unclosed char literal".to_string(),
20 offset: d.range().start(),
21 });
22 }
23
24 if len == 0 {
25 errors.push(SyntaxError {
26 msg: "Empty char literal".to_string(),
27 offset: d.range().start(),
28 });
29 }
30
31 if len > 1 {
32 errors.push(SyntaxError {
33 msg: "Character literal should be only one character long".to_string(),
34 offset: d.range().start(),
35 });
36 }
37 }
38 }
39 errors
40}