diff options
Diffstat (limited to 'crates')
-rw-r--r-- | crates/ra_syntax/src/string_lexing.rs | 513 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing/byte.rs | 51 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing/byte_string.rs | 51 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing/char.rs | 176 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing/mod.rs | 13 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing/parser.rs | 201 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing/string.rs | 46 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/byte.rs | 9 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/char.rs | 4 | ||||
-rw-r--r-- | crates/ra_syntax/src/yellow/syntax_error.rs | 5 |
10 files changed, 548 insertions, 521 deletions
diff --git a/crates/ra_syntax/src/string_lexing.rs b/crates/ra_syntax/src/string_lexing.rs deleted file mode 100644 index d253c97e7..000000000 --- a/crates/ra_syntax/src/string_lexing.rs +++ /dev/null | |||
@@ -1,513 +0,0 @@ | |||
1 | use self::CharComponentKind::*; | ||
2 | use rowan::{TextRange, TextUnit}; | ||
3 | |||
4 | pub fn parse_byte_string_literal(src: &str) -> ByteStringComponentIterator { | ||
5 | ByteStringComponentIterator { | ||
6 | parser: Parser::new(src), | ||
7 | has_closing_quote: false, | ||
8 | } | ||
9 | } | ||
10 | |||
11 | pub struct ByteStringComponentIterator<'a> { | ||
12 | parser: Parser<'a>, | ||
13 | pub has_closing_quote: bool, | ||
14 | } | ||
15 | |||
16 | impl<'a> Iterator for ByteStringComponentIterator<'a> { | ||
17 | type Item = StringComponent; | ||
18 | fn next(&mut self) -> Option<StringComponent> { | ||
19 | if self.parser.pos == 0 { | ||
20 | assert!( | ||
21 | self.parser.advance() == 'b', | ||
22 | "byte string literal should start with a `b`" | ||
23 | ); | ||
24 | |||
25 | assert!( | ||
26 | self.parser.advance() == '"', | ||
27 | "byte string literal should start with a `b`, followed by double quotes" | ||
28 | ); | ||
29 | } | ||
30 | |||
31 | if let Some(component) = self.parser.parse_string_component() { | ||
32 | return Some(component); | ||
33 | } | ||
34 | |||
35 | // We get here when there are no char components left to parse | ||
36 | if self.parser.peek() == Some('"') { | ||
37 | self.parser.advance(); | ||
38 | self.has_closing_quote = true; | ||
39 | } | ||
40 | |||
41 | assert!( | ||
42 | self.parser.peek() == None, | ||
43 | "byte string literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
44 | self.parser.src, | ||
45 | self.parser.pos, | ||
46 | self.parser.src.len() | ||
47 | ); | ||
48 | |||
49 | None | ||
50 | } | ||
51 | } | ||
52 | |||
53 | pub fn parse_string_literal(src: &str) -> StringComponentIterator { | ||
54 | StringComponentIterator { | ||
55 | parser: Parser::new(src), | ||
56 | has_closing_quote: false, | ||
57 | } | ||
58 | } | ||
59 | |||
60 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
61 | pub struct StringComponent { | ||
62 | pub range: TextRange, | ||
63 | pub kind: StringComponentKind, | ||
64 | } | ||
65 | |||
66 | impl StringComponent { | ||
67 | fn new(range: TextRange, kind: StringComponentKind) -> StringComponent { | ||
68 | StringComponent { range, kind } | ||
69 | } | ||
70 | } | ||
71 | |||
72 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
73 | pub enum StringComponentKind { | ||
74 | IgnoreNewline, | ||
75 | Char(CharComponentKind), | ||
76 | } | ||
77 | |||
78 | pub struct StringComponentIterator<'a> { | ||
79 | parser: Parser<'a>, | ||
80 | pub has_closing_quote: bool, | ||
81 | } | ||
82 | |||
83 | impl<'a> Iterator for StringComponentIterator<'a> { | ||
84 | type Item = StringComponent; | ||
85 | fn next(&mut self) -> Option<StringComponent> { | ||
86 | if self.parser.pos == 0 { | ||
87 | assert!( | ||
88 | self.parser.advance() == '"', | ||
89 | "string literal should start with double quotes" | ||
90 | ); | ||
91 | } | ||
92 | |||
93 | if let Some(component) = self.parser.parse_string_component() { | ||
94 | return Some(component); | ||
95 | } | ||
96 | |||
97 | // We get here when there are no char components left to parse | ||
98 | if self.parser.peek() == Some('"') { | ||
99 | self.parser.advance(); | ||
100 | self.has_closing_quote = true; | ||
101 | } | ||
102 | |||
103 | assert!( | ||
104 | self.parser.peek() == None, | ||
105 | "string literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
106 | self.parser.src, | ||
107 | self.parser.pos, | ||
108 | self.parser.src.len() | ||
109 | ); | ||
110 | |||
111 | None | ||
112 | } | ||
113 | } | ||
114 | |||
115 | pub fn parse_byte_literal(src: &str) -> ByteComponentIterator { | ||
116 | ByteComponentIterator { | ||
117 | parser: Parser::new(src), | ||
118 | has_closing_quote: false, | ||
119 | } | ||
120 | } | ||
121 | |||
122 | pub struct ByteComponentIterator<'a> { | ||
123 | parser: Parser<'a>, | ||
124 | pub has_closing_quote: bool, | ||
125 | } | ||
126 | |||
127 | impl<'a> Iterator for ByteComponentIterator<'a> { | ||
128 | type Item = CharComponent; | ||
129 | fn next(&mut self) -> Option<CharComponent> { | ||
130 | if self.parser.pos == 0 { | ||
131 | assert!( | ||
132 | self.parser.advance() == 'b', | ||
133 | "Byte literal should start with a `b`" | ||
134 | ); | ||
135 | |||
136 | assert!( | ||
137 | self.parser.advance() == '\'', | ||
138 | "Byte literal should start with a `b`, followed by a quote" | ||
139 | ); | ||
140 | } | ||
141 | |||
142 | |||
143 | if let Some(component) = self.parser.parse_char_component() { | ||
144 | return Some(component); | ||
145 | } | ||
146 | |||
147 | // We get here when there are no char components left to parse | ||
148 | if self.parser.peek() == Some('\'') { | ||
149 | self.parser.advance(); | ||
150 | self.has_closing_quote = true; | ||
151 | } | ||
152 | |||
153 | assert!( | ||
154 | self.parser.peek() == None, | ||
155 | "byte literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
156 | self.parser.src, | ||
157 | self.parser.pos, | ||
158 | self.parser.src.len() | ||
159 | ); | ||
160 | |||
161 | None | ||
162 | } | ||
163 | } | ||
164 | |||
165 | pub fn parse_char_literal(src: &str) -> CharComponentIterator { | ||
166 | CharComponentIterator { | ||
167 | parser: Parser::new(src), | ||
168 | has_closing_quote: false, | ||
169 | } | ||
170 | } | ||
171 | |||
172 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
173 | pub struct CharComponent { | ||
174 | pub range: TextRange, | ||
175 | pub kind: CharComponentKind, | ||
176 | } | ||
177 | |||
178 | impl CharComponent { | ||
179 | fn new(range: TextRange, kind: CharComponentKind) -> CharComponent { | ||
180 | CharComponent { range, kind } | ||
181 | } | ||
182 | } | ||
183 | |||
184 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
185 | pub enum CharComponentKind { | ||
186 | CodePoint, | ||
187 | AsciiEscape, | ||
188 | AsciiCodeEscape, | ||
189 | UnicodeEscape, | ||
190 | } | ||
191 | |||
192 | pub struct CharComponentIterator<'a> { | ||
193 | parser: Parser<'a>, | ||
194 | pub has_closing_quote: bool, | ||
195 | } | ||
196 | |||
197 | impl<'a> Iterator for CharComponentIterator<'a> { | ||
198 | type Item = CharComponent; | ||
199 | fn next(&mut self) -> Option<CharComponent> { | ||
200 | if self.parser.pos == 0 { | ||
201 | assert!( | ||
202 | self.parser.advance() == '\'', | ||
203 | "char literal should start with a quote" | ||
204 | ); | ||
205 | } | ||
206 | |||
207 | if let Some(component) = self.parser.parse_char_component() { | ||
208 | return Some(component); | ||
209 | } | ||
210 | |||
211 | // We get here when there are no char components left to parse | ||
212 | if self.parser.peek() == Some('\'') { | ||
213 | self.parser.advance(); | ||
214 | self.has_closing_quote = true; | ||
215 | } | ||
216 | |||
217 | assert!( | ||
218 | self.parser.peek() == None, | ||
219 | "char literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
220 | self.parser.src, | ||
221 | self.parser.pos, | ||
222 | self.parser.src.len() | ||
223 | ); | ||
224 | |||
225 | None | ||
226 | } | ||
227 | } | ||
228 | |||
229 | pub struct Parser<'a> { | ||
230 | src: &'a str, | ||
231 | pos: usize, | ||
232 | } | ||
233 | |||
234 | impl<'a> Parser<'a> { | ||
235 | pub fn new(src: &'a str) -> Parser<'a> { | ||
236 | Parser { src, pos: 0 } | ||
237 | } | ||
238 | |||
239 | // Utility methods | ||
240 | |||
241 | pub fn peek(&self) -> Option<char> { | ||
242 | if self.pos == self.src.len() { | ||
243 | return None; | ||
244 | } | ||
245 | |||
246 | self.src[self.pos..].chars().next() | ||
247 | } | ||
248 | |||
249 | pub fn advance(&mut self) -> char { | ||
250 | let next = self | ||
251 | .peek() | ||
252 | .expect("cannot advance if end of input is reached"); | ||
253 | self.pos += next.len_utf8(); | ||
254 | next | ||
255 | } | ||
256 | |||
257 | pub fn skip_whitespace(&mut self) { | ||
258 | while self.peek().map(|c| c.is_whitespace()) == Some(true) { | ||
259 | self.advance(); | ||
260 | } | ||
261 | } | ||
262 | |||
263 | pub fn get_pos(&self) -> TextUnit { | ||
264 | (self.pos as u32).into() | ||
265 | } | ||
266 | |||
267 | // Char parsing methods | ||
268 | |||
269 | fn parse_unicode_escape(&mut self, start: TextUnit) -> CharComponent { | ||
270 | match self.peek() { | ||
271 | Some('{') => { | ||
272 | self.advance(); | ||
273 | |||
274 | // Parse anything until we reach `}` | ||
275 | while let Some(next) = self.peek() { | ||
276 | self.advance(); | ||
277 | if next == '}' { | ||
278 | break; | ||
279 | } | ||
280 | } | ||
281 | |||
282 | let end = self.get_pos(); | ||
283 | CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) | ||
284 | } | ||
285 | Some(_) | None => { | ||
286 | let end = self.get_pos(); | ||
287 | CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) | ||
288 | } | ||
289 | } | ||
290 | } | ||
291 | |||
292 | fn parse_ascii_code_escape(&mut self, start: TextUnit) -> CharComponent { | ||
293 | let code_start = self.get_pos(); | ||
294 | while let Some(next) = self.peek() { | ||
295 | if next == '\'' || (self.get_pos() - code_start == 2.into()) { | ||
296 | break; | ||
297 | } | ||
298 | |||
299 | self.advance(); | ||
300 | } | ||
301 | |||
302 | let end = self.get_pos(); | ||
303 | CharComponent::new(TextRange::from_to(start, end), AsciiCodeEscape) | ||
304 | } | ||
305 | |||
306 | fn parse_escape(&mut self, start: TextUnit) -> CharComponent { | ||
307 | if self.peek().is_none() { | ||
308 | return CharComponent::new(TextRange::from_to(start, start), AsciiEscape); | ||
309 | } | ||
310 | |||
311 | let next = self.advance(); | ||
312 | let end = self.get_pos(); | ||
313 | let range = TextRange::from_to(start, end); | ||
314 | match next { | ||
315 | 'x' => self.parse_ascii_code_escape(start), | ||
316 | 'u' => self.parse_unicode_escape(start), | ||
317 | _ => CharComponent::new(range, AsciiEscape), | ||
318 | } | ||
319 | } | ||
320 | |||
321 | pub fn parse_char_component(&mut self) -> Option<CharComponent> { | ||
322 | let next = self.peek()?; | ||
323 | |||
324 | // Ignore character close | ||
325 | if next == '\'' { | ||
326 | return None; | ||
327 | } | ||
328 | |||
329 | let start = self.get_pos(); | ||
330 | self.advance(); | ||
331 | |||
332 | if next == '\\' { | ||
333 | Some(self.parse_escape(start)) | ||
334 | } else { | ||
335 | let end = self.get_pos(); | ||
336 | Some(CharComponent::new( | ||
337 | TextRange::from_to(start, end), | ||
338 | CodePoint, | ||
339 | )) | ||
340 | } | ||
341 | } | ||
342 | |||
343 | pub fn parse_ignore_newline(&mut self, start: TextUnit) -> Option<StringComponent> { | ||
344 | // In string literals, when a `\` occurs immediately before the newline, the `\`, | ||
345 | // the newline, and all whitespace at the beginning of the next line are ignored | ||
346 | match self.peek() { | ||
347 | Some('\n') | Some('\r') => { | ||
348 | self.skip_whitespace(); | ||
349 | Some(StringComponent::new( | ||
350 | TextRange::from_to(start, self.get_pos()), | ||
351 | StringComponentKind::IgnoreNewline, | ||
352 | )) | ||
353 | } | ||
354 | _ => None, | ||
355 | } | ||
356 | } | ||
357 | |||
358 | pub fn parse_string_component(&mut self) -> Option<StringComponent> { | ||
359 | let next = self.peek()?; | ||
360 | |||
361 | // Ignore string close | ||
362 | if next == '"' { | ||
363 | return None; | ||
364 | } | ||
365 | |||
366 | let start = self.get_pos(); | ||
367 | self.advance(); | ||
368 | |||
369 | if next == '\\' { | ||
370 | // Strings can use `\` to ignore newlines, so we first try to parse one of those | ||
371 | // before falling back to parsing char escapes | ||
372 | self.parse_ignore_newline(start).or_else(|| { | ||
373 | let char_component = self.parse_escape(start); | ||
374 | Some(StringComponent::new( | ||
375 | char_component.range, | ||
376 | StringComponentKind::Char(char_component.kind), | ||
377 | )) | ||
378 | }) | ||
379 | } else { | ||
380 | let end = self.get_pos(); | ||
381 | Some(StringComponent::new( | ||
382 | TextRange::from_to(start, end), | ||
383 | StringComponentKind::Char(CodePoint), | ||
384 | )) | ||
385 | } | ||
386 | } | ||
387 | } | ||
388 | |||
389 | #[cfg(test)] | ||
390 | mod tests { | ||
391 | use super::*; | ||
392 | |||
393 | fn parse(src: &str) -> (bool, Vec<CharComponent>) { | ||
394 | let component_iterator = &mut super::parse_char_literal(src); | ||
395 | let components: Vec<_> = component_iterator.collect(); | ||
396 | (component_iterator.has_closing_quote, components) | ||
397 | } | ||
398 | |||
399 | fn unclosed_char_component(src: &str) -> CharComponent { | ||
400 | let (has_closing_quote, components) = parse(src); | ||
401 | assert!(!has_closing_quote, "char should not have closing quote"); | ||
402 | assert!(components.len() == 1); | ||
403 | components[0].clone() | ||
404 | } | ||
405 | |||
406 | fn closed_char_component(src: &str) -> CharComponent { | ||
407 | let (has_closing_quote, components) = parse(src); | ||
408 | assert!(has_closing_quote, "char should have closing quote"); | ||
409 | assert!( | ||
410 | components.len() == 1, | ||
411 | "Literal: {}\nComponents: {:#?}", | ||
412 | src, | ||
413 | components | ||
414 | ); | ||
415 | components[0].clone() | ||
416 | } | ||
417 | |||
418 | fn closed_char_components(src: &str) -> Vec<CharComponent> { | ||
419 | let (has_closing_quote, components) = parse(src); | ||
420 | assert!(has_closing_quote, "char should have closing quote"); | ||
421 | components | ||
422 | } | ||
423 | |||
424 | fn range_closed(src: &str) -> TextRange { | ||
425 | TextRange::from_to(1.into(), (src.len() as u32 - 1).into()) | ||
426 | } | ||
427 | |||
428 | fn range_unclosed(src: &str) -> TextRange { | ||
429 | TextRange::from_to(1.into(), (src.len() as u32).into()) | ||
430 | } | ||
431 | |||
432 | #[test] | ||
433 | fn test_unicode_escapes() { | ||
434 | let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""]; | ||
435 | for escape in unicode_escapes { | ||
436 | let escape_sequence = format!(r"'\u{}'", escape); | ||
437 | let component = closed_char_component(&escape_sequence); | ||
438 | let expected_range = range_closed(&escape_sequence); | ||
439 | assert_eq!(component.kind, CharComponentKind::UnicodeEscape); | ||
440 | assert_eq!(component.range, expected_range); | ||
441 | } | ||
442 | } | ||
443 | |||
444 | #[test] | ||
445 | fn test_unicode_escapes_unclosed() { | ||
446 | let unicode_escapes = &["{DEAD", "{BEEF", "{FF"]; | ||
447 | for escape in unicode_escapes { | ||
448 | let escape_sequence = format!(r"'\u{}'", escape); | ||
449 | let component = unclosed_char_component(&escape_sequence); | ||
450 | let expected_range = range_unclosed(&escape_sequence); | ||
451 | assert_eq!(component.kind, CharComponentKind::UnicodeEscape); | ||
452 | assert_eq!(component.range, expected_range); | ||
453 | } | ||
454 | } | ||
455 | |||
456 | #[test] | ||
457 | fn test_empty_char() { | ||
458 | let (has_closing_quote, components) = parse("''"); | ||
459 | assert!(has_closing_quote, "char should have closing quote"); | ||
460 | assert!(components.len() == 0); | ||
461 | } | ||
462 | |||
463 | #[test] | ||
464 | fn test_unclosed_char() { | ||
465 | let component = unclosed_char_component("'a"); | ||
466 | assert!(component.kind == CodePoint); | ||
467 | assert!(component.range == TextRange::from_to(1.into(), 2.into())); | ||
468 | } | ||
469 | |||
470 | #[test] | ||
471 | fn test_digit_escapes() { | ||
472 | let literals = &[r"", r"5", r"55"]; | ||
473 | |||
474 | for literal in literals { | ||
475 | let lit_text = format!(r"'\x{}'", literal); | ||
476 | let component = closed_char_component(&lit_text); | ||
477 | assert!(component.kind == CharComponentKind::AsciiCodeEscape); | ||
478 | assert!(component.range == range_closed(&lit_text)); | ||
479 | } | ||
480 | |||
481 | // More than 2 digits starts a new codepoint | ||
482 | let components = closed_char_components(r"'\x555'"); | ||
483 | assert!(components.len() == 2); | ||
484 | assert!(components[1].kind == CharComponentKind::CodePoint); | ||
485 | } | ||
486 | |||
487 | #[test] | ||
488 | fn test_ascii_escapes() { | ||
489 | let literals = &[ | ||
490 | r"\'", "\\\"", // equivalent to \" | ||
491 | r"\n", r"\r", r"\t", r"\\", r"\0", | ||
492 | ]; | ||
493 | |||
494 | for literal in literals { | ||
495 | let lit_text = format!("'{}'", literal); | ||
496 | let component = closed_char_component(&lit_text); | ||
497 | assert!(component.kind == CharComponentKind::AsciiEscape); | ||
498 | assert!(component.range == range_closed(&lit_text)); | ||
499 | } | ||
500 | } | ||
501 | |||
502 | #[test] | ||
503 | fn test_no_escapes() { | ||
504 | let literals = &['"', 'n', 'r', 't', '0', 'x', 'u']; | ||
505 | |||
506 | for &literal in literals { | ||
507 | let lit_text = format!("'{}'", literal); | ||
508 | let component = closed_char_component(&lit_text); | ||
509 | assert!(component.kind == CharComponentKind::CodePoint); | ||
510 | assert!(component.range == range_closed(&lit_text)); | ||
511 | } | ||
512 | } | ||
513 | } | ||
diff --git a/crates/ra_syntax/src/string_lexing/byte.rs b/crates/ra_syntax/src/string_lexing/byte.rs new file mode 100644 index 000000000..24424349c --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/byte.rs | |||
@@ -0,0 +1,51 @@ | |||
1 | use super::parser::Parser; | ||
2 | use super::CharComponent; | ||
3 | |||
4 | pub fn parse_byte_literal(src: &str) -> ByteComponentIterator { | ||
5 | ByteComponentIterator { | ||
6 | parser: Parser::new(src), | ||
7 | has_closing_quote: false, | ||
8 | } | ||
9 | } | ||
10 | |||
11 | pub struct ByteComponentIterator<'a> { | ||
12 | parser: Parser<'a>, | ||
13 | pub has_closing_quote: bool, | ||
14 | } | ||
15 | |||
16 | impl<'a> Iterator for ByteComponentIterator<'a> { | ||
17 | type Item = CharComponent; | ||
18 | fn next(&mut self) -> Option<CharComponent> { | ||
19 | if self.parser.pos == 0 { | ||
20 | assert!( | ||
21 | self.parser.advance() == 'b', | ||
22 | "Byte literal should start with a `b`" | ||
23 | ); | ||
24 | |||
25 | assert!( | ||
26 | self.parser.advance() == '\'', | ||
27 | "Byte literal should start with a `b`, followed by a quote" | ||
28 | ); | ||
29 | } | ||
30 | |||
31 | if let Some(component) = self.parser.parse_char_component() { | ||
32 | return Some(component); | ||
33 | } | ||
34 | |||
35 | // We get here when there are no char components left to parse | ||
36 | if self.parser.peek() == Some('\'') { | ||
37 | self.parser.advance(); | ||
38 | self.has_closing_quote = true; | ||
39 | } | ||
40 | |||
41 | assert!( | ||
42 | self.parser.peek() == None, | ||
43 | "byte literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
44 | self.parser.src, | ||
45 | self.parser.pos, | ||
46 | self.parser.src.len() | ||
47 | ); | ||
48 | |||
49 | None | ||
50 | } | ||
51 | } | ||
diff --git a/crates/ra_syntax/src/string_lexing/byte_string.rs b/crates/ra_syntax/src/string_lexing/byte_string.rs new file mode 100644 index 000000000..5b6dda760 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/byte_string.rs | |||
@@ -0,0 +1,51 @@ | |||
1 | use super::parser::Parser; | ||
2 | use super::StringComponent; | ||
3 | |||
4 | pub fn parse_byte_string_literal(src: &str) -> ByteStringComponentIterator { | ||
5 | ByteStringComponentIterator { | ||
6 | parser: Parser::new(src), | ||
7 | has_closing_quote: false, | ||
8 | } | ||
9 | } | ||
10 | |||
11 | pub struct ByteStringComponentIterator<'a> { | ||
12 | parser: Parser<'a>, | ||
13 | pub has_closing_quote: bool, | ||
14 | } | ||
15 | |||
16 | impl<'a> Iterator for ByteStringComponentIterator<'a> { | ||
17 | type Item = StringComponent; | ||
18 | fn next(&mut self) -> Option<StringComponent> { | ||
19 | if self.parser.pos == 0 { | ||
20 | assert!( | ||
21 | self.parser.advance() == 'b', | ||
22 | "byte string literal should start with a `b`" | ||
23 | ); | ||
24 | |||
25 | assert!( | ||
26 | self.parser.advance() == '"', | ||
27 | "byte string literal should start with a `b`, followed by double quotes" | ||
28 | ); | ||
29 | } | ||
30 | |||
31 | if let Some(component) = self.parser.parse_string_component() { | ||
32 | return Some(component); | ||
33 | } | ||
34 | |||
35 | // We get here when there are no char components left to parse | ||
36 | if self.parser.peek() == Some('"') { | ||
37 | self.parser.advance(); | ||
38 | self.has_closing_quote = true; | ||
39 | } | ||
40 | |||
41 | assert!( | ||
42 | self.parser.peek() == None, | ||
43 | "byte string literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
44 | self.parser.src, | ||
45 | self.parser.pos, | ||
46 | self.parser.src.len() | ||
47 | ); | ||
48 | |||
49 | None | ||
50 | } | ||
51 | } | ||
diff --git a/crates/ra_syntax/src/string_lexing/char.rs b/crates/ra_syntax/src/string_lexing/char.rs new file mode 100644 index 000000000..885c03b14 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/char.rs | |||
@@ -0,0 +1,176 @@ | |||
1 | use super::parser::Parser; | ||
2 | use super::CharComponent; | ||
3 | |||
4 | pub fn parse_char_literal(src: &str) -> CharComponentIterator { | ||
5 | CharComponentIterator { | ||
6 | parser: Parser::new(src), | ||
7 | has_closing_quote: false, | ||
8 | } | ||
9 | } | ||
10 | |||
11 | pub struct CharComponentIterator<'a> { | ||
12 | parser: Parser<'a>, | ||
13 | pub has_closing_quote: bool, | ||
14 | } | ||
15 | |||
16 | impl<'a> Iterator for CharComponentIterator<'a> { | ||
17 | type Item = CharComponent; | ||
18 | fn next(&mut self) -> Option<CharComponent> { | ||
19 | if self.parser.pos == 0 { | ||
20 | assert!( | ||
21 | self.parser.advance() == '\'', | ||
22 | "char literal should start with a quote" | ||
23 | ); | ||
24 | } | ||
25 | |||
26 | if let Some(component) = self.parser.parse_char_component() { | ||
27 | return Some(component); | ||
28 | } | ||
29 | |||
30 | // We get here when there are no char components left to parse | ||
31 | if self.parser.peek() == Some('\'') { | ||
32 | self.parser.advance(); | ||
33 | self.has_closing_quote = true; | ||
34 | } | ||
35 | |||
36 | assert!( | ||
37 | self.parser.peek() == None, | ||
38 | "char literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
39 | self.parser.src, | ||
40 | self.parser.pos, | ||
41 | self.parser.src.len() | ||
42 | ); | ||
43 | |||
44 | None | ||
45 | } | ||
46 | } | ||
47 | |||
48 | #[cfg(test)] | ||
49 | mod tests { | ||
50 | use rowan::TextRange; | ||
51 | use crate::string_lexing::{ | ||
52 | CharComponent, | ||
53 | CharComponentKind::*, | ||
54 | }; | ||
55 | |||
56 | fn parse(src: &str) -> (bool, Vec<CharComponent>) { | ||
57 | let component_iterator = &mut super::parse_char_literal(src); | ||
58 | let components: Vec<_> = component_iterator.collect(); | ||
59 | (component_iterator.has_closing_quote, components) | ||
60 | } | ||
61 | |||
62 | fn unclosed_char_component(src: &str) -> CharComponent { | ||
63 | let (has_closing_quote, components) = parse(src); | ||
64 | assert!(!has_closing_quote, "char should not have closing quote"); | ||
65 | assert!(components.len() == 1); | ||
66 | components[0].clone() | ||
67 | } | ||
68 | |||
69 | fn closed_char_component(src: &str) -> CharComponent { | ||
70 | let (has_closing_quote, components) = parse(src); | ||
71 | assert!(has_closing_quote, "char should have closing quote"); | ||
72 | assert!( | ||
73 | components.len() == 1, | ||
74 | "Literal: {}\nComponents: {:#?}", | ||
75 | src, | ||
76 | components | ||
77 | ); | ||
78 | components[0].clone() | ||
79 | } | ||
80 | |||
81 | fn closed_char_components(src: &str) -> Vec<CharComponent> { | ||
82 | let (has_closing_quote, components) = parse(src); | ||
83 | assert!(has_closing_quote, "char should have closing quote"); | ||
84 | components | ||
85 | } | ||
86 | |||
87 | fn range_closed(src: &str) -> TextRange { | ||
88 | TextRange::from_to(1.into(), (src.len() as u32 - 1).into()) | ||
89 | } | ||
90 | |||
91 | fn range_unclosed(src: &str) -> TextRange { | ||
92 | TextRange::from_to(1.into(), (src.len() as u32).into()) | ||
93 | } | ||
94 | |||
95 | #[test] | ||
96 | fn test_unicode_escapes() { | ||
97 | let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""]; | ||
98 | for escape in unicode_escapes { | ||
99 | let escape_sequence = format!(r"'\u{}'", escape); | ||
100 | let component = closed_char_component(&escape_sequence); | ||
101 | let expected_range = range_closed(&escape_sequence); | ||
102 | assert_eq!(component.kind, UnicodeEscape); | ||
103 | assert_eq!(component.range, expected_range); | ||
104 | } | ||
105 | } | ||
106 | |||
107 | #[test] | ||
108 | fn test_unicode_escapes_unclosed() { | ||
109 | let unicode_escapes = &["{DEAD", "{BEEF", "{FF"]; | ||
110 | for escape in unicode_escapes { | ||
111 | let escape_sequence = format!(r"'\u{}'", escape); | ||
112 | let component = unclosed_char_component(&escape_sequence); | ||
113 | let expected_range = range_unclosed(&escape_sequence); | ||
114 | assert_eq!(component.kind, UnicodeEscape); | ||
115 | assert_eq!(component.range, expected_range); | ||
116 | } | ||
117 | } | ||
118 | |||
119 | #[test] | ||
120 | fn test_empty_char() { | ||
121 | let (has_closing_quote, components) = parse("''"); | ||
122 | assert!(has_closing_quote, "char should have closing quote"); | ||
123 | assert!(components.len() == 0); | ||
124 | } | ||
125 | |||
126 | #[test] | ||
127 | fn test_unclosed_char() { | ||
128 | let component = unclosed_char_component("'a"); | ||
129 | assert!(component.kind == CodePoint); | ||
130 | assert!(component.range == TextRange::from_to(1.into(), 2.into())); | ||
131 | } | ||
132 | |||
133 | #[test] | ||
134 | fn test_digit_escapes() { | ||
135 | let literals = &[r"", r"5", r"55"]; | ||
136 | |||
137 | for literal in literals { | ||
138 | let lit_text = format!(r"'\x{}'", literal); | ||
139 | let component = closed_char_component(&lit_text); | ||
140 | assert!(component.kind == AsciiCodeEscape); | ||
141 | assert!(component.range == range_closed(&lit_text)); | ||
142 | } | ||
143 | |||
144 | // More than 2 digits starts a new codepoint | ||
145 | let components = closed_char_components(r"'\x555'"); | ||
146 | assert!(components.len() == 2); | ||
147 | assert!(components[1].kind == CodePoint); | ||
148 | } | ||
149 | |||
150 | #[test] | ||
151 | fn test_ascii_escapes() { | ||
152 | let literals = &[ | ||
153 | r"\'", "\\\"", // equivalent to \" | ||
154 | r"\n", r"\r", r"\t", r"\\", r"\0", | ||
155 | ]; | ||
156 | |||
157 | for literal in literals { | ||
158 | let lit_text = format!("'{}'", literal); | ||
159 | let component = closed_char_component(&lit_text); | ||
160 | assert!(component.kind == AsciiEscape); | ||
161 | assert!(component.range == range_closed(&lit_text)); | ||
162 | } | ||
163 | } | ||
164 | |||
165 | #[test] | ||
166 | fn test_no_escapes() { | ||
167 | let literals = &['"', 'n', 'r', 't', '0', 'x', 'u']; | ||
168 | |||
169 | for &literal in literals { | ||
170 | let lit_text = format!("'{}'", literal); | ||
171 | let component = closed_char_component(&lit_text); | ||
172 | assert!(component.kind == CodePoint); | ||
173 | assert!(component.range == range_closed(&lit_text)); | ||
174 | } | ||
175 | } | ||
176 | } | ||
diff --git a/crates/ra_syntax/src/string_lexing/mod.rs b/crates/ra_syntax/src/string_lexing/mod.rs new file mode 100644 index 000000000..94853331f --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/mod.rs | |||
@@ -0,0 +1,13 @@ | |||
1 | mod parser; | ||
2 | mod byte; | ||
3 | mod byte_string; | ||
4 | mod char; | ||
5 | mod string; | ||
6 | |||
7 | pub use self::{ | ||
8 | byte::parse_byte_literal, | ||
9 | byte_string::parse_byte_string_literal, | ||
10 | char::parse_char_literal, | ||
11 | parser::{CharComponent, CharComponentKind, StringComponent, StringComponentKind}, | ||
12 | string::parse_string_literal, | ||
13 | }; | ||
diff --git a/crates/ra_syntax/src/string_lexing/parser.rs b/crates/ra_syntax/src/string_lexing/parser.rs new file mode 100644 index 000000000..4a6d5bc93 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/parser.rs | |||
@@ -0,0 +1,201 @@ | |||
1 | use rowan::{TextRange, TextUnit}; | ||
2 | |||
3 | use self::CharComponentKind::*; | ||
4 | |||
5 | pub struct Parser<'a> { | ||
6 | pub(super) src: &'a str, | ||
7 | pub(super) pos: usize, | ||
8 | } | ||
9 | |||
10 | impl<'a> Parser<'a> { | ||
11 | pub fn new(src: &'a str) -> Parser<'a> { | ||
12 | Parser { src, pos: 0 } | ||
13 | } | ||
14 | |||
15 | // Utility methods | ||
16 | |||
17 | pub fn peek(&self) -> Option<char> { | ||
18 | if self.pos == self.src.len() { | ||
19 | return None; | ||
20 | } | ||
21 | |||
22 | self.src[self.pos..].chars().next() | ||
23 | } | ||
24 | |||
25 | pub fn advance(&mut self) -> char { | ||
26 | let next = self | ||
27 | .peek() | ||
28 | .expect("cannot advance if end of input is reached"); | ||
29 | self.pos += next.len_utf8(); | ||
30 | next | ||
31 | } | ||
32 | |||
33 | pub fn skip_whitespace(&mut self) { | ||
34 | while self.peek().map(|c| c.is_whitespace()) == Some(true) { | ||
35 | self.advance(); | ||
36 | } | ||
37 | } | ||
38 | |||
39 | pub fn get_pos(&self) -> TextUnit { | ||
40 | (self.pos as u32).into() | ||
41 | } | ||
42 | |||
43 | // Char parsing methods | ||
44 | |||
45 | fn parse_unicode_escape(&mut self, start: TextUnit) -> CharComponent { | ||
46 | match self.peek() { | ||
47 | Some('{') => { | ||
48 | self.advance(); | ||
49 | |||
50 | // Parse anything until we reach `}` | ||
51 | while let Some(next) = self.peek() { | ||
52 | self.advance(); | ||
53 | if next == '}' { | ||
54 | break; | ||
55 | } | ||
56 | } | ||
57 | |||
58 | let end = self.get_pos(); | ||
59 | CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) | ||
60 | } | ||
61 | Some(_) | None => { | ||
62 | let end = self.get_pos(); | ||
63 | CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) | ||
64 | } | ||
65 | } | ||
66 | } | ||
67 | |||
68 | fn parse_ascii_code_escape(&mut self, start: TextUnit) -> CharComponent { | ||
69 | let code_start = self.get_pos(); | ||
70 | while let Some(next) = self.peek() { | ||
71 | if next == '\'' || (self.get_pos() - code_start == 2.into()) { | ||
72 | break; | ||
73 | } | ||
74 | |||
75 | self.advance(); | ||
76 | } | ||
77 | |||
78 | let end = self.get_pos(); | ||
79 | CharComponent::new(TextRange::from_to(start, end), AsciiCodeEscape) | ||
80 | } | ||
81 | |||
82 | fn parse_escape(&mut self, start: TextUnit) -> CharComponent { | ||
83 | if self.peek().is_none() { | ||
84 | return CharComponent::new(TextRange::from_to(start, start), AsciiEscape); | ||
85 | } | ||
86 | |||
87 | let next = self.advance(); | ||
88 | let end = self.get_pos(); | ||
89 | let range = TextRange::from_to(start, end); | ||
90 | match next { | ||
91 | 'x' => self.parse_ascii_code_escape(start), | ||
92 | 'u' => self.parse_unicode_escape(start), | ||
93 | _ => CharComponent::new(range, AsciiEscape), | ||
94 | } | ||
95 | } | ||
96 | |||
97 | pub fn parse_char_component(&mut self) -> Option<CharComponent> { | ||
98 | let next = self.peek()?; | ||
99 | |||
100 | // Ignore character close | ||
101 | if next == '\'' { | ||
102 | return None; | ||
103 | } | ||
104 | |||
105 | let start = self.get_pos(); | ||
106 | self.advance(); | ||
107 | |||
108 | if next == '\\' { | ||
109 | Some(self.parse_escape(start)) | ||
110 | } else { | ||
111 | let end = self.get_pos(); | ||
112 | Some(CharComponent::new( | ||
113 | TextRange::from_to(start, end), | ||
114 | CodePoint, | ||
115 | )) | ||
116 | } | ||
117 | } | ||
118 | |||
119 | pub fn parse_ignore_newline(&mut self, start: TextUnit) -> Option<StringComponent> { | ||
120 | // In string literals, when a `\` occurs immediately before the newline, the `\`, | ||
121 | // the newline, and all whitespace at the beginning of the next line are ignored | ||
122 | match self.peek() { | ||
123 | Some('\n') | Some('\r') => { | ||
124 | self.skip_whitespace(); | ||
125 | Some(StringComponent::new( | ||
126 | TextRange::from_to(start, self.get_pos()), | ||
127 | StringComponentKind::IgnoreNewline, | ||
128 | )) | ||
129 | } | ||
130 | _ => None, | ||
131 | } | ||
132 | } | ||
133 | |||
134 | pub fn parse_string_component(&mut self) -> Option<StringComponent> { | ||
135 | let next = self.peek()?; | ||
136 | |||
137 | // Ignore string close | ||
138 | if next == '"' { | ||
139 | return None; | ||
140 | } | ||
141 | |||
142 | let start = self.get_pos(); | ||
143 | self.advance(); | ||
144 | |||
145 | if next == '\\' { | ||
146 | // Strings can use `\` to ignore newlines, so we first try to parse one of those | ||
147 | // before falling back to parsing char escapes | ||
148 | self.parse_ignore_newline(start).or_else(|| { | ||
149 | let char_component = self.parse_escape(start); | ||
150 | Some(StringComponent::new( | ||
151 | char_component.range, | ||
152 | StringComponentKind::Char(char_component.kind), | ||
153 | )) | ||
154 | }) | ||
155 | } else { | ||
156 | let end = self.get_pos(); | ||
157 | Some(StringComponent::new( | ||
158 | TextRange::from_to(start, end), | ||
159 | StringComponentKind::Char(CodePoint), | ||
160 | )) | ||
161 | } | ||
162 | } | ||
163 | } | ||
164 | |||
165 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
166 | pub struct StringComponent { | ||
167 | pub range: TextRange, | ||
168 | pub kind: StringComponentKind, | ||
169 | } | ||
170 | |||
171 | impl StringComponent { | ||
172 | fn new(range: TextRange, kind: StringComponentKind) -> StringComponent { | ||
173 | StringComponent { range, kind } | ||
174 | } | ||
175 | } | ||
176 | |||
177 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
178 | pub enum StringComponentKind { | ||
179 | IgnoreNewline, | ||
180 | Char(CharComponentKind), | ||
181 | } | ||
182 | |||
183 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
184 | pub struct CharComponent { | ||
185 | pub range: TextRange, | ||
186 | pub kind: CharComponentKind, | ||
187 | } | ||
188 | |||
189 | impl CharComponent { | ||
190 | fn new(range: TextRange, kind: CharComponentKind) -> CharComponent { | ||
191 | CharComponent { range, kind } | ||
192 | } | ||
193 | } | ||
194 | |||
195 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
196 | pub enum CharComponentKind { | ||
197 | CodePoint, | ||
198 | AsciiEscape, | ||
199 | AsciiCodeEscape, | ||
200 | UnicodeEscape, | ||
201 | } | ||
diff --git a/crates/ra_syntax/src/string_lexing/string.rs b/crates/ra_syntax/src/string_lexing/string.rs new file mode 100644 index 000000000..1b23029c6 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/string.rs | |||
@@ -0,0 +1,46 @@ | |||
1 | use super::parser::Parser; | ||
2 | use super::StringComponent; | ||
3 | |||
4 | pub fn parse_string_literal(src: &str) -> StringComponentIterator { | ||
5 | StringComponentIterator { | ||
6 | parser: Parser::new(src), | ||
7 | has_closing_quote: false, | ||
8 | } | ||
9 | } | ||
10 | |||
11 | pub struct StringComponentIterator<'a> { | ||
12 | parser: Parser<'a>, | ||
13 | pub has_closing_quote: bool, | ||
14 | } | ||
15 | |||
16 | impl<'a> Iterator for StringComponentIterator<'a> { | ||
17 | type Item = StringComponent; | ||
18 | fn next(&mut self) -> Option<StringComponent> { | ||
19 | if self.parser.pos == 0 { | ||
20 | assert!( | ||
21 | self.parser.advance() == '"', | ||
22 | "string literal should start with double quotes" | ||
23 | ); | ||
24 | } | ||
25 | |||
26 | if let Some(component) = self.parser.parse_string_component() { | ||
27 | return Some(component); | ||
28 | } | ||
29 | |||
30 | // We get here when there are no char components left to parse | ||
31 | if self.parser.peek() == Some('"') { | ||
32 | self.parser.advance(); | ||
33 | self.has_closing_quote = true; | ||
34 | } | ||
35 | |||
36 | assert!( | ||
37 | self.parser.peek() == None, | ||
38 | "string literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
39 | self.parser.src, | ||
40 | self.parser.pos, | ||
41 | self.parser.src.len() | ||
42 | ); | ||
43 | |||
44 | None | ||
45 | } | ||
46 | } | ||
diff --git a/crates/ra_syntax/src/validation/byte.rs b/crates/ra_syntax/src/validation/byte.rs index 7baf3c1d7..43c0d7edd 100644 --- a/crates/ra_syntax/src/validation/byte.rs +++ b/crates/ra_syntax/src/validation/byte.rs | |||
@@ -48,7 +48,10 @@ pub(super) fn validate_byte_component( | |||
48 | AsciiCodeEscape => validate_byte_code_escape(text, range, errors), | 48 | AsciiCodeEscape => validate_byte_code_escape(text, range, errors), |
49 | UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)), | 49 | UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)), |
50 | CodePoint => { | 50 | CodePoint => { |
51 | let c = text.chars().next().expect("Code points should be one character long"); | 51 | let c = text |
52 | .chars() | ||
53 | .next() | ||
54 | .expect("Code points should be one character long"); | ||
52 | 55 | ||
53 | // These bytes must always be escaped | 56 | // These bytes must always be escaped |
54 | if c == '\t' || c == '\r' || c == '\n' { | 57 | if c == '\t' || c == '\r' || c == '\n' { |
@@ -148,9 +151,7 @@ mod test { | |||
148 | 151 | ||
149 | #[test] | 152 | #[test] |
150 | fn test_valid_byte_escape() { | 153 | fn test_valid_byte_escape() { |
151 | let valid = [ | 154 | let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"]; |
152 | r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", | ||
153 | ]; | ||
154 | for c in &valid { | 155 | for c in &valid { |
155 | assert_valid_byte(c); | 156 | assert_valid_byte(c); |
156 | } | 157 | } |
diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs index 622b2efdc..4728c85e6 100644 --- a/crates/ra_syntax/src/validation/char.rs +++ b/crates/ra_syntax/src/validation/char.rs | |||
@@ -213,9 +213,7 @@ mod test { | |||
213 | 213 | ||
214 | #[test] | 214 | #[test] |
215 | fn test_valid_ascii_escape() { | 215 | fn test_valid_ascii_escape() { |
216 | let valid = [ | 216 | let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"]; |
217 | r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", | ||
218 | ]; | ||
219 | for c in &valid { | 217 | for c in &valid { |
220 | assert_valid_char(c); | 218 | assert_valid_char(c); |
221 | } | 219 | } |
diff --git a/crates/ra_syntax/src/yellow/syntax_error.rs b/crates/ra_syntax/src/yellow/syntax_error.rs index df230293b..c32ee650d 100644 --- a/crates/ra_syntax/src/yellow/syntax_error.rs +++ b/crates/ra_syntax/src/yellow/syntax_error.rs | |||
@@ -117,7 +117,10 @@ impl fmt::Display for SyntaxErrorKind { | |||
117 | InvalidByteEscape => write!(f, "Invalid escape sequence"), | 117 | InvalidByteEscape => write!(f, "Invalid escape sequence"), |
118 | TooShortByteCodeEscape => write!(f, "Escape sequence should have two digits"), | 118 | TooShortByteCodeEscape => write!(f, "Escape sequence should have two digits"), |
119 | MalformedByteCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"), | 119 | MalformedByteCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"), |
120 | UnicodeEscapeForbidden => write!(f, "Unicode escapes are not allowed in byte literals or byte strings"), | 120 | UnicodeEscapeForbidden => write!( |
121 | f, | ||
122 | "Unicode escapes are not allowed in byte literals or byte strings" | ||
123 | ), | ||
121 | TooShortAsciiCodeEscape => write!(f, "Escape sequence should have two digits"), | 124 | TooShortAsciiCodeEscape => write!(f, "Escape sequence should have two digits"), |
122 | AsciiCodeEscapeOutOfRange => { | 125 | AsciiCodeEscapeOutOfRange => { |
123 | write!(f, "Escape sequence should be between \\x00 and \\x7F") | 126 | write!(f, "Escape sequence should be between \\x00 and \\x7F") |