diff options
author | Adolfo OchagavĂa <[email protected]> | 2018-11-08 14:42:00 +0000 |
---|---|---|
committer | Adolfo OchagavĂa <[email protected]> | 2018-11-09 13:52:17 +0000 |
commit | 3b4c02c19e4af645fd37e8bff774b05d546dc0b6 (patch) | |
tree | 42c40e9201adf64d1c06bc1c69524f5688ee6e9f /crates/ra_syntax/src/string_lexing.rs | |
parent | 5a9150df9bcdaf5faed5b500c22333f1f7c99f32 (diff) |
Validate string literals
Diffstat (limited to 'crates/ra_syntax/src/string_lexing.rs')
-rw-r--r-- | crates/ra_syntax/src/string_lexing.rs | 414 |
1 files changed, 414 insertions, 0 deletions
diff --git a/crates/ra_syntax/src/string_lexing.rs b/crates/ra_syntax/src/string_lexing.rs new file mode 100644 index 000000000..d613bb042 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing.rs | |||
@@ -0,0 +1,414 @@ | |||
1 | use self::CharComponentKind::*; | ||
2 | use rowan::{TextRange, TextUnit}; | ||
3 | |||
4 | pub fn parse_string_literal(src: &str) -> StringComponentIterator { | ||
5 | StringComponentIterator { | ||
6 | parser: Parser::new(src), | ||
7 | has_closing_quote: false, | ||
8 | } | ||
9 | } | ||
10 | |||
11 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
12 | pub struct StringComponent { | ||
13 | pub range: TextRange, | ||
14 | pub kind: StringComponentKind, | ||
15 | } | ||
16 | |||
17 | impl StringComponent { | ||
18 | fn new(range: TextRange, kind: StringComponentKind) -> StringComponent { | ||
19 | StringComponent { range, kind } | ||
20 | } | ||
21 | } | ||
22 | |||
23 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
24 | pub enum StringComponentKind { | ||
25 | IgnoreNewline, | ||
26 | Char(CharComponentKind), | ||
27 | } | ||
28 | |||
29 | pub struct StringComponentIterator<'a> { | ||
30 | parser: Parser<'a>, | ||
31 | pub has_closing_quote: bool, | ||
32 | } | ||
33 | |||
34 | impl<'a> Iterator for StringComponentIterator<'a> { | ||
35 | type Item = StringComponent; | ||
36 | fn next(&mut self) -> Option<StringComponent> { | ||
37 | if self.parser.pos == 0 { | ||
38 | assert!( | ||
39 | self.parser.advance() == '"', | ||
40 | "string literal should start with double quotes" | ||
41 | ); | ||
42 | } | ||
43 | |||
44 | if let Some(component) = self.parser.parse_string_component() { | ||
45 | return Some(component); | ||
46 | } | ||
47 | |||
48 | // We get here when there are no char components left to parse | ||
49 | if self.parser.peek() == Some('"') { | ||
50 | self.parser.advance(); | ||
51 | self.has_closing_quote = true; | ||
52 | } | ||
53 | |||
54 | assert!( | ||
55 | self.parser.peek() == None, | ||
56 | "string literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
57 | self.parser.src, | ||
58 | self.parser.pos, | ||
59 | self.parser.src.len() | ||
60 | ); | ||
61 | |||
62 | None | ||
63 | } | ||
64 | } | ||
65 | |||
66 | pub fn parse_char_literal(src: &str) -> CharComponentIterator { | ||
67 | CharComponentIterator { | ||
68 | parser: Parser::new(src), | ||
69 | has_closing_quote: false, | ||
70 | } | ||
71 | } | ||
72 | |||
73 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
74 | pub struct CharComponent { | ||
75 | pub range: TextRange, | ||
76 | pub kind: CharComponentKind, | ||
77 | } | ||
78 | |||
79 | impl CharComponent { | ||
80 | fn new(range: TextRange, kind: CharComponentKind) -> CharComponent { | ||
81 | CharComponent { range, kind } | ||
82 | } | ||
83 | } | ||
84 | |||
85 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
86 | pub enum CharComponentKind { | ||
87 | CodePoint, | ||
88 | AsciiEscape, | ||
89 | AsciiCodeEscape, | ||
90 | UnicodeEscape, | ||
91 | } | ||
92 | |||
93 | pub struct CharComponentIterator<'a> { | ||
94 | parser: Parser<'a>, | ||
95 | pub has_closing_quote: bool, | ||
96 | } | ||
97 | |||
98 | impl<'a> Iterator for CharComponentIterator<'a> { | ||
99 | type Item = CharComponent; | ||
100 | fn next(&mut self) -> Option<CharComponent> { | ||
101 | if self.parser.pos == 0 { | ||
102 | assert!( | ||
103 | self.parser.advance() == '\'', | ||
104 | "char literal should start with a quote" | ||
105 | ); | ||
106 | } | ||
107 | |||
108 | if let Some(component) = self.parser.parse_char_component() { | ||
109 | return Some(component); | ||
110 | } | ||
111 | |||
112 | // We get here when there are no char components left to parse | ||
113 | if self.parser.peek() == Some('\'') { | ||
114 | self.parser.advance(); | ||
115 | self.has_closing_quote = true; | ||
116 | } | ||
117 | |||
118 | assert!( | ||
119 | self.parser.peek() == None, | ||
120 | "char literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
121 | self.parser.src, | ||
122 | self.parser.pos, | ||
123 | self.parser.src.len() | ||
124 | ); | ||
125 | |||
126 | None | ||
127 | } | ||
128 | } | ||
129 | |||
130 | pub struct Parser<'a> { | ||
131 | src: &'a str, | ||
132 | pos: usize, | ||
133 | } | ||
134 | |||
135 | impl<'a> Parser<'a> { | ||
136 | pub fn new(src: &'a str) -> Parser<'a> { | ||
137 | Parser { src, pos: 0 } | ||
138 | } | ||
139 | |||
140 | // Utility methods | ||
141 | |||
142 | pub fn peek(&self) -> Option<char> { | ||
143 | if self.pos == self.src.len() { | ||
144 | return None; | ||
145 | } | ||
146 | |||
147 | self.src[self.pos..].chars().next() | ||
148 | } | ||
149 | |||
150 | pub fn advance(&mut self) -> char { | ||
151 | let next = self | ||
152 | .peek() | ||
153 | .expect("cannot advance if end of input is reached"); | ||
154 | self.pos += next.len_utf8(); | ||
155 | next | ||
156 | } | ||
157 | |||
158 | pub fn skip_whitespace(&mut self) { | ||
159 | while self.peek().map(|c| c.is_whitespace()) == Some(true) { | ||
160 | self.advance(); | ||
161 | } | ||
162 | } | ||
163 | |||
164 | pub fn get_pos(&self) -> TextUnit { | ||
165 | (self.pos as u32).into() | ||
166 | } | ||
167 | |||
168 | // Char parsing methods | ||
169 | |||
170 | fn parse_unicode_escape(&mut self, start: TextUnit) -> CharComponent { | ||
171 | match self.peek() { | ||
172 | Some('{') => { | ||
173 | self.advance(); | ||
174 | |||
175 | // Parse anything until we reach `}` | ||
176 | while let Some(next) = self.peek() { | ||
177 | self.advance(); | ||
178 | if next == '}' { | ||
179 | break; | ||
180 | } | ||
181 | } | ||
182 | |||
183 | let end = self.get_pos(); | ||
184 | CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) | ||
185 | } | ||
186 | Some(_) | None => { | ||
187 | let end = self.get_pos(); | ||
188 | CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) | ||
189 | } | ||
190 | } | ||
191 | } | ||
192 | |||
193 | fn parse_ascii_code_escape(&mut self, start: TextUnit) -> CharComponent { | ||
194 | let code_start = self.get_pos(); | ||
195 | while let Some(next) = self.peek() { | ||
196 | if next == '\'' || (self.get_pos() - code_start == 2.into()) { | ||
197 | break; | ||
198 | } | ||
199 | |||
200 | self.advance(); | ||
201 | } | ||
202 | |||
203 | let end = self.get_pos(); | ||
204 | CharComponent::new(TextRange::from_to(start, end), AsciiCodeEscape) | ||
205 | } | ||
206 | |||
207 | fn parse_escape(&mut self, start: TextUnit) -> CharComponent { | ||
208 | if self.peek().is_none() { | ||
209 | return CharComponent::new(TextRange::from_to(start, start), AsciiEscape); | ||
210 | } | ||
211 | |||
212 | let next = self.advance(); | ||
213 | let end = self.get_pos(); | ||
214 | let range = TextRange::from_to(start, end); | ||
215 | match next { | ||
216 | 'x' => self.parse_ascii_code_escape(start), | ||
217 | 'u' => self.parse_unicode_escape(start), | ||
218 | _ => CharComponent::new(range, AsciiEscape), | ||
219 | } | ||
220 | } | ||
221 | |||
222 | pub fn parse_char_component(&mut self) -> Option<CharComponent> { | ||
223 | let next = self.peek()?; | ||
224 | |||
225 | // Ignore character close | ||
226 | if next == '\'' { | ||
227 | return None; | ||
228 | } | ||
229 | |||
230 | let start = self.get_pos(); | ||
231 | self.advance(); | ||
232 | |||
233 | if next == '\\' { | ||
234 | Some(self.parse_escape(start)) | ||
235 | } else { | ||
236 | let end = self.get_pos(); | ||
237 | Some(CharComponent::new( | ||
238 | TextRange::from_to(start, end), | ||
239 | CodePoint, | ||
240 | )) | ||
241 | } | ||
242 | } | ||
243 | |||
244 | pub fn parse_ignore_newline(&mut self, start: TextUnit) -> Option<StringComponent> { | ||
245 | // In string literals, when a `\` occurs immediately before the newline, the `\`, | ||
246 | // the newline, and all whitespace at the beginning of the next line are ignored | ||
247 | match self.peek() { | ||
248 | Some('\n') | Some('\r') => { | ||
249 | self.skip_whitespace(); | ||
250 | Some(StringComponent::new( | ||
251 | TextRange::from_to(start, self.get_pos()), | ||
252 | StringComponentKind::IgnoreNewline, | ||
253 | )) | ||
254 | } | ||
255 | _ => None, | ||
256 | } | ||
257 | } | ||
258 | |||
259 | pub fn parse_string_component(&mut self) -> Option<StringComponent> { | ||
260 | let next = self.peek()?; | ||
261 | |||
262 | // Ignore string close | ||
263 | if next == '"' { | ||
264 | return None; | ||
265 | } | ||
266 | |||
267 | let start = self.get_pos(); | ||
268 | self.advance(); | ||
269 | |||
270 | if next == '\\' { | ||
271 | // Strings can use `\` to ignore newlines, so we first try to parse one of those | ||
272 | // before falling back to parsing char escapes | ||
273 | self.parse_ignore_newline(start).or_else(|| { | ||
274 | let char_component = self.parse_escape(start); | ||
275 | Some(StringComponent::new( | ||
276 | char_component.range, | ||
277 | StringComponentKind::Char(char_component.kind), | ||
278 | )) | ||
279 | }) | ||
280 | } else { | ||
281 | let end = self.get_pos(); | ||
282 | Some(StringComponent::new( | ||
283 | TextRange::from_to(start, end), | ||
284 | StringComponentKind::Char(CodePoint), | ||
285 | )) | ||
286 | } | ||
287 | } | ||
288 | } | ||
289 | |||
290 | #[cfg(test)] | ||
291 | mod tests { | ||
292 | use super::*; | ||
293 | |||
294 | fn parse(src: &str) -> (bool, Vec<CharComponent>) { | ||
295 | let component_iterator = &mut super::parse_char_literal(src); | ||
296 | let components: Vec<_> = component_iterator.collect(); | ||
297 | (component_iterator.has_closing_quote, components) | ||
298 | } | ||
299 | |||
300 | fn unclosed_char_component(src: &str) -> CharComponent { | ||
301 | let (has_closing_quote, components) = parse(src); | ||
302 | assert!(!has_closing_quote, "char should not have closing quote"); | ||
303 | assert!(components.len() == 1); | ||
304 | components[0].clone() | ||
305 | } | ||
306 | |||
307 | fn closed_char_component(src: &str) -> CharComponent { | ||
308 | let (has_closing_quote, components) = parse(src); | ||
309 | assert!(has_closing_quote, "char should have closing quote"); | ||
310 | assert!( | ||
311 | components.len() == 1, | ||
312 | "Literal: {}\nComponents: {:#?}", | ||
313 | src, | ||
314 | components | ||
315 | ); | ||
316 | components[0].clone() | ||
317 | } | ||
318 | |||
319 | fn closed_char_components(src: &str) -> Vec<CharComponent> { | ||
320 | let (has_closing_quote, components) = parse(src); | ||
321 | assert!(has_closing_quote, "char should have closing quote"); | ||
322 | components | ||
323 | } | ||
324 | |||
325 | fn range_closed(src: &str) -> TextRange { | ||
326 | TextRange::from_to(1.into(), (src.len() as u32 - 1).into()) | ||
327 | } | ||
328 | |||
329 | fn range_unclosed(src: &str) -> TextRange { | ||
330 | TextRange::from_to(1.into(), (src.len() as u32).into()) | ||
331 | } | ||
332 | |||
333 | #[test] | ||
334 | fn test_unicode_escapes() { | ||
335 | let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""]; | ||
336 | for escape in unicode_escapes { | ||
337 | let escape_sequence = format!(r"'\u{}'", escape); | ||
338 | let component = closed_char_component(&escape_sequence); | ||
339 | let expected_range = range_closed(&escape_sequence); | ||
340 | assert_eq!(component.kind, CharComponentKind::UnicodeEscape); | ||
341 | assert_eq!(component.range, expected_range); | ||
342 | } | ||
343 | } | ||
344 | |||
345 | #[test] | ||
346 | fn test_unicode_escapes_unclosed() { | ||
347 | let unicode_escapes = &["{DEAD", "{BEEF", "{FF"]; | ||
348 | for escape in unicode_escapes { | ||
349 | let escape_sequence = format!(r"'\u{}'", escape); | ||
350 | let component = unclosed_char_component(&escape_sequence); | ||
351 | let expected_range = range_unclosed(&escape_sequence); | ||
352 | assert_eq!(component.kind, CharComponentKind::UnicodeEscape); | ||
353 | assert_eq!(component.range, expected_range); | ||
354 | } | ||
355 | } | ||
356 | |||
357 | #[test] | ||
358 | fn test_empty_char() { | ||
359 | let (has_closing_quote, components) = parse("''"); | ||
360 | assert!(has_closing_quote, "char should have closing quote"); | ||
361 | assert!(components.len() == 0); | ||
362 | } | ||
363 | |||
364 | #[test] | ||
365 | fn test_unclosed_char() { | ||
366 | let component = unclosed_char_component("'a"); | ||
367 | assert!(component.kind == CodePoint); | ||
368 | assert!(component.range == TextRange::from_to(1.into(), 2.into())); | ||
369 | } | ||
370 | |||
371 | #[test] | ||
372 | fn test_digit_escapes() { | ||
373 | let literals = &[r"", r"5", r"55"]; | ||
374 | |||
375 | for literal in literals { | ||
376 | let lit_text = format!(r"'\x{}'", literal); | ||
377 | let component = closed_char_component(&lit_text); | ||
378 | assert!(component.kind == CharComponentKind::AsciiCodeEscape); | ||
379 | assert!(component.range == range_closed(&lit_text)); | ||
380 | } | ||
381 | |||
382 | // More than 2 digits starts a new codepoint | ||
383 | let components = closed_char_components(r"'\x555'"); | ||
384 | assert!(components.len() == 2); | ||
385 | assert!(components[1].kind == CharComponentKind::CodePoint); | ||
386 | } | ||
387 | |||
388 | #[test] | ||
389 | fn test_ascii_escapes() { | ||
390 | let literals = &[ | ||
391 | r"\'", "\\\"", // equivalent to \" | ||
392 | r"\n", r"\r", r"\t", r"\\", r"\0", | ||
393 | ]; | ||
394 | |||
395 | for literal in literals { | ||
396 | let lit_text = format!("'{}'", literal); | ||
397 | let component = closed_char_component(&lit_text); | ||
398 | assert!(component.kind == CharComponentKind::AsciiEscape); | ||
399 | assert!(component.range == range_closed(&lit_text)); | ||
400 | } | ||
401 | } | ||
402 | |||
403 | #[test] | ||
404 | fn test_no_escapes() { | ||
405 | let literals = &['"', 'n', 'r', 't', '0', 'x', 'u']; | ||
406 | |||
407 | for &literal in literals { | ||
408 | let lit_text = format!("'{}'", literal); | ||
409 | let component = closed_char_component(&lit_text); | ||
410 | assert!(component.kind == CharComponentKind::CodePoint); | ||
411 | assert!(component.range == range_closed(&lit_text)); | ||
412 | } | ||
413 | } | ||
414 | } | ||