aboutsummaryrefslogtreecommitdiff
path: root/crates/ra_syntax/src/validation
diff options
context:
space:
mode:
authorAleksey Kladov <[email protected]>2019-07-24 09:47:28 +0100
committerAleksey Kladov <[email protected]>2019-07-24 09:47:28 +0100
commit2473cb6a5cc2af6d703129adc01942b00c059810 (patch)
tree0b51a9d40266d0b89083199191573a8a4a194d99 /crates/ra_syntax/src/validation
parentc79eea9fc1d7c3400031674b7ebb2b0671aa05e8 (diff)
switch to upstream unescape
Diffstat (limited to 'crates/ra_syntax/src/validation')
-rw-r--r--crates/ra_syntax/src/validation/unescape.rs521
1 files changed, 0 insertions, 521 deletions
diff --git a/crates/ra_syntax/src/validation/unescape.rs b/crates/ra_syntax/src/validation/unescape.rs
deleted file mode 100644
index 7eed6c663..000000000
--- a/crates/ra_syntax/src/validation/unescape.rs
+++ /dev/null
@@ -1,521 +0,0 @@
1//! Utilities for validating string and char literals and turning them into
2//! values they represent.
3//!
4//! This file is copy-pasted from the compiler
5//!
6//! https://github.com/rust-lang/rust/blob/c6ac57564852cb6e2d0db60f7b46d9eb98d4b449/src/libsyntax/parse/unescape.rs
7//!
8//! Hopefully, we'll share this code in a proper way some day
9
10use std::ops::Range;
11use std::str::Chars;
12
13#[derive(Debug, PartialEq, Eq, Clone, Hash)]
14pub enum EscapeError {
15 ZeroChars,
16 MoreThanOneChar,
17
18 LoneSlash,
19 InvalidEscape,
20 BareCarriageReturn,
21 EscapeOnlyChar,
22
23 TooShortHexEscape,
24 InvalidCharInHexEscape,
25 OutOfRangeHexEscape,
26
27 NoBraceInUnicodeEscape,
28 InvalidCharInUnicodeEscape,
29 EmptyUnicodeEscape,
30 UnclosedUnicodeEscape,
31 LeadingUnderscoreUnicodeEscape,
32 OverlongUnicodeEscape,
33 LoneSurrogateUnicodeEscape,
34 OutOfRangeUnicodeEscape,
35
36 UnicodeEscapeInByte,
37 NonAsciiCharInByte,
38}
39
40/// Takes a contents of a char literal (without quotes), and returns an
41/// unescaped char or an error
42pub(crate) fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> {
43 let mut chars = literal_text.chars();
44 unescape_char_or_byte(&mut chars, Mode::Char)
45 .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
46}
47
48/// Takes a contents of a string literal (without quotes) and produces a
49/// sequence of escaped characters or errors.
50pub(crate) fn unescape_str<F>(literal_text: &str, callback: &mut F)
51where
52 F: FnMut(Range<usize>, Result<char, EscapeError>),
53{
54 unescape_str_or_byte_str(literal_text, Mode::Str, callback)
55}
56
57pub(crate) fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
58 let mut chars = literal_text.chars();
59 unescape_char_or_byte(&mut chars, Mode::Byte)
60 .map(byte_from_char)
61 .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
62}
63
64/// Takes a contents of a string literal (without quotes) and produces a
65/// sequence of escaped characters or errors.
66pub(crate) fn unescape_byte_str<F>(literal_text: &str, callback: &mut F)
67where
68 F: FnMut(Range<usize>, Result<u8, EscapeError>),
69{
70 unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
71 callback(range, char.map(byte_from_char))
72 })
73}
74
75#[derive(Debug, Clone, Copy)]
76pub(crate) enum Mode {
77 Char,
78 Str,
79 Byte,
80 ByteStr,
81}
82
83impl Mode {
84 fn in_single_quotes(self) -> bool {
85 match self {
86 Mode::Char | Mode::Byte => true,
87 Mode::Str | Mode::ByteStr => false,
88 }
89 }
90
91 pub(crate) fn in_double_quotes(self) -> bool {
92 !self.in_single_quotes()
93 }
94
95 pub(crate) fn is_bytes(self) -> bool {
96 match self {
97 Mode::Byte | Mode::ByteStr => true,
98 Mode::Char | Mode::Str => false,
99 }
100 }
101}
102
103fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
104 if first_char != '\\' {
105 return match first_char {
106 '\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
107 '\r' => Err(if chars.clone().next() == Some('\n') {
108 EscapeError::EscapeOnlyChar
109 } else {
110 EscapeError::BareCarriageReturn
111 }),
112 '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
113 '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
114 _ => {
115 if mode.is_bytes() && !first_char.is_ascii() {
116 return Err(EscapeError::NonAsciiCharInByte);
117 }
118 Ok(first_char)
119 }
120 };
121 }
122
123 let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
124
125 let res = match second_char {
126 '"' => '"',
127 'n' => '\n',
128 'r' => '\r',
129 't' => '\t',
130 '\\' => '\\',
131 '\'' => '\'',
132 '0' => '\0',
133
134 'x' => {
135 let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
136 let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
137
138 let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
139 let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
140
141 let value = hi * 16 + lo;
142
143 if !mode.is_bytes() && !is_ascii(value) {
144 return Err(EscapeError::OutOfRangeHexEscape);
145 }
146 let value = value as u8;
147
148 value as char
149 }
150
151 'u' => {
152 if chars.next() != Some('{') {
153 return Err(EscapeError::NoBraceInUnicodeEscape);
154 }
155
156 let mut n_digits = 1;
157 let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
158 '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
159 '}' => return Err(EscapeError::EmptyUnicodeEscape),
160 c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
161 };
162
163 loop {
164 match chars.next() {
165 None => return Err(EscapeError::UnclosedUnicodeEscape),
166 Some('_') => continue,
167 Some('}') => {
168 if n_digits > 6 {
169 return Err(EscapeError::OverlongUnicodeEscape);
170 }
171 if mode.is_bytes() {
172 return Err(EscapeError::UnicodeEscapeInByte);
173 }
174
175 break std::char::from_u32(value).ok_or_else(|| {
176 if value > 0x0010_FFFF {
177 EscapeError::OutOfRangeUnicodeEscape
178 } else {
179 EscapeError::LoneSurrogateUnicodeEscape
180 }
181 })?;
182 }
183 Some(c) => {
184 let digit =
185 c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
186 n_digits += 1;
187 if n_digits > 6 {
188 continue;
189 }
190 let digit = digit as u32;
191 value = value * 16 + digit;
192 }
193 };
194 }
195 }
196 _ => return Err(EscapeError::InvalidEscape),
197 };
198 Ok(res)
199}
200
201fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
202 let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
203 let res = scan_escape(first_char, chars, mode)?;
204 if chars.next().is_some() {
205 return Err(EscapeError::MoreThanOneChar);
206 }
207 Ok(res)
208}
209
210/// Takes a contents of a string literal (without quotes) and produces a
211/// sequence of escaped characters or errors.
212fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
213where
214 F: FnMut(Range<usize>, Result<char, EscapeError>),
215{
216 assert!(mode.in_double_quotes());
217 let initial_len = src.len();
218 let mut chars = src.chars();
219 while let Some(first_char) = chars.next() {
220 let start = initial_len - chars.as_str().len() - first_char.len_utf8();
221
222 let unescaped_char = match first_char {
223 '\\' => {
224 let (second_char, third_char) = {
225 let mut chars = chars.clone();
226 (chars.next(), chars.next())
227 };
228 match (second_char, third_char) {
229 (Some('\n'), _) | (Some('\r'), Some('\n')) => {
230 skip_ascii_whitespace(&mut chars);
231 continue;
232 }
233 _ => scan_escape(first_char, &mut chars, mode),
234 }
235 }
236 '\r' => {
237 let second_char = chars.clone().next();
238 if second_char == Some('\n') {
239 chars.next();
240 Ok('\n')
241 } else {
242 scan_escape(first_char, &mut chars, mode)
243 }
244 }
245 '\n' => Ok('\n'),
246 '\t' => Ok('\t'),
247 _ => scan_escape(first_char, &mut chars, mode),
248 };
249 let end = initial_len - chars.as_str().len();
250 callback(start..end, unescaped_char);
251 }
252
253 fn skip_ascii_whitespace(chars: &mut Chars<'_>) {
254 let str = chars.as_str();
255 let first_non_space = str
256 .bytes()
257 .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
258 .unwrap_or_else(|| str.len());
259 *chars = str[first_non_space..].chars()
260 }
261}
262
263fn byte_from_char(c: char) -> u8 {
264 let res = c as u32;
265 assert!(res <= u32::from(u8::max_value()), "guaranteed because of Mode::Byte");
266 res as u8
267}
268
269fn is_ascii(x: u32) -> bool {
270 x <= 0x7F
271}
272
273#[cfg(test)]
274mod tests {
275 use super::*;
276
277 #[test]
278 fn test_unescape_char_bad() {
279 fn check(literal_text: &str, expected_error: EscapeError) {
280 let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err);
281 assert_eq!(actual_result, Err(expected_error));
282 }
283
284 check("", EscapeError::ZeroChars);
285 check(r"\", EscapeError::LoneSlash);
286
287 check("\n", EscapeError::EscapeOnlyChar);
288 check("\r\n", EscapeError::EscapeOnlyChar);
289 check("\t", EscapeError::EscapeOnlyChar);
290 check("'", EscapeError::EscapeOnlyChar);
291 check("\r", EscapeError::BareCarriageReturn);
292
293 check("spam", EscapeError::MoreThanOneChar);
294 check(r"\x0ff", EscapeError::MoreThanOneChar);
295 check(r#"\"a"#, EscapeError::MoreThanOneChar);
296 check(r"\na", EscapeError::MoreThanOneChar);
297 check(r"\ra", EscapeError::MoreThanOneChar);
298 check(r"\ta", EscapeError::MoreThanOneChar);
299 check(r"\\a", EscapeError::MoreThanOneChar);
300 check(r"\'a", EscapeError::MoreThanOneChar);
301 check(r"\0a", EscapeError::MoreThanOneChar);
302 check(r"\u{0}x", EscapeError::MoreThanOneChar);
303 check(r"\u{1F63b}}", EscapeError::MoreThanOneChar);
304
305 check(r"\v", EscapeError::InvalidEscape);
306 check(r"\💩", EscapeError::InvalidEscape);
307 check(r"\●", EscapeError::InvalidEscape);
308
309 check(r"\x", EscapeError::TooShortHexEscape);
310 check(r"\x0", EscapeError::TooShortHexEscape);
311 check(r"\xf", EscapeError::TooShortHexEscape);
312 check(r"\xa", EscapeError::TooShortHexEscape);
313 check(r"\xx", EscapeError::InvalidCharInHexEscape);
314 check(r"\xы", EscapeError::InvalidCharInHexEscape);
315 check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
316 check(r"\xtt", EscapeError::InvalidCharInHexEscape);
317 check(r"\xff", EscapeError::OutOfRangeHexEscape);
318 check(r"\xFF", EscapeError::OutOfRangeHexEscape);
319 check(r"\x80", EscapeError::OutOfRangeHexEscape);
320
321 check(r"\u", EscapeError::NoBraceInUnicodeEscape);
322 check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
323 check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
324 check(r"\u{", EscapeError::UnclosedUnicodeEscape);
325 check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
326 check(r"\u{}", EscapeError::EmptyUnicodeEscape);
327 check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
328 check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
329 check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape);
330 check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
331 check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
332
333 check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape);
334 check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape);
335 check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape);
336
337 check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape);
338 check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape);
339 check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape);
340 }
341
342 #[test]
343 fn test_unescape_char_good() {
344 fn check(literal_text: &str, expected_char: char) {
345 let actual_result = unescape_char(literal_text);
346 assert_eq!(actual_result, Ok(expected_char));
347 }
348
349 check("a", 'a');
350 check("ы", 'ы');
351 check("🦀", '🦀');
352
353 check(r#"\""#, '"');
354 check(r"\n", '\n');
355 check(r"\r", '\r');
356 check(r"\t", '\t');
357 check(r"\\", '\\');
358 check(r"\'", '\'');
359 check(r"\0", '\0');
360
361 check(r"\x00", '\0');
362 check(r"\x5a", 'Z');
363 check(r"\x5A", 'Z');
364 check(r"\x7f", 127 as char);
365
366 check(r"\u{0}", '\0');
367 check(r"\u{000000}", '\0');
368 check(r"\u{41}", 'A');
369 check(r"\u{0041}", 'A');
370 check(r"\u{00_41}", 'A');
371 check(r"\u{4__1__}", 'A');
372 check(r"\u{1F63b}", '😻');
373 }
374
375 #[test]
376 fn test_unescape_str_good() {
377 fn check(literal_text: &str, expected: &str) {
378 let mut buf = Ok(String::with_capacity(literal_text.len()));
379 unescape_str(literal_text, &mut |range, c| {
380 if let Ok(b) = &mut buf {
381 match c {
382 Ok(c) => b.push(c),
383 Err(e) => buf = Err((range, e)),
384 }
385 }
386 });
387 let buf = buf.as_ref().map(|it| it.as_ref());
388 assert_eq!(buf, Ok(expected))
389 }
390
391 check("foo", "foo");
392 check("", "");
393 check(" \t\n\r\n", " \t\n\n");
394
395 check("hello \\\n world", "hello world");
396 check("hello \\\r\n world", "hello world");
397 check("thread's", "thread's")
398 }
399
400 #[test]
401 fn test_unescape_byte_bad() {
402 fn check(literal_text: &str, expected_error: EscapeError) {
403 let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err);
404 assert_eq!(actual_result, Err(expected_error));
405 }
406
407 check("", EscapeError::ZeroChars);
408 check(r"\", EscapeError::LoneSlash);
409
410 check("\n", EscapeError::EscapeOnlyChar);
411 check("\r\n", EscapeError::EscapeOnlyChar);
412 check("\t", EscapeError::EscapeOnlyChar);
413 check("'", EscapeError::EscapeOnlyChar);
414 check("\r", EscapeError::BareCarriageReturn);
415
416 check("spam", EscapeError::MoreThanOneChar);
417 check(r"\x0ff", EscapeError::MoreThanOneChar);
418 check(r#"\"a"#, EscapeError::MoreThanOneChar);
419 check(r"\na", EscapeError::MoreThanOneChar);
420 check(r"\ra", EscapeError::MoreThanOneChar);
421 check(r"\ta", EscapeError::MoreThanOneChar);
422 check(r"\\a", EscapeError::MoreThanOneChar);
423 check(r"\'a", EscapeError::MoreThanOneChar);
424 check(r"\0a", EscapeError::MoreThanOneChar);
425
426 check(r"\v", EscapeError::InvalidEscape);
427 check(r"\💩", EscapeError::InvalidEscape);
428 check(r"\●", EscapeError::InvalidEscape);
429
430 check(r"\x", EscapeError::TooShortHexEscape);
431 check(r"\x0", EscapeError::TooShortHexEscape);
432 check(r"\xa", EscapeError::TooShortHexEscape);
433 check(r"\xf", EscapeError::TooShortHexEscape);
434 check(r"\xx", EscapeError::InvalidCharInHexEscape);
435 check(r"\xы", EscapeError::InvalidCharInHexEscape);
436 check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
437 check(r"\xtt", EscapeError::InvalidCharInHexEscape);
438
439 check(r"\u", EscapeError::NoBraceInUnicodeEscape);
440 check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
441 check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
442 check(r"\u{", EscapeError::UnclosedUnicodeEscape);
443 check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
444 check(r"\u{}", EscapeError::EmptyUnicodeEscape);
445 check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
446 check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
447
448 check("ы", EscapeError::NonAsciiCharInByte);
449 check("🦀", EscapeError::NonAsciiCharInByte);
450
451 check(r"\u{0}", EscapeError::UnicodeEscapeInByte);
452 check(r"\u{000000}", EscapeError::UnicodeEscapeInByte);
453 check(r"\u{41}", EscapeError::UnicodeEscapeInByte);
454 check(r"\u{0041}", EscapeError::UnicodeEscapeInByte);
455 check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte);
456 check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte);
457 check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte);
458 check(r"\u{0}x", EscapeError::UnicodeEscapeInByte);
459 check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte);
460 check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte);
461 check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
462 check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
463 check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte);
464 check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte);
465 check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte);
466 check(r"\u{D800}", EscapeError::UnicodeEscapeInByte);
467 check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte);
468 check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte);
469 }
470
471 #[test]
472 fn test_unescape_byte_good() {
473 fn check(literal_text: &str, expected_byte: u8) {
474 let actual_result = unescape_byte(literal_text);
475 assert_eq!(actual_result, Ok(expected_byte));
476 }
477
478 check("a", b'a');
479
480 check(r#"\""#, b'"');
481 check(r"\n", b'\n');
482 check(r"\r", b'\r');
483 check(r"\t", b'\t');
484 check(r"\\", b'\\');
485 check(r"\'", b'\'');
486 check(r"\0", b'\0');
487
488 check(r"\x00", b'\0');
489 check(r"\x5a", b'Z');
490 check(r"\x5A", b'Z');
491 check(r"\x7f", 127);
492 check(r"\x80", 128);
493 check(r"\xff", 255);
494 check(r"\xFF", 255);
495 }
496
497 #[test]
498 fn test_unescape_byte_str_good() {
499 fn check(literal_text: &str, expected: &[u8]) {
500 let mut buf = Ok(Vec::with_capacity(literal_text.len()));
501 unescape_byte_str(literal_text, &mut |range, c| {
502 if let Ok(b) = &mut buf {
503 match c {
504 Ok(c) => b.push(c),
505 Err(e) => buf = Err((range, e)),
506 }
507 }
508 });
509 let buf = buf.as_ref().map(|it| it.as_ref());
510 assert_eq!(buf, Ok(expected))
511 }
512
513 check("foo", b"foo");
514 check("", b"");
515 check(" \t\n\r\n", b" \t\n\n");
516
517 check("hello \\\n world", b"hello world");
518 check("hello \\\r\n world", b"hello world");
519 check("thread's", b"thread's")
520 }
521}