diff options
author | bors[bot] <bors[bot]@users.noreply.github.com> | 2019-05-07 17:43:10 +0100 |
---|---|---|
committer | bors[bot] <bors[bot]@users.noreply.github.com> | 2019-05-07 17:43:10 +0100 |
commit | d3efedb752bb2198796603d8a479a5e3ee472a97 (patch) | |
tree | ca6a4aee6ad4077a869a932a18c6c8d134406f8c | |
parent | ef782adc293deb287128f005dbab2038ba3ccdc1 (diff) | |
parent | 313314e14b629ebf50389dbd2d440bda922f6ae7 (diff) |
Merge #1253
1253: Share literal validation logic with compiler r=matklad a=matklad
This is neat: the unescape module is literary what compiler is using right now:
https://github.com/rust-lang/rust/blob/c6ac57564852cb6e2d0db60f7b46d9eb98d4b449/src/libsyntax/parse/unescape.rs
So, yeah, code sharing via copy-paste!
Co-authored-by: Aleksey Kladov <[email protected]>
-rw-r--r-- | crates/ra_syntax/src/lib.rs | 1 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing.rs | 333 | ||||
-rw-r--r-- | crates/ra_syntax/src/syntax_error.rs | 104 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation.rs | 64 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/byte.rs | 199 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/byte_string.rs | 169 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/char.rs | 273 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/string.rs | 154 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/unescape.rs | 521 | ||||
-rw-r--r-- | crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt | 3 |
10 files changed, 620 insertions, 1201 deletions
diff --git a/crates/ra_syntax/src/lib.rs b/crates/ra_syntax/src/lib.rs index 9cb66b76b..39c25dbdc 100644 --- a/crates/ra_syntax/src/lib.rs +++ b/crates/ra_syntax/src/lib.rs | |||
@@ -23,7 +23,6 @@ mod syntax_node; | |||
23 | mod syntax_text; | 23 | mod syntax_text; |
24 | mod syntax_error; | 24 | mod syntax_error; |
25 | mod parsing; | 25 | mod parsing; |
26 | mod string_lexing; | ||
27 | mod validation; | 26 | mod validation; |
28 | mod ptr; | 27 | mod ptr; |
29 | 28 | ||
diff --git a/crates/ra_syntax/src/string_lexing.rs b/crates/ra_syntax/src/string_lexing.rs deleted file mode 100644 index 4c3eea3d2..000000000 --- a/crates/ra_syntax/src/string_lexing.rs +++ /dev/null | |||
@@ -1,333 +0,0 @@ | |||
1 | use crate::{TextRange, TextUnit}; | ||
2 | use self::StringComponentKind::*; | ||
3 | |||
4 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
5 | pub(crate) struct StringComponent { | ||
6 | pub(crate) range: TextRange, | ||
7 | pub(crate) kind: StringComponentKind, | ||
8 | } | ||
9 | |||
10 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
11 | pub(crate) enum StringComponentKind { | ||
12 | IgnoreNewline, | ||
13 | CodePoint, | ||
14 | AsciiEscape, | ||
15 | AsciiCodeEscape, | ||
16 | UnicodeEscape, | ||
17 | } | ||
18 | |||
19 | pub(crate) fn parse_quoted_literal( | ||
20 | prefix: Option<char>, | ||
21 | quote: char, | ||
22 | src: &str, | ||
23 | ) -> StringComponentIter { | ||
24 | let prefix = prefix.map(|p| match p { | ||
25 | 'b' => b'b', | ||
26 | _ => panic!("invalid prefix"), | ||
27 | }); | ||
28 | let quote = match quote { | ||
29 | '\'' => b'\'', | ||
30 | '"' => b'"', | ||
31 | _ => panic!("invalid quote"), | ||
32 | }; | ||
33 | StringComponentIter { src, prefix, quote, pos: 0, has_closing_quote: false, suffix: None } | ||
34 | } | ||
35 | |||
36 | pub(crate) struct StringComponentIter<'a> { | ||
37 | src: &'a str, | ||
38 | prefix: Option<u8>, | ||
39 | quote: u8, | ||
40 | pos: usize, | ||
41 | pub(crate) has_closing_quote: bool, | ||
42 | pub(crate) suffix: Option<TextRange>, | ||
43 | } | ||
44 | |||
45 | impl<'a> Iterator for StringComponentIter<'a> { | ||
46 | type Item = StringComponent; | ||
47 | fn next(&mut self) -> Option<StringComponent> { | ||
48 | if self.pos == 0 { | ||
49 | if let Some(prefix) = self.prefix { | ||
50 | assert!( | ||
51 | self.advance() == prefix as char, | ||
52 | "literal should start with a {:?}", | ||
53 | prefix as char, | ||
54 | ); | ||
55 | } | ||
56 | assert!( | ||
57 | self.advance() == self.quote as char, | ||
58 | "literal should start with a {:?}", | ||
59 | self.quote as char, | ||
60 | ); | ||
61 | } | ||
62 | |||
63 | if let Some(component) = self.parse_component() { | ||
64 | return Some(component); | ||
65 | } | ||
66 | |||
67 | // We get here when there are no char components left to parse | ||
68 | if self.peek() == Some(self.quote as char) { | ||
69 | self.advance(); | ||
70 | self.has_closing_quote = true; | ||
71 | if let Some(range) = self.parse_suffix() { | ||
72 | self.suffix = Some(range); | ||
73 | } | ||
74 | } | ||
75 | |||
76 | assert!( | ||
77 | self.peek() == None, | ||
78 | "literal should leave no unparsed input: src = {:?}, pos = {}, length = {}", | ||
79 | self.src, | ||
80 | self.pos, | ||
81 | self.src.len() | ||
82 | ); | ||
83 | |||
84 | None | ||
85 | } | ||
86 | } | ||
87 | |||
88 | impl<'a> StringComponentIter<'a> { | ||
89 | fn peek(&self) -> Option<char> { | ||
90 | if self.pos == self.src.len() { | ||
91 | return None; | ||
92 | } | ||
93 | |||
94 | self.src[self.pos..].chars().next() | ||
95 | } | ||
96 | |||
97 | fn advance(&mut self) -> char { | ||
98 | let next = self.peek().expect("cannot advance if end of input is reached"); | ||
99 | self.pos += next.len_utf8(); | ||
100 | next | ||
101 | } | ||
102 | |||
103 | fn parse_component(&mut self) -> Option<StringComponent> { | ||
104 | let next = self.peek()?; | ||
105 | |||
106 | // Ignore string close | ||
107 | if next == self.quote as char { | ||
108 | return None; | ||
109 | } | ||
110 | |||
111 | let start = self.start_range(); | ||
112 | self.advance(); | ||
113 | |||
114 | if next == '\\' { | ||
115 | // Strings can use `\` to ignore newlines, so we first try to parse one of those | ||
116 | // before falling back to parsing char escapes | ||
117 | if self.quote == b'"' { | ||
118 | if let Some(component) = self.parse_ignore_newline(start) { | ||
119 | return Some(component); | ||
120 | } | ||
121 | } | ||
122 | |||
123 | Some(self.parse_escape(start)) | ||
124 | } else { | ||
125 | Some(self.finish_component(start, CodePoint)) | ||
126 | } | ||
127 | } | ||
128 | |||
129 | fn parse_ignore_newline(&mut self, start: TextUnit) -> Option<StringComponent> { | ||
130 | // In string literals, when a `\` occurs immediately before the newline, the `\`, | ||
131 | // the newline, and all whitespace at the beginning of the next line are ignored | ||
132 | match self.peek() { | ||
133 | Some('\n') | Some('\r') => { | ||
134 | self.skip_whitespace(); | ||
135 | Some(self.finish_component(start, IgnoreNewline)) | ||
136 | } | ||
137 | _ => None, | ||
138 | } | ||
139 | } | ||
140 | |||
141 | fn skip_whitespace(&mut self) { | ||
142 | while self.peek().map(|c| c.is_whitespace()) == Some(true) { | ||
143 | self.advance(); | ||
144 | } | ||
145 | } | ||
146 | |||
147 | fn parse_escape(&mut self, start: TextUnit) -> StringComponent { | ||
148 | if self.peek().is_none() { | ||
149 | return self.finish_component(start, AsciiEscape); | ||
150 | } | ||
151 | |||
152 | let next = self.advance(); | ||
153 | match next { | ||
154 | 'x' => self.parse_ascii_code_escape(start), | ||
155 | 'u' => self.parse_unicode_escape(start), | ||
156 | _ => self.finish_component(start, AsciiEscape), | ||
157 | } | ||
158 | } | ||
159 | |||
160 | fn parse_unicode_escape(&mut self, start: TextUnit) -> StringComponent { | ||
161 | match self.peek() { | ||
162 | Some('{') => { | ||
163 | self.advance(); | ||
164 | |||
165 | // Parse anything until we reach `}` | ||
166 | while let Some(next) = self.peek() { | ||
167 | self.advance(); | ||
168 | if next == '}' { | ||
169 | break; | ||
170 | } | ||
171 | } | ||
172 | |||
173 | self.finish_component(start, UnicodeEscape) | ||
174 | } | ||
175 | Some(_) | None => self.finish_component(start, UnicodeEscape), | ||
176 | } | ||
177 | } | ||
178 | |||
179 | fn parse_ascii_code_escape(&mut self, start: TextUnit) -> StringComponent { | ||
180 | let code_start = self.pos; | ||
181 | while let Some(next) = self.peek() { | ||
182 | if next == '\'' || (self.pos - code_start == 2) { | ||
183 | break; | ||
184 | } | ||
185 | |||
186 | self.advance(); | ||
187 | } | ||
188 | self.finish_component(start, AsciiCodeEscape) | ||
189 | } | ||
190 | |||
191 | fn parse_suffix(&mut self) -> Option<TextRange> { | ||
192 | let start = self.start_range(); | ||
193 | let _ = self.peek()?; | ||
194 | while let Some(_) = self.peek() { | ||
195 | self.advance(); | ||
196 | } | ||
197 | Some(self.finish_range(start)) | ||
198 | } | ||
199 | |||
200 | fn start_range(&self) -> TextUnit { | ||
201 | TextUnit::from_usize(self.pos) | ||
202 | } | ||
203 | |||
204 | fn finish_range(&self, start: TextUnit) -> TextRange { | ||
205 | TextRange::from_to(start, TextUnit::from_usize(self.pos)) | ||
206 | } | ||
207 | |||
208 | fn finish_component(&self, start: TextUnit, kind: StringComponentKind) -> StringComponent { | ||
209 | let range = self.finish_range(start); | ||
210 | StringComponent { range, kind } | ||
211 | } | ||
212 | } | ||
213 | |||
214 | #[cfg(test)] | ||
215 | mod tests { | ||
216 | use super::*; | ||
217 | |||
218 | fn parse(src: &str) -> (bool, Vec<StringComponent>) { | ||
219 | let component_iterator = &mut parse_quoted_literal(None, '\'', src); | ||
220 | let components: Vec<_> = component_iterator.collect(); | ||
221 | (component_iterator.has_closing_quote, components) | ||
222 | } | ||
223 | |||
224 | fn unclosed_char_component(src: &str) -> StringComponent { | ||
225 | let (has_closing_quote, components) = parse(src); | ||
226 | assert!(!has_closing_quote, "char should not have closing quote"); | ||
227 | assert!(components.len() == 1); | ||
228 | components[0].clone() | ||
229 | } | ||
230 | |||
231 | fn closed_char_component(src: &str) -> StringComponent { | ||
232 | let (has_closing_quote, components) = parse(src); | ||
233 | assert!(has_closing_quote, "char should have closing quote"); | ||
234 | assert!(components.len() == 1, "Literal: {}\nComponents: {:#?}", src, components); | ||
235 | components[0].clone() | ||
236 | } | ||
237 | |||
238 | fn closed_char_components(src: &str) -> Vec<StringComponent> { | ||
239 | let (has_closing_quote, components) = parse(src); | ||
240 | assert!(has_closing_quote, "char should have closing quote"); | ||
241 | components | ||
242 | } | ||
243 | |||
244 | fn range_closed(src: &str) -> TextRange { | ||
245 | TextRange::from_to(1.into(), (src.len() as u32 - 1).into()) | ||
246 | } | ||
247 | |||
248 | fn range_unclosed(src: &str) -> TextRange { | ||
249 | TextRange::from_to(1.into(), (src.len() as u32).into()) | ||
250 | } | ||
251 | |||
252 | #[test] | ||
253 | fn test_unicode_escapes() { | ||
254 | let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""]; | ||
255 | for escape in unicode_escapes { | ||
256 | let escape_sequence = format!(r"'\u{}'", escape); | ||
257 | let component = closed_char_component(&escape_sequence); | ||
258 | let expected_range = range_closed(&escape_sequence); | ||
259 | assert_eq!(component.kind, UnicodeEscape); | ||
260 | assert_eq!(component.range, expected_range); | ||
261 | } | ||
262 | } | ||
263 | |||
264 | #[test] | ||
265 | fn test_unicode_escapes_unclosed() { | ||
266 | let unicode_escapes = &["{DEAD", "{BEEF", "{FF"]; | ||
267 | for escape in unicode_escapes { | ||
268 | let escape_sequence = format!(r"'\u{}'", escape); | ||
269 | let component = unclosed_char_component(&escape_sequence); | ||
270 | let expected_range = range_unclosed(&escape_sequence); | ||
271 | assert_eq!(component.kind, UnicodeEscape); | ||
272 | assert_eq!(component.range, expected_range); | ||
273 | } | ||
274 | } | ||
275 | |||
276 | #[test] | ||
277 | fn test_empty_char() { | ||
278 | let (has_closing_quote, components) = parse("''"); | ||
279 | assert!(has_closing_quote, "char should have closing quote"); | ||
280 | assert!(components.len() == 0); | ||
281 | } | ||
282 | |||
283 | #[test] | ||
284 | fn test_unclosed_char() { | ||
285 | let component = unclosed_char_component("'a"); | ||
286 | assert!(component.kind == CodePoint); | ||
287 | assert!(component.range == TextRange::from_to(1.into(), 2.into())); | ||
288 | } | ||
289 | |||
290 | #[test] | ||
291 | fn test_digit_escapes() { | ||
292 | let literals = &[r"", r"5", r"55"]; | ||
293 | |||
294 | for literal in literals { | ||
295 | let lit_text = format!(r"'\x{}'", literal); | ||
296 | let component = closed_char_component(&lit_text); | ||
297 | assert!(component.kind == AsciiCodeEscape); | ||
298 | assert!(component.range == range_closed(&lit_text)); | ||
299 | } | ||
300 | |||
301 | // More than 2 digits starts a new codepoint | ||
302 | let components = closed_char_components(r"'\x555'"); | ||
303 | assert!(components.len() == 2); | ||
304 | assert!(components[1].kind == CodePoint); | ||
305 | } | ||
306 | |||
307 | #[test] | ||
308 | fn test_ascii_escapes() { | ||
309 | let literals = &[ | ||
310 | r"\'", "\\\"", // equivalent to \" | ||
311 | r"\n", r"\r", r"\t", r"\\", r"\0", | ||
312 | ]; | ||
313 | |||
314 | for literal in literals { | ||
315 | let lit_text = format!("'{}'", literal); | ||
316 | let component = closed_char_component(&lit_text); | ||
317 | assert!(component.kind == AsciiEscape); | ||
318 | assert!(component.range == range_closed(&lit_text)); | ||
319 | } | ||
320 | } | ||
321 | |||
322 | #[test] | ||
323 | fn test_no_escapes() { | ||
324 | let literals = &['"', 'n', 'r', 't', '0', 'x', 'u']; | ||
325 | |||
326 | for &literal in literals { | ||
327 | let lit_text = format!("'{}'", literal); | ||
328 | let component = closed_char_component(&lit_text); | ||
329 | assert!(component.kind == CodePoint); | ||
330 | assert!(component.range == range_closed(&lit_text)); | ||
331 | } | ||
332 | } | ||
333 | } | ||
diff --git a/crates/ra_syntax/src/syntax_error.rs b/crates/ra_syntax/src/syntax_error.rs index 4198eefdb..27e12293b 100644 --- a/crates/ra_syntax/src/syntax_error.rs +++ b/crates/ra_syntax/src/syntax_error.rs | |||
@@ -2,7 +2,10 @@ use std::fmt; | |||
2 | 2 | ||
3 | use ra_parser::ParseError; | 3 | use ra_parser::ParseError; |
4 | 4 | ||
5 | use crate::{TextRange, TextUnit}; | 5 | use crate::{ |
6 | TextRange, TextUnit, | ||
7 | validation::EscapeError, | ||
8 | }; | ||
6 | 9 | ||
7 | #[derive(Debug, Clone, PartialEq, Eq, Hash)] | 10 | #[derive(Debug, Clone, PartialEq, Eq, Hash)] |
8 | pub struct SyntaxError { | 11 | pub struct SyntaxError { |
@@ -67,32 +70,7 @@ impl fmt::Display for SyntaxError { | |||
67 | #[derive(Debug, Clone, PartialEq, Eq, Hash)] | 70 | #[derive(Debug, Clone, PartialEq, Eq, Hash)] |
68 | pub enum SyntaxErrorKind { | 71 | pub enum SyntaxErrorKind { |
69 | ParseError(ParseError), | 72 | ParseError(ParseError), |
70 | UnescapedCodepoint, | 73 | EscapeError(EscapeError), |
71 | EmptyChar, | ||
72 | UnclosedChar, | ||
73 | OverlongChar, | ||
74 | EmptyByte, | ||
75 | UnclosedByte, | ||
76 | OverlongByte, | ||
77 | ByteOutOfRange, | ||
78 | UnescapedByte, | ||
79 | EmptyByteEscape, | ||
80 | InvalidByteEscape, | ||
81 | TooShortByteCodeEscape, | ||
82 | MalformedByteCodeEscape, | ||
83 | UnicodeEscapeForbidden, | ||
84 | EmptyAsciiEscape, | ||
85 | InvalidAsciiEscape, | ||
86 | TooShortAsciiCodeEscape, | ||
87 | AsciiCodeEscapeOutOfRange, | ||
88 | MalformedAsciiCodeEscape, | ||
89 | UnclosedUnicodeEscape, | ||
90 | MalformedUnicodeEscape, | ||
91 | EmptyUnicodeEcape, | ||
92 | OverlongUnicodeEscape, | ||
93 | UnicodeEscapeOutOfRange, | ||
94 | UnclosedString, | ||
95 | InvalidSuffix, | ||
96 | InvalidBlockAttr, | 74 | InvalidBlockAttr, |
97 | InvalidMatchInnerAttr, | 75 | InvalidMatchInnerAttr, |
98 | InvalidTupleIndexFormat, | 76 | InvalidTupleIndexFormat, |
@@ -102,38 +80,6 @@ impl fmt::Display for SyntaxErrorKind { | |||
102 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | 80 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
103 | use self::SyntaxErrorKind::*; | 81 | use self::SyntaxErrorKind::*; |
104 | match self { | 82 | match self { |
105 | UnescapedCodepoint => write!(f, "This codepoint should always be escaped"), | ||
106 | EmptyAsciiEscape => write!(f, "Empty escape sequence"), | ||
107 | InvalidAsciiEscape => write!(f, "Invalid escape sequence"), | ||
108 | EmptyChar => write!(f, "Empty char literal"), | ||
109 | UnclosedChar => write!(f, "Unclosed char literal"), | ||
110 | OverlongChar => write!(f, "Char literal should be one character long"), | ||
111 | EmptyByte => write!(f, "Empty byte literal"), | ||
112 | UnclosedByte => write!(f, "Unclosed byte literal"), | ||
113 | OverlongByte => write!(f, "Byte literal should be one character long"), | ||
114 | ByteOutOfRange => write!(f, "Byte should be a valid ASCII character"), | ||
115 | UnescapedByte => write!(f, "This byte should always be escaped"), | ||
116 | EmptyByteEscape => write!(f, "Empty escape sequence"), | ||
117 | InvalidByteEscape => write!(f, "Invalid escape sequence"), | ||
118 | TooShortByteCodeEscape => write!(f, "Escape sequence should have two digits"), | ||
119 | MalformedByteCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"), | ||
120 | UnicodeEscapeForbidden => { | ||
121 | write!(f, "Unicode escapes are not allowed in byte literals or byte strings") | ||
122 | } | ||
123 | TooShortAsciiCodeEscape => write!(f, "Escape sequence should have two digits"), | ||
124 | AsciiCodeEscapeOutOfRange => { | ||
125 | write!(f, "Escape sequence should be between \\x00 and \\x7F") | ||
126 | } | ||
127 | MalformedAsciiCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"), | ||
128 | UnclosedUnicodeEscape => write!(f, "Missing `}}`"), | ||
129 | MalformedUnicodeEscape => write!(f, "Malformed unicode escape sequence"), | ||
130 | EmptyUnicodeEcape => write!(f, "Empty unicode escape sequence"), | ||
131 | OverlongUnicodeEscape => { | ||
132 | write!(f, "Unicode escape sequence should have at most 6 digits") | ||
133 | } | ||
134 | UnicodeEscapeOutOfRange => write!(f, "Unicode escape code should be at most 0x10FFFF"), | ||
135 | UnclosedString => write!(f, "Unclosed string literal"), | ||
136 | InvalidSuffix => write!(f, "Invalid literal suffix"), | ||
137 | InvalidBlockAttr => { | 83 | InvalidBlockAttr => { |
138 | write!(f, "A block in this position cannot accept inner attributes") | 84 | write!(f, "A block in this position cannot accept inner attributes") |
139 | } | 85 | } |
@@ -144,6 +90,46 @@ impl fmt::Display for SyntaxErrorKind { | |||
144 | write!(f, "Tuple (struct) field access is only allowed through decimal integers with no underscores or suffix") | 90 | write!(f, "Tuple (struct) field access is only allowed through decimal integers with no underscores or suffix") |
145 | } | 91 | } |
146 | ParseError(msg) => write!(f, "{}", msg.0), | 92 | ParseError(msg) => write!(f, "{}", msg.0), |
93 | EscapeError(err) => write!(f, "{}", err), | ||
147 | } | 94 | } |
148 | } | 95 | } |
149 | } | 96 | } |
97 | |||
98 | impl fmt::Display for EscapeError { | ||
99 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||
100 | let msg = match self { | ||
101 | EscapeError::ZeroChars => "Empty literal", | ||
102 | EscapeError::MoreThanOneChar => "Literal should be one character long", | ||
103 | EscapeError::LoneSlash => "Character must be escaped: '\\'", | ||
104 | EscapeError::InvalidEscape => "Invalid escape sequence", | ||
105 | EscapeError::BareCarriageReturn => "Character must be escaped: '\r'", | ||
106 | EscapeError::EscapeOnlyChar => "Character must be escaped", | ||
107 | EscapeError::TooShortHexEscape => "Escape sequence should have two digits", | ||
108 | EscapeError::InvalidCharInHexEscape => "Escape sequence should be a hexadecimal number", | ||
109 | EscapeError::OutOfRangeHexEscape => "Escape sequence should be ASCII", | ||
110 | EscapeError::NoBraceInUnicodeEscape => "Invalid escape sequence", | ||
111 | EscapeError::InvalidCharInUnicodeEscape => "Invalid escape sequence", | ||
112 | EscapeError::EmptyUnicodeEscape => "Invalid escape sequence", | ||
113 | EscapeError::UnclosedUnicodeEscape => "Missing '}'", | ||
114 | EscapeError::LeadingUnderscoreUnicodeEscape => "Invalid escape sequence", | ||
115 | EscapeError::OverlongUnicodeEscape => { | ||
116 | "Unicode escape sequence should have at most 6 digits" | ||
117 | } | ||
118 | EscapeError::LoneSurrogateUnicodeEscape => { | ||
119 | "Unicode escape code should not be a surrogate" | ||
120 | } | ||
121 | EscapeError::OutOfRangeUnicodeEscape => { | ||
122 | "Unicode escape code should be at most 0x10FFFF" | ||
123 | } | ||
124 | EscapeError::UnicodeEscapeInByte => "Unicode escapes are not allowed in bytes", | ||
125 | EscapeError::NonAsciiCharInByte => "Non ASCII characters are not allowed in bytes", | ||
126 | }; | ||
127 | write!(f, "{}", msg) | ||
128 | } | ||
129 | } | ||
130 | |||
131 | impl From<EscapeError> for SyntaxErrorKind { | ||
132 | fn from(err: EscapeError) -> Self { | ||
133 | SyntaxErrorKind::EscapeError(err) | ||
134 | } | ||
135 | } | ||
diff --git a/crates/ra_syntax/src/validation.rs b/crates/ra_syntax/src/validation.rs index c2f545173..11a1fb4a7 100644 --- a/crates/ra_syntax/src/validation.rs +++ b/crates/ra_syntax/src/validation.rs | |||
@@ -1,17 +1,17 @@ | |||
1 | mod byte; | 1 | mod unescape; |
2 | mod byte_string; | 2 | |
3 | mod char; | ||
4 | mod string; | ||
5 | mod block; | 3 | mod block; |
6 | mod field_expr; | 4 | mod field_expr; |
7 | 5 | ||
8 | use crate::{ | 6 | use crate::{ |
9 | SourceFile, SyntaxError, AstNode, SyntaxNode, | 7 | SourceFile, SyntaxError, AstNode, SyntaxNode, TextUnit, |
10 | SyntaxKind::{L_CURLY, R_CURLY, BYTE, BYTE_STRING, STRING, CHAR}, | 8 | SyntaxKind::{L_CURLY, R_CURLY, BYTE, BYTE_STRING, STRING, CHAR}, |
11 | ast, | 9 | ast, |
12 | algo::visit::{visitor_ctx, VisitorCtx}, | 10 | algo::visit::{visitor_ctx, VisitorCtx}, |
13 | }; | 11 | }; |
14 | 12 | ||
13 | pub(crate) use unescape::EscapeError; | ||
14 | |||
15 | pub(crate) fn validate(file: &SourceFile) -> Vec<SyntaxError> { | 15 | pub(crate) fn validate(file: &SourceFile) -> Vec<SyntaxError> { |
16 | let mut errors = Vec::new(); | 16 | let mut errors = Vec::new(); |
17 | for node in file.syntax().descendants() { | 17 | for node in file.syntax().descendants() { |
@@ -26,11 +26,55 @@ pub(crate) fn validate(file: &SourceFile) -> Vec<SyntaxError> { | |||
26 | 26 | ||
27 | // FIXME: kill duplication | 27 | // FIXME: kill duplication |
28 | fn validate_literal(literal: &ast::Literal, acc: &mut Vec<SyntaxError>) { | 28 | fn validate_literal(literal: &ast::Literal, acc: &mut Vec<SyntaxError>) { |
29 | match literal.token().kind() { | 29 | let token = literal.token(); |
30 | BYTE => byte::validate_byte_node(literal.token(), acc), | 30 | let text = token.text().as_str(); |
31 | BYTE_STRING => byte_string::validate_byte_string_node(literal.token(), acc), | 31 | match token.kind() { |
32 | STRING => string::validate_string_node(literal.token(), acc), | 32 | BYTE => { |
33 | CHAR => char::validate_char_node(literal.token(), acc), | 33 | if let Some(end) = text.rfind('\'') { |
34 | if let Some(without_quotes) = text.get(2..end) { | ||
35 | if let Err((off, err)) = unescape::unescape_byte(without_quotes) { | ||
36 | let off = token.range().start() + TextUnit::from_usize(off + 2); | ||
37 | acc.push(SyntaxError::new(err.into(), off)) | ||
38 | } | ||
39 | } | ||
40 | } | ||
41 | } | ||
42 | CHAR => { | ||
43 | if let Some(end) = text.rfind('\'') { | ||
44 | if let Some(without_quotes) = text.get(1..end) { | ||
45 | if let Err((off, err)) = unescape::unescape_char(without_quotes) { | ||
46 | let off = token.range().start() + TextUnit::from_usize(off + 1); | ||
47 | acc.push(SyntaxError::new(err.into(), off)) | ||
48 | } | ||
49 | } | ||
50 | } | ||
51 | } | ||
52 | BYTE_STRING => { | ||
53 | if let Some(end) = text.rfind('\"') { | ||
54 | if let Some(without_quotes) = text.get(2..end) { | ||
55 | unescape::unescape_byte_str(without_quotes, &mut |range, char| { | ||
56 | if let Err(err) = char { | ||
57 | let off = range.start; | ||
58 | let off = token.range().start() + TextUnit::from_usize(off + 2); | ||
59 | acc.push(SyntaxError::new(err.into(), off)) | ||
60 | } | ||
61 | }) | ||
62 | } | ||
63 | } | ||
64 | } | ||
65 | STRING => { | ||
66 | if let Some(end) = text.rfind('\"') { | ||
67 | if let Some(without_quotes) = text.get(1..end) { | ||
68 | unescape::unescape_str(without_quotes, &mut |range, char| { | ||
69 | if let Err(err) = char { | ||
70 | let off = range.start; | ||
71 | let off = token.range().start() + TextUnit::from_usize(off + 1); | ||
72 | acc.push(SyntaxError::new(err.into(), off)) | ||
73 | } | ||
74 | }) | ||
75 | } | ||
76 | } | ||
77 | } | ||
34 | _ => (), | 78 | _ => (), |
35 | } | 79 | } |
36 | } | 80 | } |
diff --git a/crates/ra_syntax/src/validation/byte.rs b/crates/ra_syntax/src/validation/byte.rs deleted file mode 100644 index f653e65d0..000000000 --- a/crates/ra_syntax/src/validation/byte.rs +++ /dev/null | |||
@@ -1,199 +0,0 @@ | |||
1 | //! Validation of byte literals | ||
2 | |||
3 | use crate::{ | ||
4 | string_lexing::{self, StringComponentKind}, | ||
5 | TextRange, | ||
6 | validation::char, | ||
7 | SyntaxError, | ||
8 | SyntaxErrorKind::*, | ||
9 | SyntaxToken, | ||
10 | }; | ||
11 | |||
12 | pub(super) fn validate_byte_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) { | ||
13 | let literal_text = node.text(); | ||
14 | let literal_range = node.range(); | ||
15 | let mut components = string_lexing::parse_quoted_literal(Some('b'), '\'', literal_text); | ||
16 | let mut len = 0; | ||
17 | for component in &mut components { | ||
18 | len += 1; | ||
19 | let text = &literal_text[component.range]; | ||
20 | let range = component.range + literal_range.start(); | ||
21 | validate_byte_component(text, component.kind, range, errors); | ||
22 | } | ||
23 | |||
24 | if !components.has_closing_quote { | ||
25 | errors.push(SyntaxError::new(UnclosedByte, literal_range)); | ||
26 | } | ||
27 | |||
28 | if let Some(range) = components.suffix { | ||
29 | errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start())); | ||
30 | } | ||
31 | |||
32 | if len == 0 { | ||
33 | errors.push(SyntaxError::new(EmptyByte, literal_range)); | ||
34 | } | ||
35 | |||
36 | if len > 1 { | ||
37 | errors.push(SyntaxError::new(OverlongByte, literal_range)); | ||
38 | } | ||
39 | } | ||
40 | |||
41 | pub(super) fn validate_byte_component( | ||
42 | text: &str, | ||
43 | kind: StringComponentKind, | ||
44 | range: TextRange, | ||
45 | errors: &mut Vec<SyntaxError>, | ||
46 | ) { | ||
47 | use self::StringComponentKind::*; | ||
48 | match kind { | ||
49 | AsciiEscape => validate_byte_escape(text, range, errors), | ||
50 | AsciiCodeEscape => validate_byte_code_escape(text, range, errors), | ||
51 | UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)), | ||
52 | CodePoint => { | ||
53 | let c = text.chars().next().expect("Code points should be one character long"); | ||
54 | |||
55 | // These bytes must always be escaped | ||
56 | if c == '\t' || c == '\r' || c == '\n' { | ||
57 | errors.push(SyntaxError::new(UnescapedByte, range)); | ||
58 | } | ||
59 | |||
60 | // Only ASCII bytes are allowed | ||
61 | if c > 0x7F as char { | ||
62 | errors.push(SyntaxError::new(ByteOutOfRange, range)); | ||
63 | } | ||
64 | } | ||
65 | IgnoreNewline => { /* always valid */ } | ||
66 | } | ||
67 | } | ||
68 | |||
69 | fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | ||
70 | if text.len() == 1 { | ||
71 | // Escape sequence consists only of leading `\` | ||
72 | errors.push(SyntaxError::new(EmptyByteEscape, range)); | ||
73 | } else { | ||
74 | let escape_code = text.chars().skip(1).next().unwrap(); | ||
75 | if !char::is_ascii_escape(escape_code) { | ||
76 | errors.push(SyntaxError::new(InvalidByteEscape, range)); | ||
77 | } | ||
78 | } | ||
79 | } | ||
80 | |||
81 | fn validate_byte_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | ||
82 | // A ByteCodeEscape has 4 chars, example: `\xDD` | ||
83 | if !text.is_ascii() { | ||
84 | errors.push(SyntaxError::new(MalformedByteCodeEscape, range)); | ||
85 | } else if text.chars().count() < 4 { | ||
86 | errors.push(SyntaxError::new(TooShortByteCodeEscape, range)); | ||
87 | } else { | ||
88 | assert!(text.chars().count() == 4, "ByteCodeEscape cannot be longer than 4 chars"); | ||
89 | |||
90 | if u8::from_str_radix(&text[2..], 16).is_err() { | ||
91 | errors.push(SyntaxError::new(MalformedByteCodeEscape, range)); | ||
92 | } | ||
93 | } | ||
94 | } | ||
95 | |||
96 | #[cfg(test)] | ||
97 | mod test { | ||
98 | use crate::{SourceFile, TreeArc}; | ||
99 | |||
100 | fn build_file(literal: &str) -> TreeArc<SourceFile> { | ||
101 | let src = format!("const C: u8 = b'{}';", literal); | ||
102 | SourceFile::parse(&src) | ||
103 | } | ||
104 | |||
105 | fn assert_valid_byte(literal: &str) { | ||
106 | let file = build_file(literal); | ||
107 | assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors()); | ||
108 | } | ||
109 | |||
110 | fn assert_invalid_byte(literal: &str) { | ||
111 | let file = build_file(literal); | ||
112 | assert!(file.errors().len() > 0); | ||
113 | } | ||
114 | |||
115 | #[test] | ||
116 | fn test_ansi_codepoints() { | ||
117 | for byte in 0..128 { | ||
118 | match byte { | ||
119 | b'\n' | b'\r' | b'\t' => assert_invalid_byte(&(byte as char).to_string()), | ||
120 | b'\'' | b'\\' => { /* Ignore character close and backslash */ } | ||
121 | _ => assert_valid_byte(&(byte as char).to_string()), | ||
122 | } | ||
123 | } | ||
124 | |||
125 | for byte in 128..=255u8 { | ||
126 | assert_invalid_byte(&(byte as char).to_string()); | ||
127 | } | ||
128 | } | ||
129 | |||
130 | #[test] | ||
131 | fn test_unicode_codepoints() { | ||
132 | let invalid = ["Ƒ", "バ", "メ", "﷽"]; | ||
133 | for c in &invalid { | ||
134 | assert_invalid_byte(c); | ||
135 | } | ||
136 | } | ||
137 | |||
138 | #[test] | ||
139 | fn test_unicode_multiple_codepoints() { | ||
140 | let invalid = ["नी", "👨👨"]; | ||
141 | for c in &invalid { | ||
142 | assert_invalid_byte(c); | ||
143 | } | ||
144 | } | ||
145 | |||
146 | #[test] | ||
147 | fn test_valid_byte_escape() { | ||
148 | let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"]; | ||
149 | for c in &valid { | ||
150 | assert_valid_byte(c); | ||
151 | } | ||
152 | } | ||
153 | |||
154 | #[test] | ||
155 | fn test_invalid_byte_escape() { | ||
156 | let invalid = [r"\a", r"\?", r"\"]; | ||
157 | for c in &invalid { | ||
158 | assert_invalid_byte(c); | ||
159 | } | ||
160 | } | ||
161 | |||
162 | #[test] | ||
163 | fn test_valid_byte_code_escape() { | ||
164 | let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"]; | ||
165 | for c in &valid { | ||
166 | assert_valid_byte(c); | ||
167 | } | ||
168 | } | ||
169 | |||
170 | #[test] | ||
171 | fn test_invalid_byte_code_escape() { | ||
172 | let invalid = [r"\x", r"\x7"]; | ||
173 | for c in &invalid { | ||
174 | assert_invalid_byte(c); | ||
175 | } | ||
176 | } | ||
177 | |||
178 | #[test] | ||
179 | fn test_invalid_unicode_escape() { | ||
180 | let well_formed = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"]; | ||
181 | for c in &well_formed { | ||
182 | assert_invalid_byte(c); | ||
183 | } | ||
184 | |||
185 | let invalid = [ | ||
186 | r"\u", | ||
187 | r"\u{}", | ||
188 | r"\u{", | ||
189 | r"\u{FF", | ||
190 | r"\u{FFFFFF}", | ||
191 | r"\u{_F}", | ||
192 | r"\u{00FFFFF}", | ||
193 | r"\u{110000}", | ||
194 | ]; | ||
195 | for c in &invalid { | ||
196 | assert_invalid_byte(c); | ||
197 | } | ||
198 | } | ||
199 | } | ||
diff --git a/crates/ra_syntax/src/validation/byte_string.rs b/crates/ra_syntax/src/validation/byte_string.rs deleted file mode 100644 index 1d48c2d9b..000000000 --- a/crates/ra_syntax/src/validation/byte_string.rs +++ /dev/null | |||
@@ -1,169 +0,0 @@ | |||
1 | use crate::{ | ||
2 | string_lexing::{self, StringComponentKind}, | ||
3 | SyntaxError, | ||
4 | SyntaxErrorKind::*, | ||
5 | SyntaxToken, | ||
6 | }; | ||
7 | |||
8 | use super::byte; | ||
9 | |||
10 | pub(crate) fn validate_byte_string_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) { | ||
11 | let literal_text = node.text(); | ||
12 | let literal_range = node.range(); | ||
13 | let mut components = string_lexing::parse_quoted_literal(Some('b'), '"', literal_text); | ||
14 | for component in &mut components { | ||
15 | let range = component.range + literal_range.start(); | ||
16 | |||
17 | match component.kind { | ||
18 | StringComponentKind::IgnoreNewline => { /* always valid */ } | ||
19 | _ => { | ||
20 | // Chars must escape \t, \n and \r codepoints, but strings don't | ||
21 | let text = &literal_text[component.range]; | ||
22 | match text { | ||
23 | "\t" | "\n" | "\r" => { /* always valid */ } | ||
24 | _ => byte::validate_byte_component(text, component.kind, range, errors), | ||
25 | } | ||
26 | } | ||
27 | } | ||
28 | } | ||
29 | |||
30 | if !components.has_closing_quote { | ||
31 | errors.push(SyntaxError::new(UnclosedString, literal_range)); | ||
32 | } | ||
33 | |||
34 | if let Some(range) = components.suffix { | ||
35 | errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start())); | ||
36 | } | ||
37 | } | ||
38 | |||
39 | #[cfg(test)] | ||
40 | mod test { | ||
41 | use crate::{SourceFile, TreeArc}; | ||
42 | |||
43 | fn build_file(literal: &str) -> TreeArc<SourceFile> { | ||
44 | let src = format!(r#"const S: &'static [u8] = b"{}";"#, literal); | ||
45 | println!("Source: {}", src); | ||
46 | SourceFile::parse(&src) | ||
47 | } | ||
48 | |||
49 | fn assert_valid_str(literal: &str) { | ||
50 | let file = build_file(literal); | ||
51 | assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors()); | ||
52 | } | ||
53 | |||
54 | fn assert_invalid_str(literal: &str) { | ||
55 | let file = build_file(literal); | ||
56 | assert!(file.errors().len() > 0); | ||
57 | } | ||
58 | |||
59 | #[test] | ||
60 | fn test_ansi_codepoints() { | ||
61 | for byte in 0..128 { | ||
62 | match byte { | ||
63 | b'\"' | b'\\' => { /* Ignore string close and backslash */ } | ||
64 | _ => assert_valid_str(&(byte as char).to_string()), | ||
65 | } | ||
66 | } | ||
67 | |||
68 | for byte in 128..=255u8 { | ||
69 | assert_invalid_str(&(byte as char).to_string()); | ||
70 | } | ||
71 | } | ||
72 | |||
73 | #[test] | ||
74 | fn test_unicode_codepoints() { | ||
75 | let invalid = ["Ƒ", "バ", "メ", "﷽"]; | ||
76 | for c in &invalid { | ||
77 | assert_invalid_str(c); | ||
78 | } | ||
79 | } | ||
80 | |||
81 | #[test] | ||
82 | fn test_unicode_multiple_codepoints() { | ||
83 | let invalid = ["नी", "👨👨"]; | ||
84 | for c in &invalid { | ||
85 | assert_invalid_str(c); | ||
86 | } | ||
87 | } | ||
88 | |||
89 | #[test] | ||
90 | fn test_valid_ascii_escape() { | ||
91 | let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"]; | ||
92 | for c in &valid { | ||
93 | assert_valid_str(c); | ||
94 | } | ||
95 | } | ||
96 | |||
97 | #[test] | ||
98 | fn test_invalid_ascii_escape() { | ||
99 | let invalid = [r"\a", r"\?", r"\"]; | ||
100 | for c in &invalid { | ||
101 | assert_invalid_str(c); | ||
102 | } | ||
103 | } | ||
104 | |||
105 | #[test] | ||
106 | fn test_valid_ascii_code_escape() { | ||
107 | let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"]; | ||
108 | for c in &valid { | ||
109 | assert_valid_str(c); | ||
110 | } | ||
111 | } | ||
112 | |||
113 | #[test] | ||
114 | fn test_invalid_ascii_code_escape() { | ||
115 | let invalid = [r"\x", r"\x7"]; | ||
116 | for c in &invalid { | ||
117 | assert_invalid_str(c); | ||
118 | } | ||
119 | } | ||
120 | |||
121 | #[test] | ||
122 | fn test_invalid_unicode_escape() { | ||
123 | let well_formed = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"]; | ||
124 | for c in &well_formed { | ||
125 | assert_invalid_str(c); | ||
126 | } | ||
127 | |||
128 | let invalid = [ | ||
129 | r"\u", | ||
130 | r"\u{}", | ||
131 | r"\u{", | ||
132 | r"\u{FF", | ||
133 | r"\u{FFFFFF}", | ||
134 | r"\u{_F}", | ||
135 | r"\u{00FFFFF}", | ||
136 | r"\u{110000}", | ||
137 | ]; | ||
138 | for c in &invalid { | ||
139 | assert_invalid_str(c); | ||
140 | } | ||
141 | } | ||
142 | |||
143 | #[test] | ||
144 | fn test_mixed_invalid() { | ||
145 | assert_invalid_str( | ||
146 | r"This is the tale of a string | ||
147 | with a newline in between, some emoji (👨👨) here and there, | ||
148 | unicode escapes like this: \u{1FFBB} and weird stuff like | ||
149 | this ﷽", | ||
150 | ); | ||
151 | } | ||
152 | |||
153 | #[test] | ||
154 | fn test_mixed_valid() { | ||
155 | assert_valid_str( | ||
156 | r"This is the tale of a string | ||
157 | with a newline in between, no emoji at all, | ||
158 | nor unicode escapes or weird stuff", | ||
159 | ); | ||
160 | } | ||
161 | |||
162 | #[test] | ||
163 | fn test_ignore_newline() { | ||
164 | assert_valid_str( | ||
165 | "Hello \ | ||
166 | World", | ||
167 | ); | ||
168 | } | ||
169 | } | ||
diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs deleted file mode 100644 index 0f1885873..000000000 --- a/crates/ra_syntax/src/validation/char.rs +++ /dev/null | |||
@@ -1,273 +0,0 @@ | |||
1 | //! Validation of char literals | ||
2 | |||
3 | use std::u32; | ||
4 | |||
5 | use arrayvec::ArrayString; | ||
6 | |||
7 | use crate::{ | ||
8 | string_lexing::{self, StringComponentKind}, | ||
9 | TextRange, | ||
10 | SyntaxError, | ||
11 | SyntaxErrorKind::*, | ||
12 | SyntaxToken, | ||
13 | }; | ||
14 | |||
15 | pub(super) fn validate_char_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) { | ||
16 | let literal_text = node.text(); | ||
17 | let literal_range = node.range(); | ||
18 | let mut components = string_lexing::parse_quoted_literal(None, '\'', literal_text); | ||
19 | let mut len = 0; | ||
20 | for component in &mut components { | ||
21 | len += 1; | ||
22 | let text = &literal_text[component.range]; | ||
23 | let range = component.range + literal_range.start(); | ||
24 | validate_char_component(text, component.kind, range, errors); | ||
25 | } | ||
26 | |||
27 | if !components.has_closing_quote { | ||
28 | errors.push(SyntaxError::new(UnclosedChar, literal_range)); | ||
29 | } | ||
30 | |||
31 | if let Some(range) = components.suffix { | ||
32 | errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start())); | ||
33 | } | ||
34 | |||
35 | if len == 0 { | ||
36 | errors.push(SyntaxError::new(EmptyChar, literal_range)); | ||
37 | } | ||
38 | |||
39 | if len > 1 { | ||
40 | errors.push(SyntaxError::new(OverlongChar, literal_range)); | ||
41 | } | ||
42 | } | ||
43 | |||
44 | pub(super) fn validate_char_component( | ||
45 | text: &str, | ||
46 | kind: StringComponentKind, | ||
47 | range: TextRange, | ||
48 | errors: &mut Vec<SyntaxError>, | ||
49 | ) { | ||
50 | // Validate escapes | ||
51 | use self::StringComponentKind::*; | ||
52 | match kind { | ||
53 | AsciiEscape => validate_ascii_escape(text, range, errors), | ||
54 | AsciiCodeEscape => validate_ascii_code_escape(text, range, errors), | ||
55 | UnicodeEscape => validate_unicode_escape(text, range, errors), | ||
56 | CodePoint => { | ||
57 | // These code points must always be escaped | ||
58 | if text == "\t" || text == "\r" || text == "\n" { | ||
59 | errors.push(SyntaxError::new(UnescapedCodepoint, range)); | ||
60 | } | ||
61 | } | ||
62 | StringComponentKind::IgnoreNewline => { /* always valid */ } | ||
63 | } | ||
64 | } | ||
65 | |||
66 | fn validate_ascii_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | ||
67 | if text.len() == 1 { | ||
68 | // Escape sequence consists only of leading `\` (only occurs at EOF, otherwise e.g. '\' is treated as an unclosed char containing a single quote `'`) | ||
69 | errors.push(SyntaxError::new(EmptyAsciiEscape, range)); | ||
70 | } else { | ||
71 | let escape_code = text.chars().skip(1).next().unwrap(); | ||
72 | if !is_ascii_escape(escape_code) { | ||
73 | errors.push(SyntaxError::new(InvalidAsciiEscape, range)); | ||
74 | } | ||
75 | } | ||
76 | } | ||
77 | |||
78 | pub(super) fn is_ascii_escape(code: char) -> bool { | ||
79 | match code { | ||
80 | '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true, | ||
81 | _ => false, | ||
82 | } | ||
83 | } | ||
84 | |||
85 | fn validate_ascii_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | ||
86 | // An AsciiCodeEscape has 4 chars, example: `\xDD` | ||
87 | if !text.is_ascii() { | ||
88 | // FIXME: Give a more precise error message (say what the invalid character was) | ||
89 | errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)); | ||
90 | } else if text.chars().count() < 4 { | ||
91 | errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range)); | ||
92 | } else { | ||
93 | assert_eq!( | ||
94 | text.chars().count(), | ||
95 | 4, | ||
96 | "AsciiCodeEscape cannot be longer than 4 chars, but text '{}' is", | ||
97 | text, | ||
98 | ); | ||
99 | |||
100 | match u8::from_str_radix(&text[2..], 16) { | ||
101 | Ok(code) if code < 128 => { /* Escape code is valid */ } | ||
102 | Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)), | ||
103 | Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)), | ||
104 | } | ||
105 | } | ||
106 | } | ||
107 | |||
108 | fn validate_unicode_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | ||
109 | assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u"); | ||
110 | |||
111 | if text.len() == 2 { | ||
112 | // No starting `{` | ||
113 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
114 | return; | ||
115 | } | ||
116 | |||
117 | if text.len() == 3 { | ||
118 | // Only starting `{` | ||
119 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)); | ||
120 | return; | ||
121 | } | ||
122 | |||
123 | let mut code = ArrayString::<[_; 6]>::new(); | ||
124 | let mut closed = false; | ||
125 | for c in text[3..].chars() { | ||
126 | assert!(!closed, "no characters after escape is closed"); | ||
127 | |||
128 | if c.is_digit(16) { | ||
129 | if code.len() == 6 { | ||
130 | errors.push(SyntaxError::new(OverlongUnicodeEscape, range)); | ||
131 | return; | ||
132 | } | ||
133 | |||
134 | code.push(c); | ||
135 | } else if c == '_' { | ||
136 | // Reject leading _ | ||
137 | if code.len() == 0 { | ||
138 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
139 | return; | ||
140 | } | ||
141 | } else if c == '}' { | ||
142 | closed = true; | ||
143 | } else { | ||
144 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
145 | return; | ||
146 | } | ||
147 | } | ||
148 | |||
149 | if !closed { | ||
150 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)) | ||
151 | } | ||
152 | |||
153 | if code.len() == 0 { | ||
154 | errors.push(SyntaxError::new(EmptyUnicodeEcape, range)); | ||
155 | return; | ||
156 | } | ||
157 | |||
158 | match u32::from_str_radix(&code, 16) { | ||
159 | Ok(code_u32) if code_u32 > 0x10FFFF => { | ||
160 | errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range)); | ||
161 | } | ||
162 | Ok(_) => { | ||
163 | // Valid escape code | ||
164 | } | ||
165 | Err(_) => { | ||
166 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
167 | } | ||
168 | } | ||
169 | } | ||
170 | |||
171 | #[cfg(test)] | ||
172 | mod test { | ||
173 | use crate::{SourceFile, TreeArc}; | ||
174 | |||
175 | fn build_file(literal: &str) -> TreeArc<SourceFile> { | ||
176 | let src = format!("const C: char = '{}';", literal); | ||
177 | SourceFile::parse(&src) | ||
178 | } | ||
179 | |||
180 | fn assert_valid_char(literal: &str) { | ||
181 | let file = build_file(literal); | ||
182 | assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors()); | ||
183 | } | ||
184 | |||
185 | fn assert_invalid_char(literal: &str) { | ||
186 | let file = build_file(literal); | ||
187 | assert!(file.errors().len() > 0); | ||
188 | } | ||
189 | |||
190 | #[test] | ||
191 | fn test_ansi_codepoints() { | ||
192 | for byte in 0..=255u8 { | ||
193 | match byte { | ||
194 | b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()), | ||
195 | b'\'' | b'\\' => { /* Ignore character close and backslash */ } | ||
196 | _ => assert_valid_char(&(byte as char).to_string()), | ||
197 | } | ||
198 | } | ||
199 | } | ||
200 | |||
201 | #[test] | ||
202 | fn test_unicode_codepoints() { | ||
203 | let valid = ["Ƒ", "バ", "メ", "﷽"]; | ||
204 | for c in &valid { | ||
205 | assert_valid_char(c); | ||
206 | } | ||
207 | } | ||
208 | |||
209 | #[test] | ||
210 | fn test_unicode_multiple_codepoints() { | ||
211 | let invalid = ["नी", "👨👨"]; | ||
212 | for c in &invalid { | ||
213 | assert_invalid_char(c); | ||
214 | } | ||
215 | } | ||
216 | |||
217 | #[test] | ||
218 | fn test_valid_ascii_escape() { | ||
219 | let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"]; | ||
220 | for c in &valid { | ||
221 | assert_valid_char(c); | ||
222 | } | ||
223 | } | ||
224 | |||
225 | #[test] | ||
226 | fn test_invalid_ascii_escape() { | ||
227 | let invalid = [r"\a", r"\?", r"\"]; | ||
228 | for c in &invalid { | ||
229 | assert_invalid_char(c); | ||
230 | } | ||
231 | } | ||
232 | |||
233 | #[test] | ||
234 | fn test_valid_ascii_code_escape() { | ||
235 | let valid = [r"\x00", r"\x7F", r"\x55"]; | ||
236 | for c in &valid { | ||
237 | assert_valid_char(c); | ||
238 | } | ||
239 | } | ||
240 | |||
241 | #[test] | ||
242 | fn test_invalid_ascii_code_escape() { | ||
243 | let invalid = [r"\x", r"\x7", r"\xF0"]; | ||
244 | for c in &invalid { | ||
245 | assert_invalid_char(c); | ||
246 | } | ||
247 | } | ||
248 | |||
249 | #[test] | ||
250 | fn test_valid_unicode_escape() { | ||
251 | let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"]; | ||
252 | for c in &valid { | ||
253 | assert_valid_char(c); | ||
254 | } | ||
255 | } | ||
256 | |||
257 | #[test] | ||
258 | fn test_invalid_unicode_escape() { | ||
259 | let invalid = [ | ||
260 | r"\u", | ||
261 | r"\u{}", | ||
262 | r"\u{", | ||
263 | r"\u{FF", | ||
264 | r"\u{FFFFFF}", | ||
265 | r"\u{_F}", | ||
266 | r"\u{00FFFFF}", | ||
267 | r"\u{110000}", | ||
268 | ]; | ||
269 | for c in &invalid { | ||
270 | assert_invalid_char(c); | ||
271 | } | ||
272 | } | ||
273 | } | ||
diff --git a/crates/ra_syntax/src/validation/string.rs b/crates/ra_syntax/src/validation/string.rs deleted file mode 100644 index fc2f1b992..000000000 --- a/crates/ra_syntax/src/validation/string.rs +++ /dev/null | |||
@@ -1,154 +0,0 @@ | |||
1 | use crate::{ | ||
2 | string_lexing, | ||
3 | SyntaxError, | ||
4 | SyntaxErrorKind::*, | ||
5 | SyntaxToken, | ||
6 | }; | ||
7 | |||
8 | use super::char; | ||
9 | |||
10 | pub(crate) fn validate_string_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) { | ||
11 | let literal_text = node.text(); | ||
12 | let literal_range = node.range(); | ||
13 | let mut components = string_lexing::parse_quoted_literal(None, '"', literal_text); | ||
14 | for component in &mut components { | ||
15 | let range = component.range + literal_range.start(); | ||
16 | |||
17 | // Chars must escape \t, \n and \r codepoints, but strings don't | ||
18 | let text = &literal_text[component.range]; | ||
19 | match text { | ||
20 | "\t" | "\n" | "\r" => { /* always valid */ } | ||
21 | _ => char::validate_char_component(text, component.kind, range, errors), | ||
22 | } | ||
23 | } | ||
24 | |||
25 | if !components.has_closing_quote { | ||
26 | errors.push(SyntaxError::new(UnclosedString, literal_range)); | ||
27 | } | ||
28 | |||
29 | if let Some(range) = components.suffix { | ||
30 | errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start())); | ||
31 | } | ||
32 | } | ||
33 | |||
34 | #[cfg(test)] | ||
35 | mod test { | ||
36 | use crate::{SourceFile, TreeArc}; | ||
37 | |||
38 | fn build_file(literal: &str) -> TreeArc<SourceFile> { | ||
39 | let src = format!(r#"const S: &'static str = "{}";"#, literal); | ||
40 | println!("Source: {}", src); | ||
41 | SourceFile::parse(&src) | ||
42 | } | ||
43 | |||
44 | fn assert_valid_str(literal: &str) { | ||
45 | let file = build_file(literal); | ||
46 | assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors()); | ||
47 | } | ||
48 | |||
49 | fn assert_invalid_str(literal: &str) { | ||
50 | let file = build_file(literal); | ||
51 | assert!(file.errors().len() > 0); | ||
52 | } | ||
53 | |||
54 | #[test] | ||
55 | fn test_ansi_codepoints() { | ||
56 | for byte in 0..=255u8 { | ||
57 | match byte { | ||
58 | b'\"' | b'\\' => { /* Ignore string close and backslash */ } | ||
59 | _ => assert_valid_str(&(byte as char).to_string()), | ||
60 | } | ||
61 | } | ||
62 | } | ||
63 | |||
64 | #[test] | ||
65 | fn test_unicode_codepoints() { | ||
66 | let valid = ["Ƒ", "バ", "メ", "﷽"]; | ||
67 | for c in &valid { | ||
68 | assert_valid_str(c); | ||
69 | } | ||
70 | } | ||
71 | |||
72 | #[test] | ||
73 | fn test_unicode_multiple_codepoints() { | ||
74 | let valid = ["नी", "👨👨"]; | ||
75 | for c in &valid { | ||
76 | assert_valid_str(c); | ||
77 | } | ||
78 | } | ||
79 | |||
80 | #[test] | ||
81 | fn test_valid_ascii_escape() { | ||
82 | let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"]; | ||
83 | for c in &valid { | ||
84 | assert_valid_str(c); | ||
85 | } | ||
86 | } | ||
87 | |||
88 | #[test] | ||
89 | fn test_invalid_ascii_escape() { | ||
90 | let invalid = [r"\a", r"\?", r"\"]; | ||
91 | for c in &invalid { | ||
92 | assert_invalid_str(c); | ||
93 | } | ||
94 | } | ||
95 | |||
96 | #[test] | ||
97 | fn test_valid_ascii_code_escape() { | ||
98 | let valid = [r"\x00", r"\x7F", r"\x55"]; | ||
99 | for c in &valid { | ||
100 | assert_valid_str(c); | ||
101 | } | ||
102 | } | ||
103 | |||
104 | #[test] | ||
105 | fn test_invalid_ascii_code_escape() { | ||
106 | let invalid = [r"\x", r"\x7", r"\xF0"]; | ||
107 | for c in &invalid { | ||
108 | assert_invalid_str(c); | ||
109 | } | ||
110 | } | ||
111 | |||
112 | #[test] | ||
113 | fn test_valid_unicode_escape() { | ||
114 | let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"]; | ||
115 | for c in &valid { | ||
116 | assert_valid_str(c); | ||
117 | } | ||
118 | } | ||
119 | |||
120 | #[test] | ||
121 | fn test_invalid_unicode_escape() { | ||
122 | let invalid = [ | ||
123 | r"\u", | ||
124 | r"\u{}", | ||
125 | r"\u{", | ||
126 | r"\u{FF", | ||
127 | r"\u{FFFFFF}", | ||
128 | r"\u{_F}", | ||
129 | r"\u{00FFFFF}", | ||
130 | r"\u{110000}", | ||
131 | ]; | ||
132 | for c in &invalid { | ||
133 | assert_invalid_str(c); | ||
134 | } | ||
135 | } | ||
136 | |||
137 | #[test] | ||
138 | fn test_mixed() { | ||
139 | assert_valid_str( | ||
140 | r"This is the tale of a string | ||
141 | with a newline in between, some emoji (👨👨) here and there, | ||
142 | unicode escapes like this: \u{1FFBB} and weird stuff like | ||
143 | this ﷽", | ||
144 | ); | ||
145 | } | ||
146 | |||
147 | #[test] | ||
148 | fn test_ignore_newline() { | ||
149 | assert_valid_str( | ||
150 | "Hello \ | ||
151 | World", | ||
152 | ); | ||
153 | } | ||
154 | } | ||
diff --git a/crates/ra_syntax/src/validation/unescape.rs b/crates/ra_syntax/src/validation/unescape.rs new file mode 100644 index 000000000..2086046b6 --- /dev/null +++ b/crates/ra_syntax/src/validation/unescape.rs | |||
@@ -0,0 +1,521 @@ | |||
1 | //! Utilities for validating string and char literals and turning them into | ||
2 | //! values they represent. | ||
3 | //! | ||
4 | //! This file is copy-pasted from the compiler | ||
5 | //! | ||
6 | //! https://github.com/rust-lang/rust/blob/c6ac57564852cb6e2d0db60f7b46d9eb98d4b449/src/libsyntax/parse/unescape.rs | ||
7 | //! | ||
8 | //! Hopefully, we'll share this code in a proper way some day | ||
9 | |||
10 | use std::str::Chars; | ||
11 | use std::ops::Range; | ||
12 | |||
13 | #[derive(Debug, PartialEq, Eq, Clone, Hash)] | ||
14 | pub enum EscapeError { | ||
15 | ZeroChars, | ||
16 | MoreThanOneChar, | ||
17 | |||
18 | LoneSlash, | ||
19 | InvalidEscape, | ||
20 | BareCarriageReturn, | ||
21 | EscapeOnlyChar, | ||
22 | |||
23 | TooShortHexEscape, | ||
24 | InvalidCharInHexEscape, | ||
25 | OutOfRangeHexEscape, | ||
26 | |||
27 | NoBraceInUnicodeEscape, | ||
28 | InvalidCharInUnicodeEscape, | ||
29 | EmptyUnicodeEscape, | ||
30 | UnclosedUnicodeEscape, | ||
31 | LeadingUnderscoreUnicodeEscape, | ||
32 | OverlongUnicodeEscape, | ||
33 | LoneSurrogateUnicodeEscape, | ||
34 | OutOfRangeUnicodeEscape, | ||
35 | |||
36 | UnicodeEscapeInByte, | ||
37 | NonAsciiCharInByte, | ||
38 | } | ||
39 | |||
40 | /// Takes a contents of a char literal (without quotes), and returns an | ||
41 | /// unescaped char or an error | ||
42 | pub(crate) fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> { | ||
43 | let mut chars = literal_text.chars(); | ||
44 | unescape_char_or_byte(&mut chars, Mode::Char) | ||
45 | .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) | ||
46 | } | ||
47 | |||
48 | /// Takes a contents of a string literal (without quotes) and produces a | ||
49 | /// sequence of escaped characters or errors. | ||
50 | pub(crate) fn unescape_str<F>(literal_text: &str, callback: &mut F) | ||
51 | where | ||
52 | F: FnMut(Range<usize>, Result<char, EscapeError>), | ||
53 | { | ||
54 | unescape_str_or_byte_str(literal_text, Mode::Str, callback) | ||
55 | } | ||
56 | |||
57 | pub(crate) fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> { | ||
58 | let mut chars = literal_text.chars(); | ||
59 | unescape_char_or_byte(&mut chars, Mode::Byte) | ||
60 | .map(byte_from_char) | ||
61 | .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) | ||
62 | } | ||
63 | |||
64 | /// Takes a contents of a string literal (without quotes) and produces a | ||
65 | /// sequence of escaped characters or errors. | ||
66 | pub(crate) fn unescape_byte_str<F>(literal_text: &str, callback: &mut F) | ||
67 | where | ||
68 | F: FnMut(Range<usize>, Result<u8, EscapeError>), | ||
69 | { | ||
70 | unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| { | ||
71 | callback(range, char.map(byte_from_char)) | ||
72 | }) | ||
73 | } | ||
74 | |||
75 | #[derive(Debug, Clone, Copy)] | ||
76 | pub(crate) enum Mode { | ||
77 | Char, | ||
78 | Str, | ||
79 | Byte, | ||
80 | ByteStr, | ||
81 | } | ||
82 | |||
83 | impl Mode { | ||
84 | fn in_single_quotes(self) -> bool { | ||
85 | match self { | ||
86 | Mode::Char | Mode::Byte => true, | ||
87 | Mode::Str | Mode::ByteStr => false, | ||
88 | } | ||
89 | } | ||
90 | |||
91 | pub(crate) fn in_double_quotes(self) -> bool { | ||
92 | !self.in_single_quotes() | ||
93 | } | ||
94 | |||
95 | pub(crate) fn is_bytes(self) -> bool { | ||
96 | match self { | ||
97 | Mode::Byte | Mode::ByteStr => true, | ||
98 | Mode::Char | Mode::Str => false, | ||
99 | } | ||
100 | } | ||
101 | } | ||
102 | |||
103 | fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { | ||
104 | if first_char != '\\' { | ||
105 | return match first_char { | ||
106 | '\t' | '\n' => Err(EscapeError::EscapeOnlyChar), | ||
107 | '\r' => Err(if chars.clone().next() == Some('\n') { | ||
108 | EscapeError::EscapeOnlyChar | ||
109 | } else { | ||
110 | EscapeError::BareCarriageReturn | ||
111 | }), | ||
112 | '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar), | ||
113 | '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar), | ||
114 | _ => { | ||
115 | if mode.is_bytes() && !first_char.is_ascii() { | ||
116 | return Err(EscapeError::NonAsciiCharInByte); | ||
117 | } | ||
118 | Ok(first_char) | ||
119 | } | ||
120 | }; | ||
121 | } | ||
122 | |||
123 | let second_char = chars.next().ok_or(EscapeError::LoneSlash)?; | ||
124 | |||
125 | let res = match second_char { | ||
126 | '"' => '"', | ||
127 | 'n' => '\n', | ||
128 | 'r' => '\r', | ||
129 | 't' => '\t', | ||
130 | '\\' => '\\', | ||
131 | '\'' => '\'', | ||
132 | '0' => '\0', | ||
133 | |||
134 | 'x' => { | ||
135 | let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; | ||
136 | let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; | ||
137 | |||
138 | let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; | ||
139 | let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; | ||
140 | |||
141 | let value = hi * 16 + lo; | ||
142 | |||
143 | if !mode.is_bytes() && !is_ascii(value) { | ||
144 | return Err(EscapeError::OutOfRangeHexEscape); | ||
145 | } | ||
146 | let value = value as u8; | ||
147 | |||
148 | value as char | ||
149 | } | ||
150 | |||
151 | 'u' => { | ||
152 | if chars.next() != Some('{') { | ||
153 | return Err(EscapeError::NoBraceInUnicodeEscape); | ||
154 | } | ||
155 | |||
156 | let mut n_digits = 1; | ||
157 | let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { | ||
158 | '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), | ||
159 | '}' => return Err(EscapeError::EmptyUnicodeEscape), | ||
160 | c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?, | ||
161 | }; | ||
162 | |||
163 | loop { | ||
164 | match chars.next() { | ||
165 | None => return Err(EscapeError::UnclosedUnicodeEscape), | ||
166 | Some('_') => continue, | ||
167 | Some('}') => { | ||
168 | if n_digits > 6 { | ||
169 | return Err(EscapeError::OverlongUnicodeEscape); | ||
170 | } | ||
171 | if mode.is_bytes() { | ||
172 | return Err(EscapeError::UnicodeEscapeInByte); | ||
173 | } | ||
174 | |||
175 | break std::char::from_u32(value).ok_or_else(|| { | ||
176 | if value > 0x10FFFF { | ||
177 | EscapeError::OutOfRangeUnicodeEscape | ||
178 | } else { | ||
179 | EscapeError::LoneSurrogateUnicodeEscape | ||
180 | } | ||
181 | })?; | ||
182 | } | ||
183 | Some(c) => { | ||
184 | let digit = | ||
185 | c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?; | ||
186 | n_digits += 1; | ||
187 | if n_digits > 6 { | ||
188 | continue; | ||
189 | } | ||
190 | let digit = digit as u32; | ||
191 | value = value * 16 + digit; | ||
192 | } | ||
193 | }; | ||
194 | } | ||
195 | } | ||
196 | _ => return Err(EscapeError::InvalidEscape), | ||
197 | }; | ||
198 | Ok(res) | ||
199 | } | ||
200 | |||
201 | fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { | ||
202 | let first_char = chars.next().ok_or(EscapeError::ZeroChars)?; | ||
203 | let res = scan_escape(first_char, chars, mode)?; | ||
204 | if chars.next().is_some() { | ||
205 | return Err(EscapeError::MoreThanOneChar); | ||
206 | } | ||
207 | Ok(res) | ||
208 | } | ||
209 | |||
210 | /// Takes a contents of a string literal (without quotes) and produces a | ||
211 | /// sequence of escaped characters or errors. | ||
212 | fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F) | ||
213 | where | ||
214 | F: FnMut(Range<usize>, Result<char, EscapeError>), | ||
215 | { | ||
216 | assert!(mode.in_double_quotes()); | ||
217 | let initial_len = src.len(); | ||
218 | let mut chars = src.chars(); | ||
219 | while let Some(first_char) = chars.next() { | ||
220 | let start = initial_len - chars.as_str().len() - first_char.len_utf8(); | ||
221 | |||
222 | let unescaped_char = match first_char { | ||
223 | '\\' => { | ||
224 | let (second_char, third_char) = { | ||
225 | let mut chars = chars.clone(); | ||
226 | (chars.next(), chars.next()) | ||
227 | }; | ||
228 | match (second_char, third_char) { | ||
229 | (Some('\n'), _) | (Some('\r'), Some('\n')) => { | ||
230 | skip_ascii_whitespace(&mut chars); | ||
231 | continue; | ||
232 | } | ||
233 | _ => scan_escape(first_char, &mut chars, mode), | ||
234 | } | ||
235 | } | ||
236 | '\r' => { | ||
237 | let second_char = chars.clone().next(); | ||
238 | if second_char == Some('\n') { | ||
239 | chars.next(); | ||
240 | Ok('\n') | ||
241 | } else { | ||
242 | scan_escape(first_char, &mut chars, mode) | ||
243 | } | ||
244 | } | ||
245 | '\n' => Ok('\n'), | ||
246 | '\t' => Ok('\t'), | ||
247 | _ => scan_escape(first_char, &mut chars, mode), | ||
248 | }; | ||
249 | let end = initial_len - chars.as_str().len(); | ||
250 | callback(start..end, unescaped_char); | ||
251 | } | ||
252 | |||
253 | fn skip_ascii_whitespace(chars: &mut Chars<'_>) { | ||
254 | let str = chars.as_str(); | ||
255 | let first_non_space = str | ||
256 | .bytes() | ||
257 | .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') | ||
258 | .unwrap_or(str.len()); | ||
259 | *chars = str[first_non_space..].chars() | ||
260 | } | ||
261 | } | ||
262 | |||
263 | fn byte_from_char(c: char) -> u8 { | ||
264 | let res = c as u32; | ||
265 | assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte"); | ||
266 | res as u8 | ||
267 | } | ||
268 | |||
269 | fn is_ascii(x: u32) -> bool { | ||
270 | x <= 0x7F | ||
271 | } | ||
272 | |||
273 | #[cfg(test)] | ||
274 | mod tests { | ||
275 | use super::*; | ||
276 | |||
277 | #[test] | ||
278 | fn test_unescape_char_bad() { | ||
279 | fn check(literal_text: &str, expected_error: EscapeError) { | ||
280 | let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err); | ||
281 | assert_eq!(actual_result, Err(expected_error)); | ||
282 | } | ||
283 | |||
284 | check("", EscapeError::ZeroChars); | ||
285 | check(r"\", EscapeError::LoneSlash); | ||
286 | |||
287 | check("\n", EscapeError::EscapeOnlyChar); | ||
288 | check("\r\n", EscapeError::EscapeOnlyChar); | ||
289 | check("\t", EscapeError::EscapeOnlyChar); | ||
290 | check("'", EscapeError::EscapeOnlyChar); | ||
291 | check("\r", EscapeError::BareCarriageReturn); | ||
292 | |||
293 | check("spam", EscapeError::MoreThanOneChar); | ||
294 | check(r"\x0ff", EscapeError::MoreThanOneChar); | ||
295 | check(r#"\"a"#, EscapeError::MoreThanOneChar); | ||
296 | check(r"\na", EscapeError::MoreThanOneChar); | ||
297 | check(r"\ra", EscapeError::MoreThanOneChar); | ||
298 | check(r"\ta", EscapeError::MoreThanOneChar); | ||
299 | check(r"\\a", EscapeError::MoreThanOneChar); | ||
300 | check(r"\'a", EscapeError::MoreThanOneChar); | ||
301 | check(r"\0a", EscapeError::MoreThanOneChar); | ||
302 | check(r"\u{0}x", EscapeError::MoreThanOneChar); | ||
303 | check(r"\u{1F63b}}", EscapeError::MoreThanOneChar); | ||
304 | |||
305 | check(r"\v", EscapeError::InvalidEscape); | ||
306 | check(r"\💩", EscapeError::InvalidEscape); | ||
307 | check(r"\●", EscapeError::InvalidEscape); | ||
308 | |||
309 | check(r"\x", EscapeError::TooShortHexEscape); | ||
310 | check(r"\x0", EscapeError::TooShortHexEscape); | ||
311 | check(r"\xf", EscapeError::TooShortHexEscape); | ||
312 | check(r"\xa", EscapeError::TooShortHexEscape); | ||
313 | check(r"\xx", EscapeError::InvalidCharInHexEscape); | ||
314 | check(r"\xы", EscapeError::InvalidCharInHexEscape); | ||
315 | check(r"\x🦀", EscapeError::InvalidCharInHexEscape); | ||
316 | check(r"\xtt", EscapeError::InvalidCharInHexEscape); | ||
317 | check(r"\xff", EscapeError::OutOfRangeHexEscape); | ||
318 | check(r"\xFF", EscapeError::OutOfRangeHexEscape); | ||
319 | check(r"\x80", EscapeError::OutOfRangeHexEscape); | ||
320 | |||
321 | check(r"\u", EscapeError::NoBraceInUnicodeEscape); | ||
322 | check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape); | ||
323 | check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape); | ||
324 | check(r"\u{", EscapeError::UnclosedUnicodeEscape); | ||
325 | check(r"\u{0000", EscapeError::UnclosedUnicodeEscape); | ||
326 | check(r"\u{}", EscapeError::EmptyUnicodeEscape); | ||
327 | check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape); | ||
328 | check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape); | ||
329 | check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape); | ||
330 | check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape); | ||
331 | check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape); | ||
332 | |||
333 | check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape); | ||
334 | check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape); | ||
335 | check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape); | ||
336 | |||
337 | check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape); | ||
338 | check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape); | ||
339 | check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape); | ||
340 | } | ||
341 | |||
342 | #[test] | ||
343 | fn test_unescape_char_good() { | ||
344 | fn check(literal_text: &str, expected_char: char) { | ||
345 | let actual_result = unescape_char(literal_text); | ||
346 | assert_eq!(actual_result, Ok(expected_char)); | ||
347 | } | ||
348 | |||
349 | check("a", 'a'); | ||
350 | check("ы", 'ы'); | ||
351 | check("🦀", '🦀'); | ||
352 | |||
353 | check(r#"\""#, '"'); | ||
354 | check(r"\n", '\n'); | ||
355 | check(r"\r", '\r'); | ||
356 | check(r"\t", '\t'); | ||
357 | check(r"\\", '\\'); | ||
358 | check(r"\'", '\''); | ||
359 | check(r"\0", '\0'); | ||
360 | |||
361 | check(r"\x00", '\0'); | ||
362 | check(r"\x5a", 'Z'); | ||
363 | check(r"\x5A", 'Z'); | ||
364 | check(r"\x7f", 127 as char); | ||
365 | |||
366 | check(r"\u{0}", '\0'); | ||
367 | check(r"\u{000000}", '\0'); | ||
368 | check(r"\u{41}", 'A'); | ||
369 | check(r"\u{0041}", 'A'); | ||
370 | check(r"\u{00_41}", 'A'); | ||
371 | check(r"\u{4__1__}", 'A'); | ||
372 | check(r"\u{1F63b}", '😻'); | ||
373 | } | ||
374 | |||
375 | #[test] | ||
376 | fn test_unescape_str_good() { | ||
377 | fn check(literal_text: &str, expected: &str) { | ||
378 | let mut buf = Ok(String::with_capacity(literal_text.len())); | ||
379 | unescape_str(literal_text, &mut |range, c| { | ||
380 | if let Ok(b) = &mut buf { | ||
381 | match c { | ||
382 | Ok(c) => b.push(c), | ||
383 | Err(e) => buf = Err((range, e)), | ||
384 | } | ||
385 | } | ||
386 | }); | ||
387 | let buf = buf.as_ref().map(|it| it.as_ref()); | ||
388 | assert_eq!(buf, Ok(expected)) | ||
389 | } | ||
390 | |||
391 | check("foo", "foo"); | ||
392 | check("", ""); | ||
393 | check(" \t\n\r\n", " \t\n\n"); | ||
394 | |||
395 | check("hello \\\n world", "hello world"); | ||
396 | check("hello \\\r\n world", "hello world"); | ||
397 | check("thread's", "thread's") | ||
398 | } | ||
399 | |||
400 | #[test] | ||
401 | fn test_unescape_byte_bad() { | ||
402 | fn check(literal_text: &str, expected_error: EscapeError) { | ||
403 | let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err); | ||
404 | assert_eq!(actual_result, Err(expected_error)); | ||
405 | } | ||
406 | |||
407 | check("", EscapeError::ZeroChars); | ||
408 | check(r"\", EscapeError::LoneSlash); | ||
409 | |||
410 | check("\n", EscapeError::EscapeOnlyChar); | ||
411 | check("\r\n", EscapeError::EscapeOnlyChar); | ||
412 | check("\t", EscapeError::EscapeOnlyChar); | ||
413 | check("'", EscapeError::EscapeOnlyChar); | ||
414 | check("\r", EscapeError::BareCarriageReturn); | ||
415 | |||
416 | check("spam", EscapeError::MoreThanOneChar); | ||
417 | check(r"\x0ff", EscapeError::MoreThanOneChar); | ||
418 | check(r#"\"a"#, EscapeError::MoreThanOneChar); | ||
419 | check(r"\na", EscapeError::MoreThanOneChar); | ||
420 | check(r"\ra", EscapeError::MoreThanOneChar); | ||
421 | check(r"\ta", EscapeError::MoreThanOneChar); | ||
422 | check(r"\\a", EscapeError::MoreThanOneChar); | ||
423 | check(r"\'a", EscapeError::MoreThanOneChar); | ||
424 | check(r"\0a", EscapeError::MoreThanOneChar); | ||
425 | |||
426 | check(r"\v", EscapeError::InvalidEscape); | ||
427 | check(r"\💩", EscapeError::InvalidEscape); | ||
428 | check(r"\●", EscapeError::InvalidEscape); | ||
429 | |||
430 | check(r"\x", EscapeError::TooShortHexEscape); | ||
431 | check(r"\x0", EscapeError::TooShortHexEscape); | ||
432 | check(r"\xa", EscapeError::TooShortHexEscape); | ||
433 | check(r"\xf", EscapeError::TooShortHexEscape); | ||
434 | check(r"\xx", EscapeError::InvalidCharInHexEscape); | ||
435 | check(r"\xы", EscapeError::InvalidCharInHexEscape); | ||
436 | check(r"\x🦀", EscapeError::InvalidCharInHexEscape); | ||
437 | check(r"\xtt", EscapeError::InvalidCharInHexEscape); | ||
438 | |||
439 | check(r"\u", EscapeError::NoBraceInUnicodeEscape); | ||
440 | check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape); | ||
441 | check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape); | ||
442 | check(r"\u{", EscapeError::UnclosedUnicodeEscape); | ||
443 | check(r"\u{0000", EscapeError::UnclosedUnicodeEscape); | ||
444 | check(r"\u{}", EscapeError::EmptyUnicodeEscape); | ||
445 | check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape); | ||
446 | check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape); | ||
447 | |||
448 | check("ы", EscapeError::NonAsciiCharInByte); | ||
449 | check("🦀", EscapeError::NonAsciiCharInByte); | ||
450 | |||
451 | check(r"\u{0}", EscapeError::UnicodeEscapeInByte); | ||
452 | check(r"\u{000000}", EscapeError::UnicodeEscapeInByte); | ||
453 | check(r"\u{41}", EscapeError::UnicodeEscapeInByte); | ||
454 | check(r"\u{0041}", EscapeError::UnicodeEscapeInByte); | ||
455 | check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte); | ||
456 | check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte); | ||
457 | check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte); | ||
458 | check(r"\u{0}x", EscapeError::UnicodeEscapeInByte); | ||
459 | check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte); | ||
460 | check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte); | ||
461 | check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte); | ||
462 | check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte); | ||
463 | check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte); | ||
464 | check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte); | ||
465 | check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte); | ||
466 | check(r"\u{D800}", EscapeError::UnicodeEscapeInByte); | ||
467 | check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte); | ||
468 | check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte); | ||
469 | } | ||
470 | |||
471 | #[test] | ||
472 | fn test_unescape_byte_good() { | ||
473 | fn check(literal_text: &str, expected_byte: u8) { | ||
474 | let actual_result = unescape_byte(literal_text); | ||
475 | assert_eq!(actual_result, Ok(expected_byte)); | ||
476 | } | ||
477 | |||
478 | check("a", b'a'); | ||
479 | |||
480 | check(r#"\""#, b'"'); | ||
481 | check(r"\n", b'\n'); | ||
482 | check(r"\r", b'\r'); | ||
483 | check(r"\t", b'\t'); | ||
484 | check(r"\\", b'\\'); | ||
485 | check(r"\'", b'\''); | ||
486 | check(r"\0", b'\0'); | ||
487 | |||
488 | check(r"\x00", b'\0'); | ||
489 | check(r"\x5a", b'Z'); | ||
490 | check(r"\x5A", b'Z'); | ||
491 | check(r"\x7f", 127); | ||
492 | check(r"\x80", 128); | ||
493 | check(r"\xff", 255); | ||
494 | check(r"\xFF", 255); | ||
495 | } | ||
496 | |||
497 | #[test] | ||
498 | fn test_unescape_byte_str_good() { | ||
499 | fn check(literal_text: &str, expected: &[u8]) { | ||
500 | let mut buf = Ok(Vec::with_capacity(literal_text.len())); | ||
501 | unescape_byte_str(literal_text, &mut |range, c| { | ||
502 | if let Ok(b) = &mut buf { | ||
503 | match c { | ||
504 | Ok(c) => b.push(c), | ||
505 | Err(e) => buf = Err((range, e)), | ||
506 | } | ||
507 | } | ||
508 | }); | ||
509 | let buf = buf.as_ref().map(|it| it.as_ref()); | ||
510 | assert_eq!(buf, Ok(expected)) | ||
511 | } | ||
512 | |||
513 | check("foo", b"foo"); | ||
514 | check("", b""); | ||
515 | check(" \t\n\r\n", b" \t\n\n"); | ||
516 | |||
517 | check("hello \\\n world", b"hello world"); | ||
518 | check("hello \\\r\n world", b"hello world"); | ||
519 | check("thread's", b"thread's") | ||
520 | } | ||
521 | } | ||
diff --git a/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt b/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt index 61a28134a..e0e38d37d 100644 --- a/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt +++ b/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt | |||
@@ -40,7 +40,6 @@ SOURCE_FILE@[0; 112) | |||
40 | WHITESPACE@[43; 44) " " | 40 | WHITESPACE@[43; 44) " " |
41 | LITERAL@[44; 59) | 41 | LITERAL@[44; 59) |
42 | STRING@[44; 59) "\"string\"invalid" | 42 | STRING@[44; 59) "\"string\"invalid" |
43 | err: `Invalid literal suffix` | ||
44 | SEMI@[59; 60) ";" | 43 | SEMI@[59; 60) ";" |
45 | WHITESPACE@[60; 65) "\n " | 44 | WHITESPACE@[60; 65) "\n " |
46 | LET_STMT@[65; 83) | 45 | LET_STMT@[65; 83) |
@@ -53,7 +52,6 @@ SOURCE_FILE@[0; 112) | |||
53 | WHITESPACE@[72; 73) " " | 52 | WHITESPACE@[72; 73) " " |
54 | LITERAL@[73; 82) | 53 | LITERAL@[73; 82) |
55 | BYTE@[73; 82) "b\'b\'_suff" | 54 | BYTE@[73; 82) "b\'b\'_suff" |
56 | err: `Invalid literal suffix` | ||
57 | SEMI@[82; 83) ";" | 55 | SEMI@[82; 83) ";" |
58 | WHITESPACE@[83; 88) "\n " | 56 | WHITESPACE@[83; 88) "\n " |
59 | LET_STMT@[88; 109) | 57 | LET_STMT@[88; 109) |
@@ -66,7 +64,6 @@ SOURCE_FILE@[0; 112) | |||
66 | WHITESPACE@[95; 96) " " | 64 | WHITESPACE@[95; 96) " " |
67 | LITERAL@[96; 108) | 65 | LITERAL@[96; 108) |
68 | BYTE_STRING@[96; 108) "b\"bs\"invalid" | 66 | BYTE_STRING@[96; 108) "b\"bs\"invalid" |
69 | err: `Invalid literal suffix` | ||
70 | SEMI@[108; 109) ";" | 67 | SEMI@[108; 109) ";" |
71 | WHITESPACE@[109; 110) "\n" | 68 | WHITESPACE@[109; 110) "\n" |
72 | R_CURLY@[110; 111) "}" | 69 | R_CURLY@[110; 111) "}" |