diff options
author | bors[bot] <bors[bot]@users.noreply.github.com> | 2019-05-07 17:43:10 +0100 |
---|---|---|
committer | bors[bot] <bors[bot]@users.noreply.github.com> | 2019-05-07 17:43:10 +0100 |
commit | d3efedb752bb2198796603d8a479a5e3ee472a97 (patch) | |
tree | ca6a4aee6ad4077a869a932a18c6c8d134406f8c /crates/ra_syntax/src/validation/char.rs | |
parent | ef782adc293deb287128f005dbab2038ba3ccdc1 (diff) | |
parent | 313314e14b629ebf50389dbd2d440bda922f6ae7 (diff) |
Merge #1253
1253: Share literal validation logic with compiler r=matklad a=matklad
This is neat: the unescape module is literary what compiler is using right now:
https://github.com/rust-lang/rust/blob/c6ac57564852cb6e2d0db60f7b46d9eb98d4b449/src/libsyntax/parse/unescape.rs
So, yeah, code sharing via copy-paste!
Co-authored-by: Aleksey Kladov <[email protected]>
Diffstat (limited to 'crates/ra_syntax/src/validation/char.rs')
-rw-r--r-- | crates/ra_syntax/src/validation/char.rs | 273 |
1 files changed, 0 insertions, 273 deletions
diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs deleted file mode 100644 index 0f1885873..000000000 --- a/crates/ra_syntax/src/validation/char.rs +++ /dev/null | |||
@@ -1,273 +0,0 @@ | |||
1 | //! Validation of char literals | ||
2 | |||
3 | use std::u32; | ||
4 | |||
5 | use arrayvec::ArrayString; | ||
6 | |||
7 | use crate::{ | ||
8 | string_lexing::{self, StringComponentKind}, | ||
9 | TextRange, | ||
10 | SyntaxError, | ||
11 | SyntaxErrorKind::*, | ||
12 | SyntaxToken, | ||
13 | }; | ||
14 | |||
15 | pub(super) fn validate_char_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) { | ||
16 | let literal_text = node.text(); | ||
17 | let literal_range = node.range(); | ||
18 | let mut components = string_lexing::parse_quoted_literal(None, '\'', literal_text); | ||
19 | let mut len = 0; | ||
20 | for component in &mut components { | ||
21 | len += 1; | ||
22 | let text = &literal_text[component.range]; | ||
23 | let range = component.range + literal_range.start(); | ||
24 | validate_char_component(text, component.kind, range, errors); | ||
25 | } | ||
26 | |||
27 | if !components.has_closing_quote { | ||
28 | errors.push(SyntaxError::new(UnclosedChar, literal_range)); | ||
29 | } | ||
30 | |||
31 | if let Some(range) = components.suffix { | ||
32 | errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start())); | ||
33 | } | ||
34 | |||
35 | if len == 0 { | ||
36 | errors.push(SyntaxError::new(EmptyChar, literal_range)); | ||
37 | } | ||
38 | |||
39 | if len > 1 { | ||
40 | errors.push(SyntaxError::new(OverlongChar, literal_range)); | ||
41 | } | ||
42 | } | ||
43 | |||
44 | pub(super) fn validate_char_component( | ||
45 | text: &str, | ||
46 | kind: StringComponentKind, | ||
47 | range: TextRange, | ||
48 | errors: &mut Vec<SyntaxError>, | ||
49 | ) { | ||
50 | // Validate escapes | ||
51 | use self::StringComponentKind::*; | ||
52 | match kind { | ||
53 | AsciiEscape => validate_ascii_escape(text, range, errors), | ||
54 | AsciiCodeEscape => validate_ascii_code_escape(text, range, errors), | ||
55 | UnicodeEscape => validate_unicode_escape(text, range, errors), | ||
56 | CodePoint => { | ||
57 | // These code points must always be escaped | ||
58 | if text == "\t" || text == "\r" || text == "\n" { | ||
59 | errors.push(SyntaxError::new(UnescapedCodepoint, range)); | ||
60 | } | ||
61 | } | ||
62 | StringComponentKind::IgnoreNewline => { /* always valid */ } | ||
63 | } | ||
64 | } | ||
65 | |||
66 | fn validate_ascii_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | ||
67 | if text.len() == 1 { | ||
68 | // Escape sequence consists only of leading `\` (only occurs at EOF, otherwise e.g. '\' is treated as an unclosed char containing a single quote `'`) | ||
69 | errors.push(SyntaxError::new(EmptyAsciiEscape, range)); | ||
70 | } else { | ||
71 | let escape_code = text.chars().skip(1).next().unwrap(); | ||
72 | if !is_ascii_escape(escape_code) { | ||
73 | errors.push(SyntaxError::new(InvalidAsciiEscape, range)); | ||
74 | } | ||
75 | } | ||
76 | } | ||
77 | |||
78 | pub(super) fn is_ascii_escape(code: char) -> bool { | ||
79 | match code { | ||
80 | '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true, | ||
81 | _ => false, | ||
82 | } | ||
83 | } | ||
84 | |||
85 | fn validate_ascii_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | ||
86 | // An AsciiCodeEscape has 4 chars, example: `\xDD` | ||
87 | if !text.is_ascii() { | ||
88 | // FIXME: Give a more precise error message (say what the invalid character was) | ||
89 | errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)); | ||
90 | } else if text.chars().count() < 4 { | ||
91 | errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range)); | ||
92 | } else { | ||
93 | assert_eq!( | ||
94 | text.chars().count(), | ||
95 | 4, | ||
96 | "AsciiCodeEscape cannot be longer than 4 chars, but text '{}' is", | ||
97 | text, | ||
98 | ); | ||
99 | |||
100 | match u8::from_str_radix(&text[2..], 16) { | ||
101 | Ok(code) if code < 128 => { /* Escape code is valid */ } | ||
102 | Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)), | ||
103 | Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)), | ||
104 | } | ||
105 | } | ||
106 | } | ||
107 | |||
108 | fn validate_unicode_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | ||
109 | assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u"); | ||
110 | |||
111 | if text.len() == 2 { | ||
112 | // No starting `{` | ||
113 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
114 | return; | ||
115 | } | ||
116 | |||
117 | if text.len() == 3 { | ||
118 | // Only starting `{` | ||
119 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)); | ||
120 | return; | ||
121 | } | ||
122 | |||
123 | let mut code = ArrayString::<[_; 6]>::new(); | ||
124 | let mut closed = false; | ||
125 | for c in text[3..].chars() { | ||
126 | assert!(!closed, "no characters after escape is closed"); | ||
127 | |||
128 | if c.is_digit(16) { | ||
129 | if code.len() == 6 { | ||
130 | errors.push(SyntaxError::new(OverlongUnicodeEscape, range)); | ||
131 | return; | ||
132 | } | ||
133 | |||
134 | code.push(c); | ||
135 | } else if c == '_' { | ||
136 | // Reject leading _ | ||
137 | if code.len() == 0 { | ||
138 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
139 | return; | ||
140 | } | ||
141 | } else if c == '}' { | ||
142 | closed = true; | ||
143 | } else { | ||
144 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
145 | return; | ||
146 | } | ||
147 | } | ||
148 | |||
149 | if !closed { | ||
150 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)) | ||
151 | } | ||
152 | |||
153 | if code.len() == 0 { | ||
154 | errors.push(SyntaxError::new(EmptyUnicodeEcape, range)); | ||
155 | return; | ||
156 | } | ||
157 | |||
158 | match u32::from_str_radix(&code, 16) { | ||
159 | Ok(code_u32) if code_u32 > 0x10FFFF => { | ||
160 | errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range)); | ||
161 | } | ||
162 | Ok(_) => { | ||
163 | // Valid escape code | ||
164 | } | ||
165 | Err(_) => { | ||
166 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
167 | } | ||
168 | } | ||
169 | } | ||
170 | |||
171 | #[cfg(test)] | ||
172 | mod test { | ||
173 | use crate::{SourceFile, TreeArc}; | ||
174 | |||
175 | fn build_file(literal: &str) -> TreeArc<SourceFile> { | ||
176 | let src = format!("const C: char = '{}';", literal); | ||
177 | SourceFile::parse(&src) | ||
178 | } | ||
179 | |||
180 | fn assert_valid_char(literal: &str) { | ||
181 | let file = build_file(literal); | ||
182 | assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors()); | ||
183 | } | ||
184 | |||
185 | fn assert_invalid_char(literal: &str) { | ||
186 | let file = build_file(literal); | ||
187 | assert!(file.errors().len() > 0); | ||
188 | } | ||
189 | |||
190 | #[test] | ||
191 | fn test_ansi_codepoints() { | ||
192 | for byte in 0..=255u8 { | ||
193 | match byte { | ||
194 | b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()), | ||
195 | b'\'' | b'\\' => { /* Ignore character close and backslash */ } | ||
196 | _ => assert_valid_char(&(byte as char).to_string()), | ||
197 | } | ||
198 | } | ||
199 | } | ||
200 | |||
201 | #[test] | ||
202 | fn test_unicode_codepoints() { | ||
203 | let valid = ["Ƒ", "バ", "メ", "﷽"]; | ||
204 | for c in &valid { | ||
205 | assert_valid_char(c); | ||
206 | } | ||
207 | } | ||
208 | |||
209 | #[test] | ||
210 | fn test_unicode_multiple_codepoints() { | ||
211 | let invalid = ["नी", "👨👨"]; | ||
212 | for c in &invalid { | ||
213 | assert_invalid_char(c); | ||
214 | } | ||
215 | } | ||
216 | |||
217 | #[test] | ||
218 | fn test_valid_ascii_escape() { | ||
219 | let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"]; | ||
220 | for c in &valid { | ||
221 | assert_valid_char(c); | ||
222 | } | ||
223 | } | ||
224 | |||
225 | #[test] | ||
226 | fn test_invalid_ascii_escape() { | ||
227 | let invalid = [r"\a", r"\?", r"\"]; | ||
228 | for c in &invalid { | ||
229 | assert_invalid_char(c); | ||
230 | } | ||
231 | } | ||
232 | |||
233 | #[test] | ||
234 | fn test_valid_ascii_code_escape() { | ||
235 | let valid = [r"\x00", r"\x7F", r"\x55"]; | ||
236 | for c in &valid { | ||
237 | assert_valid_char(c); | ||
238 | } | ||
239 | } | ||
240 | |||
241 | #[test] | ||
242 | fn test_invalid_ascii_code_escape() { | ||
243 | let invalid = [r"\x", r"\x7", r"\xF0"]; | ||
244 | for c in &invalid { | ||
245 | assert_invalid_char(c); | ||
246 | } | ||
247 | } | ||
248 | |||
249 | #[test] | ||
250 | fn test_valid_unicode_escape() { | ||
251 | let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"]; | ||
252 | for c in &valid { | ||
253 | assert_valid_char(c); | ||
254 | } | ||
255 | } | ||
256 | |||
257 | #[test] | ||
258 | fn test_invalid_unicode_escape() { | ||
259 | let invalid = [ | ||
260 | r"\u", | ||
261 | r"\u{}", | ||
262 | r"\u{", | ||
263 | r"\u{FF", | ||
264 | r"\u{FFFFFF}", | ||
265 | r"\u{_F}", | ||
266 | r"\u{00FFFFF}", | ||
267 | r"\u{110000}", | ||
268 | ]; | ||
269 | for c in &invalid { | ||
270 | assert_invalid_char(c); | ||
271 | } | ||
272 | } | ||
273 | } | ||