diff options
author | bors[bot] <bors[bot]@users.noreply.github.com> | 2018-11-07 11:09:40 +0000 |
---|---|---|
committer | bors[bot] <bors[bot]@users.noreply.github.com> | 2018-11-07 11:09:40 +0000 |
commit | 2e2445444abcde9fc2f50c70a8157958f7d5ddd8 (patch) | |
tree | 8c667fc65a65c0e7978add07b72ec532a4b6eb7b | |
parent | a46a07eca33f951b3d445e49dcbff3c53962a8e0 (diff) | |
parent | 433a8061910a388f777b839eb67f2582f91b6c7a (diff) |
Merge #207
207: Finish implementing char validation r=aochagavia a=aochagavia
The only thing missing right now are good integration tests (and maybe more descriptive error messages)
Co-authored-by: Adolfo Ochagavía <[email protected]>
-rw-r--r-- | Cargo.lock | 1 | ||||
-rw-r--r-- | crates/ra_syntax/Cargo.toml | 1 | ||||
-rw-r--r-- | crates/ra_syntax/src/lexer/ptr.rs | 3 | ||||
-rw-r--r-- | crates/ra_syntax/src/lib.rs | 1 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing/mod.rs | 2 | ||||
-rw-r--r-- | crates/ra_syntax/src/utils.rs | 1 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation.rs | 213 | ||||
-rw-r--r-- | crates/ra_syntax/src/yellow/syntax_error.rs | 26 |
8 files changed, 235 insertions, 13 deletions
diff --git a/Cargo.lock b/Cargo.lock index 80fbda23c..c1f773055 100644 --- a/Cargo.lock +++ b/Cargo.lock | |||
@@ -671,6 +671,7 @@ dependencies = [ | |||
671 | name = "ra_syntax" | 671 | name = "ra_syntax" |
672 | version = "0.1.0" | 672 | version = "0.1.0" |
673 | dependencies = [ | 673 | dependencies = [ |
674 | "arrayvec 0.4.7 (registry+https://github.com/rust-lang/crates.io-index)", | ||
674 | "drop_bomb 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", | 675 | "drop_bomb 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", |
675 | "itertools 0.7.8 (registry+https://github.com/rust-lang/crates.io-index)", | 676 | "itertools 0.7.8 (registry+https://github.com/rust-lang/crates.io-index)", |
676 | "parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", | 677 | "parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", |
diff --git a/crates/ra_syntax/Cargo.toml b/crates/ra_syntax/Cargo.toml index 97d259570..54ee72386 100644 --- a/crates/ra_syntax/Cargo.toml +++ b/crates/ra_syntax/Cargo.toml | |||
@@ -8,6 +8,7 @@ description = "Comment and whitespace preserving parser for the Rust langauge" | |||
8 | repository = "https://github.com/rust-analyzer/rust-analyzer" | 8 | repository = "https://github.com/rust-analyzer/rust-analyzer" |
9 | 9 | ||
10 | [dependencies] | 10 | [dependencies] |
11 | arrayvec = "0.4.7" | ||
11 | unicode-xid = "0.1.0" | 12 | unicode-xid = "0.1.0" |
12 | itertools = "0.7.8" | 13 | itertools = "0.7.8" |
13 | drop_bomb = "0.1.4" | 14 | drop_bomb = "0.1.4" |
diff --git a/crates/ra_syntax/src/lexer/ptr.rs b/crates/ra_syntax/src/lexer/ptr.rs index 4c291b9c4..7e4df51aa 100644 --- a/crates/ra_syntax/src/lexer/ptr.rs +++ b/crates/ra_syntax/src/lexer/ptr.rs | |||
@@ -30,8 +30,7 @@ impl<'s> Ptr<'s> { | |||
30 | /// Gets the nth character from the current. | 30 | /// Gets the nth character from the current. |
31 | /// For example, 0 will return the current token, 1 will return the next, etc. | 31 | /// For example, 0 will return the current token, 1 will return the next, etc. |
32 | pub fn nth(&self, n: u32) -> Option<char> { | 32 | pub fn nth(&self, n: u32) -> Option<char> { |
33 | let mut chars = self.chars().peekable(); | 33 | self.chars().nth(n as usize) |
34 | chars.by_ref().nth(n as usize) | ||
35 | } | 34 | } |
36 | 35 | ||
37 | /// Checks whether the current character is `c`. | 36 | /// Checks whether the current character is `c`. |
diff --git a/crates/ra_syntax/src/lib.rs b/crates/ra_syntax/src/lib.rs index aa172ba42..54012b7b6 100644 --- a/crates/ra_syntax/src/lib.rs +++ b/crates/ra_syntax/src/lib.rs | |||
@@ -20,6 +20,7 @@ | |||
20 | #![allow(missing_docs)] | 20 | #![allow(missing_docs)] |
21 | //#![warn(unreachable_pub)] // rust-lang/rust#47816 | 21 | //#![warn(unreachable_pub)] // rust-lang/rust#47816 |
22 | 22 | ||
23 | extern crate arrayvec; | ||
23 | extern crate drop_bomb; | 24 | extern crate drop_bomb; |
24 | extern crate itertools; | 25 | extern crate itertools; |
25 | extern crate parking_lot; | 26 | extern crate parking_lot; |
diff --git a/crates/ra_syntax/src/string_lexing/mod.rs b/crates/ra_syntax/src/string_lexing/mod.rs index f0812ff28..cc53e0aba 100644 --- a/crates/ra_syntax/src/string_lexing/mod.rs +++ b/crates/ra_syntax/src/string_lexing/mod.rs | |||
@@ -219,7 +219,7 @@ mod tests { | |||
219 | 219 | ||
220 | #[test] | 220 | #[test] |
221 | fn test_unicode_escapes() { | 221 | fn test_unicode_escapes() { |
222 | let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", ""]; | 222 | let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""]; |
223 | for escape in unicode_escapes { | 223 | for escape in unicode_escapes { |
224 | let escape_sequence = format!(r"'\u{}'", escape); | 224 | let escape_sequence = format!(r"'\u{}'", escape); |
225 | let component = closed_char_component(&escape_sequence); | 225 | let component = closed_char_component(&escape_sequence); |
diff --git a/crates/ra_syntax/src/utils.rs b/crates/ra_syntax/src/utils.rs index 288d7edd4..cad9544be 100644 --- a/crates/ra_syntax/src/utils.rs +++ b/crates/ra_syntax/src/utils.rs | |||
@@ -1,5 +1,6 @@ | |||
1 | use crate::{File, SyntaxKind, SyntaxNodeRef, WalkEvent}; | 1 | use crate::{File, SyntaxKind, SyntaxNodeRef, WalkEvent}; |
2 | use std::fmt::Write; | 2 | use std::fmt::Write; |
3 | use std::str; | ||
3 | 4 | ||
4 | /// Parse a file and create a string representation of the resulting parse tree. | 5 | /// Parse a file and create a string representation of the resulting parse tree. |
5 | pub fn dump_tree(syntax: SyntaxNodeRef) -> String { | 6 | pub fn dump_tree(syntax: SyntaxNodeRef) -> String { |
diff --git a/crates/ra_syntax/src/validation.rs b/crates/ra_syntax/src/validation.rs index 2b26e388d..f345dbd6e 100644 --- a/crates/ra_syntax/src/validation.rs +++ b/crates/ra_syntax/src/validation.rs | |||
@@ -1,3 +1,7 @@ | |||
1 | use std::u32; | ||
2 | |||
3 | use arrayvec::ArrayString; | ||
4 | |||
1 | use crate::{ | 5 | use crate::{ |
2 | algo::visit::{visitor_ctx, VisitorCtx}, | 6 | algo::visit::{visitor_ctx, VisitorCtx}, |
3 | ast::{self, AstNode}, | 7 | ast::{self, AstNode}, |
@@ -42,18 +46,90 @@ fn validate_char(node: ast::Char, errors: &mut Vec<SyntaxError>) { | |||
42 | } | 46 | } |
43 | } | 47 | } |
44 | AsciiCodeEscape => { | 48 | AsciiCodeEscape => { |
45 | // TODO: | 49 | // An AsciiCodeEscape has 4 chars, example: `\xDD` |
46 | // * First digit is octal | 50 | if text.len() < 4 { |
47 | // * Second digit is hex | 51 | errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range)); |
52 | } else { | ||
53 | assert!( | ||
54 | text.chars().count() == 4, | ||
55 | "AsciiCodeEscape cannot be longer than 4 chars" | ||
56 | ); | ||
57 | |||
58 | match u8::from_str_radix(&text[2..], 16) { | ||
59 | Ok(code) if code < 128 => { /* Escape code is valid */ } | ||
60 | Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)), | ||
61 | Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)), | ||
62 | } | ||
63 | } | ||
48 | } | 64 | } |
49 | UnicodeEscape => { | 65 | UnicodeEscape => { |
50 | // TODO: | 66 | assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u"); |
51 | // * Only hex digits or underscores allowed | 67 | |
52 | // * Max 6 chars | 68 | if text.len() == 2 { |
53 | // * Within allowed range (must be at most 10FFFF) | 69 | // No starting `{` |
70 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
71 | return; | ||
72 | } | ||
73 | |||
74 | if text.len() == 3 { | ||
75 | // Only starting `{` | ||
76 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)); | ||
77 | return; | ||
78 | } | ||
79 | |||
80 | let mut code = ArrayString::<[_; 6]>::new(); | ||
81 | let mut closed = false; | ||
82 | for c in text[3..].chars() { | ||
83 | assert!(!closed, "no characters after escape is closed"); | ||
84 | |||
85 | if c.is_digit(16) { | ||
86 | if code.len() == 6 { | ||
87 | errors.push(SyntaxError::new(OverlongUnicodeEscape, range)); | ||
88 | return; | ||
89 | } | ||
90 | |||
91 | code.push(c); | ||
92 | } else if c == '_' { | ||
93 | // Reject leading _ | ||
94 | if code.len() == 0 { | ||
95 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
96 | return; | ||
97 | } | ||
98 | } else if c == '}' { | ||
99 | closed = true; | ||
100 | } else { | ||
101 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
102 | return; | ||
103 | } | ||
104 | } | ||
105 | |||
106 | if !closed { | ||
107 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)) | ||
108 | } | ||
109 | |||
110 | if code.len() == 0 { | ||
111 | errors.push(SyntaxError::new(EmptyUnicodeEcape, range)); | ||
112 | return; | ||
113 | } | ||
114 | |||
115 | match u32::from_str_radix(&code, 16) { | ||
116 | Ok(code_u32) if code_u32 > 0x10FFFF => { | ||
117 | errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range)); | ||
118 | } | ||
119 | Ok(_) => { | ||
120 | // Valid escape code | ||
121 | } | ||
122 | Err(_) => { | ||
123 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
124 | } | ||
125 | } | ||
126 | } | ||
127 | CodePoint => { | ||
128 | // These code points must always be escaped | ||
129 | if text == "\t" || text == "\r" { | ||
130 | errors.push(SyntaxError::new(UnescapedCodepoint, range)); | ||
131 | } | ||
54 | } | 132 | } |
55 | // Code points are always valid | ||
56 | CodePoint => (), | ||
57 | } | 133 | } |
58 | } | 134 | } |
59 | 135 | ||
@@ -72,7 +148,124 @@ fn validate_char(node: ast::Char, errors: &mut Vec<SyntaxError>) { | |||
72 | 148 | ||
73 | fn is_ascii_escape(code: char) -> bool { | 149 | fn is_ascii_escape(code: char) -> bool { |
74 | match code { | 150 | match code { |
75 | '\'' | '"' | 'n' | 'r' | 't' | '0' => true, | 151 | '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true, |
76 | _ => false, | 152 | _ => false, |
77 | } | 153 | } |
78 | } | 154 | } |
155 | |||
156 | #[cfg(test)] | ||
157 | mod test { | ||
158 | use crate::File; | ||
159 | |||
160 | fn build_file(literal: &str) -> File { | ||
161 | let src = format!("const C: char = '{}';", literal); | ||
162 | File::parse(&src) | ||
163 | } | ||
164 | |||
165 | fn assert_valid_char(literal: &str) { | ||
166 | let file = build_file(literal); | ||
167 | assert!( | ||
168 | file.errors().len() == 0, | ||
169 | "Errors for literal '{}': {:?}", | ||
170 | literal, | ||
171 | file.errors() | ||
172 | ); | ||
173 | } | ||
174 | |||
175 | fn assert_invalid_char(literal: &str) { | ||
176 | let file = build_file(literal); | ||
177 | assert!(file.errors().len() > 0); | ||
178 | } | ||
179 | |||
180 | #[test] | ||
181 | fn test_ansi_codepoints() { | ||
182 | for byte in 0..=255u8 { | ||
183 | match byte { | ||
184 | b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()), | ||
185 | b'\'' | b'\\' => { /* Ignore character close and backslash */ } | ||
186 | _ => assert_valid_char(&(byte as char).to_string()), | ||
187 | } | ||
188 | } | ||
189 | } | ||
190 | |||
191 | #[test] | ||
192 | fn test_unicode_codepoints() { | ||
193 | let valid = ["Ƒ", "バ", "メ", "﷽"]; | ||
194 | for c in &valid { | ||
195 | assert_valid_char(c); | ||
196 | } | ||
197 | } | ||
198 | |||
199 | #[test] | ||
200 | fn test_unicode_multiple_codepoints() { | ||
201 | let invalid = ["नी", "👨👨"]; | ||
202 | for c in &invalid { | ||
203 | assert_invalid_char(c); | ||
204 | } | ||
205 | } | ||
206 | |||
207 | #[test] | ||
208 | fn test_valid_ascii_escape() { | ||
209 | let valid = [ | ||
210 | r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b", | ||
211 | ]; | ||
212 | for c in &valid { | ||
213 | assert_valid_char(c); | ||
214 | } | ||
215 | } | ||
216 | |||
217 | #[test] | ||
218 | fn test_invalid_ascii_escape() { | ||
219 | let invalid = [r"\a", r"\?", r"\"]; | ||
220 | for c in &invalid { | ||
221 | assert_invalid_char(c); | ||
222 | } | ||
223 | } | ||
224 | |||
225 | #[test] | ||
226 | fn test_valid_ascii_code_escape() { | ||
227 | let valid = [r"\x00", r"\x7F", r"\x55"]; | ||
228 | for c in &valid { | ||
229 | assert_valid_char(c); | ||
230 | } | ||
231 | } | ||
232 | |||
233 | #[test] | ||
234 | fn test_invalid_ascii_code_escape() { | ||
235 | let invalid = [r"\x", r"\x7", r"\xF0"]; | ||
236 | for c in &invalid { | ||
237 | assert_invalid_char(c); | ||
238 | } | ||
239 | } | ||
240 | |||
241 | #[test] | ||
242 | fn test_valid_unicode_escape() { | ||
243 | let valid = [ | ||
244 | r"\u{FF}", | ||
245 | r"\u{0}", | ||
246 | r"\u{F}", | ||
247 | r"\u{10FFFF}", | ||
248 | r"\u{1_0__FF___FF_____}", | ||
249 | ]; | ||
250 | for c in &valid { | ||
251 | assert_valid_char(c); | ||
252 | } | ||
253 | } | ||
254 | |||
255 | #[test] | ||
256 | fn test_invalid_unicode_escape() { | ||
257 | let invalid = [ | ||
258 | r"\u", | ||
259 | r"\u{}", | ||
260 | r"\u{", | ||
261 | r"\u{FF", | ||
262 | r"\u{FFFFFF}", | ||
263 | r"\u{_F}", | ||
264 | r"\u{00FFFFF}", | ||
265 | r"\u{110000}", | ||
266 | ]; | ||
267 | for c in &invalid { | ||
268 | assert_invalid_char(c); | ||
269 | } | ||
270 | } | ||
271 | } | ||
diff --git a/crates/ra_syntax/src/yellow/syntax_error.rs b/crates/ra_syntax/src/yellow/syntax_error.rs index f3df6bc15..c524adf39 100644 --- a/crates/ra_syntax/src/yellow/syntax_error.rs +++ b/crates/ra_syntax/src/yellow/syntax_error.rs | |||
@@ -34,6 +34,10 @@ impl SyntaxError { | |||
34 | } | 34 | } |
35 | } | 35 | } |
36 | 36 | ||
37 | pub fn kind(&self) -> SyntaxErrorKind { | ||
38 | self.kind.clone() | ||
39 | } | ||
40 | |||
37 | pub fn location(&self) -> Location { | 41 | pub fn location(&self) -> Location { |
38 | self.location.clone() | 42 | self.location.clone() |
39 | } | 43 | } |
@@ -64,11 +68,20 @@ impl fmt::Display for SyntaxError { | |||
64 | #[derive(Debug, Clone, PartialEq, Eq, Hash)] | 68 | #[derive(Debug, Clone, PartialEq, Eq, Hash)] |
65 | pub enum SyntaxErrorKind { | 69 | pub enum SyntaxErrorKind { |
66 | ParseError(ParseError), | 70 | ParseError(ParseError), |
71 | UnescapedCodepoint, | ||
67 | EmptyChar, | 72 | EmptyChar, |
68 | UnclosedChar, | 73 | UnclosedChar, |
69 | LongChar, | 74 | LongChar, |
70 | EmptyAsciiEscape, | 75 | EmptyAsciiEscape, |
71 | InvalidAsciiEscape, | 76 | InvalidAsciiEscape, |
77 | TooShortAsciiCodeEscape, | ||
78 | AsciiCodeEscapeOutOfRange, | ||
79 | MalformedAsciiCodeEscape, | ||
80 | UnclosedUnicodeEscape, | ||
81 | MalformedUnicodeEscape, | ||
82 | EmptyUnicodeEcape, | ||
83 | OverlongUnicodeEscape, | ||
84 | UnicodeEscapeOutOfRange, | ||
72 | } | 85 | } |
73 | 86 | ||
74 | #[derive(Debug, Clone, PartialEq, Eq, Hash)] | 87 | #[derive(Debug, Clone, PartialEq, Eq, Hash)] |
@@ -78,11 +91,24 @@ impl fmt::Display for SyntaxErrorKind { | |||
78 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | 91 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
79 | use self::SyntaxErrorKind::*; | 92 | use self::SyntaxErrorKind::*; |
80 | match self { | 93 | match self { |
94 | UnescapedCodepoint => write!(f, "This codepoint should always be escaped"), | ||
81 | EmptyAsciiEscape => write!(f, "Empty escape sequence"), | 95 | EmptyAsciiEscape => write!(f, "Empty escape sequence"), |
82 | InvalidAsciiEscape => write!(f, "Invalid escape sequence"), | 96 | InvalidAsciiEscape => write!(f, "Invalid escape sequence"), |
83 | EmptyChar => write!(f, "Empty char literal"), | 97 | EmptyChar => write!(f, "Empty char literal"), |
84 | UnclosedChar => write!(f, "Unclosed char literal"), | 98 | UnclosedChar => write!(f, "Unclosed char literal"), |
85 | LongChar => write!(f, "Char literal should be one character long"), | 99 | LongChar => write!(f, "Char literal should be one character long"), |
100 | TooShortAsciiCodeEscape => write!(f, "Escape sequence should have two digits"), | ||
101 | AsciiCodeEscapeOutOfRange => { | ||
102 | write!(f, "Escape sequence should be between \\x00 and \\x7F") | ||
103 | } | ||
104 | MalformedAsciiCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"), | ||
105 | UnclosedUnicodeEscape => write!(f, "Missing `}}`"), | ||
106 | MalformedUnicodeEscape => write!(f, "Malformed unicode escape sequence"), | ||
107 | EmptyUnicodeEcape => write!(f, "Empty unicode escape sequence"), | ||
108 | OverlongUnicodeEscape => { | ||
109 | write!(f, "Unicode escape sequence should have at most 6 digits") | ||
110 | } | ||
111 | UnicodeEscapeOutOfRange => write!(f, "Unicode escape code should be at most 0x10FFFF"), | ||
86 | ParseError(msg) => write!(f, "{}", msg.0), | 112 | ParseError(msg) => write!(f, "{}", msg.0), |
87 | } | 113 | } |
88 | } | 114 | } |