aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbors[bot] <bors[bot]@users.noreply.github.com>2018-11-07 11:09:40 +0000
committerbors[bot] <bors[bot]@users.noreply.github.com>2018-11-07 11:09:40 +0000
commit2e2445444abcde9fc2f50c70a8157958f7d5ddd8 (patch)
tree8c667fc65a65c0e7978add07b72ec532a4b6eb7b
parenta46a07eca33f951b3d445e49dcbff3c53962a8e0 (diff)
parent433a8061910a388f777b839eb67f2582f91b6c7a (diff)
Merge #207
207: Finish implementing char validation r=aochagavia a=aochagavia The only thing missing right now are good integration tests (and maybe more descriptive error messages) Co-authored-by: Adolfo Ochagavía <[email protected]>
-rw-r--r--Cargo.lock1
-rw-r--r--crates/ra_syntax/Cargo.toml1
-rw-r--r--crates/ra_syntax/src/lexer/ptr.rs3
-rw-r--r--crates/ra_syntax/src/lib.rs1
-rw-r--r--crates/ra_syntax/src/string_lexing/mod.rs2
-rw-r--r--crates/ra_syntax/src/utils.rs1
-rw-r--r--crates/ra_syntax/src/validation.rs213
-rw-r--r--crates/ra_syntax/src/yellow/syntax_error.rs26
8 files changed, 235 insertions, 13 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 80fbda23c..c1f773055 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -671,6 +671,7 @@ dependencies = [
671name = "ra_syntax" 671name = "ra_syntax"
672version = "0.1.0" 672version = "0.1.0"
673dependencies = [ 673dependencies = [
674 "arrayvec 0.4.7 (registry+https://github.com/rust-lang/crates.io-index)",
674 "drop_bomb 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", 675 "drop_bomb 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
675 "itertools 0.7.8 (registry+https://github.com/rust-lang/crates.io-index)", 676 "itertools 0.7.8 (registry+https://github.com/rust-lang/crates.io-index)",
676 "parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", 677 "parking_lot 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)",
diff --git a/crates/ra_syntax/Cargo.toml b/crates/ra_syntax/Cargo.toml
index 97d259570..54ee72386 100644
--- a/crates/ra_syntax/Cargo.toml
+++ b/crates/ra_syntax/Cargo.toml
@@ -8,6 +8,7 @@ description = "Comment and whitespace preserving parser for the Rust langauge"
8repository = "https://github.com/rust-analyzer/rust-analyzer" 8repository = "https://github.com/rust-analyzer/rust-analyzer"
9 9
10[dependencies] 10[dependencies]
11arrayvec = "0.4.7"
11unicode-xid = "0.1.0" 12unicode-xid = "0.1.0"
12itertools = "0.7.8" 13itertools = "0.7.8"
13drop_bomb = "0.1.4" 14drop_bomb = "0.1.4"
diff --git a/crates/ra_syntax/src/lexer/ptr.rs b/crates/ra_syntax/src/lexer/ptr.rs
index 4c291b9c4..7e4df51aa 100644
--- a/crates/ra_syntax/src/lexer/ptr.rs
+++ b/crates/ra_syntax/src/lexer/ptr.rs
@@ -30,8 +30,7 @@ impl<'s> Ptr<'s> {
30 /// Gets the nth character from the current. 30 /// Gets the nth character from the current.
31 /// For example, 0 will return the current token, 1 will return the next, etc. 31 /// For example, 0 will return the current token, 1 will return the next, etc.
32 pub fn nth(&self, n: u32) -> Option<char> { 32 pub fn nth(&self, n: u32) -> Option<char> {
33 let mut chars = self.chars().peekable(); 33 self.chars().nth(n as usize)
34 chars.by_ref().nth(n as usize)
35 } 34 }
36 35
37 /// Checks whether the current character is `c`. 36 /// Checks whether the current character is `c`.
diff --git a/crates/ra_syntax/src/lib.rs b/crates/ra_syntax/src/lib.rs
index aa172ba42..54012b7b6 100644
--- a/crates/ra_syntax/src/lib.rs
+++ b/crates/ra_syntax/src/lib.rs
@@ -20,6 +20,7 @@
20#![allow(missing_docs)] 20#![allow(missing_docs)]
21//#![warn(unreachable_pub)] // rust-lang/rust#47816 21//#![warn(unreachable_pub)] // rust-lang/rust#47816
22 22
23extern crate arrayvec;
23extern crate drop_bomb; 24extern crate drop_bomb;
24extern crate itertools; 25extern crate itertools;
25extern crate parking_lot; 26extern crate parking_lot;
diff --git a/crates/ra_syntax/src/string_lexing/mod.rs b/crates/ra_syntax/src/string_lexing/mod.rs
index f0812ff28..cc53e0aba 100644
--- a/crates/ra_syntax/src/string_lexing/mod.rs
+++ b/crates/ra_syntax/src/string_lexing/mod.rs
@@ -219,7 +219,7 @@ mod tests {
219 219
220 #[test] 220 #[test]
221 fn test_unicode_escapes() { 221 fn test_unicode_escapes() {
222 let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", ""]; 222 let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""];
223 for escape in unicode_escapes { 223 for escape in unicode_escapes {
224 let escape_sequence = format!(r"'\u{}'", escape); 224 let escape_sequence = format!(r"'\u{}'", escape);
225 let component = closed_char_component(&escape_sequence); 225 let component = closed_char_component(&escape_sequence);
diff --git a/crates/ra_syntax/src/utils.rs b/crates/ra_syntax/src/utils.rs
index 288d7edd4..cad9544be 100644
--- a/crates/ra_syntax/src/utils.rs
+++ b/crates/ra_syntax/src/utils.rs
@@ -1,5 +1,6 @@
1use crate::{File, SyntaxKind, SyntaxNodeRef, WalkEvent}; 1use crate::{File, SyntaxKind, SyntaxNodeRef, WalkEvent};
2use std::fmt::Write; 2use std::fmt::Write;
3use std::str;
3 4
4/// Parse a file and create a string representation of the resulting parse tree. 5/// Parse a file and create a string representation of the resulting parse tree.
5pub fn dump_tree(syntax: SyntaxNodeRef) -> String { 6pub fn dump_tree(syntax: SyntaxNodeRef) -> String {
diff --git a/crates/ra_syntax/src/validation.rs b/crates/ra_syntax/src/validation.rs
index 2b26e388d..f345dbd6e 100644
--- a/crates/ra_syntax/src/validation.rs
+++ b/crates/ra_syntax/src/validation.rs
@@ -1,3 +1,7 @@
1use std::u32;
2
3use arrayvec::ArrayString;
4
1use crate::{ 5use crate::{
2 algo::visit::{visitor_ctx, VisitorCtx}, 6 algo::visit::{visitor_ctx, VisitorCtx},
3 ast::{self, AstNode}, 7 ast::{self, AstNode},
@@ -42,18 +46,90 @@ fn validate_char(node: ast::Char, errors: &mut Vec<SyntaxError>) {
42 } 46 }
43 } 47 }
44 AsciiCodeEscape => { 48 AsciiCodeEscape => {
45 // TODO: 49 // An AsciiCodeEscape has 4 chars, example: `\xDD`
46 // * First digit is octal 50 if text.len() < 4 {
47 // * Second digit is hex 51 errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range));
52 } else {
53 assert!(
54 text.chars().count() == 4,
55 "AsciiCodeEscape cannot be longer than 4 chars"
56 );
57
58 match u8::from_str_radix(&text[2..], 16) {
59 Ok(code) if code < 128 => { /* Escape code is valid */ }
60 Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)),
61 Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)),
62 }
63 }
48 } 64 }
49 UnicodeEscape => { 65 UnicodeEscape => {
50 // TODO: 66 assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u");
51 // * Only hex digits or underscores allowed 67
52 // * Max 6 chars 68 if text.len() == 2 {
53 // * Within allowed range (must be at most 10FFFF) 69 // No starting `{`
70 errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
71 return;
72 }
73
74 if text.len() == 3 {
75 // Only starting `{`
76 errors.push(SyntaxError::new(UnclosedUnicodeEscape, range));
77 return;
78 }
79
80 let mut code = ArrayString::<[_; 6]>::new();
81 let mut closed = false;
82 for c in text[3..].chars() {
83 assert!(!closed, "no characters after escape is closed");
84
85 if c.is_digit(16) {
86 if code.len() == 6 {
87 errors.push(SyntaxError::new(OverlongUnicodeEscape, range));
88 return;
89 }
90
91 code.push(c);
92 } else if c == '_' {
93 // Reject leading _
94 if code.len() == 0 {
95 errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
96 return;
97 }
98 } else if c == '}' {
99 closed = true;
100 } else {
101 errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
102 return;
103 }
104 }
105
106 if !closed {
107 errors.push(SyntaxError::new(UnclosedUnicodeEscape, range))
108 }
109
110 if code.len() == 0 {
111 errors.push(SyntaxError::new(EmptyUnicodeEcape, range));
112 return;
113 }
114
115 match u32::from_str_radix(&code, 16) {
116 Ok(code_u32) if code_u32 > 0x10FFFF => {
117 errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range));
118 }
119 Ok(_) => {
120 // Valid escape code
121 }
122 Err(_) => {
123 errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
124 }
125 }
126 }
127 CodePoint => {
128 // These code points must always be escaped
129 if text == "\t" || text == "\r" {
130 errors.push(SyntaxError::new(UnescapedCodepoint, range));
131 }
54 } 132 }
55 // Code points are always valid
56 CodePoint => (),
57 } 133 }
58 } 134 }
59 135
@@ -72,7 +148,124 @@ fn validate_char(node: ast::Char, errors: &mut Vec<SyntaxError>) {
72 148
73fn is_ascii_escape(code: char) -> bool { 149fn is_ascii_escape(code: char) -> bool {
74 match code { 150 match code {
75 '\'' | '"' | 'n' | 'r' | 't' | '0' => true, 151 '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true,
76 _ => false, 152 _ => false,
77 } 153 }
78} 154}
155
156#[cfg(test)]
157mod test {
158 use crate::File;
159
160 fn build_file(literal: &str) -> File {
161 let src = format!("const C: char = '{}';", literal);
162 File::parse(&src)
163 }
164
165 fn assert_valid_char(literal: &str) {
166 let file = build_file(literal);
167 assert!(
168 file.errors().len() == 0,
169 "Errors for literal '{}': {:?}",
170 literal,
171 file.errors()
172 );
173 }
174
175 fn assert_invalid_char(literal: &str) {
176 let file = build_file(literal);
177 assert!(file.errors().len() > 0);
178 }
179
180 #[test]
181 fn test_ansi_codepoints() {
182 for byte in 0..=255u8 {
183 match byte {
184 b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()),
185 b'\'' | b'\\' => { /* Ignore character close and backslash */ }
186 _ => assert_valid_char(&(byte as char).to_string()),
187 }
188 }
189 }
190
191 #[test]
192 fn test_unicode_codepoints() {
193 let valid = ["Ƒ", "バ", "メ", "﷽"];
194 for c in &valid {
195 assert_valid_char(c);
196 }
197 }
198
199 #[test]
200 fn test_unicode_multiple_codepoints() {
201 let invalid = ["नी", "👨‍👨‍"];
202 for c in &invalid {
203 assert_invalid_char(c);
204 }
205 }
206
207 #[test]
208 fn test_valid_ascii_escape() {
209 let valid = [
210 r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b",
211 ];
212 for c in &valid {
213 assert_valid_char(c);
214 }
215 }
216
217 #[test]
218 fn test_invalid_ascii_escape() {
219 let invalid = [r"\a", r"\?", r"\"];
220 for c in &invalid {
221 assert_invalid_char(c);
222 }
223 }
224
225 #[test]
226 fn test_valid_ascii_code_escape() {
227 let valid = [r"\x00", r"\x7F", r"\x55"];
228 for c in &valid {
229 assert_valid_char(c);
230 }
231 }
232
233 #[test]
234 fn test_invalid_ascii_code_escape() {
235 let invalid = [r"\x", r"\x7", r"\xF0"];
236 for c in &invalid {
237 assert_invalid_char(c);
238 }
239 }
240
241 #[test]
242 fn test_valid_unicode_escape() {
243 let valid = [
244 r"\u{FF}",
245 r"\u{0}",
246 r"\u{F}",
247 r"\u{10FFFF}",
248 r"\u{1_0__FF___FF_____}",
249 ];
250 for c in &valid {
251 assert_valid_char(c);
252 }
253 }
254
255 #[test]
256 fn test_invalid_unicode_escape() {
257 let invalid = [
258 r"\u",
259 r"\u{}",
260 r"\u{",
261 r"\u{FF",
262 r"\u{FFFFFF}",
263 r"\u{_F}",
264 r"\u{00FFFFF}",
265 r"\u{110000}",
266 ];
267 for c in &invalid {
268 assert_invalid_char(c);
269 }
270 }
271}
diff --git a/crates/ra_syntax/src/yellow/syntax_error.rs b/crates/ra_syntax/src/yellow/syntax_error.rs
index f3df6bc15..c524adf39 100644
--- a/crates/ra_syntax/src/yellow/syntax_error.rs
+++ b/crates/ra_syntax/src/yellow/syntax_error.rs
@@ -34,6 +34,10 @@ impl SyntaxError {
34 } 34 }
35 } 35 }
36 36
37 pub fn kind(&self) -> SyntaxErrorKind {
38 self.kind.clone()
39 }
40
37 pub fn location(&self) -> Location { 41 pub fn location(&self) -> Location {
38 self.location.clone() 42 self.location.clone()
39 } 43 }
@@ -64,11 +68,20 @@ impl fmt::Display for SyntaxError {
64#[derive(Debug, Clone, PartialEq, Eq, Hash)] 68#[derive(Debug, Clone, PartialEq, Eq, Hash)]
65pub enum SyntaxErrorKind { 69pub enum SyntaxErrorKind {
66 ParseError(ParseError), 70 ParseError(ParseError),
71 UnescapedCodepoint,
67 EmptyChar, 72 EmptyChar,
68 UnclosedChar, 73 UnclosedChar,
69 LongChar, 74 LongChar,
70 EmptyAsciiEscape, 75 EmptyAsciiEscape,
71 InvalidAsciiEscape, 76 InvalidAsciiEscape,
77 TooShortAsciiCodeEscape,
78 AsciiCodeEscapeOutOfRange,
79 MalformedAsciiCodeEscape,
80 UnclosedUnicodeEscape,
81 MalformedUnicodeEscape,
82 EmptyUnicodeEcape,
83 OverlongUnicodeEscape,
84 UnicodeEscapeOutOfRange,
72} 85}
73 86
74#[derive(Debug, Clone, PartialEq, Eq, Hash)] 87#[derive(Debug, Clone, PartialEq, Eq, Hash)]
@@ -78,11 +91,24 @@ impl fmt::Display for SyntaxErrorKind {
78 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 91 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
79 use self::SyntaxErrorKind::*; 92 use self::SyntaxErrorKind::*;
80 match self { 93 match self {
94 UnescapedCodepoint => write!(f, "This codepoint should always be escaped"),
81 EmptyAsciiEscape => write!(f, "Empty escape sequence"), 95 EmptyAsciiEscape => write!(f, "Empty escape sequence"),
82 InvalidAsciiEscape => write!(f, "Invalid escape sequence"), 96 InvalidAsciiEscape => write!(f, "Invalid escape sequence"),
83 EmptyChar => write!(f, "Empty char literal"), 97 EmptyChar => write!(f, "Empty char literal"),
84 UnclosedChar => write!(f, "Unclosed char literal"), 98 UnclosedChar => write!(f, "Unclosed char literal"),
85 LongChar => write!(f, "Char literal should be one character long"), 99 LongChar => write!(f, "Char literal should be one character long"),
100 TooShortAsciiCodeEscape => write!(f, "Escape sequence should have two digits"),
101 AsciiCodeEscapeOutOfRange => {
102 write!(f, "Escape sequence should be between \\x00 and \\x7F")
103 }
104 MalformedAsciiCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"),
105 UnclosedUnicodeEscape => write!(f, "Missing `}}`"),
106 MalformedUnicodeEscape => write!(f, "Malformed unicode escape sequence"),
107 EmptyUnicodeEcape => write!(f, "Empty unicode escape sequence"),
108 OverlongUnicodeEscape => {
109 write!(f, "Unicode escape sequence should have at most 6 digits")
110 }
111 UnicodeEscapeOutOfRange => write!(f, "Unicode escape code should be at most 0x10FFFF"),
86 ParseError(msg) => write!(f, "{}", msg.0), 112 ParseError(msg) => write!(f, "{}", msg.0),
87 } 113 }
88 } 114 }