diff options
author | Adolfo Ochagavía <[email protected]> | 2018-11-11 19:27:00 +0000 |
---|---|---|
committer | Adolfo Ochagavía <[email protected]> | 2018-11-11 19:27:00 +0000 |
commit | c258b4fdb0e421813330c2428985c4537c787582 (patch) | |
tree | e53263f28c0cd07911a1e9c9ef6538c8ff0227fd /crates/ra_syntax/src/validation | |
parent | a4f7d7a7cd85a5b9b64a935dd84ad493b6860236 (diff) |
Add validator for byte
Diffstat (limited to 'crates/ra_syntax/src/validation')
-rw-r--r-- | crates/ra_syntax/src/validation/byte.rs | 202 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/char.rs | 188 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/mod.rs | 2 |
3 files changed, 302 insertions, 90 deletions
diff --git a/crates/ra_syntax/src/validation/byte.rs b/crates/ra_syntax/src/validation/byte.rs new file mode 100644 index 000000000..3d2806c4e --- /dev/null +++ b/crates/ra_syntax/src/validation/byte.rs | |||
@@ -0,0 +1,202 @@ | |||
1 | //! Validation of byte literals | ||
2 | |||
3 | use crate::{ | ||
4 | ast::{self, AstNode}, | ||
5 | string_lexing::{self, CharComponentKind}, | ||
6 | TextRange, | ||
7 | validation::char, | ||
8 | yellow::{ | ||
9 | SyntaxError, | ||
10 | SyntaxErrorKind::*, | ||
11 | }, | ||
12 | }; | ||
13 | |||
14 | pub(super) fn validate_byte_node(node: ast::Byte, errors: &mut Vec<SyntaxError>) { | ||
15 | let literal_text = node.text(); | ||
16 | let literal_range = node.syntax().range(); | ||
17 | let mut components = string_lexing::parse_byte_literal(literal_text); | ||
18 | let mut len = 0; | ||
19 | for component in &mut components { | ||
20 | len += 1; | ||
21 | let text = &literal_text[component.range]; | ||
22 | let range = component.range + literal_range.start(); | ||
23 | |||
24 | use self::CharComponentKind::*; | ||
25 | match component.kind { | ||
26 | AsciiEscape => validate_byte_escape(text, range, errors), | ||
27 | AsciiCodeEscape => validate_byte_code_escape(text, range, errors), | ||
28 | UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)), | ||
29 | CodePoint => { | ||
30 | let c = text.chars().next().expect("Code points should be one character long"); | ||
31 | |||
32 | // These bytes must always be escaped | ||
33 | if c == '\t' || c == '\r' || c == '\n' { | ||
34 | errors.push(SyntaxError::new(UnescapedByte, range)); | ||
35 | } | ||
36 | |||
37 | // Only ASCII bytes are allowed | ||
38 | if c > 0x7F as char { | ||
39 | errors.push(SyntaxError::new(ByteOutOfRange, range)); | ||
40 | } | ||
41 | } | ||
42 | } | ||
43 | } | ||
44 | |||
45 | if !components.has_closing_quote { | ||
46 | errors.push(SyntaxError::new(UnclosedByte, literal_range)); | ||
47 | } | ||
48 | |||
49 | if len == 0 { | ||
50 | errors.push(SyntaxError::new(EmptyByte, literal_range)); | ||
51 | } | ||
52 | |||
53 | if len > 1 { | ||
54 | errors.push(SyntaxError::new(OverlongByte, literal_range)); | ||
55 | } | ||
56 | } | ||
57 | |||
58 | fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | ||
59 | if text.len() == 1 { | ||
60 | // Escape sequence consists only of leading `\` | ||
61 | errors.push(SyntaxError::new(EmptyByteEscape, range)); | ||
62 | } else { | ||
63 | let escape_code = text.chars().skip(1).next().unwrap(); | ||
64 | if !char::is_ascii_escape(escape_code) { | ||
65 | errors.push(SyntaxError::new(InvalidByteEscape, range)); | ||
66 | } | ||
67 | } | ||
68 | } | ||
69 | |||
70 | fn validate_byte_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | ||
71 | // A ByteCodeEscape has 4 chars, example: `\xDD` | ||
72 | if text.len() < 4 { | ||
73 | errors.push(SyntaxError::new(TooShortByteCodeEscape, range)); | ||
74 | } else { | ||
75 | assert!( | ||
76 | text.chars().count() == 4, | ||
77 | "ByteCodeEscape cannot be longer than 4 chars" | ||
78 | ); | ||
79 | |||
80 | if u8::from_str_radix(&text[2..], 16).is_err() { | ||
81 | errors.push(SyntaxError::new(MalformedByteCodeEscape, range)); | ||
82 | } | ||
83 | } | ||
84 | } | ||
85 | |||
86 | #[cfg(test)] | ||
87 | mod test { | ||
88 | use crate::SourceFileNode; | ||
89 | |||
90 | fn build_file(literal: &str) -> SourceFileNode { | ||
91 | let src = format!("const C: u8 = b'{}';", literal); | ||
92 | SourceFileNode::parse(&src) | ||
93 | } | ||
94 | |||
95 | fn assert_valid_byte(literal: &str) { | ||
96 | let file = build_file(literal); | ||
97 | assert!( | ||
98 | file.errors().len() == 0, | ||
99 | "Errors for literal '{}': {:?}", | ||
100 | literal, | ||
101 | file.errors() | ||
102 | ); | ||
103 | } | ||
104 | |||
105 | fn assert_invalid_byte(literal: &str) { | ||
106 | let file = build_file(literal); | ||
107 | assert!(file.errors().len() > 0); | ||
108 | } | ||
109 | |||
110 | #[test] | ||
111 | fn test_ansi_codepoints() { | ||
112 | for byte in 0..128 { | ||
113 | match byte { | ||
114 | b'\n' | b'\r' | b'\t' => assert_invalid_byte(&(byte as char).to_string()), | ||
115 | b'\'' | b'\\' => { /* Ignore character close and backslash */ } | ||
116 | _ => assert_valid_byte(&(byte as char).to_string()), | ||
117 | } | ||
118 | } | ||
119 | |||
120 | for byte in 128..=255u8 { | ||
121 | assert_invalid_byte(&(byte as char).to_string()); | ||
122 | } | ||
123 | } | ||
124 | |||
125 | #[test] | ||
126 | fn test_unicode_codepoints() { | ||
127 | let invalid = ["Ƒ", "バ", "メ", "﷽"]; | ||
128 | for c in &invalid { | ||
129 | assert_invalid_byte(c); | ||
130 | } | ||
131 | } | ||
132 | |||
133 | #[test] | ||
134 | fn test_unicode_multiple_codepoints() { | ||
135 | let invalid = ["नी", "👨👨"]; | ||
136 | for c in &invalid { | ||
137 | assert_invalid_byte(c); | ||
138 | } | ||
139 | } | ||
140 | |||
141 | #[test] | ||
142 | fn test_valid_byte_escape() { | ||
143 | let valid = [ | ||
144 | r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b", | ||
145 | ]; | ||
146 | for c in &valid { | ||
147 | assert_valid_byte(c); | ||
148 | } | ||
149 | } | ||
150 | |||
151 | #[test] | ||
152 | fn test_invalid_byte_escape() { | ||
153 | let invalid = [r"\a", r"\?", r"\"]; | ||
154 | for c in &invalid { | ||
155 | assert_invalid_byte(c); | ||
156 | } | ||
157 | } | ||
158 | |||
159 | #[test] | ||
160 | fn test_valid_byte_code_escape() { | ||
161 | let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"]; | ||
162 | for c in &valid { | ||
163 | assert_valid_byte(c); | ||
164 | } | ||
165 | } | ||
166 | |||
167 | #[test] | ||
168 | fn test_invalid_byte_code_escape() { | ||
169 | let invalid = [r"\x", r"\x7"]; | ||
170 | for c in &invalid { | ||
171 | assert_invalid_byte(c); | ||
172 | } | ||
173 | } | ||
174 | |||
175 | #[test] | ||
176 | fn test_invalid_unicode_escape() { | ||
177 | let well_formed = [ | ||
178 | r"\u{FF}", | ||
179 | r"\u{0}", | ||
180 | r"\u{F}", | ||
181 | r"\u{10FFFF}", | ||
182 | r"\u{1_0__FF___FF_____}", | ||
183 | ]; | ||
184 | for c in &well_formed { | ||
185 | assert_invalid_byte(c); | ||
186 | } | ||
187 | |||
188 | let invalid = [ | ||
189 | r"\u", | ||
190 | r"\u{}", | ||
191 | r"\u{", | ||
192 | r"\u{FF", | ||
193 | r"\u{FFFFFF}", | ||
194 | r"\u{_F}", | ||
195 | r"\u{00FFFFF}", | ||
196 | r"\u{110000}", | ||
197 | ]; | ||
198 | for c in &invalid { | ||
199 | assert_invalid_byte(c); | ||
200 | } | ||
201 | } | ||
202 | } | ||
diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs index 63f9bad24..793539b3a 100644 --- a/crates/ra_syntax/src/validation/char.rs +++ b/crates/ra_syntax/src/validation/char.rs | |||
@@ -1,3 +1,5 @@ | |||
1 | //! Validation of char literals | ||
2 | |||
1 | use std::u32; | 3 | use std::u32; |
2 | 4 | ||
3 | use arrayvec::ArrayString; | 5 | use arrayvec::ArrayString; |
@@ -12,7 +14,7 @@ use crate::{ | |||
12 | }, | 14 | }, |
13 | }; | 15 | }; |
14 | 16 | ||
15 | pub(crate) fn validate_char_node(node: ast::Char, errors: &mut Vec<SyntaxError>) { | 17 | pub(super) fn validate_char_node(node: ast::Char, errors: &mut Vec<SyntaxError>) { |
16 | let literal_text = node.text(); | 18 | let literal_text = node.text(); |
17 | let literal_range = node.syntax().range(); | 19 | let literal_range = node.syntax().range(); |
18 | let mut components = string_lexing::parse_char_literal(literal_text); | 20 | let mut components = string_lexing::parse_char_literal(literal_text); |
@@ -37,7 +39,7 @@ pub(crate) fn validate_char_node(node: ast::Char, errors: &mut Vec<SyntaxError>) | |||
37 | } | 39 | } |
38 | } | 40 | } |
39 | 41 | ||
40 | pub(crate) fn validate_char_component( | 42 | pub(super) fn validate_char_component( |
41 | text: &str, | 43 | text: &str, |
42 | kind: CharComponentKind, | 44 | kind: CharComponentKind, |
43 | range: TextRange, | 45 | range: TextRange, |
@@ -46,109 +48,115 @@ pub(crate) fn validate_char_component( | |||
46 | // Validate escapes | 48 | // Validate escapes |
47 | use self::CharComponentKind::*; | 49 | use self::CharComponentKind::*; |
48 | match kind { | 50 | match kind { |
49 | AsciiEscape => { | 51 | AsciiEscape => validate_ascii_escape(text, range, errors), |
50 | if text.len() == 1 { | 52 | AsciiCodeEscape => validate_ascii_code_escape(text, range, errors), |
51 | // Escape sequence consists only of leading `\` | 53 | UnicodeEscape => validate_unicode_escape(text, range, errors), |
52 | errors.push(SyntaxError::new(EmptyAsciiEscape, range)); | 54 | CodePoint => { |
53 | } else { | 55 | // These code points must always be escaped |
54 | let escape_code = text.chars().skip(1).next().unwrap(); | 56 | if text == "\t" || text == "\r" || text == "\n" { |
55 | if !is_ascii_escape(escape_code) { | 57 | errors.push(SyntaxError::new(UnescapedCodepoint, range)); |
56 | errors.push(SyntaxError::new(InvalidAsciiEscape, range)); | ||
57 | } | ||
58 | } | 58 | } |
59 | } | 59 | } |
60 | AsciiCodeEscape => { | 60 | } |
61 | // An AsciiCodeEscape has 4 chars, example: `\xDD` | 61 | } |
62 | if text.len() < 4 { | 62 | |
63 | errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range)); | 63 | fn validate_ascii_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { |
64 | } else { | 64 | if text.len() == 1 { |
65 | assert!( | 65 | // Escape sequence consists only of leading `\` |
66 | text.chars().count() == 4, | 66 | errors.push(SyntaxError::new(EmptyAsciiEscape, range)); |
67 | "AsciiCodeEscape cannot be longer than 4 chars" | 67 | } else { |
68 | ); | 68 | let escape_code = text.chars().skip(1).next().unwrap(); |
69 | 69 | if !is_ascii_escape(escape_code) { | |
70 | match u8::from_str_radix(&text[2..], 16) { | 70 | errors.push(SyntaxError::new(InvalidAsciiEscape, range)); |
71 | Ok(code) if code < 128 => { /* Escape code is valid */ } | ||
72 | Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)), | ||
73 | Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)), | ||
74 | } | ||
75 | } | ||
76 | } | 71 | } |
77 | UnicodeEscape => { | 72 | } |
78 | assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u"); | 73 | } |
79 | 74 | ||
80 | if text.len() == 2 { | 75 | pub(super) fn is_ascii_escape(code: char) -> bool { |
81 | // No starting `{` | 76 | match code { |
82 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | 77 | '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true, |
83 | return; | 78 | _ => false, |
84 | } | 79 | } |
80 | } | ||
85 | 81 | ||
86 | if text.len() == 3 { | 82 | fn validate_ascii_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { |
87 | // Only starting `{` | 83 | // An AsciiCodeEscape has 4 chars, example: `\xDD` |
88 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)); | 84 | if text.len() < 4 { |
89 | return; | 85 | errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range)); |
90 | } | 86 | } else { |
87 | assert!( | ||
88 | text.chars().count() == 4, | ||
89 | "AsciiCodeEscape cannot be longer than 4 chars" | ||
90 | ); | ||
91 | 91 | ||
92 | let mut code = ArrayString::<[_; 6]>::new(); | 92 | match u8::from_str_radix(&text[2..], 16) { |
93 | let mut closed = false; | 93 | Ok(code) if code < 128 => { /* Escape code is valid */ } |
94 | for c in text[3..].chars() { | 94 | Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)), |
95 | assert!(!closed, "no characters after escape is closed"); | 95 | Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)), |
96 | 96 | } | |
97 | if c.is_digit(16) { | 97 | } |
98 | if code.len() == 6 { | 98 | } |
99 | errors.push(SyntaxError::new(OverlongUnicodeEscape, range)); | ||
100 | return; | ||
101 | } | ||
102 | |||
103 | code.push(c); | ||
104 | } else if c == '_' { | ||
105 | // Reject leading _ | ||
106 | if code.len() == 0 { | ||
107 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
108 | return; | ||
109 | } | ||
110 | } else if c == '}' { | ||
111 | closed = true; | ||
112 | } else { | ||
113 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
114 | return; | ||
115 | } | ||
116 | } | ||
117 | 99 | ||
118 | if !closed { | 100 | fn validate_unicode_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { |
119 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)) | 101 | assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u"); |
120 | } | ||
121 | 102 | ||
122 | if code.len() == 0 { | 103 | if text.len() == 2 { |
123 | errors.push(SyntaxError::new(EmptyUnicodeEcape, range)); | 104 | // No starting `{` |
105 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
106 | return; | ||
107 | } | ||
108 | |||
109 | if text.len() == 3 { | ||
110 | // Only starting `{` | ||
111 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)); | ||
112 | return; | ||
113 | } | ||
114 | |||
115 | let mut code = ArrayString::<[_; 6]>::new(); | ||
116 | let mut closed = false; | ||
117 | for c in text[3..].chars() { | ||
118 | assert!(!closed, "no characters after escape is closed"); | ||
119 | |||
120 | if c.is_digit(16) { | ||
121 | if code.len() == 6 { | ||
122 | errors.push(SyntaxError::new(OverlongUnicodeEscape, range)); | ||
124 | return; | 123 | return; |
125 | } | 124 | } |
126 | 125 | ||
127 | match u32::from_str_radix(&code, 16) { | 126 | code.push(c); |
128 | Ok(code_u32) if code_u32 > 0x10FFFF => { | 127 | } else if c == '_' { |
129 | errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range)); | 128 | // Reject leading _ |
130 | } | 129 | if code.len() == 0 { |
131 | Ok(_) => { | 130 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); |
132 | // Valid escape code | 131 | return; |
133 | } | ||
134 | Err(_) => { | ||
135 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
136 | } | ||
137 | } | ||
138 | } | ||
139 | CodePoint => { | ||
140 | // These code points must always be escaped | ||
141 | if text == "\t" || text == "\r" { | ||
142 | errors.push(SyntaxError::new(UnescapedCodepoint, range)); | ||
143 | } | 132 | } |
133 | } else if c == '}' { | ||
134 | closed = true; | ||
135 | } else { | ||
136 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
137 | return; | ||
144 | } | 138 | } |
145 | } | 139 | } |
146 | } | ||
147 | 140 | ||
148 | fn is_ascii_escape(code: char) -> bool { | 141 | if !closed { |
149 | match code { | 142 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)) |
150 | '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true, | 143 | } |
151 | _ => false, | 144 | |
145 | if code.len() == 0 { | ||
146 | errors.push(SyntaxError::new(EmptyUnicodeEcape, range)); | ||
147 | return; | ||
148 | } | ||
149 | |||
150 | match u32::from_str_radix(&code, 16) { | ||
151 | Ok(code_u32) if code_u32 > 0x10FFFF => { | ||
152 | errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range)); | ||
153 | } | ||
154 | Ok(_) => { | ||
155 | // Valid escape code | ||
156 | } | ||
157 | Err(_) => { | ||
158 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
159 | } | ||
152 | } | 160 | } |
153 | } | 161 | } |
154 | 162 | ||
diff --git a/crates/ra_syntax/src/validation/mod.rs b/crates/ra_syntax/src/validation/mod.rs index 2ff0bc26d..acad7cb7f 100644 --- a/crates/ra_syntax/src/validation/mod.rs +++ b/crates/ra_syntax/src/validation/mod.rs | |||
@@ -5,6 +5,7 @@ use crate::{ | |||
5 | yellow::SyntaxError, | 5 | yellow::SyntaxError, |
6 | }; | 6 | }; |
7 | 7 | ||
8 | mod byte; | ||
8 | mod char; | 9 | mod char; |
9 | mod string; | 10 | mod string; |
10 | 11 | ||
@@ -12,6 +13,7 @@ pub(crate) fn validate(file: &SourceFileNode) -> Vec<SyntaxError> { | |||
12 | let mut errors = Vec::new(); | 13 | let mut errors = Vec::new(); |
13 | for node in file.syntax().descendants() { | 14 | for node in file.syntax().descendants() { |
14 | let _ = visitor_ctx(&mut errors) | 15 | let _ = visitor_ctx(&mut errors) |
16 | .visit::<ast::Byte, _>(self::byte::validate_byte_node) | ||
15 | .visit::<ast::Char, _>(self::char::validate_char_node) | 17 | .visit::<ast::Char, _>(self::char::validate_char_node) |
16 | .visit::<ast::String, _>(self::string::validate_string_node) | 18 | .visit::<ast::String, _>(self::string::validate_string_node) |
17 | .accept(node); | 19 | .accept(node); |