diff options
Diffstat (limited to 'crates/ra_syntax/src/validation')
-rw-r--r-- | crates/ra_syntax/src/validation/byte.rs | 211 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/byte_string.rs | 178 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/char.rs | 276 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/mod.rs | 24 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/string.rs | 168 |
5 files changed, 857 insertions, 0 deletions
diff --git a/crates/ra_syntax/src/validation/byte.rs b/crates/ra_syntax/src/validation/byte.rs new file mode 100644 index 000000000..43c0d7edd --- /dev/null +++ b/crates/ra_syntax/src/validation/byte.rs | |||
@@ -0,0 +1,211 @@ | |||
1 | //! Validation of byte literals | ||
2 | |||
3 | use crate::{ | ||
4 | ast::{self, AstNode}, | ||
5 | string_lexing::{self, CharComponentKind}, | ||
6 | TextRange, | ||
7 | validation::char, | ||
8 | yellow::{ | ||
9 | SyntaxError, | ||
10 | SyntaxErrorKind::*, | ||
11 | }, | ||
12 | }; | ||
13 | |||
14 | pub(super) fn validate_byte_node(node: ast::Byte, errors: &mut Vec<SyntaxError>) { | ||
15 | let literal_text = node.text(); | ||
16 | let literal_range = node.syntax().range(); | ||
17 | let mut components = string_lexing::parse_byte_literal(literal_text); | ||
18 | let mut len = 0; | ||
19 | for component in &mut components { | ||
20 | len += 1; | ||
21 | let text = &literal_text[component.range]; | ||
22 | let range = component.range + literal_range.start(); | ||
23 | validate_byte_component(text, component.kind, range, errors); | ||
24 | } | ||
25 | |||
26 | if !components.has_closing_quote { | ||
27 | errors.push(SyntaxError::new(UnclosedByte, literal_range)); | ||
28 | } | ||
29 | |||
30 | if len == 0 { | ||
31 | errors.push(SyntaxError::new(EmptyByte, literal_range)); | ||
32 | } | ||
33 | |||
34 | if len > 1 { | ||
35 | errors.push(SyntaxError::new(OverlongByte, literal_range)); | ||
36 | } | ||
37 | } | ||
38 | |||
39 | pub(super) fn validate_byte_component( | ||
40 | text: &str, | ||
41 | kind: CharComponentKind, | ||
42 | range: TextRange, | ||
43 | errors: &mut Vec<SyntaxError>, | ||
44 | ) { | ||
45 | use self::CharComponentKind::*; | ||
46 | match kind { | ||
47 | AsciiEscape => validate_byte_escape(text, range, errors), | ||
48 | AsciiCodeEscape => validate_byte_code_escape(text, range, errors), | ||
49 | UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)), | ||
50 | CodePoint => { | ||
51 | let c = text | ||
52 | .chars() | ||
53 | .next() | ||
54 | .expect("Code points should be one character long"); | ||
55 | |||
56 | // These bytes must always be escaped | ||
57 | if c == '\t' || c == '\r' || c == '\n' { | ||
58 | errors.push(SyntaxError::new(UnescapedByte, range)); | ||
59 | } | ||
60 | |||
61 | // Only ASCII bytes are allowed | ||
62 | if c > 0x7F as char { | ||
63 | errors.push(SyntaxError::new(ByteOutOfRange, range)); | ||
64 | } | ||
65 | } | ||
66 | } | ||
67 | } | ||
68 | |||
69 | fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | ||
70 | if text.len() == 1 { | ||
71 | // Escape sequence consists only of leading `\` | ||
72 | errors.push(SyntaxError::new(EmptyByteEscape, range)); | ||
73 | } else { | ||
74 | let escape_code = text.chars().skip(1).next().unwrap(); | ||
75 | if !char::is_ascii_escape(escape_code) { | ||
76 | errors.push(SyntaxError::new(InvalidByteEscape, range)); | ||
77 | } | ||
78 | } | ||
79 | } | ||
80 | |||
81 | fn validate_byte_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | ||
82 | // A ByteCodeEscape has 4 chars, example: `\xDD` | ||
83 | if text.len() < 4 { | ||
84 | errors.push(SyntaxError::new(TooShortByteCodeEscape, range)); | ||
85 | } else { | ||
86 | assert!( | ||
87 | text.chars().count() == 4, | ||
88 | "ByteCodeEscape cannot be longer than 4 chars" | ||
89 | ); | ||
90 | |||
91 | if u8::from_str_radix(&text[2..], 16).is_err() { | ||
92 | errors.push(SyntaxError::new(MalformedByteCodeEscape, range)); | ||
93 | } | ||
94 | } | ||
95 | } | ||
96 | |||
97 | #[cfg(test)] | ||
98 | mod test { | ||
99 | use crate::SourceFileNode; | ||
100 | |||
101 | fn build_file(literal: &str) -> SourceFileNode { | ||
102 | let src = format!("const C: u8 = b'{}';", literal); | ||
103 | SourceFileNode::parse(&src) | ||
104 | } | ||
105 | |||
106 | fn assert_valid_byte(literal: &str) { | ||
107 | let file = build_file(literal); | ||
108 | assert!( | ||
109 | file.errors().len() == 0, | ||
110 | "Errors for literal '{}': {:?}", | ||
111 | literal, | ||
112 | file.errors() | ||
113 | ); | ||
114 | } | ||
115 | |||
116 | fn assert_invalid_byte(literal: &str) { | ||
117 | let file = build_file(literal); | ||
118 | assert!(file.errors().len() > 0); | ||
119 | } | ||
120 | |||
121 | #[test] | ||
122 | fn test_ansi_codepoints() { | ||
123 | for byte in 0..128 { | ||
124 | match byte { | ||
125 | b'\n' | b'\r' | b'\t' => assert_invalid_byte(&(byte as char).to_string()), | ||
126 | b'\'' | b'\\' => { /* Ignore character close and backslash */ } | ||
127 | _ => assert_valid_byte(&(byte as char).to_string()), | ||
128 | } | ||
129 | } | ||
130 | |||
131 | for byte in 128..=255u8 { | ||
132 | assert_invalid_byte(&(byte as char).to_string()); | ||
133 | } | ||
134 | } | ||
135 | |||
136 | #[test] | ||
137 | fn test_unicode_codepoints() { | ||
138 | let invalid = ["Ƒ", "バ", "メ", "﷽"]; | ||
139 | for c in &invalid { | ||
140 | assert_invalid_byte(c); | ||
141 | } | ||
142 | } | ||
143 | |||
144 | #[test] | ||
145 | fn test_unicode_multiple_codepoints() { | ||
146 | let invalid = ["नी", "👨👨"]; | ||
147 | for c in &invalid { | ||
148 | assert_invalid_byte(c); | ||
149 | } | ||
150 | } | ||
151 | |||
152 | #[test] | ||
153 | fn test_valid_byte_escape() { | ||
154 | let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"]; | ||
155 | for c in &valid { | ||
156 | assert_valid_byte(c); | ||
157 | } | ||
158 | } | ||
159 | |||
160 | #[test] | ||
161 | fn test_invalid_byte_escape() { | ||
162 | let invalid = [r"\a", r"\?", r"\"]; | ||
163 | for c in &invalid { | ||
164 | assert_invalid_byte(c); | ||
165 | } | ||
166 | } | ||
167 | |||
168 | #[test] | ||
169 | fn test_valid_byte_code_escape() { | ||
170 | let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"]; | ||
171 | for c in &valid { | ||
172 | assert_valid_byte(c); | ||
173 | } | ||
174 | } | ||
175 | |||
176 | #[test] | ||
177 | fn test_invalid_byte_code_escape() { | ||
178 | let invalid = [r"\x", r"\x7"]; | ||
179 | for c in &invalid { | ||
180 | assert_invalid_byte(c); | ||
181 | } | ||
182 | } | ||
183 | |||
184 | #[test] | ||
185 | fn test_invalid_unicode_escape() { | ||
186 | let well_formed = [ | ||
187 | r"\u{FF}", | ||
188 | r"\u{0}", | ||
189 | r"\u{F}", | ||
190 | r"\u{10FFFF}", | ||
191 | r"\u{1_0__FF___FF_____}", | ||
192 | ]; | ||
193 | for c in &well_formed { | ||
194 | assert_invalid_byte(c); | ||
195 | } | ||
196 | |||
197 | let invalid = [ | ||
198 | r"\u", | ||
199 | r"\u{}", | ||
200 | r"\u{", | ||
201 | r"\u{FF", | ||
202 | r"\u{FFFFFF}", | ||
203 | r"\u{_F}", | ||
204 | r"\u{00FFFFF}", | ||
205 | r"\u{110000}", | ||
206 | ]; | ||
207 | for c in &invalid { | ||
208 | assert_invalid_byte(c); | ||
209 | } | ||
210 | } | ||
211 | } | ||
diff --git a/crates/ra_syntax/src/validation/byte_string.rs b/crates/ra_syntax/src/validation/byte_string.rs new file mode 100644 index 000000000..7b830e97c --- /dev/null +++ b/crates/ra_syntax/src/validation/byte_string.rs | |||
@@ -0,0 +1,178 @@ | |||
1 | use crate::{ | ||
2 | ast::{self, AstNode}, | ||
3 | string_lexing::{self, StringComponentKind}, | ||
4 | yellow::{ | ||
5 | SyntaxError, | ||
6 | SyntaxErrorKind::*, | ||
7 | }, | ||
8 | }; | ||
9 | |||
10 | use super::byte; | ||
11 | |||
12 | pub(crate) fn validate_byte_string_node(node: ast::ByteString, errors: &mut Vec<SyntaxError>) { | ||
13 | let literal_text = node.text(); | ||
14 | let literal_range = node.syntax().range(); | ||
15 | let mut components = string_lexing::parse_byte_string_literal(literal_text); | ||
16 | for component in &mut components { | ||
17 | let range = component.range + literal_range.start(); | ||
18 | |||
19 | match component.kind { | ||
20 | StringComponentKind::Char(kind) => { | ||
21 | // Chars must escape \t, \n and \r codepoints, but strings don't | ||
22 | let text = &literal_text[component.range]; | ||
23 | match text { | ||
24 | "\t" | "\n" | "\r" => { /* always valid */ } | ||
25 | _ => byte::validate_byte_component(text, kind, range, errors), | ||
26 | } | ||
27 | } | ||
28 | StringComponentKind::IgnoreNewline => { /* always valid */ } | ||
29 | } | ||
30 | } | ||
31 | |||
32 | if !components.has_closing_quote { | ||
33 | errors.push(SyntaxError::new(UnclosedString, literal_range)); | ||
34 | } | ||
35 | } | ||
36 | |||
37 | #[cfg(test)] | ||
38 | mod test { | ||
39 | use crate::SourceFileNode; | ||
40 | |||
41 | fn build_file(literal: &str) -> SourceFileNode { | ||
42 | let src = format!(r#"const S: &'static [u8] = b"{}";"#, literal); | ||
43 | println!("Source: {}", src); | ||
44 | SourceFileNode::parse(&src) | ||
45 | } | ||
46 | |||
47 | fn assert_valid_str(literal: &str) { | ||
48 | let file = build_file(literal); | ||
49 | assert!( | ||
50 | file.errors().len() == 0, | ||
51 | "Errors for literal '{}': {:?}", | ||
52 | literal, | ||
53 | file.errors() | ||
54 | ); | ||
55 | } | ||
56 | |||
57 | fn assert_invalid_str(literal: &str) { | ||
58 | let file = build_file(literal); | ||
59 | assert!(file.errors().len() > 0); | ||
60 | } | ||
61 | |||
62 | #[test] | ||
63 | fn test_ansi_codepoints() { | ||
64 | for byte in 0..128 { | ||
65 | match byte { | ||
66 | b'\"' | b'\\' => { /* Ignore string close and backslash */ } | ||
67 | _ => assert_valid_str(&(byte as char).to_string()), | ||
68 | } | ||
69 | } | ||
70 | |||
71 | for byte in 128..=255u8 { | ||
72 | assert_invalid_str(&(byte as char).to_string()); | ||
73 | } | ||
74 | } | ||
75 | |||
76 | #[test] | ||
77 | fn test_unicode_codepoints() { | ||
78 | let invalid = ["Ƒ", "バ", "メ", "﷽"]; | ||
79 | for c in &invalid { | ||
80 | assert_invalid_str(c); | ||
81 | } | ||
82 | } | ||
83 | |||
84 | #[test] | ||
85 | fn test_unicode_multiple_codepoints() { | ||
86 | let invalid = ["नी", "👨👨"]; | ||
87 | for c in &invalid { | ||
88 | assert_invalid_str(c); | ||
89 | } | ||
90 | } | ||
91 | |||
92 | #[test] | ||
93 | fn test_valid_ascii_escape() { | ||
94 | let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"]; | ||
95 | for c in &valid { | ||
96 | assert_valid_str(c); | ||
97 | } | ||
98 | } | ||
99 | |||
100 | #[test] | ||
101 | fn test_invalid_ascii_escape() { | ||
102 | let invalid = [r"\a", r"\?", r"\"]; | ||
103 | for c in &invalid { | ||
104 | assert_invalid_str(c); | ||
105 | } | ||
106 | } | ||
107 | |||
108 | #[test] | ||
109 | fn test_valid_ascii_code_escape() { | ||
110 | let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"]; | ||
111 | for c in &valid { | ||
112 | assert_valid_str(c); | ||
113 | } | ||
114 | } | ||
115 | |||
116 | #[test] | ||
117 | fn test_invalid_ascii_code_escape() { | ||
118 | let invalid = [r"\x", r"\x7"]; | ||
119 | for c in &invalid { | ||
120 | assert_invalid_str(c); | ||
121 | } | ||
122 | } | ||
123 | |||
124 | #[test] | ||
125 | fn test_invalid_unicode_escape() { | ||
126 | let well_formed = [ | ||
127 | r"\u{FF}", | ||
128 | r"\u{0}", | ||
129 | r"\u{F}", | ||
130 | r"\u{10FFFF}", | ||
131 | r"\u{1_0__FF___FF_____}", | ||
132 | ]; | ||
133 | for c in &well_formed { | ||
134 | assert_invalid_str(c); | ||
135 | } | ||
136 | |||
137 | let invalid = [ | ||
138 | r"\u", | ||
139 | r"\u{}", | ||
140 | r"\u{", | ||
141 | r"\u{FF", | ||
142 | r"\u{FFFFFF}", | ||
143 | r"\u{_F}", | ||
144 | r"\u{00FFFFF}", | ||
145 | r"\u{110000}", | ||
146 | ]; | ||
147 | for c in &invalid { | ||
148 | assert_invalid_str(c); | ||
149 | } | ||
150 | } | ||
151 | |||
152 | #[test] | ||
153 | fn test_mixed_invalid() { | ||
154 | assert_invalid_str( | ||
155 | r"This is the tale of a string | ||
156 | with a newline in between, some emoji (👨👨) here and there, | ||
157 | unicode escapes like this: \u{1FFBB} and weird stuff like | ||
158 | this ﷽", | ||
159 | ); | ||
160 | } | ||
161 | |||
162 | #[test] | ||
163 | fn test_mixed_valid() { | ||
164 | assert_valid_str( | ||
165 | r"This is the tale of a string | ||
166 | with a newline in between, no emoji at all, | ||
167 | nor unicode escapes or weird stuff", | ||
168 | ); | ||
169 | } | ||
170 | |||
171 | #[test] | ||
172 | fn test_ignore_newline() { | ||
173 | assert_valid_str( | ||
174 | "Hello \ | ||
175 | World", | ||
176 | ); | ||
177 | } | ||
178 | } | ||
diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs new file mode 100644 index 000000000..4728c85e6 --- /dev/null +++ b/crates/ra_syntax/src/validation/char.rs | |||
@@ -0,0 +1,276 @@ | |||
1 | //! Validation of char literals | ||
2 | |||
3 | use std::u32; | ||
4 | |||
5 | use arrayvec::ArrayString; | ||
6 | |||
7 | use crate::{ | ||
8 | ast::{self, AstNode}, | ||
9 | string_lexing::{self, CharComponentKind}, | ||
10 | TextRange, | ||
11 | yellow::{ | ||
12 | SyntaxError, | ||
13 | SyntaxErrorKind::*, | ||
14 | }, | ||
15 | }; | ||
16 | |||
17 | pub(super) fn validate_char_node(node: ast::Char, errors: &mut Vec<SyntaxError>) { | ||
18 | let literal_text = node.text(); | ||
19 | let literal_range = node.syntax().range(); | ||
20 | let mut components = string_lexing::parse_char_literal(literal_text); | ||
21 | let mut len = 0; | ||
22 | for component in &mut components { | ||
23 | len += 1; | ||
24 | let text = &literal_text[component.range]; | ||
25 | let range = component.range + literal_range.start(); | ||
26 | validate_char_component(text, component.kind, range, errors); | ||
27 | } | ||
28 | |||
29 | if !components.has_closing_quote { | ||
30 | errors.push(SyntaxError::new(UnclosedChar, literal_range)); | ||
31 | } | ||
32 | |||
33 | if len == 0 { | ||
34 | errors.push(SyntaxError::new(EmptyChar, literal_range)); | ||
35 | } | ||
36 | |||
37 | if len > 1 { | ||
38 | errors.push(SyntaxError::new(OverlongChar, literal_range)); | ||
39 | } | ||
40 | } | ||
41 | |||
42 | pub(super) fn validate_char_component( | ||
43 | text: &str, | ||
44 | kind: CharComponentKind, | ||
45 | range: TextRange, | ||
46 | errors: &mut Vec<SyntaxError>, | ||
47 | ) { | ||
48 | // Validate escapes | ||
49 | use self::CharComponentKind::*; | ||
50 | match kind { | ||
51 | AsciiEscape => validate_ascii_escape(text, range, errors), | ||
52 | AsciiCodeEscape => validate_ascii_code_escape(text, range, errors), | ||
53 | UnicodeEscape => validate_unicode_escape(text, range, errors), | ||
54 | CodePoint => { | ||
55 | // These code points must always be escaped | ||
56 | if text == "\t" || text == "\r" || text == "\n" { | ||
57 | errors.push(SyntaxError::new(UnescapedCodepoint, range)); | ||
58 | } | ||
59 | } | ||
60 | } | ||
61 | } | ||
62 | |||
63 | fn validate_ascii_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | ||
64 | if text.len() == 1 { | ||
65 | // Escape sequence consists only of leading `\` | ||
66 | errors.push(SyntaxError::new(EmptyAsciiEscape, range)); | ||
67 | } else { | ||
68 | let escape_code = text.chars().skip(1).next().unwrap(); | ||
69 | if !is_ascii_escape(escape_code) { | ||
70 | errors.push(SyntaxError::new(InvalidAsciiEscape, range)); | ||
71 | } | ||
72 | } | ||
73 | } | ||
74 | |||
75 | pub(super) fn is_ascii_escape(code: char) -> bool { | ||
76 | match code { | ||
77 | '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true, | ||
78 | _ => false, | ||
79 | } | ||
80 | } | ||
81 | |||
82 | fn validate_ascii_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | ||
83 | // An AsciiCodeEscape has 4 chars, example: `\xDD` | ||
84 | if text.len() < 4 { | ||
85 | errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range)); | ||
86 | } else { | ||
87 | assert!( | ||
88 | text.chars().count() == 4, | ||
89 | "AsciiCodeEscape cannot be longer than 4 chars" | ||
90 | ); | ||
91 | |||
92 | match u8::from_str_radix(&text[2..], 16) { | ||
93 | Ok(code) if code < 128 => { /* Escape code is valid */ } | ||
94 | Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)), | ||
95 | Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)), | ||
96 | } | ||
97 | } | ||
98 | } | ||
99 | |||
100 | fn validate_unicode_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | ||
101 | assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u"); | ||
102 | |||
103 | if text.len() == 2 { | ||
104 | // No starting `{` | ||
105 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
106 | return; | ||
107 | } | ||
108 | |||
109 | if text.len() == 3 { | ||
110 | // Only starting `{` | ||
111 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)); | ||
112 | return; | ||
113 | } | ||
114 | |||
115 | let mut code = ArrayString::<[_; 6]>::new(); | ||
116 | let mut closed = false; | ||
117 | for c in text[3..].chars() { | ||
118 | assert!(!closed, "no characters after escape is closed"); | ||
119 | |||
120 | if c.is_digit(16) { | ||
121 | if code.len() == 6 { | ||
122 | errors.push(SyntaxError::new(OverlongUnicodeEscape, range)); | ||
123 | return; | ||
124 | } | ||
125 | |||
126 | code.push(c); | ||
127 | } else if c == '_' { | ||
128 | // Reject leading _ | ||
129 | if code.len() == 0 { | ||
130 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
131 | return; | ||
132 | } | ||
133 | } else if c == '}' { | ||
134 | closed = true; | ||
135 | } else { | ||
136 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
137 | return; | ||
138 | } | ||
139 | } | ||
140 | |||
141 | if !closed { | ||
142 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)) | ||
143 | } | ||
144 | |||
145 | if code.len() == 0 { | ||
146 | errors.push(SyntaxError::new(EmptyUnicodeEcape, range)); | ||
147 | return; | ||
148 | } | ||
149 | |||
150 | match u32::from_str_radix(&code, 16) { | ||
151 | Ok(code_u32) if code_u32 > 0x10FFFF => { | ||
152 | errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range)); | ||
153 | } | ||
154 | Ok(_) => { | ||
155 | // Valid escape code | ||
156 | } | ||
157 | Err(_) => { | ||
158 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
159 | } | ||
160 | } | ||
161 | } | ||
162 | |||
163 | #[cfg(test)] | ||
164 | mod test { | ||
165 | use crate::SourceFileNode; | ||
166 | |||
167 | fn build_file(literal: &str) -> SourceFileNode { | ||
168 | let src = format!("const C: char = '{}';", literal); | ||
169 | SourceFileNode::parse(&src) | ||
170 | } | ||
171 | |||
172 | fn assert_valid_char(literal: &str) { | ||
173 | let file = build_file(literal); | ||
174 | assert!( | ||
175 | file.errors().len() == 0, | ||
176 | "Errors for literal '{}': {:?}", | ||
177 | literal, | ||
178 | file.errors() | ||
179 | ); | ||
180 | } | ||
181 | |||
182 | fn assert_invalid_char(literal: &str) { | ||
183 | let file = build_file(literal); | ||
184 | assert!(file.errors().len() > 0); | ||
185 | } | ||
186 | |||
187 | #[test] | ||
188 | fn test_ansi_codepoints() { | ||
189 | for byte in 0..=255u8 { | ||
190 | match byte { | ||
191 | b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()), | ||
192 | b'\'' | b'\\' => { /* Ignore character close and backslash */ } | ||
193 | _ => assert_valid_char(&(byte as char).to_string()), | ||
194 | } | ||
195 | } | ||
196 | } | ||
197 | |||
198 | #[test] | ||
199 | fn test_unicode_codepoints() { | ||
200 | let valid = ["Ƒ", "バ", "メ", "﷽"]; | ||
201 | for c in &valid { | ||
202 | assert_valid_char(c); | ||
203 | } | ||
204 | } | ||
205 | |||
206 | #[test] | ||
207 | fn test_unicode_multiple_codepoints() { | ||
208 | let invalid = ["नी", "👨👨"]; | ||
209 | for c in &invalid { | ||
210 | assert_invalid_char(c); | ||
211 | } | ||
212 | } | ||
213 | |||
214 | #[test] | ||
215 | fn test_valid_ascii_escape() { | ||
216 | let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"]; | ||
217 | for c in &valid { | ||
218 | assert_valid_char(c); | ||
219 | } | ||
220 | } | ||
221 | |||
222 | #[test] | ||
223 | fn test_invalid_ascii_escape() { | ||
224 | let invalid = [r"\a", r"\?", r"\"]; | ||
225 | for c in &invalid { | ||
226 | assert_invalid_char(c); | ||
227 | } | ||
228 | } | ||
229 | |||
230 | #[test] | ||
231 | fn test_valid_ascii_code_escape() { | ||
232 | let valid = [r"\x00", r"\x7F", r"\x55"]; | ||
233 | for c in &valid { | ||
234 | assert_valid_char(c); | ||
235 | } | ||
236 | } | ||
237 | |||
238 | #[test] | ||
239 | fn test_invalid_ascii_code_escape() { | ||
240 | let invalid = [r"\x", r"\x7", r"\xF0"]; | ||
241 | for c in &invalid { | ||
242 | assert_invalid_char(c); | ||
243 | } | ||
244 | } | ||
245 | |||
246 | #[test] | ||
247 | fn test_valid_unicode_escape() { | ||
248 | let valid = [ | ||
249 | r"\u{FF}", | ||
250 | r"\u{0}", | ||
251 | r"\u{F}", | ||
252 | r"\u{10FFFF}", | ||
253 | r"\u{1_0__FF___FF_____}", | ||
254 | ]; | ||
255 | for c in &valid { | ||
256 | assert_valid_char(c); | ||
257 | } | ||
258 | } | ||
259 | |||
260 | #[test] | ||
261 | fn test_invalid_unicode_escape() { | ||
262 | let invalid = [ | ||
263 | r"\u", | ||
264 | r"\u{}", | ||
265 | r"\u{", | ||
266 | r"\u{FF", | ||
267 | r"\u{FFFFFF}", | ||
268 | r"\u{_F}", | ||
269 | r"\u{00FFFFF}", | ||
270 | r"\u{110000}", | ||
271 | ]; | ||
272 | for c in &invalid { | ||
273 | assert_invalid_char(c); | ||
274 | } | ||
275 | } | ||
276 | } | ||
diff --git a/crates/ra_syntax/src/validation/mod.rs b/crates/ra_syntax/src/validation/mod.rs new file mode 100644 index 000000000..bdee8120c --- /dev/null +++ b/crates/ra_syntax/src/validation/mod.rs | |||
@@ -0,0 +1,24 @@ | |||
1 | use crate::{ | ||
2 | algo::visit::{visitor_ctx, VisitorCtx}, | ||
3 | ast, | ||
4 | SourceFileNode, | ||
5 | yellow::SyntaxError, | ||
6 | }; | ||
7 | |||
8 | mod byte; | ||
9 | mod byte_string; | ||
10 | mod char; | ||
11 | mod string; | ||
12 | |||
13 | pub(crate) fn validate(file: &SourceFileNode) -> Vec<SyntaxError> { | ||
14 | let mut errors = Vec::new(); | ||
15 | for node in file.syntax().descendants() { | ||
16 | let _ = visitor_ctx(&mut errors) | ||
17 | .visit::<ast::Byte, _>(self::byte::validate_byte_node) | ||
18 | .visit::<ast::ByteString, _>(self::byte_string::validate_byte_string_node) | ||
19 | .visit::<ast::Char, _>(self::char::validate_char_node) | ||
20 | .visit::<ast::String, _>(self::string::validate_string_node) | ||
21 | .accept(node); | ||
22 | } | ||
23 | errors | ||
24 | } | ||
diff --git a/crates/ra_syntax/src/validation/string.rs b/crates/ra_syntax/src/validation/string.rs new file mode 100644 index 000000000..089879d15 --- /dev/null +++ b/crates/ra_syntax/src/validation/string.rs | |||
@@ -0,0 +1,168 @@ | |||
1 | use crate::{ | ||
2 | ast::{self, AstNode}, | ||
3 | string_lexing::{self, StringComponentKind}, | ||
4 | yellow::{ | ||
5 | SyntaxError, | ||
6 | SyntaxErrorKind::*, | ||
7 | }, | ||
8 | }; | ||
9 | |||
10 | use super::char; | ||
11 | |||
12 | pub(crate) fn validate_string_node(node: ast::String, errors: &mut Vec<SyntaxError>) { | ||
13 | let literal_text = node.text(); | ||
14 | let literal_range = node.syntax().range(); | ||
15 | let mut components = string_lexing::parse_string_literal(literal_text); | ||
16 | for component in &mut components { | ||
17 | let range = component.range + literal_range.start(); | ||
18 | |||
19 | match component.kind { | ||
20 | StringComponentKind::Char(kind) => { | ||
21 | // Chars must escape \t, \n and \r codepoints, but strings don't | ||
22 | let text = &literal_text[component.range]; | ||
23 | match text { | ||
24 | "\t" | "\n" | "\r" => { /* always valid */ } | ||
25 | _ => char::validate_char_component(text, kind, range, errors), | ||
26 | } | ||
27 | } | ||
28 | StringComponentKind::IgnoreNewline => { /* always valid */ } | ||
29 | } | ||
30 | } | ||
31 | |||
32 | if !components.has_closing_quote { | ||
33 | errors.push(SyntaxError::new(UnclosedString, literal_range)); | ||
34 | } | ||
35 | } | ||
36 | |||
37 | #[cfg(test)] | ||
38 | mod test { | ||
39 | use crate::SourceFileNode; | ||
40 | |||
41 | fn build_file(literal: &str) -> SourceFileNode { | ||
42 | let src = format!(r#"const S: &'static str = "{}";"#, literal); | ||
43 | println!("Source: {}", src); | ||
44 | SourceFileNode::parse(&src) | ||
45 | } | ||
46 | |||
47 | fn assert_valid_str(literal: &str) { | ||
48 | let file = build_file(literal); | ||
49 | assert!( | ||
50 | file.errors().len() == 0, | ||
51 | "Errors for literal '{}': {:?}", | ||
52 | literal, | ||
53 | file.errors() | ||
54 | ); | ||
55 | } | ||
56 | |||
57 | fn assert_invalid_str(literal: &str) { | ||
58 | let file = build_file(literal); | ||
59 | assert!(file.errors().len() > 0); | ||
60 | } | ||
61 | |||
62 | #[test] | ||
63 | fn test_ansi_codepoints() { | ||
64 | for byte in 0..=255u8 { | ||
65 | match byte { | ||
66 | b'\"' | b'\\' => { /* Ignore string close and backslash */ } | ||
67 | _ => assert_valid_str(&(byte as char).to_string()), | ||
68 | } | ||
69 | } | ||
70 | } | ||
71 | |||
72 | #[test] | ||
73 | fn test_unicode_codepoints() { | ||
74 | let valid = ["Ƒ", "バ", "メ", "﷽"]; | ||
75 | for c in &valid { | ||
76 | assert_valid_str(c); | ||
77 | } | ||
78 | } | ||
79 | |||
80 | #[test] | ||
81 | fn test_unicode_multiple_codepoints() { | ||
82 | let valid = ["नी", "👨👨"]; | ||
83 | for c in &valid { | ||
84 | assert_valid_str(c); | ||
85 | } | ||
86 | } | ||
87 | |||
88 | #[test] | ||
89 | fn test_valid_ascii_escape() { | ||
90 | let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"]; | ||
91 | for c in &valid { | ||
92 | assert_valid_str(c); | ||
93 | } | ||
94 | } | ||
95 | |||
96 | #[test] | ||
97 | fn test_invalid_ascii_escape() { | ||
98 | let invalid = [r"\a", r"\?", r"\"]; | ||
99 | for c in &invalid { | ||
100 | assert_invalid_str(c); | ||
101 | } | ||
102 | } | ||
103 | |||
104 | #[test] | ||
105 | fn test_valid_ascii_code_escape() { | ||
106 | let valid = [r"\x00", r"\x7F", r"\x55"]; | ||
107 | for c in &valid { | ||
108 | assert_valid_str(c); | ||
109 | } | ||
110 | } | ||
111 | |||
112 | #[test] | ||
113 | fn test_invalid_ascii_code_escape() { | ||
114 | let invalid = [r"\x", r"\x7", r"\xF0"]; | ||
115 | for c in &invalid { | ||
116 | assert_invalid_str(c); | ||
117 | } | ||
118 | } | ||
119 | |||
120 | #[test] | ||
121 | fn test_valid_unicode_escape() { | ||
122 | let valid = [ | ||
123 | r"\u{FF}", | ||
124 | r"\u{0}", | ||
125 | r"\u{F}", | ||
126 | r"\u{10FFFF}", | ||
127 | r"\u{1_0__FF___FF_____}", | ||
128 | ]; | ||
129 | for c in &valid { | ||
130 | assert_valid_str(c); | ||
131 | } | ||
132 | } | ||
133 | |||
134 | #[test] | ||
135 | fn test_invalid_unicode_escape() { | ||
136 | let invalid = [ | ||
137 | r"\u", | ||
138 | r"\u{}", | ||
139 | r"\u{", | ||
140 | r"\u{FF", | ||
141 | r"\u{FFFFFF}", | ||
142 | r"\u{_F}", | ||
143 | r"\u{00FFFFF}", | ||
144 | r"\u{110000}", | ||
145 | ]; | ||
146 | for c in &invalid { | ||
147 | assert_invalid_str(c); | ||
148 | } | ||
149 | } | ||
150 | |||
151 | #[test] | ||
152 | fn test_mixed() { | ||
153 | assert_valid_str( | ||
154 | r"This is the tale of a string | ||
155 | with a newline in between, some emoji (👨👨) here and there, | ||
156 | unicode escapes like this: \u{1FFBB} and weird stuff like | ||
157 | this ﷽", | ||
158 | ); | ||
159 | } | ||
160 | |||
161 | #[test] | ||
162 | fn test_ignore_newline() { | ||
163 | assert_valid_str( | ||
164 | "Hello \ | ||
165 | World", | ||
166 | ); | ||
167 | } | ||
168 | } | ||