aboutsummaryrefslogtreecommitdiff
path: root/crates/ra_syntax/src/validation
diff options
context:
space:
mode:
authorAleksey Kladov <[email protected]>2019-05-07 17:38:26 +0100
committerAleksey Kladov <[email protected]>2019-05-07 17:41:59 +0100
commit313314e14b629ebf50389dbd2d440bda922f6ae7 (patch)
treeca6a4aee6ad4077a869a932a18c6c8d134406f8c /crates/ra_syntax/src/validation
parentef782adc293deb287128f005dbab2038ba3ccdc1 (diff)
share literal validation logic with compiler
Diffstat (limited to 'crates/ra_syntax/src/validation')
-rw-r--r--crates/ra_syntax/src/validation/byte.rs199
-rw-r--r--crates/ra_syntax/src/validation/byte_string.rs169
-rw-r--r--crates/ra_syntax/src/validation/char.rs273
-rw-r--r--crates/ra_syntax/src/validation/string.rs154
-rw-r--r--crates/ra_syntax/src/validation/unescape.rs521
5 files changed, 521 insertions, 795 deletions
diff --git a/crates/ra_syntax/src/validation/byte.rs b/crates/ra_syntax/src/validation/byte.rs
deleted file mode 100644
index f653e65d0..000000000
--- a/crates/ra_syntax/src/validation/byte.rs
+++ /dev/null
@@ -1,199 +0,0 @@
1//! Validation of byte literals
2
3use crate::{
4 string_lexing::{self, StringComponentKind},
5 TextRange,
6 validation::char,
7 SyntaxError,
8 SyntaxErrorKind::*,
9 SyntaxToken,
10};
11
12pub(super) fn validate_byte_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
13 let literal_text = node.text();
14 let literal_range = node.range();
15 let mut components = string_lexing::parse_quoted_literal(Some('b'), '\'', literal_text);
16 let mut len = 0;
17 for component in &mut components {
18 len += 1;
19 let text = &literal_text[component.range];
20 let range = component.range + literal_range.start();
21 validate_byte_component(text, component.kind, range, errors);
22 }
23
24 if !components.has_closing_quote {
25 errors.push(SyntaxError::new(UnclosedByte, literal_range));
26 }
27
28 if let Some(range) = components.suffix {
29 errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
30 }
31
32 if len == 0 {
33 errors.push(SyntaxError::new(EmptyByte, literal_range));
34 }
35
36 if len > 1 {
37 errors.push(SyntaxError::new(OverlongByte, literal_range));
38 }
39}
40
41pub(super) fn validate_byte_component(
42 text: &str,
43 kind: StringComponentKind,
44 range: TextRange,
45 errors: &mut Vec<SyntaxError>,
46) {
47 use self::StringComponentKind::*;
48 match kind {
49 AsciiEscape => validate_byte_escape(text, range, errors),
50 AsciiCodeEscape => validate_byte_code_escape(text, range, errors),
51 UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)),
52 CodePoint => {
53 let c = text.chars().next().expect("Code points should be one character long");
54
55 // These bytes must always be escaped
56 if c == '\t' || c == '\r' || c == '\n' {
57 errors.push(SyntaxError::new(UnescapedByte, range));
58 }
59
60 // Only ASCII bytes are allowed
61 if c > 0x7F as char {
62 errors.push(SyntaxError::new(ByteOutOfRange, range));
63 }
64 }
65 IgnoreNewline => { /* always valid */ }
66 }
67}
68
69fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
70 if text.len() == 1 {
71 // Escape sequence consists only of leading `\`
72 errors.push(SyntaxError::new(EmptyByteEscape, range));
73 } else {
74 let escape_code = text.chars().skip(1).next().unwrap();
75 if !char::is_ascii_escape(escape_code) {
76 errors.push(SyntaxError::new(InvalidByteEscape, range));
77 }
78 }
79}
80
81fn validate_byte_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
82 // A ByteCodeEscape has 4 chars, example: `\xDD`
83 if !text.is_ascii() {
84 errors.push(SyntaxError::new(MalformedByteCodeEscape, range));
85 } else if text.chars().count() < 4 {
86 errors.push(SyntaxError::new(TooShortByteCodeEscape, range));
87 } else {
88 assert!(text.chars().count() == 4, "ByteCodeEscape cannot be longer than 4 chars");
89
90 if u8::from_str_radix(&text[2..], 16).is_err() {
91 errors.push(SyntaxError::new(MalformedByteCodeEscape, range));
92 }
93 }
94}
95
96#[cfg(test)]
97mod test {
98 use crate::{SourceFile, TreeArc};
99
100 fn build_file(literal: &str) -> TreeArc<SourceFile> {
101 let src = format!("const C: u8 = b'{}';", literal);
102 SourceFile::parse(&src)
103 }
104
105 fn assert_valid_byte(literal: &str) {
106 let file = build_file(literal);
107 assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
108 }
109
110 fn assert_invalid_byte(literal: &str) {
111 let file = build_file(literal);
112 assert!(file.errors().len() > 0);
113 }
114
115 #[test]
116 fn test_ansi_codepoints() {
117 for byte in 0..128 {
118 match byte {
119 b'\n' | b'\r' | b'\t' => assert_invalid_byte(&(byte as char).to_string()),
120 b'\'' | b'\\' => { /* Ignore character close and backslash */ }
121 _ => assert_valid_byte(&(byte as char).to_string()),
122 }
123 }
124
125 for byte in 128..=255u8 {
126 assert_invalid_byte(&(byte as char).to_string());
127 }
128 }
129
130 #[test]
131 fn test_unicode_codepoints() {
132 let invalid = ["Ƒ", "バ", "メ", "﷽"];
133 for c in &invalid {
134 assert_invalid_byte(c);
135 }
136 }
137
138 #[test]
139 fn test_unicode_multiple_codepoints() {
140 let invalid = ["नी", "👨‍👨‍"];
141 for c in &invalid {
142 assert_invalid_byte(c);
143 }
144 }
145
146 #[test]
147 fn test_valid_byte_escape() {
148 let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"];
149 for c in &valid {
150 assert_valid_byte(c);
151 }
152 }
153
154 #[test]
155 fn test_invalid_byte_escape() {
156 let invalid = [r"\a", r"\?", r"\"];
157 for c in &invalid {
158 assert_invalid_byte(c);
159 }
160 }
161
162 #[test]
163 fn test_valid_byte_code_escape() {
164 let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"];
165 for c in &valid {
166 assert_valid_byte(c);
167 }
168 }
169
170 #[test]
171 fn test_invalid_byte_code_escape() {
172 let invalid = [r"\x", r"\x7"];
173 for c in &invalid {
174 assert_invalid_byte(c);
175 }
176 }
177
178 #[test]
179 fn test_invalid_unicode_escape() {
180 let well_formed = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
181 for c in &well_formed {
182 assert_invalid_byte(c);
183 }
184
185 let invalid = [
186 r"\u",
187 r"\u{}",
188 r"\u{",
189 r"\u{FF",
190 r"\u{FFFFFF}",
191 r"\u{_F}",
192 r"\u{00FFFFF}",
193 r"\u{110000}",
194 ];
195 for c in &invalid {
196 assert_invalid_byte(c);
197 }
198 }
199}
diff --git a/crates/ra_syntax/src/validation/byte_string.rs b/crates/ra_syntax/src/validation/byte_string.rs
deleted file mode 100644
index 1d48c2d9b..000000000
--- a/crates/ra_syntax/src/validation/byte_string.rs
+++ /dev/null
@@ -1,169 +0,0 @@
1use crate::{
2 string_lexing::{self, StringComponentKind},
3 SyntaxError,
4 SyntaxErrorKind::*,
5 SyntaxToken,
6};
7
8use super::byte;
9
10pub(crate) fn validate_byte_string_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
11 let literal_text = node.text();
12 let literal_range = node.range();
13 let mut components = string_lexing::parse_quoted_literal(Some('b'), '"', literal_text);
14 for component in &mut components {
15 let range = component.range + literal_range.start();
16
17 match component.kind {
18 StringComponentKind::IgnoreNewline => { /* always valid */ }
19 _ => {
20 // Chars must escape \t, \n and \r codepoints, but strings don't
21 let text = &literal_text[component.range];
22 match text {
23 "\t" | "\n" | "\r" => { /* always valid */ }
24 _ => byte::validate_byte_component(text, component.kind, range, errors),
25 }
26 }
27 }
28 }
29
30 if !components.has_closing_quote {
31 errors.push(SyntaxError::new(UnclosedString, literal_range));
32 }
33
34 if let Some(range) = components.suffix {
35 errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
36 }
37}
38
39#[cfg(test)]
40mod test {
41 use crate::{SourceFile, TreeArc};
42
43 fn build_file(literal: &str) -> TreeArc<SourceFile> {
44 let src = format!(r#"const S: &'static [u8] = b"{}";"#, literal);
45 println!("Source: {}", src);
46 SourceFile::parse(&src)
47 }
48
49 fn assert_valid_str(literal: &str) {
50 let file = build_file(literal);
51 assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
52 }
53
54 fn assert_invalid_str(literal: &str) {
55 let file = build_file(literal);
56 assert!(file.errors().len() > 0);
57 }
58
59 #[test]
60 fn test_ansi_codepoints() {
61 for byte in 0..128 {
62 match byte {
63 b'\"' | b'\\' => { /* Ignore string close and backslash */ }
64 _ => assert_valid_str(&(byte as char).to_string()),
65 }
66 }
67
68 for byte in 128..=255u8 {
69 assert_invalid_str(&(byte as char).to_string());
70 }
71 }
72
73 #[test]
74 fn test_unicode_codepoints() {
75 let invalid = ["Ƒ", "バ", "メ", "﷽"];
76 for c in &invalid {
77 assert_invalid_str(c);
78 }
79 }
80
81 #[test]
82 fn test_unicode_multiple_codepoints() {
83 let invalid = ["नी", "👨‍👨‍"];
84 for c in &invalid {
85 assert_invalid_str(c);
86 }
87 }
88
89 #[test]
90 fn test_valid_ascii_escape() {
91 let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"];
92 for c in &valid {
93 assert_valid_str(c);
94 }
95 }
96
97 #[test]
98 fn test_invalid_ascii_escape() {
99 let invalid = [r"\a", r"\?", r"\"];
100 for c in &invalid {
101 assert_invalid_str(c);
102 }
103 }
104
105 #[test]
106 fn test_valid_ascii_code_escape() {
107 let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"];
108 for c in &valid {
109 assert_valid_str(c);
110 }
111 }
112
113 #[test]
114 fn test_invalid_ascii_code_escape() {
115 let invalid = [r"\x", r"\x7"];
116 for c in &invalid {
117 assert_invalid_str(c);
118 }
119 }
120
121 #[test]
122 fn test_invalid_unicode_escape() {
123 let well_formed = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
124 for c in &well_formed {
125 assert_invalid_str(c);
126 }
127
128 let invalid = [
129 r"\u",
130 r"\u{}",
131 r"\u{",
132 r"\u{FF",
133 r"\u{FFFFFF}",
134 r"\u{_F}",
135 r"\u{00FFFFF}",
136 r"\u{110000}",
137 ];
138 for c in &invalid {
139 assert_invalid_str(c);
140 }
141 }
142
143 #[test]
144 fn test_mixed_invalid() {
145 assert_invalid_str(
146 r"This is the tale of a string
147with a newline in between, some emoji (👨‍👨‍) here and there,
148unicode escapes like this: \u{1FFBB} and weird stuff like
149this ﷽",
150 );
151 }
152
153 #[test]
154 fn test_mixed_valid() {
155 assert_valid_str(
156 r"This is the tale of a string
157with a newline in between, no emoji at all,
158nor unicode escapes or weird stuff",
159 );
160 }
161
162 #[test]
163 fn test_ignore_newline() {
164 assert_valid_str(
165 "Hello \
166 World",
167 );
168 }
169}
diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs
deleted file mode 100644
index 0f1885873..000000000
--- a/crates/ra_syntax/src/validation/char.rs
+++ /dev/null
@@ -1,273 +0,0 @@
1//! Validation of char literals
2
3use std::u32;
4
5use arrayvec::ArrayString;
6
7use crate::{
8 string_lexing::{self, StringComponentKind},
9 TextRange,
10 SyntaxError,
11 SyntaxErrorKind::*,
12 SyntaxToken,
13};
14
15pub(super) fn validate_char_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
16 let literal_text = node.text();
17 let literal_range = node.range();
18 let mut components = string_lexing::parse_quoted_literal(None, '\'', literal_text);
19 let mut len = 0;
20 for component in &mut components {
21 len += 1;
22 let text = &literal_text[component.range];
23 let range = component.range + literal_range.start();
24 validate_char_component(text, component.kind, range, errors);
25 }
26
27 if !components.has_closing_quote {
28 errors.push(SyntaxError::new(UnclosedChar, literal_range));
29 }
30
31 if let Some(range) = components.suffix {
32 errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
33 }
34
35 if len == 0 {
36 errors.push(SyntaxError::new(EmptyChar, literal_range));
37 }
38
39 if len > 1 {
40 errors.push(SyntaxError::new(OverlongChar, literal_range));
41 }
42}
43
44pub(super) fn validate_char_component(
45 text: &str,
46 kind: StringComponentKind,
47 range: TextRange,
48 errors: &mut Vec<SyntaxError>,
49) {
50 // Validate escapes
51 use self::StringComponentKind::*;
52 match kind {
53 AsciiEscape => validate_ascii_escape(text, range, errors),
54 AsciiCodeEscape => validate_ascii_code_escape(text, range, errors),
55 UnicodeEscape => validate_unicode_escape(text, range, errors),
56 CodePoint => {
57 // These code points must always be escaped
58 if text == "\t" || text == "\r" || text == "\n" {
59 errors.push(SyntaxError::new(UnescapedCodepoint, range));
60 }
61 }
62 StringComponentKind::IgnoreNewline => { /* always valid */ }
63 }
64}
65
66fn validate_ascii_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
67 if text.len() == 1 {
68 // Escape sequence consists only of leading `\` (only occurs at EOF, otherwise e.g. '\' is treated as an unclosed char containing a single quote `'`)
69 errors.push(SyntaxError::new(EmptyAsciiEscape, range));
70 } else {
71 let escape_code = text.chars().skip(1).next().unwrap();
72 if !is_ascii_escape(escape_code) {
73 errors.push(SyntaxError::new(InvalidAsciiEscape, range));
74 }
75 }
76}
77
78pub(super) fn is_ascii_escape(code: char) -> bool {
79 match code {
80 '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true,
81 _ => false,
82 }
83}
84
85fn validate_ascii_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
86 // An AsciiCodeEscape has 4 chars, example: `\xDD`
87 if !text.is_ascii() {
88 // FIXME: Give a more precise error message (say what the invalid character was)
89 errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range));
90 } else if text.chars().count() < 4 {
91 errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range));
92 } else {
93 assert_eq!(
94 text.chars().count(),
95 4,
96 "AsciiCodeEscape cannot be longer than 4 chars, but text '{}' is",
97 text,
98 );
99
100 match u8::from_str_radix(&text[2..], 16) {
101 Ok(code) if code < 128 => { /* Escape code is valid */ }
102 Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)),
103 Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)),
104 }
105 }
106}
107
108fn validate_unicode_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
109 assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u");
110
111 if text.len() == 2 {
112 // No starting `{`
113 errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
114 return;
115 }
116
117 if text.len() == 3 {
118 // Only starting `{`
119 errors.push(SyntaxError::new(UnclosedUnicodeEscape, range));
120 return;
121 }
122
123 let mut code = ArrayString::<[_; 6]>::new();
124 let mut closed = false;
125 for c in text[3..].chars() {
126 assert!(!closed, "no characters after escape is closed");
127
128 if c.is_digit(16) {
129 if code.len() == 6 {
130 errors.push(SyntaxError::new(OverlongUnicodeEscape, range));
131 return;
132 }
133
134 code.push(c);
135 } else if c == '_' {
136 // Reject leading _
137 if code.len() == 0 {
138 errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
139 return;
140 }
141 } else if c == '}' {
142 closed = true;
143 } else {
144 errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
145 return;
146 }
147 }
148
149 if !closed {
150 errors.push(SyntaxError::new(UnclosedUnicodeEscape, range))
151 }
152
153 if code.len() == 0 {
154 errors.push(SyntaxError::new(EmptyUnicodeEcape, range));
155 return;
156 }
157
158 match u32::from_str_radix(&code, 16) {
159 Ok(code_u32) if code_u32 > 0x10FFFF => {
160 errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range));
161 }
162 Ok(_) => {
163 // Valid escape code
164 }
165 Err(_) => {
166 errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
167 }
168 }
169}
170
171#[cfg(test)]
172mod test {
173 use crate::{SourceFile, TreeArc};
174
175 fn build_file(literal: &str) -> TreeArc<SourceFile> {
176 let src = format!("const C: char = '{}';", literal);
177 SourceFile::parse(&src)
178 }
179
180 fn assert_valid_char(literal: &str) {
181 let file = build_file(literal);
182 assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
183 }
184
185 fn assert_invalid_char(literal: &str) {
186 let file = build_file(literal);
187 assert!(file.errors().len() > 0);
188 }
189
190 #[test]
191 fn test_ansi_codepoints() {
192 for byte in 0..=255u8 {
193 match byte {
194 b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()),
195 b'\'' | b'\\' => { /* Ignore character close and backslash */ }
196 _ => assert_valid_char(&(byte as char).to_string()),
197 }
198 }
199 }
200
201 #[test]
202 fn test_unicode_codepoints() {
203 let valid = ["Ƒ", "バ", "メ", "﷽"];
204 for c in &valid {
205 assert_valid_char(c);
206 }
207 }
208
209 #[test]
210 fn test_unicode_multiple_codepoints() {
211 let invalid = ["नी", "👨‍👨‍"];
212 for c in &invalid {
213 assert_invalid_char(c);
214 }
215 }
216
217 #[test]
218 fn test_valid_ascii_escape() {
219 let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"];
220 for c in &valid {
221 assert_valid_char(c);
222 }
223 }
224
225 #[test]
226 fn test_invalid_ascii_escape() {
227 let invalid = [r"\a", r"\?", r"\"];
228 for c in &invalid {
229 assert_invalid_char(c);
230 }
231 }
232
233 #[test]
234 fn test_valid_ascii_code_escape() {
235 let valid = [r"\x00", r"\x7F", r"\x55"];
236 for c in &valid {
237 assert_valid_char(c);
238 }
239 }
240
241 #[test]
242 fn test_invalid_ascii_code_escape() {
243 let invalid = [r"\x", r"\x7", r"\xF0"];
244 for c in &invalid {
245 assert_invalid_char(c);
246 }
247 }
248
249 #[test]
250 fn test_valid_unicode_escape() {
251 let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
252 for c in &valid {
253 assert_valid_char(c);
254 }
255 }
256
257 #[test]
258 fn test_invalid_unicode_escape() {
259 let invalid = [
260 r"\u",
261 r"\u{}",
262 r"\u{",
263 r"\u{FF",
264 r"\u{FFFFFF}",
265 r"\u{_F}",
266 r"\u{00FFFFF}",
267 r"\u{110000}",
268 ];
269 for c in &invalid {
270 assert_invalid_char(c);
271 }
272 }
273}
diff --git a/crates/ra_syntax/src/validation/string.rs b/crates/ra_syntax/src/validation/string.rs
deleted file mode 100644
index fc2f1b992..000000000
--- a/crates/ra_syntax/src/validation/string.rs
+++ /dev/null
@@ -1,154 +0,0 @@
1use crate::{
2 string_lexing,
3 SyntaxError,
4 SyntaxErrorKind::*,
5 SyntaxToken,
6};
7
8use super::char;
9
10pub(crate) fn validate_string_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
11 let literal_text = node.text();
12 let literal_range = node.range();
13 let mut components = string_lexing::parse_quoted_literal(None, '"', literal_text);
14 for component in &mut components {
15 let range = component.range + literal_range.start();
16
17 // Chars must escape \t, \n and \r codepoints, but strings don't
18 let text = &literal_text[component.range];
19 match text {
20 "\t" | "\n" | "\r" => { /* always valid */ }
21 _ => char::validate_char_component(text, component.kind, range, errors),
22 }
23 }
24
25 if !components.has_closing_quote {
26 errors.push(SyntaxError::new(UnclosedString, literal_range));
27 }
28
29 if let Some(range) = components.suffix {
30 errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
31 }
32}
33
34#[cfg(test)]
35mod test {
36 use crate::{SourceFile, TreeArc};
37
38 fn build_file(literal: &str) -> TreeArc<SourceFile> {
39 let src = format!(r#"const S: &'static str = "{}";"#, literal);
40 println!("Source: {}", src);
41 SourceFile::parse(&src)
42 }
43
44 fn assert_valid_str(literal: &str) {
45 let file = build_file(literal);
46 assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
47 }
48
49 fn assert_invalid_str(literal: &str) {
50 let file = build_file(literal);
51 assert!(file.errors().len() > 0);
52 }
53
54 #[test]
55 fn test_ansi_codepoints() {
56 for byte in 0..=255u8 {
57 match byte {
58 b'\"' | b'\\' => { /* Ignore string close and backslash */ }
59 _ => assert_valid_str(&(byte as char).to_string()),
60 }
61 }
62 }
63
64 #[test]
65 fn test_unicode_codepoints() {
66 let valid = ["Ƒ", "バ", "メ", "﷽"];
67 for c in &valid {
68 assert_valid_str(c);
69 }
70 }
71
72 #[test]
73 fn test_unicode_multiple_codepoints() {
74 let valid = ["नी", "👨‍👨‍"];
75 for c in &valid {
76 assert_valid_str(c);
77 }
78 }
79
80 #[test]
81 fn test_valid_ascii_escape() {
82 let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"];
83 for c in &valid {
84 assert_valid_str(c);
85 }
86 }
87
88 #[test]
89 fn test_invalid_ascii_escape() {
90 let invalid = [r"\a", r"\?", r"\"];
91 for c in &invalid {
92 assert_invalid_str(c);
93 }
94 }
95
96 #[test]
97 fn test_valid_ascii_code_escape() {
98 let valid = [r"\x00", r"\x7F", r"\x55"];
99 for c in &valid {
100 assert_valid_str(c);
101 }
102 }
103
104 #[test]
105 fn test_invalid_ascii_code_escape() {
106 let invalid = [r"\x", r"\x7", r"\xF0"];
107 for c in &invalid {
108 assert_invalid_str(c);
109 }
110 }
111
112 #[test]
113 fn test_valid_unicode_escape() {
114 let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
115 for c in &valid {
116 assert_valid_str(c);
117 }
118 }
119
120 #[test]
121 fn test_invalid_unicode_escape() {
122 let invalid = [
123 r"\u",
124 r"\u{}",
125 r"\u{",
126 r"\u{FF",
127 r"\u{FFFFFF}",
128 r"\u{_F}",
129 r"\u{00FFFFF}",
130 r"\u{110000}",
131 ];
132 for c in &invalid {
133 assert_invalid_str(c);
134 }
135 }
136
137 #[test]
138 fn test_mixed() {
139 assert_valid_str(
140 r"This is the tale of a string
141with a newline in between, some emoji (👨‍👨‍) here and there,
142unicode escapes like this: \u{1FFBB} and weird stuff like
143this ﷽",
144 );
145 }
146
147 #[test]
148 fn test_ignore_newline() {
149 assert_valid_str(
150 "Hello \
151 World",
152 );
153 }
154}
diff --git a/crates/ra_syntax/src/validation/unescape.rs b/crates/ra_syntax/src/validation/unescape.rs
new file mode 100644
index 000000000..2086046b6
--- /dev/null
+++ b/crates/ra_syntax/src/validation/unescape.rs
@@ -0,0 +1,521 @@
1//! Utilities for validating string and char literals and turning them into
2//! values they represent.
3//!
4//! This file is copy-pasted from the compiler
5//!
6//! https://github.com/rust-lang/rust/blob/c6ac57564852cb6e2d0db60f7b46d9eb98d4b449/src/libsyntax/parse/unescape.rs
7//!
8//! Hopefully, we'll share this code in a proper way some day
9
10use std::str::Chars;
11use std::ops::Range;
12
13#[derive(Debug, PartialEq, Eq, Clone, Hash)]
14pub enum EscapeError {
15 ZeroChars,
16 MoreThanOneChar,
17
18 LoneSlash,
19 InvalidEscape,
20 BareCarriageReturn,
21 EscapeOnlyChar,
22
23 TooShortHexEscape,
24 InvalidCharInHexEscape,
25 OutOfRangeHexEscape,
26
27 NoBraceInUnicodeEscape,
28 InvalidCharInUnicodeEscape,
29 EmptyUnicodeEscape,
30 UnclosedUnicodeEscape,
31 LeadingUnderscoreUnicodeEscape,
32 OverlongUnicodeEscape,
33 LoneSurrogateUnicodeEscape,
34 OutOfRangeUnicodeEscape,
35
36 UnicodeEscapeInByte,
37 NonAsciiCharInByte,
38}
39
40/// Takes a contents of a char literal (without quotes), and returns an
41/// unescaped char or an error
42pub(crate) fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> {
43 let mut chars = literal_text.chars();
44 unescape_char_or_byte(&mut chars, Mode::Char)
45 .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
46}
47
48/// Takes a contents of a string literal (without quotes) and produces a
49/// sequence of escaped characters or errors.
50pub(crate) fn unescape_str<F>(literal_text: &str, callback: &mut F)
51where
52 F: FnMut(Range<usize>, Result<char, EscapeError>),
53{
54 unescape_str_or_byte_str(literal_text, Mode::Str, callback)
55}
56
57pub(crate) fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
58 let mut chars = literal_text.chars();
59 unescape_char_or_byte(&mut chars, Mode::Byte)
60 .map(byte_from_char)
61 .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
62}
63
64/// Takes a contents of a string literal (without quotes) and produces a
65/// sequence of escaped characters or errors.
66pub(crate) fn unescape_byte_str<F>(literal_text: &str, callback: &mut F)
67where
68 F: FnMut(Range<usize>, Result<u8, EscapeError>),
69{
70 unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
71 callback(range, char.map(byte_from_char))
72 })
73}
74
75#[derive(Debug, Clone, Copy)]
76pub(crate) enum Mode {
77 Char,
78 Str,
79 Byte,
80 ByteStr,
81}
82
83impl Mode {
84 fn in_single_quotes(self) -> bool {
85 match self {
86 Mode::Char | Mode::Byte => true,
87 Mode::Str | Mode::ByteStr => false,
88 }
89 }
90
91 pub(crate) fn in_double_quotes(self) -> bool {
92 !self.in_single_quotes()
93 }
94
95 pub(crate) fn is_bytes(self) -> bool {
96 match self {
97 Mode::Byte | Mode::ByteStr => true,
98 Mode::Char | Mode::Str => false,
99 }
100 }
101}
102
103fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
104 if first_char != '\\' {
105 return match first_char {
106 '\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
107 '\r' => Err(if chars.clone().next() == Some('\n') {
108 EscapeError::EscapeOnlyChar
109 } else {
110 EscapeError::BareCarriageReturn
111 }),
112 '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
113 '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
114 _ => {
115 if mode.is_bytes() && !first_char.is_ascii() {
116 return Err(EscapeError::NonAsciiCharInByte);
117 }
118 Ok(first_char)
119 }
120 };
121 }
122
123 let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
124
125 let res = match second_char {
126 '"' => '"',
127 'n' => '\n',
128 'r' => '\r',
129 't' => '\t',
130 '\\' => '\\',
131 '\'' => '\'',
132 '0' => '\0',
133
134 'x' => {
135 let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
136 let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
137
138 let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
139 let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
140
141 let value = hi * 16 + lo;
142
143 if !mode.is_bytes() && !is_ascii(value) {
144 return Err(EscapeError::OutOfRangeHexEscape);
145 }
146 let value = value as u8;
147
148 value as char
149 }
150
151 'u' => {
152 if chars.next() != Some('{') {
153 return Err(EscapeError::NoBraceInUnicodeEscape);
154 }
155
156 let mut n_digits = 1;
157 let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
158 '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
159 '}' => return Err(EscapeError::EmptyUnicodeEscape),
160 c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
161 };
162
163 loop {
164 match chars.next() {
165 None => return Err(EscapeError::UnclosedUnicodeEscape),
166 Some('_') => continue,
167 Some('}') => {
168 if n_digits > 6 {
169 return Err(EscapeError::OverlongUnicodeEscape);
170 }
171 if mode.is_bytes() {
172 return Err(EscapeError::UnicodeEscapeInByte);
173 }
174
175 break std::char::from_u32(value).ok_or_else(|| {
176 if value > 0x10FFFF {
177 EscapeError::OutOfRangeUnicodeEscape
178 } else {
179 EscapeError::LoneSurrogateUnicodeEscape
180 }
181 })?;
182 }
183 Some(c) => {
184 let digit =
185 c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
186 n_digits += 1;
187 if n_digits > 6 {
188 continue;
189 }
190 let digit = digit as u32;
191 value = value * 16 + digit;
192 }
193 };
194 }
195 }
196 _ => return Err(EscapeError::InvalidEscape),
197 };
198 Ok(res)
199}
200
201fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
202 let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
203 let res = scan_escape(first_char, chars, mode)?;
204 if chars.next().is_some() {
205 return Err(EscapeError::MoreThanOneChar);
206 }
207 Ok(res)
208}
209
210/// Takes a contents of a string literal (without quotes) and produces a
211/// sequence of escaped characters or errors.
212fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
213where
214 F: FnMut(Range<usize>, Result<char, EscapeError>),
215{
216 assert!(mode.in_double_quotes());
217 let initial_len = src.len();
218 let mut chars = src.chars();
219 while let Some(first_char) = chars.next() {
220 let start = initial_len - chars.as_str().len() - first_char.len_utf8();
221
222 let unescaped_char = match first_char {
223 '\\' => {
224 let (second_char, third_char) = {
225 let mut chars = chars.clone();
226 (chars.next(), chars.next())
227 };
228 match (second_char, third_char) {
229 (Some('\n'), _) | (Some('\r'), Some('\n')) => {
230 skip_ascii_whitespace(&mut chars);
231 continue;
232 }
233 _ => scan_escape(first_char, &mut chars, mode),
234 }
235 }
236 '\r' => {
237 let second_char = chars.clone().next();
238 if second_char == Some('\n') {
239 chars.next();
240 Ok('\n')
241 } else {
242 scan_escape(first_char, &mut chars, mode)
243 }
244 }
245 '\n' => Ok('\n'),
246 '\t' => Ok('\t'),
247 _ => scan_escape(first_char, &mut chars, mode),
248 };
249 let end = initial_len - chars.as_str().len();
250 callback(start..end, unescaped_char);
251 }
252
253 fn skip_ascii_whitespace(chars: &mut Chars<'_>) {
254 let str = chars.as_str();
255 let first_non_space = str
256 .bytes()
257 .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
258 .unwrap_or(str.len());
259 *chars = str[first_non_space..].chars()
260 }
261}
262
263fn byte_from_char(c: char) -> u8 {
264 let res = c as u32;
265 assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte");
266 res as u8
267}
268
269fn is_ascii(x: u32) -> bool {
270 x <= 0x7F
271}
272
273#[cfg(test)]
274mod tests {
275 use super::*;
276
277 #[test]
278 fn test_unescape_char_bad() {
279 fn check(literal_text: &str, expected_error: EscapeError) {
280 let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err);
281 assert_eq!(actual_result, Err(expected_error));
282 }
283
284 check("", EscapeError::ZeroChars);
285 check(r"\", EscapeError::LoneSlash);
286
287 check("\n", EscapeError::EscapeOnlyChar);
288 check("\r\n", EscapeError::EscapeOnlyChar);
289 check("\t", EscapeError::EscapeOnlyChar);
290 check("'", EscapeError::EscapeOnlyChar);
291 check("\r", EscapeError::BareCarriageReturn);
292
293 check("spam", EscapeError::MoreThanOneChar);
294 check(r"\x0ff", EscapeError::MoreThanOneChar);
295 check(r#"\"a"#, EscapeError::MoreThanOneChar);
296 check(r"\na", EscapeError::MoreThanOneChar);
297 check(r"\ra", EscapeError::MoreThanOneChar);
298 check(r"\ta", EscapeError::MoreThanOneChar);
299 check(r"\\a", EscapeError::MoreThanOneChar);
300 check(r"\'a", EscapeError::MoreThanOneChar);
301 check(r"\0a", EscapeError::MoreThanOneChar);
302 check(r"\u{0}x", EscapeError::MoreThanOneChar);
303 check(r"\u{1F63b}}", EscapeError::MoreThanOneChar);
304
305 check(r"\v", EscapeError::InvalidEscape);
306 check(r"\💩", EscapeError::InvalidEscape);
307 check(r"\●", EscapeError::InvalidEscape);
308
309 check(r"\x", EscapeError::TooShortHexEscape);
310 check(r"\x0", EscapeError::TooShortHexEscape);
311 check(r"\xf", EscapeError::TooShortHexEscape);
312 check(r"\xa", EscapeError::TooShortHexEscape);
313 check(r"\xx", EscapeError::InvalidCharInHexEscape);
314 check(r"\xы", EscapeError::InvalidCharInHexEscape);
315 check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
316 check(r"\xtt", EscapeError::InvalidCharInHexEscape);
317 check(r"\xff", EscapeError::OutOfRangeHexEscape);
318 check(r"\xFF", EscapeError::OutOfRangeHexEscape);
319 check(r"\x80", EscapeError::OutOfRangeHexEscape);
320
321 check(r"\u", EscapeError::NoBraceInUnicodeEscape);
322 check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
323 check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
324 check(r"\u{", EscapeError::UnclosedUnicodeEscape);
325 check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
326 check(r"\u{}", EscapeError::EmptyUnicodeEscape);
327 check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
328 check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
329 check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape);
330 check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
331 check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
332
333 check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape);
334 check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape);
335 check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape);
336
337 check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape);
338 check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape);
339 check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape);
340 }
341
342 #[test]
343 fn test_unescape_char_good() {
344 fn check(literal_text: &str, expected_char: char) {
345 let actual_result = unescape_char(literal_text);
346 assert_eq!(actual_result, Ok(expected_char));
347 }
348
349 check("a", 'a');
350 check("ы", 'ы');
351 check("🦀", '🦀');
352
353 check(r#"\""#, '"');
354 check(r"\n", '\n');
355 check(r"\r", '\r');
356 check(r"\t", '\t');
357 check(r"\\", '\\');
358 check(r"\'", '\'');
359 check(r"\0", '\0');
360
361 check(r"\x00", '\0');
362 check(r"\x5a", 'Z');
363 check(r"\x5A", 'Z');
364 check(r"\x7f", 127 as char);
365
366 check(r"\u{0}", '\0');
367 check(r"\u{000000}", '\0');
368 check(r"\u{41}", 'A');
369 check(r"\u{0041}", 'A');
370 check(r"\u{00_41}", 'A');
371 check(r"\u{4__1__}", 'A');
372 check(r"\u{1F63b}", '😻');
373 }
374
375 #[test]
376 fn test_unescape_str_good() {
377 fn check(literal_text: &str, expected: &str) {
378 let mut buf = Ok(String::with_capacity(literal_text.len()));
379 unescape_str(literal_text, &mut |range, c| {
380 if let Ok(b) = &mut buf {
381 match c {
382 Ok(c) => b.push(c),
383 Err(e) => buf = Err((range, e)),
384 }
385 }
386 });
387 let buf = buf.as_ref().map(|it| it.as_ref());
388 assert_eq!(buf, Ok(expected))
389 }
390
391 check("foo", "foo");
392 check("", "");
393 check(" \t\n\r\n", " \t\n\n");
394
395 check("hello \\\n world", "hello world");
396 check("hello \\\r\n world", "hello world");
397 check("thread's", "thread's")
398 }
399
400 #[test]
401 fn test_unescape_byte_bad() {
402 fn check(literal_text: &str, expected_error: EscapeError) {
403 let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err);
404 assert_eq!(actual_result, Err(expected_error));
405 }
406
407 check("", EscapeError::ZeroChars);
408 check(r"\", EscapeError::LoneSlash);
409
410 check("\n", EscapeError::EscapeOnlyChar);
411 check("\r\n", EscapeError::EscapeOnlyChar);
412 check("\t", EscapeError::EscapeOnlyChar);
413 check("'", EscapeError::EscapeOnlyChar);
414 check("\r", EscapeError::BareCarriageReturn);
415
416 check("spam", EscapeError::MoreThanOneChar);
417 check(r"\x0ff", EscapeError::MoreThanOneChar);
418 check(r#"\"a"#, EscapeError::MoreThanOneChar);
419 check(r"\na", EscapeError::MoreThanOneChar);
420 check(r"\ra", EscapeError::MoreThanOneChar);
421 check(r"\ta", EscapeError::MoreThanOneChar);
422 check(r"\\a", EscapeError::MoreThanOneChar);
423 check(r"\'a", EscapeError::MoreThanOneChar);
424 check(r"\0a", EscapeError::MoreThanOneChar);
425
426 check(r"\v", EscapeError::InvalidEscape);
427 check(r"\💩", EscapeError::InvalidEscape);
428 check(r"\●", EscapeError::InvalidEscape);
429
430 check(r"\x", EscapeError::TooShortHexEscape);
431 check(r"\x0", EscapeError::TooShortHexEscape);
432 check(r"\xa", EscapeError::TooShortHexEscape);
433 check(r"\xf", EscapeError::TooShortHexEscape);
434 check(r"\xx", EscapeError::InvalidCharInHexEscape);
435 check(r"\xы", EscapeError::InvalidCharInHexEscape);
436 check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
437 check(r"\xtt", EscapeError::InvalidCharInHexEscape);
438
439 check(r"\u", EscapeError::NoBraceInUnicodeEscape);
440 check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
441 check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
442 check(r"\u{", EscapeError::UnclosedUnicodeEscape);
443 check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
444 check(r"\u{}", EscapeError::EmptyUnicodeEscape);
445 check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
446 check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
447
448 check("ы", EscapeError::NonAsciiCharInByte);
449 check("🦀", EscapeError::NonAsciiCharInByte);
450
451 check(r"\u{0}", EscapeError::UnicodeEscapeInByte);
452 check(r"\u{000000}", EscapeError::UnicodeEscapeInByte);
453 check(r"\u{41}", EscapeError::UnicodeEscapeInByte);
454 check(r"\u{0041}", EscapeError::UnicodeEscapeInByte);
455 check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte);
456 check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte);
457 check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte);
458 check(r"\u{0}x", EscapeError::UnicodeEscapeInByte);
459 check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte);
460 check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte);
461 check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
462 check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
463 check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte);
464 check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte);
465 check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte);
466 check(r"\u{D800}", EscapeError::UnicodeEscapeInByte);
467 check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte);
468 check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte);
469 }
470
471 #[test]
472 fn test_unescape_byte_good() {
473 fn check(literal_text: &str, expected_byte: u8) {
474 let actual_result = unescape_byte(literal_text);
475 assert_eq!(actual_result, Ok(expected_byte));
476 }
477
478 check("a", b'a');
479
480 check(r#"\""#, b'"');
481 check(r"\n", b'\n');
482 check(r"\r", b'\r');
483 check(r"\t", b'\t');
484 check(r"\\", b'\\');
485 check(r"\'", b'\'');
486 check(r"\0", b'\0');
487
488 check(r"\x00", b'\0');
489 check(r"\x5a", b'Z');
490 check(r"\x5A", b'Z');
491 check(r"\x7f", 127);
492 check(r"\x80", 128);
493 check(r"\xff", 255);
494 check(r"\xFF", 255);
495 }
496
497 #[test]
498 fn test_unescape_byte_str_good() {
499 fn check(literal_text: &str, expected: &[u8]) {
500 let mut buf = Ok(Vec::with_capacity(literal_text.len()));
501 unescape_byte_str(literal_text, &mut |range, c| {
502 if let Ok(b) = &mut buf {
503 match c {
504 Ok(c) => b.push(c),
505 Err(e) => buf = Err((range, e)),
506 }
507 }
508 });
509 let buf = buf.as_ref().map(|it| it.as_ref());
510 assert_eq!(buf, Ok(expected))
511 }
512
513 check("foo", b"foo");
514 check("", b"");
515 check(" \t\n\r\n", b" \t\n\n");
516
517 check("hello \\\n world", b"hello world");
518 check("hello \\\r\n world", b"hello world");
519 check("thread's", b"thread's")
520 }
521}