Merge #1253

1253: Share literal validation logic with compiler r=matklad a=matklad This is neat: the unescape module is literary what compiler is using right now: https://github.com/rust-lang/rust/blob/c6ac57564852cb6e2d0db60f7b46d9eb98d4b449/src/libsyntax/parse/unescape.rs So, yeah, code sharing via copy-paste! Co-authored-by: Aleksey Kladov <[email protected]>
author: bors[bot] <bors[bot]@users.noreply.github.com> 2019-05-07 17:43:10 +0100
committer: bors[bot] <bors[bot]@users.noreply.github.com> 2019-05-07 17:43:10 +0100
commit: d3efedb752bb2198796603d8a479a5e3ee472a97 (patch)
tree: ca6a4aee6ad4077a869a932a18c6c8d134406f8c /crates/ra_syntax/src/validation/char.rs
parent: ef782adc293deb287128f005dbab2038ba3ccdc1 (diff)
parent: 313314e14b629ebf50389dbd2d440bda922f6ae7 (diff)
1 files changed, 0 insertions, 273 deletions
diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs
deleted file mode 100644
index 0f1885873..000000000
--- a/crates/ra_syntax/src/validation/char.rs
+++ /dev/null
@@ -1,273 +0,0 @@
-//! Validation of char literals
-use std::u32;
-use arrayvec::ArrayString;
-use crate::{
-    string_lexing::{self, StringComponentKind},
-    TextRange,
-    SyntaxError,
-    SyntaxErrorKind::*,
-    SyntaxToken,
-};
-pub(super) fn validate_char_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
-    let literal_text = node.text();
-    let literal_range = node.range();
-    let mut components = string_lexing::parse_quoted_literal(None, '\'', literal_text);
-    let mut len = 0;
-    for component in &mut components {
-        len += 1;
-        let text = &literal_text[component.range];
-        let range = component.range + literal_range.start();
-        validate_char_component(text, component.kind, range, errors);
-    }
-    if !components.has_closing_quote {
-        errors.push(SyntaxError::new(UnclosedChar, literal_range));
-    }
-    if let Some(range) = components.suffix {
-        errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
-    }
-    if len == 0 {
-        errors.push(SyntaxError::new(EmptyChar, literal_range));
-    }
-    if len > 1 {
-        errors.push(SyntaxError::new(OverlongChar, literal_range));
-    }
-}
-pub(super) fn validate_char_component(
-    text: &str,
-    kind: StringComponentKind,
-    range: TextRange,
-    errors: &mut Vec<SyntaxError>,
-) {
-    // Validate escapes
-    use self::StringComponentKind::*;
-    match kind {
-        AsciiEscape => validate_ascii_escape(text, range, errors),
-        AsciiCodeEscape => validate_ascii_code_escape(text, range, errors),
-        UnicodeEscape => validate_unicode_escape(text, range, errors),
-        CodePoint => {
-            // These code points must always be escaped
-            if text == "\t" || text == "\r" || text == "\n" {
-                errors.push(SyntaxError::new(UnescapedCodepoint, range));
-            }
-        }
-        StringComponentKind::IgnoreNewline => { /* always valid */ }
-    }
-}
-fn validate_ascii_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
-    if text.len() == 1 {
-        // Escape sequence consists only of leading `\` (only occurs at EOF, otherwise e.g. '\' is treated as an unclosed char containing a single quote `'`)
-        errors.push(SyntaxError::new(EmptyAsciiEscape, range));
-    } else {
-        let escape_code = text.chars().skip(1).next().unwrap();
-        if !is_ascii_escape(escape_code) {
-            errors.push(SyntaxError::new(InvalidAsciiEscape, range));
-        }
-    }
-}
-pub(super) fn is_ascii_escape(code: char) -> bool {
-    match code {
-        '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true,
-        _ => false,
-    }
-}
-fn validate_ascii_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
-    // An AsciiCodeEscape has 4 chars, example: `\xDD`
-    if !text.is_ascii() {
-        // FIXME: Give a more precise error message (say what the invalid character was)
-        errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range));
-    } else if text.chars().count() < 4 {
-        errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range));
-    } else {
-        assert_eq!(
-            text.chars().count(),
-            4,
-            "AsciiCodeEscape cannot be longer than 4 chars, but text '{}' is",
-            text,
-        );
-        match u8::from_str_radix(&text[2..], 16) {
-            Ok(code) if code < 128 => { /* Escape code is valid */ }
-            Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)),
-            Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)),
-        }
-    }
-}
-fn validate_unicode_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
-    assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u");
-    if text.len() == 2 {
-        // No starting `{`
-        errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
-        return;
-    }
-    if text.len() == 3 {
-        // Only starting `{`
-        errors.push(SyntaxError::new(UnclosedUnicodeEscape, range));
-        return;
-    }
-    let mut code = ArrayString::<[_; 6]>::new();
-    let mut closed = false;
-    for c in text[3..].chars() {
-        assert!(!closed, "no characters after escape is closed");
-        if c.is_digit(16) {
-            if code.len() == 6 {
-                errors.push(SyntaxError::new(OverlongUnicodeEscape, range));
-                return;
-            }
-            code.push(c);
-        } else if c == '_' {
-            // Reject leading _
-            if code.len() == 0 {
-                errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
-                return;
-            }
-        } else if c == '}' {
-            closed = true;
-        } else {
-            errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
-            return;
-        }
-    }
-    if !closed {
-        errors.push(SyntaxError::new(UnclosedUnicodeEscape, range))
-    }
-    if code.len() == 0 {
-        errors.push(SyntaxError::new(EmptyUnicodeEcape, range));
-        return;
-    }
-    match u32::from_str_radix(&code, 16) {
-        Ok(code_u32) if code_u32 > 0x10FFFF => {
-            errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range));
-        }
-        Ok(_) => {
-            // Valid escape code
-        }
-        Err(_) => {
-            errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
-        }
-    }
-}
-#[cfg(test)]
-mod test {
-    use crate::{SourceFile, TreeArc};
-    fn build_file(literal: &str) -> TreeArc<SourceFile> {
-        let src = format!("const C: char = '{}';", literal);
-        SourceFile::parse(&src)
-    }
-    fn assert_valid_char(literal: &str) {
-        let file = build_file(literal);
-        assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
-    }
-    fn assert_invalid_char(literal: &str) {
-        let file = build_file(literal);
-        assert!(file.errors().len() > 0);
-    }
-    #[test]
-    fn test_ansi_codepoints() {
-        for byte in 0..=255u8 {
-            match byte {
-                b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()),
-                b'\'' | b'\\' => { /* Ignore character close and backslash */ }
-                _ => assert_valid_char(&(byte as char).to_string()),
-            }
-        }
-    }
-    #[test]
-    fn test_unicode_codepoints() {
-        let valid = ["Ƒ", "バ", "メ", "﷽"];
-        for c in &valid {
-            assert_valid_char(c);
-        }
-    }
-    #[test]
-    fn test_unicode_multiple_codepoints() {
-        let invalid = ["नी", "👨‍👨‍"];
-        for c in &invalid {
-            assert_invalid_char(c);
-        }
-    }
-    #[test]
-    fn test_valid_ascii_escape() {
-        let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"];
-        for c in &valid {
-            assert_valid_char(c);
-        }
-    }
-    #[test]
-    fn test_invalid_ascii_escape() {
-        let invalid = [r"\a", r"\?", r"\"];
-        for c in &invalid {
-            assert_invalid_char(c);
-        }
-    }
-    #[test]
-    fn test_valid_ascii_code_escape() {
-        let valid = [r"\x00", r"\x7F", r"\x55"];
-        for c in &valid {
-            assert_valid_char(c);
-        }
-    }
-    #[test]
-    fn test_invalid_ascii_code_escape() {
-        let invalid = [r"\x", r"\x7", r"\xF0"];
-        for c in &invalid {
-            assert_invalid_char(c);
-        }
-    }
-    #[test]
-    fn test_valid_unicode_escape() {
-        let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
-        for c in &valid {
-            assert_valid_char(c);
-        }
-    }
-    #[test]
-    fn test_invalid_unicode_escape() {
-        let invalid = [
-            r"\u",
-            r"\u{}",
-            r"\u{",
-            r"\u{FF",
-            r"\u{FFFFFF}",
-            r"\u{_F}",
-            r"\u{00FFFFF}",
-            r"\u{110000}",
-        ];
-        for c in &invalid {
-            assert_invalid_char(c);
-        }
-    }
-}
author	bors[bot] <bors[bot]@users.noreply.github.com>	2019-05-07 17:43:10 +0100
committer	bors[bot] <bors[bot]@users.noreply.github.com>	2019-05-07 17:43:10 +0100
commit	d3efedb752bb2198796603d8a479a5e3ee472a97 (patch)
tree	ca6a4aee6ad4077a869a932a18c6c8d134406f8c /crates/ra_syntax/src/validation/char.rs
parent	ef782adc293deb287128f005dbab2038ba3ccdc1 (diff)
parent	313314e14b629ebf50389dbd2d440bda922f6ae7 (diff)

diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs deleted file mode 100644 index 0f1885873..000000000 --- a/crates/ra_syntax/src/validation/char.rs +++ /dev/null
@@ -1,273 +0,0 @@
1	//! Validation of char literals
2
3	use std::u32;
4
5	use arrayvec::ArrayString;
6
7	use crate::{
8	string_lexing::{self, StringComponentKind},
9	TextRange,
10	SyntaxError,
11	SyntaxErrorKind::*,
12	SyntaxToken,
13	};
14
15	pub(super) fn validate_char_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
16	let literal_text = node.text();
17	let literal_range = node.range();
18	let mut components = string_lexing::parse_quoted_literal(None, '\'', literal_text);
19	let mut len = 0;
20	for component in &mut components {
21	len += 1;
22	let text = &literal_text[component.range];
23	let range = component.range + literal_range.start();
24	validate_char_component(text, component.kind, range, errors);
25	}
26
27	if !components.has_closing_quote {
28	errors.push(SyntaxError::new(UnclosedChar, literal_range));
29	}
30
31	if let Some(range) = components.suffix {
32	errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
33	}
34
35	if len == 0 {
36	errors.push(SyntaxError::new(EmptyChar, literal_range));
37	}
38
39	if len > 1 {
40	errors.push(SyntaxError::new(OverlongChar, literal_range));
41	}
42	}
43
44	pub(super) fn validate_char_component(
45	text: &str,
46	kind: StringComponentKind,
47	range: TextRange,
48	errors: &mut Vec<SyntaxError>,
49	) {
50	// Validate escapes
51	use self::StringComponentKind::*;
52	match kind {
53	AsciiEscape => validate_ascii_escape(text, range, errors),
54	AsciiCodeEscape => validate_ascii_code_escape(text, range, errors),
55	UnicodeEscape => validate_unicode_escape(text, range, errors),
56	CodePoint => {
57	// These code points must always be escaped
58	if text == "\t" \|\| text == "\r" \|\| text == "\n" {
59	errors.push(SyntaxError::new(UnescapedCodepoint, range));
60	}
61	}
62	StringComponentKind::IgnoreNewline => { /* always valid */ }
63	}
64	}
65
66	fn validate_ascii_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
67	if text.len() == 1 {
68	// Escape sequence consists only of leading `\` (only occurs at EOF, otherwise e.g. '\' is treated as an unclosed char containing a single quote `'`)
69	errors.push(SyntaxError::new(EmptyAsciiEscape, range));
70	} else {
71	let escape_code = text.chars().skip(1).next().unwrap();
72	if !is_ascii_escape(escape_code) {
73	errors.push(SyntaxError::new(InvalidAsciiEscape, range));
74	}
75	}
76	}
77
78	pub(super) fn is_ascii_escape(code: char) -> bool {
79	match code {
80	'\\' \| '\'' \| '"' \| 'n' \| 'r' \| 't' \| '0' => true,
81	_ => false,
82	}
83	}
84
85	fn validate_ascii_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
86	// An AsciiCodeEscape has 4 chars, example: `\xDD`
87	if !text.is_ascii() {
88	// FIXME: Give a more precise error message (say what the invalid character was)
89	errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range));
90	} else if text.chars().count() < 4 {
91	errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range));
92	} else {
93	assert_eq!(
94	text.chars().count(),
95	4,
96	"AsciiCodeEscape cannot be longer than 4 chars, but text '{}' is",
97	text,
98	);
99
100	match u8::from_str_radix(&text[2..], 16) {
101	Ok(code) if code < 128 => { /* Escape code is valid */ }
102	Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)),
103	Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)),
104	}
105	}
106	}
107
108	fn validate_unicode_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
109	assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u");
110
111	if text.len() == 2 {
112	// No starting `{`
113	errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
114	return;
115	}
116
117	if text.len() == 3 {
118	// Only starting `{`
119	errors.push(SyntaxError::new(UnclosedUnicodeEscape, range));
120	return;
121	}
122
123	let mut code = ArrayString::<[_; 6]>::new();
124	let mut closed = false;
125	for c in text[3..].chars() {
126	assert!(!closed, "no characters after escape is closed");
127
128	if c.is_digit(16) {
129	if code.len() == 6 {
130	errors.push(SyntaxError::new(OverlongUnicodeEscape, range));
131	return;
132	}
133
134	code.push(c);
135	} else if c == '_' {
136	// Reject leading _
137	if code.len() == 0 {
138	errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
139	return;
140	}
141	} else if c == '}' {
142	closed = true;
143	} else {
144	errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
145	return;
146	}
147	}
148
149	if !closed {
150	errors.push(SyntaxError::new(UnclosedUnicodeEscape, range))
151	}
152
153	if code.len() == 0 {
154	errors.push(SyntaxError::new(EmptyUnicodeEcape, range));
155	return;
156	}
157
158	match u32::from_str_radix(&code, 16) {
159	Ok(code_u32) if code_u32 > 0x10FFFF => {
160	errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range));
161	}
162	Ok(_) => {
163	// Valid escape code
164	}
165	Err(_) => {
166	errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
167	}
168	}
169	}
170
171	#[cfg(test)]
172	mod test {
173	use crate::{SourceFile, TreeArc};
174
175	fn build_file(literal: &str) -> TreeArc<SourceFile> {
176	let src = format!("const C: char = '{}';", literal);
177	SourceFile::parse(&src)
178	}
179
180	fn assert_valid_char(literal: &str) {
181	let file = build_file(literal);
182	assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
183	}
184
185	fn assert_invalid_char(literal: &str) {
186	let file = build_file(literal);
187	assert!(file.errors().len() > 0);
188	}
189
190	#[test]
191	fn test_ansi_codepoints() {
192	for byte in 0..=255u8 {
193	match byte {
194	b'\n' \| b'\r' \| b'\t' => assert_invalid_char(&(byte as char).to_string()),
195	b'\'' \| b'\\' => { /* Ignore character close and backslash */ }
196	_ => assert_valid_char(&(byte as char).to_string()),
197	}
198	}
199	}
200
201	#[test]
202	fn test_unicode_codepoints() {
203	let valid = ["Ƒ", "バ", "メ", "﷽"];
204	for c in &valid {
205	assert_valid_char(c);
206	}
207	}
208
209	#[test]
210	fn test_unicode_multiple_codepoints() {
211	let invalid = ["नी", "👨‍👨‍"];
212	for c in &invalid {
213	assert_invalid_char(c);
214	}
215	}
216
217	#[test]
218	fn test_valid_ascii_escape() {
219	let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"];
220	for c in &valid {
221	assert_valid_char(c);
222	}
223	}
224
225	#[test]
226	fn test_invalid_ascii_escape() {
227	let invalid = [r"\a", r"\?", r"\"];
228	for c in &invalid {
229	assert_invalid_char(c);
230	}
231	}
232
233	#[test]
234	fn test_valid_ascii_code_escape() {
235	let valid = [r"\x00", r"\x7F", r"\x55"];
236	for c in &valid {
237	assert_valid_char(c);
238	}
239	}
240
241	#[test]
242	fn test_invalid_ascii_code_escape() {
243	let invalid = [r"\x", r"\x7", r"\xF0"];
244	for c in &invalid {
245	assert_invalid_char(c);
246	}
247	}
248
249	#[test]
250	fn test_valid_unicode_escape() {
251	let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
252	for c in &valid {
253	assert_valid_char(c);
254	}
255	}
256
257	#[test]
258	fn test_invalid_unicode_escape() {
259	let invalid = [
260	r"\u",
261	r"\u{}",
262	r"\u{",
263	r"\u{FF",
264	r"\u{FFFFFF}",
265	r"\u{_F}",
266	r"\u{00FFFFF}",
267	r"\u{110000}",
268	];
269	for c in &invalid {
270	assert_invalid_char(c);
271	}
272	}
273	}