Validate string literals

author: Adolfo Ochagavía <[email protected]> 2018-11-08 14:42:00 +0000
committer: Adolfo Ochagavía <[email protected]> 2018-11-09 13:52:17 +0000
commit: 3b4c02c19e4af645fd37e8bff774b05d546dc0b6 (patch)
tree: 42c40e9201adf64d1c06bc1c69524f5688ee6e9f /crates/ra_syntax/src/validation/char.rs
parent: 5a9150df9bcdaf5faed5b500c22333f1f7c99f32 (diff)
1 files changed, 270 insertions, 0 deletions
diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs
new file mode 100644
index 000000000..63f9bad24
--- /dev/null
+++ b/crates/ra_syntax/src/validation/char.rs
@@ -0,0 +1,270 @@
+use std::u32;
+use arrayvec::ArrayString;
+use crate::{
+    ast::{self, AstNode},
+    string_lexing::{self, CharComponentKind},
+    TextRange,
+    yellow::{
+        SyntaxError,
+        SyntaxErrorKind::*,
+    },
+};
+pub(crate) fn validate_char_node(node: ast::Char, errors: &mut Vec<SyntaxError>) {
+    let literal_text = node.text();
+    let literal_range = node.syntax().range();
+    let mut components = string_lexing::parse_char_literal(literal_text);
+    let mut len = 0;
+    for component in &mut components {
+        len += 1;
+        let text = &literal_text[component.range];
+        let range = component.range + literal_range.start();
+        validate_char_component(text, component.kind, range, errors);
+    }
+    if !components.has_closing_quote {
+        errors.push(SyntaxError::new(UnclosedChar, literal_range));
+    }
+    if len == 0 {
+        errors.push(SyntaxError::new(EmptyChar, literal_range));
+    }
+    if len > 1 {
+        errors.push(SyntaxError::new(OverlongChar, literal_range));
+    }
+}
+pub(crate) fn validate_char_component(
+    text: &str,
+    kind: CharComponentKind,
+    range: TextRange,
+    errors: &mut Vec<SyntaxError>,
+) {
+    // Validate escapes
+    use self::CharComponentKind::*;
+    match kind {
+        AsciiEscape => {
+            if text.len() == 1 {
+                // Escape sequence consists only of leading `\`
+                errors.push(SyntaxError::new(EmptyAsciiEscape, range));
+            } else {
+                let escape_code = text.chars().skip(1).next().unwrap();
+                if !is_ascii_escape(escape_code) {
+                    errors.push(SyntaxError::new(InvalidAsciiEscape, range));
+                }
+            }
+        }
+        AsciiCodeEscape => {
+            // An AsciiCodeEscape has 4 chars, example: `\xDD`
+            if text.len() < 4 {
+                errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range));
+            } else {
+                assert!(
+                    text.chars().count() == 4,
+                    "AsciiCodeEscape cannot be longer than 4 chars"
+                );
+                match u8::from_str_radix(&text[2..], 16) {
+                    Ok(code) if code < 128 => { /* Escape code is valid */ }
+                    Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)),
+                    Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)),
+                }
+            }
+        }
+        UnicodeEscape => {
+            assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u");
+            if text.len() == 2 {
+                // No starting `{`
+                errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
+                return;
+            }
+            if text.len() == 3 {
+                // Only starting `{`
+                errors.push(SyntaxError::new(UnclosedUnicodeEscape, range));
+                return;
+            }
+            let mut code = ArrayString::<[_; 6]>::new();
+            let mut closed = false;
+            for c in text[3..].chars() {
+                assert!(!closed, "no characters after escape is closed");
+                if c.is_digit(16) {
+                    if code.len() == 6 {
+                        errors.push(SyntaxError::new(OverlongUnicodeEscape, range));
+                        return;
+                    }
+                    code.push(c);
+                } else if c == '_' {
+                    // Reject leading _
+                    if code.len() == 0 {
+                        errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
+                        return;
+                    }
+                } else if c == '}' {
+                    closed = true;
+                } else {
+                    errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
+                    return;
+                }
+            }
+            if !closed {
+                errors.push(SyntaxError::new(UnclosedUnicodeEscape, range))
+            }
+            if code.len() == 0 {
+                errors.push(SyntaxError::new(EmptyUnicodeEcape, range));
+                return;
+            }
+            match u32::from_str_radix(&code, 16) {
+                Ok(code_u32) if code_u32 > 0x10FFFF => {
+                    errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range));
+                }
+                Ok(_) => {
+                    // Valid escape code
+                }
+                Err(_) => {
+                    errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
+                }
+            }
+        }
+        CodePoint => {
+            // These code points must always be escaped
+            if text == "\t" || text == "\r" {
+                errors.push(SyntaxError::new(UnescapedCodepoint, range));
+            }
+        }
+    }
+}
+fn is_ascii_escape(code: char) -> bool {
+    match code {
+        '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true,
+        _ => false,
+    }
+}
+#[cfg(test)]
+mod test {
+    use crate::SourceFileNode;
+    fn build_file(literal: &str) -> SourceFileNode {
+        let src = format!("const C: char = '{}';", literal);
+        SourceFileNode::parse(&src)
+    }
+    fn assert_valid_char(literal: &str) {
+        let file = build_file(literal);
+        assert!(
+            file.errors().len() == 0,
+            "Errors for literal '{}': {:?}",
+            literal,
+            file.errors()
+        );
+    }
+    fn assert_invalid_char(literal: &str) {
+        let file = build_file(literal);
+        assert!(file.errors().len() > 0);
+    }
+    #[test]
+    fn test_ansi_codepoints() {
+        for byte in 0..=255u8 {
+            match byte {
+                b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()),
+                b'\'' | b'\\' => { /* Ignore character close and backslash */ }
+                _ => assert_valid_char(&(byte as char).to_string()),
+            }
+        }
+    }
+    #[test]
+    fn test_unicode_codepoints() {
+        let valid = ["Ƒ", "バ", "メ", "﷽"];
+        for c in &valid {
+            assert_valid_char(c);
+        }
+    }
+    #[test]
+    fn test_unicode_multiple_codepoints() {
+        let invalid = ["नी", "👨‍👨‍"];
+        for c in &invalid {
+            assert_invalid_char(c);
+        }
+    }
+    #[test]
+    fn test_valid_ascii_escape() {
+        let valid = [
+            r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b",
+        ];
+        for c in &valid {
+            assert_valid_char(c);
+        }
+    }
+    #[test]
+    fn test_invalid_ascii_escape() {
+        let invalid = [r"\a", r"\?", r"\"];
+        for c in &invalid {
+            assert_invalid_char(c);
+        }
+    }
+    #[test]
+    fn test_valid_ascii_code_escape() {
+        let valid = [r"\x00", r"\x7F", r"\x55"];
+        for c in &valid {
+            assert_valid_char(c);
+        }
+    }
+    #[test]
+    fn test_invalid_ascii_code_escape() {
+        let invalid = [r"\x", r"\x7", r"\xF0"];
+        for c in &invalid {
+            assert_invalid_char(c);
+        }
+    }
+    #[test]
+    fn test_valid_unicode_escape() {
+        let valid = [
+            r"\u{FF}",
+            r"\u{0}",
+            r"\u{F}",
+            r"\u{10FFFF}",
+            r"\u{1_0__FF___FF_____}",
+        ];
+        for c in &valid {
+            assert_valid_char(c);
+        }
+    }
+    #[test]
+    fn test_invalid_unicode_escape() {
+        let invalid = [
+            r"\u",
+            r"\u{}",
+            r"\u{",
+            r"\u{FF",
+            r"\u{FFFFFF}",
+            r"\u{_F}",
+            r"\u{00FFFFF}",
+            r"\u{110000}",
+        ];
+        for c in &invalid {
+            assert_invalid_char(c);
+        }
+    }
+}
author	Adolfo Ochagavía <[email protected]>	2018-11-08 14:42:00 +0000
committer	Adolfo Ochagavía <[email protected]>	2018-11-09 13:52:17 +0000
commit	3b4c02c19e4af645fd37e8bff774b05d546dc0b6 (patch)
tree	42c40e9201adf64d1c06bc1c69524f5688ee6e9f /crates/ra_syntax/src/validation/char.rs
parent	5a9150df9bcdaf5faed5b500c22333f1f7c99f32 (diff)

diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs new file mode 100644 index 000000000..63f9bad24 --- /dev/null +++ b/crates/ra_syntax/src/validation/char.rs
@@ -0,0 +1,270 @@
	1	use std::u32;
	2
	3	use arrayvec::ArrayString;
	4
	5	use crate::{
	6	ast::{self, AstNode},
	7	string_lexing::{self, CharComponentKind},
	8	TextRange,
	9	yellow::{
	10	SyntaxError,
	11	SyntaxErrorKind::*,
	12	},
	13	};
	14
	15	pub(crate) fn validate_char_node(node: ast::Char, errors: &mut Vec<SyntaxError>) {
	16	let literal_text = node.text();
	17	let literal_range = node.syntax().range();
	18	let mut components = string_lexing::parse_char_literal(literal_text);
	19	let mut len = 0;
	20	for component in &mut components {
	21	len += 1;
	22	let text = &literal_text[component.range];
	23	let range = component.range + literal_range.start();
	24	validate_char_component(text, component.kind, range, errors);
	25	}
	26
	27	if !components.has_closing_quote {
	28	errors.push(SyntaxError::new(UnclosedChar, literal_range));
	29	}
	30
	31	if len == 0 {
	32	errors.push(SyntaxError::new(EmptyChar, literal_range));
	33	}
	34
	35	if len > 1 {
	36	errors.push(SyntaxError::new(OverlongChar, literal_range));
	37	}
	38	}
	39
	40	pub(crate) fn validate_char_component(
	41	text: &str,
	42	kind: CharComponentKind,
	43	range: TextRange,
	44	errors: &mut Vec<SyntaxError>,
	45	) {
	46	// Validate escapes
	47	use self::CharComponentKind::*;
	48	match kind {
	49	AsciiEscape => {
	50	if text.len() == 1 {
	51	// Escape sequence consists only of leading `\`
	52	errors.push(SyntaxError::new(EmptyAsciiEscape, range));
	53	} else {
	54	let escape_code = text.chars().skip(1).next().unwrap();
	55	if !is_ascii_escape(escape_code) {
	56	errors.push(SyntaxError::new(InvalidAsciiEscape, range));
	57	}
	58	}
	59	}
	60	AsciiCodeEscape => {
	61	// An AsciiCodeEscape has 4 chars, example: `\xDD`
	62	if text.len() < 4 {
	63	errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range));
	64	} else {
	65	assert!(
	66	text.chars().count() == 4,
	67	"AsciiCodeEscape cannot be longer than 4 chars"
	68	);
	69
	70	match u8::from_str_radix(&text[2..], 16) {
	71	Ok(code) if code < 128 => { /* Escape code is valid */ }
	72	Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)),
	73	Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)),
	74	}
	75	}
	76	}
	77	UnicodeEscape => {
	78	assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u");
	79
	80	if text.len() == 2 {
	81	// No starting `{`
	82	errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
	83	return;
	84	}
	85
	86	if text.len() == 3 {
	87	// Only starting `{`
	88	errors.push(SyntaxError::new(UnclosedUnicodeEscape, range));
	89	return;
	90	}
	91
	92	let mut code = ArrayString::<[_; 6]>::new();
	93	let mut closed = false;
	94	for c in text[3..].chars() {
	95	assert!(!closed, "no characters after escape is closed");
	96
	97	if c.is_digit(16) {
	98	if code.len() == 6 {
	99	errors.push(SyntaxError::new(OverlongUnicodeEscape, range));
	100	return;
	101	}
	102
	103	code.push(c);
	104	} else if c == '_' {
	105	// Reject leading _
	106	if code.len() == 0 {
	107	errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
	108	return;
	109	}
	110	} else if c == '}' {
	111	closed = true;
	112	} else {
	113	errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
	114	return;
	115	}
	116	}
	117
	118	if !closed {
	119	errors.push(SyntaxError::new(UnclosedUnicodeEscape, range))
	120	}
	121
	122	if code.len() == 0 {
	123	errors.push(SyntaxError::new(EmptyUnicodeEcape, range));
	124	return;
	125	}
	126
	127	match u32::from_str_radix(&code, 16) {
	128	Ok(code_u32) if code_u32 > 0x10FFFF => {
	129	errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range));
	130	}
	131	Ok(_) => {
	132	// Valid escape code
	133	}
	134	Err(_) => {
	135	errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
	136	}
	137	}
	138	}
	139	CodePoint => {
	140	// These code points must always be escaped
	141	if text == "\t" \|\| text == "\r" {
	142	errors.push(SyntaxError::new(UnescapedCodepoint, range));
	143	}
	144	}
	145	}
	146	}
	147
	148	fn is_ascii_escape(code: char) -> bool {
	149	match code {
	150	'\\' \| '\'' \| '"' \| 'n' \| 'r' \| 't' \| '0' => true,
	151	_ => false,
	152	}
	153	}
	154
	155	#[cfg(test)]
	156	mod test {
	157	use crate::SourceFileNode;
	158
	159	fn build_file(literal: &str) -> SourceFileNode {
	160	let src = format!("const C: char = '{}';", literal);
	161	SourceFileNode::parse(&src)
	162	}
	163
	164	fn assert_valid_char(literal: &str) {
	165	let file = build_file(literal);
	166	assert!(
	167	file.errors().len() == 0,
	168	"Errors for literal '{}': {:?}",
	169	literal,
	170	file.errors()
	171	);
	172	}
	173
	174	fn assert_invalid_char(literal: &str) {
	175	let file = build_file(literal);
	176	assert!(file.errors().len() > 0);
	177	}
	178
	179	#[test]
	180	fn test_ansi_codepoints() {
	181	for byte in 0..=255u8 {
	182	match byte {
	183	b'\n' \| b'\r' \| b'\t' => assert_invalid_char(&(byte as char).to_string()),
	184	b'\'' \| b'\\' => { /* Ignore character close and backslash */ }
	185	_ => assert_valid_char(&(byte as char).to_string()),
	186	}
	187	}
	188	}
	189
	190	#[test]
	191	fn test_unicode_codepoints() {
	192	let valid = ["Ƒ", "バ", "メ", "﷽"];
	193	for c in &valid {
	194	assert_valid_char(c);
	195	}
	196	}
	197
	198	#[test]
	199	fn test_unicode_multiple_codepoints() {
	200	let invalid = ["नी", "👨‍👨‍"];
	201	for c in &invalid {
	202	assert_invalid_char(c);
	203	}
	204	}
	205
	206	#[test]
	207	fn test_valid_ascii_escape() {
	208	let valid = [
	209	r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b",
	210	];
	211	for c in &valid {
	212	assert_valid_char(c);
	213	}
	214	}
	215
	216	#[test]
	217	fn test_invalid_ascii_escape() {
	218	let invalid = [r"\a", r"\?", r"\"];
	219	for c in &invalid {
	220	assert_invalid_char(c);
	221	}
	222	}
	223
	224	#[test]
	225	fn test_valid_ascii_code_escape() {
	226	let valid = [r"\x00", r"\x7F", r"\x55"];
	227	for c in &valid {
	228	assert_valid_char(c);
	229	}
	230	}
	231
	232	#[test]
	233	fn test_invalid_ascii_code_escape() {
	234	let invalid = [r"\x", r"\x7", r"\xF0"];
	235	for c in &invalid {
	236	assert_invalid_char(c);
	237	}
	238	}
	239
	240	#[test]
	241	fn test_valid_unicode_escape() {
	242	let valid = [
	243	r"\u{FF}",
	244	r"\u{0}",
	245	r"\u{F}",
	246	r"\u{10FFFF}",
	247	r"\u{1_0__FF___FF_____}",
	248	];
	249	for c in &valid {
	250	assert_valid_char(c);
	251	}
	252	}
	253
	254	#[test]
	255	fn test_invalid_unicode_escape() {
	256	let invalid = [
	257	r"\u",
	258	r"\u{}",
	259	r"\u{",
	260	r"\u{FF",
	261	r"\u{FFFFFF}",
	262	r"\u{_F}",
	263	r"\u{00FFFFF}",
	264	r"\u{110000}",
	265	];
	266	for c in &invalid {
	267	assert_invalid_char(c);
	268	}
	269	}
	270	}