Add validator for byte

author: Adolfo Ochagavía <[email protected]> 2018-11-11 19:27:00 +0000
committer: Adolfo Ochagavía <[email protected]> 2018-11-11 19:27:00 +0000
commit: c258b4fdb0e421813330c2428985c4537c787582 (patch)
tree: e53263f28c0cd07911a1e9c9ef6538c8ff0227fd /crates/ra_syntax/src/validation/byte.rs
parent: a4f7d7a7cd85a5b9b64a935dd84ad493b6860236 (diff)
1 files changed, 202 insertions, 0 deletions
diff --git a/crates/ra_syntax/src/validation/byte.rs b/crates/ra_syntax/src/validation/byte.rs
new file mode 100644
index 000000000..3d2806c4e
--- /dev/null
+++ b/crates/ra_syntax/src/validation/byte.rs
@@ -0,0 +1,202 @@
+//! Validation of byte literals
+use crate::{
+    ast::{self, AstNode},
+    string_lexing::{self, CharComponentKind},
+    TextRange,
+    validation::char,
+    yellow::{
+        SyntaxError,
+        SyntaxErrorKind::*,
+    },
+};
+pub(super) fn validate_byte_node(node: ast::Byte, errors: &mut Vec<SyntaxError>) {
+    let literal_text = node.text();
+    let literal_range = node.syntax().range();
+    let mut components = string_lexing::parse_byte_literal(literal_text);
+    let mut len = 0;
+    for component in &mut components {
+        len += 1;
+        let text = &literal_text[component.range];
+        let range = component.range + literal_range.start();
+        use self::CharComponentKind::*;
+        match component.kind {
+            AsciiEscape => validate_byte_escape(text, range, errors),
+            AsciiCodeEscape => validate_byte_code_escape(text, range, errors),
+            UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)),
+            CodePoint => {
+                let c = text.chars().next().expect("Code points should be one character long");
+                // These bytes must always be escaped
+                if c == '\t' || c == '\r' || c == '\n' {
+                    errors.push(SyntaxError::new(UnescapedByte, range));
+                }
+                // Only ASCII bytes are allowed
+                if c > 0x7F as char {
+                    errors.push(SyntaxError::new(ByteOutOfRange, range));
+                }
+            }
+        }
+    }
+    if !components.has_closing_quote {
+        errors.push(SyntaxError::new(UnclosedByte, literal_range));
+    }
+    if len == 0 {
+        errors.push(SyntaxError::new(EmptyByte, literal_range));
+    }
+    if len > 1 {
+        errors.push(SyntaxError::new(OverlongByte, literal_range));
+    }
+}
+fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
+    if text.len() == 1 {
+        // Escape sequence consists only of leading `\`
+        errors.push(SyntaxError::new(EmptyByteEscape, range));
+    } else {
+        let escape_code = text.chars().skip(1).next().unwrap();
+        if !char::is_ascii_escape(escape_code) {
+            errors.push(SyntaxError::new(InvalidByteEscape, range));
+        }
+    }
+}
+fn validate_byte_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
+    // A ByteCodeEscape has 4 chars, example: `\xDD`
+    if text.len() < 4 {
+        errors.push(SyntaxError::new(TooShortByteCodeEscape, range));
+    } else {
+        assert!(
+            text.chars().count() == 4,
+            "ByteCodeEscape cannot be longer than 4 chars"
+        );
+        if u8::from_str_radix(&text[2..], 16).is_err() {
+            errors.push(SyntaxError::new(MalformedByteCodeEscape, range));
+        }
+    }
+}
+#[cfg(test)]
+mod test {
+    use crate::SourceFileNode;
+    fn build_file(literal: &str) -> SourceFileNode {
+        let src = format!("const C: u8 = b'{}';", literal);
+        SourceFileNode::parse(&src)
+    }
+    fn assert_valid_byte(literal: &str) {
+        let file = build_file(literal);
+        assert!(
+            file.errors().len() == 0,
+            "Errors for literal '{}': {:?}",
+            literal,
+            file.errors()
+        );
+    }
+    fn assert_invalid_byte(literal: &str) {
+        let file = build_file(literal);
+        assert!(file.errors().len() > 0);
+    }
+    #[test]
+    fn test_ansi_codepoints() {
+        for byte in 0..128 {
+            match byte {
+                b'\n' | b'\r' | b'\t' => assert_invalid_byte(&(byte as char).to_string()),
+                b'\'' | b'\\' => { /* Ignore character close and backslash */ }
+                _ => assert_valid_byte(&(byte as char).to_string()),
+            }
+        }
+        for byte in 128..=255u8 {
+            assert_invalid_byte(&(byte as char).to_string());
+        }
+    }
+    #[test]
+    fn test_unicode_codepoints() {
+        let invalid = ["Ƒ", "バ", "メ", "﷽"];
+        for c in &invalid {
+            assert_invalid_byte(c);
+        }
+    }
+    #[test]
+    fn test_unicode_multiple_codepoints() {
+        let invalid = ["नी", "👨‍👨‍"];
+        for c in &invalid {
+            assert_invalid_byte(c);
+        }
+    }
+    #[test]
+    fn test_valid_byte_escape() {
+        let valid = [
+            r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b",
+        ];
+        for c in &valid {
+            assert_valid_byte(c);
+        }
+    }
+    #[test]
+    fn test_invalid_byte_escape() {
+        let invalid = [r"\a", r"\?", r"\"];
+        for c in &invalid {
+            assert_invalid_byte(c);
+        }
+    }
+    #[test]
+    fn test_valid_byte_code_escape() {
+        let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"];
+        for c in &valid {
+            assert_valid_byte(c);
+        }
+    }
+    #[test]
+    fn test_invalid_byte_code_escape() {
+        let invalid = [r"\x", r"\x7"];
+        for c in &invalid {
+            assert_invalid_byte(c);
+        }
+    }
+    #[test]
+    fn test_invalid_unicode_escape() {
+        let well_formed = [
+            r"\u{FF}",
+            r"\u{0}",
+            r"\u{F}",
+            r"\u{10FFFF}",
+            r"\u{1_0__FF___FF_____}",
+        ];
+        for c in &well_formed {
+            assert_invalid_byte(c);
+        }
+        let invalid = [
+            r"\u",
+            r"\u{}",
+            r"\u{",
+            r"\u{FF",
+            r"\u{FFFFFF}",
+            r"\u{_F}",
+            r"\u{00FFFFF}",
+            r"\u{110000}",
+        ];
+        for c in &invalid {
+            assert_invalid_byte(c);
+        }
+    }
+}
author	Adolfo Ochagavía <[email protected]>	2018-11-11 19:27:00 +0000
committer	Adolfo Ochagavía <[email protected]>	2018-11-11 19:27:00 +0000
commit	c258b4fdb0e421813330c2428985c4537c787582 (patch)
tree	e53263f28c0cd07911a1e9c9ef6538c8ff0227fd /crates/ra_syntax/src/validation/byte.rs
parent	a4f7d7a7cd85a5b9b64a935dd84ad493b6860236 (diff)

diff --git a/crates/ra_syntax/src/validation/byte.rs b/crates/ra_syntax/src/validation/byte.rs new file mode 100644 index 000000000..3d2806c4e --- /dev/null +++ b/crates/ra_syntax/src/validation/byte.rs
@@ -0,0 +1,202 @@
	1	//! Validation of byte literals
	2
	3	use crate::{
	4	ast::{self, AstNode},
	5	string_lexing::{self, CharComponentKind},
	6	TextRange,
	7	validation::char,
	8	yellow::{
	9	SyntaxError,
	10	SyntaxErrorKind::*,
	11	},
	12	};
	13
	14	pub(super) fn validate_byte_node(node: ast::Byte, errors: &mut Vec<SyntaxError>) {
	15	let literal_text = node.text();
	16	let literal_range = node.syntax().range();
	17	let mut components = string_lexing::parse_byte_literal(literal_text);
	18	let mut len = 0;
	19	for component in &mut components {
	20	len += 1;
	21	let text = &literal_text[component.range];
	22	let range = component.range + literal_range.start();
	23
	24	use self::CharComponentKind::*;
	25	match component.kind {
	26	AsciiEscape => validate_byte_escape(text, range, errors),
	27	AsciiCodeEscape => validate_byte_code_escape(text, range, errors),
	28	UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)),
	29	CodePoint => {
	30	let c = text.chars().next().expect("Code points should be one character long");
	31
	32	// These bytes must always be escaped
	33	if c == '\t' \|\| c == '\r' \|\| c == '\n' {
	34	errors.push(SyntaxError::new(UnescapedByte, range));
	35	}
	36
	37	// Only ASCII bytes are allowed
	38	if c > 0x7F as char {
	39	errors.push(SyntaxError::new(ByteOutOfRange, range));
	40	}
	41	}
	42	}
	43	}
	44
	45	if !components.has_closing_quote {
	46	errors.push(SyntaxError::new(UnclosedByte, literal_range));
	47	}
	48
	49	if len == 0 {
	50	errors.push(SyntaxError::new(EmptyByte, literal_range));
	51	}
	52
	53	if len > 1 {
	54	errors.push(SyntaxError::new(OverlongByte, literal_range));
	55	}
	56	}
	57
	58	fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
	59	if text.len() == 1 {
	60	// Escape sequence consists only of leading `\`
	61	errors.push(SyntaxError::new(EmptyByteEscape, range));
	62	} else {
	63	let escape_code = text.chars().skip(1).next().unwrap();
	64	if !char::is_ascii_escape(escape_code) {
	65	errors.push(SyntaxError::new(InvalidByteEscape, range));
	66	}
	67	}
	68	}
	69
	70	fn validate_byte_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
	71	// A ByteCodeEscape has 4 chars, example: `\xDD`
	72	if text.len() < 4 {
	73	errors.push(SyntaxError::new(TooShortByteCodeEscape, range));
	74	} else {
	75	assert!(
	76	text.chars().count() == 4,
	77	"ByteCodeEscape cannot be longer than 4 chars"
	78	);
	79
	80	if u8::from_str_radix(&text[2..], 16).is_err() {
	81	errors.push(SyntaxError::new(MalformedByteCodeEscape, range));
	82	}
	83	}
	84	}
	85
	86	#[cfg(test)]
	87	mod test {
	88	use crate::SourceFileNode;
	89
	90	fn build_file(literal: &str) -> SourceFileNode {
	91	let src = format!("const C: u8 = b'{}';", literal);
	92	SourceFileNode::parse(&src)
	93	}
	94
	95	fn assert_valid_byte(literal: &str) {
	96	let file = build_file(literal);
	97	assert!(
	98	file.errors().len() == 0,
	99	"Errors for literal '{}': {:?}",
	100	literal,
	101	file.errors()
	102	);
	103	}
	104
	105	fn assert_invalid_byte(literal: &str) {
	106	let file = build_file(literal);
	107	assert!(file.errors().len() > 0);
	108	}
	109
	110	#[test]
	111	fn test_ansi_codepoints() {
	112	for byte in 0..128 {
	113	match byte {
	114	b'\n' \| b'\r' \| b'\t' => assert_invalid_byte(&(byte as char).to_string()),
	115	b'\'' \| b'\\' => { /* Ignore character close and backslash */ }
	116	_ => assert_valid_byte(&(byte as char).to_string()),
	117	}
	118	}
	119
	120	for byte in 128..=255u8 {
	121	assert_invalid_byte(&(byte as char).to_string());
	122	}
	123	}
	124
	125	#[test]
	126	fn test_unicode_codepoints() {
	127	let invalid = ["Ƒ", "バ", "メ", "﷽"];
	128	for c in &invalid {
	129	assert_invalid_byte(c);
	130	}
	131	}
	132
	133	#[test]
	134	fn test_unicode_multiple_codepoints() {
	135	let invalid = ["नी", "👨‍👨‍"];
	136	for c in &invalid {
	137	assert_invalid_byte(c);
	138	}
	139	}
	140
	141	#[test]
	142	fn test_valid_byte_escape() {
	143	let valid = [
	144	r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b",
	145	];
	146	for c in &valid {
	147	assert_valid_byte(c);
	148	}
	149	}
	150
	151	#[test]
	152	fn test_invalid_byte_escape() {
	153	let invalid = [r"\a", r"\?", r"\"];
	154	for c in &invalid {
	155	assert_invalid_byte(c);
	156	}
	157	}
	158
	159	#[test]
	160	fn test_valid_byte_code_escape() {
	161	let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"];
	162	for c in &valid {
	163	assert_valid_byte(c);
	164	}
	165	}
	166
	167	#[test]
	168	fn test_invalid_byte_code_escape() {
	169	let invalid = [r"\x", r"\x7"];
	170	for c in &invalid {
	171	assert_invalid_byte(c);
	172	}
	173	}
	174
	175	#[test]
	176	fn test_invalid_unicode_escape() {
	177	let well_formed = [
	178	r"\u{FF}",
	179	r"\u{0}",
	180	r"\u{F}",
	181	r"\u{10FFFF}",
	182	r"\u{1_0__FF___FF_____}",
	183	];
	184	for c in &well_formed {
	185	assert_invalid_byte(c);
	186	}
	187
	188	let invalid = [
	189	r"\u",
	190	r"\u{}",
	191	r"\u{",
	192	r"\u{FF",
	193	r"\u{FFFFFF}",
	194	r"\u{_F}",
	195	r"\u{00FFFFF}",
	196	r"\u{110000}",
	197	];
	198	for c in &invalid {
	199	assert_invalid_byte(c);
	200	}
	201	}
	202	}