From 30cd4d5acb7dfd40cea264a926d1c89f0c3522c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adolfo=20Ochagav=C3=ADa?= Date: Sun, 11 Nov 2018 20:41:43 +0100 Subject: Validate byte string literals --- crates/ra_syntax/src/ast/generated.rs | 37 +++++ crates/ra_syntax/src/ast/mod.rs | 6 + crates/ra_syntax/src/grammar.ron | 1 + crates/ra_syntax/src/string_lexing.rs | 53 +++++++- crates/ra_syntax/src/validation/byte.rs | 50 ++++--- crates/ra_syntax/src/validation/byte_string.rs | 178 +++++++++++++++++++++++++ crates/ra_syntax/src/validation/char.rs | 2 +- crates/ra_syntax/src/validation/mod.rs | 2 + 8 files changed, 305 insertions(+), 24 deletions(-) create mode 100644 crates/ra_syntax/src/validation/byte_string.rs (limited to 'crates/ra_syntax') diff --git a/crates/ra_syntax/src/ast/generated.rs b/crates/ra_syntax/src/ast/generated.rs index 75236153d..bf056131e 100644 --- a/crates/ra_syntax/src/ast/generated.rs +++ b/crates/ra_syntax/src/ast/generated.rs @@ -409,6 +409,43 @@ impl> ByteNode { impl<'a> Byte<'a> {} +// ByteString +#[derive(Debug, Clone, Copy,)] +pub struct ByteStringNode = OwnedRoot> { + pub(crate) syntax: SyntaxNode, +} +pub type ByteString<'a> = ByteStringNode>; + +impl, R2: TreeRoot> PartialEq> for ByteStringNode { + fn eq(&self, other: &ByteStringNode) -> bool { self.syntax == other.syntax } +} +impl> Eq for ByteStringNode {} +impl> Hash for ByteStringNode { + fn hash(&self, state: &mut H) { self.syntax.hash(state) } +} + +impl<'a> AstNode<'a> for ByteString<'a> { + fn cast(syntax: SyntaxNodeRef<'a>) -> Option { + match syntax.kind() { + BYTE_STRING => Some(ByteString { syntax }), + _ => None, + } + } + fn syntax(self) -> SyntaxNodeRef<'a> { self.syntax } +} + +impl> ByteStringNode { + pub fn borrowed(&self) -> ByteString { + ByteStringNode { syntax: self.syntax.borrowed() } + } + pub fn owned(&self) -> ByteStringNode { + ByteStringNode { syntax: self.syntax.owned() } + } +} + + +impl<'a> ByteString<'a> {} + // CallExpr #[derive(Debug, Clone, Copy,)] pub struct CallExprNode = OwnedRoot> { diff --git a/crates/ra_syntax/src/ast/mod.rs b/crates/ra_syntax/src/ast/mod.rs index 686b5cf04..7077e3492 100644 --- a/crates/ra_syntax/src/ast/mod.rs +++ b/crates/ra_syntax/src/ast/mod.rs @@ -140,6 +140,12 @@ impl<'a> Byte<'a> { } } +impl<'a> ByteString<'a> { + pub fn text(&self) -> &SmolStr { + &self.syntax().leaf_text().unwrap() + } +} + impl<'a> String<'a> { pub fn text(&self) -> &SmolStr { &self.syntax().leaf_text().unwrap() diff --git a/crates/ra_syntax/src/grammar.ron b/crates/ra_syntax/src/grammar.ron index 2c2ed1aeb..53cd2118f 100644 --- a/crates/ra_syntax/src/grammar.ron +++ b/crates/ra_syntax/src/grammar.ron @@ -413,6 +413,7 @@ Grammar( "BinExpr": (), "String": (), "Byte": (), + "ByteString": (), "Char": (), "Literal": (), diff --git a/crates/ra_syntax/src/string_lexing.rs b/crates/ra_syntax/src/string_lexing.rs index 4e8c3a91c..d253c97e7 100644 --- a/crates/ra_syntax/src/string_lexing.rs +++ b/crates/ra_syntax/src/string_lexing.rs @@ -1,6 +1,55 @@ use self::CharComponentKind::*; use rowan::{TextRange, TextUnit}; +pub fn parse_byte_string_literal(src: &str) -> ByteStringComponentIterator { + ByteStringComponentIterator { + parser: Parser::new(src), + has_closing_quote: false, + } +} + +pub struct ByteStringComponentIterator<'a> { + parser: Parser<'a>, + pub has_closing_quote: bool, +} + +impl<'a> Iterator for ByteStringComponentIterator<'a> { + type Item = StringComponent; + fn next(&mut self) -> Option { + if self.parser.pos == 0 { + assert!( + self.parser.advance() == 'b', + "byte string literal should start with a `b`" + ); + + assert!( + self.parser.advance() == '"', + "byte string literal should start with a `b`, followed by double quotes" + ); + } + + if let Some(component) = self.parser.parse_string_component() { + return Some(component); + } + + // We get here when there are no char components left to parse + if self.parser.peek() == Some('"') { + self.parser.advance(); + self.has_closing_quote = true; + } + + assert!( + self.parser.peek() == None, + "byte string literal should leave no unparsed input: src = {}, pos = {}, length = {}", + self.parser.src, + self.parser.pos, + self.parser.src.len() + ); + + None + } +} + pub fn parse_string_literal(src: &str) -> StringComponentIterator { StringComponentIterator { parser: Parser::new(src), @@ -81,12 +130,12 @@ impl<'a> Iterator for ByteComponentIterator<'a> { if self.parser.pos == 0 { assert!( self.parser.advance() == 'b', - "Byte literal should start with a b" + "Byte literal should start with a `b`" ); assert!( self.parser.advance() == '\'', - "Byte literal should start with a b, followed by a quote" + "Byte literal should start with a `b`, followed by a quote" ); } diff --git a/crates/ra_syntax/src/validation/byte.rs b/crates/ra_syntax/src/validation/byte.rs index 3d2806c4e..7baf3c1d7 100644 --- a/crates/ra_syntax/src/validation/byte.rs +++ b/crates/ra_syntax/src/validation/byte.rs @@ -20,26 +20,7 @@ pub(super) fn validate_byte_node(node: ast::Byte, errors: &mut Vec) len += 1; let text = &literal_text[component.range]; let range = component.range + literal_range.start(); - - use self::CharComponentKind::*; - match component.kind { - AsciiEscape => validate_byte_escape(text, range, errors), - AsciiCodeEscape => validate_byte_code_escape(text, range, errors), - UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)), - CodePoint => { - let c = text.chars().next().expect("Code points should be one character long"); - - // These bytes must always be escaped - if c == '\t' || c == '\r' || c == '\n' { - errors.push(SyntaxError::new(UnescapedByte, range)); - } - - // Only ASCII bytes are allowed - if c > 0x7F as char { - errors.push(SyntaxError::new(ByteOutOfRange, range)); - } - } - } + validate_byte_component(text, component.kind, range, errors); } if !components.has_closing_quote { @@ -55,6 +36,33 @@ pub(super) fn validate_byte_node(node: ast::Byte, errors: &mut Vec) } } +pub(super) fn validate_byte_component( + text: &str, + kind: CharComponentKind, + range: TextRange, + errors: &mut Vec, +) { + use self::CharComponentKind::*; + match kind { + AsciiEscape => validate_byte_escape(text, range, errors), + AsciiCodeEscape => validate_byte_code_escape(text, range, errors), + UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)), + CodePoint => { + let c = text.chars().next().expect("Code points should be one character long"); + + // These bytes must always be escaped + if c == '\t' || c == '\r' || c == '\n' { + errors.push(SyntaxError::new(UnescapedByte, range)); + } + + // Only ASCII bytes are allowed + if c > 0x7F as char { + errors.push(SyntaxError::new(ByteOutOfRange, range)); + } + } + } +} + fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec) { if text.len() == 1 { // Escape sequence consists only of leading `\` @@ -141,7 +149,7 @@ mod test { #[test] fn test_valid_byte_escape() { let valid = [ - r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b", + r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", ]; for c in &valid { assert_valid_byte(c); diff --git a/crates/ra_syntax/src/validation/byte_string.rs b/crates/ra_syntax/src/validation/byte_string.rs new file mode 100644 index 000000000..7b830e97c --- /dev/null +++ b/crates/ra_syntax/src/validation/byte_string.rs @@ -0,0 +1,178 @@ +use crate::{ + ast::{self, AstNode}, + string_lexing::{self, StringComponentKind}, + yellow::{ + SyntaxError, + SyntaxErrorKind::*, + }, +}; + +use super::byte; + +pub(crate) fn validate_byte_string_node(node: ast::ByteString, errors: &mut Vec) { + let literal_text = node.text(); + let literal_range = node.syntax().range(); + let mut components = string_lexing::parse_byte_string_literal(literal_text); + for component in &mut components { + let range = component.range + literal_range.start(); + + match component.kind { + StringComponentKind::Char(kind) => { + // Chars must escape \t, \n and \r codepoints, but strings don't + let text = &literal_text[component.range]; + match text { + "\t" | "\n" | "\r" => { /* always valid */ } + _ => byte::validate_byte_component(text, kind, range, errors), + } + } + StringComponentKind::IgnoreNewline => { /* always valid */ } + } + } + + if !components.has_closing_quote { + errors.push(SyntaxError::new(UnclosedString, literal_range)); + } +} + +#[cfg(test)] +mod test { + use crate::SourceFileNode; + + fn build_file(literal: &str) -> SourceFileNode { + let src = format!(r#"const S: &'static [u8] = b"{}";"#, literal); + println!("Source: {}", src); + SourceFileNode::parse(&src) + } + + fn assert_valid_str(literal: &str) { + let file = build_file(literal); + assert!( + file.errors().len() == 0, + "Errors for literal '{}': {:?}", + literal, + file.errors() + ); + } + + fn assert_invalid_str(literal: &str) { + let file = build_file(literal); + assert!(file.errors().len() > 0); + } + + #[test] + fn test_ansi_codepoints() { + for byte in 0..128 { + match byte { + b'\"' | b'\\' => { /* Ignore string close and backslash */ } + _ => assert_valid_str(&(byte as char).to_string()), + } + } + + for byte in 128..=255u8 { + assert_invalid_str(&(byte as char).to_string()); + } + } + + #[test] + fn test_unicode_codepoints() { + let invalid = ["Ƒ", "バ", "メ", "﷽"]; + for c in &invalid { + assert_invalid_str(c); + } + } + + #[test] + fn test_unicode_multiple_codepoints() { + let invalid = ["नी", "👨‍👨‍"]; + for c in &invalid { + assert_invalid_str(c); + } + } + + #[test] + fn test_valid_ascii_escape() { + let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"]; + for c in &valid { + assert_valid_str(c); + } + } + + #[test] + fn test_invalid_ascii_escape() { + let invalid = [r"\a", r"\?", r"\"]; + for c in &invalid { + assert_invalid_str(c); + } + } + + #[test] + fn test_valid_ascii_code_escape() { + let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"]; + for c in &valid { + assert_valid_str(c); + } + } + + #[test] + fn test_invalid_ascii_code_escape() { + let invalid = [r"\x", r"\x7"]; + for c in &invalid { + assert_invalid_str(c); + } + } + + #[test] + fn test_invalid_unicode_escape() { + let well_formed = [ + r"\u{FF}", + r"\u{0}", + r"\u{F}", + r"\u{10FFFF}", + r"\u{1_0__FF___FF_____}", + ]; + for c in &well_formed { + assert_invalid_str(c); + } + + let invalid = [ + r"\u", + r"\u{}", + r"\u{", + r"\u{FF", + r"\u{FFFFFF}", + r"\u{_F}", + r"\u{00FFFFF}", + r"\u{110000}", + ]; + for c in &invalid { + assert_invalid_str(c); + } + } + + #[test] + fn test_mixed_invalid() { + assert_invalid_str( + r"This is the tale of a string +with a newline in between, some emoji (👨‍👨‍) here and there, +unicode escapes like this: \u{1FFBB} and weird stuff like +this ﷽", + ); + } + + #[test] + fn test_mixed_valid() { + assert_valid_str( + r"This is the tale of a string +with a newline in between, no emoji at all, +nor unicode escapes or weird stuff", + ); + } + + #[test] + fn test_ignore_newline() { + assert_valid_str( + "Hello \ + World", + ); + } +} diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs index 793539b3a..622b2efdc 100644 --- a/crates/ra_syntax/src/validation/char.rs +++ b/crates/ra_syntax/src/validation/char.rs @@ -214,7 +214,7 @@ mod test { #[test] fn test_valid_ascii_escape() { let valid = [ - r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b", + r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", ]; for c in &valid { assert_valid_char(c); diff --git a/crates/ra_syntax/src/validation/mod.rs b/crates/ra_syntax/src/validation/mod.rs index acad7cb7f..bdee8120c 100644 --- a/crates/ra_syntax/src/validation/mod.rs +++ b/crates/ra_syntax/src/validation/mod.rs @@ -6,6 +6,7 @@ use crate::{ }; mod byte; +mod byte_string; mod char; mod string; @@ -14,6 +15,7 @@ pub(crate) fn validate(file: &SourceFileNode) -> Vec { for node in file.syntax().descendants() { let _ = visitor_ctx(&mut errors) .visit::(self::byte::validate_byte_node) + .visit::(self::byte_string::validate_byte_string_node) .visit::(self::char::validate_char_node) .visit::(self::string::validate_string_node) .accept(node); -- cgit v1.2.3