From c258b4fdb0e421813330c2428985c4537c787582 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adolfo=20Ochagav=C3=ADa?= Date: Sun, 11 Nov 2018 20:27:00 +0100 Subject: Add validator for byte --- crates/ra_syntax/src/ast/generated.rs | 37 +++++ crates/ra_syntax/src/ast/mod.rs | 6 + crates/ra_syntax/src/grammar.ron | 1 + crates/ra_syntax/src/string_lexing.rs | 50 +++++++ crates/ra_syntax/src/validation/byte.rs | 202 ++++++++++++++++++++++++++++ crates/ra_syntax/src/validation/char.rs | 188 +++++++++++++------------- crates/ra_syntax/src/validation/mod.rs | 2 + crates/ra_syntax/src/yellow/syntax_error.rs | 20 +++ 8 files changed, 416 insertions(+), 90 deletions(-) create mode 100644 crates/ra_syntax/src/validation/byte.rs (limited to 'crates/ra_syntax') diff --git a/crates/ra_syntax/src/ast/generated.rs b/crates/ra_syntax/src/ast/generated.rs index 2e9ae263a..75236153d 100644 --- a/crates/ra_syntax/src/ast/generated.rs +++ b/crates/ra_syntax/src/ast/generated.rs @@ -372,6 +372,43 @@ impl> BreakExprNode { impl<'a> BreakExpr<'a> {} +// Byte +#[derive(Debug, Clone, Copy,)] +pub struct ByteNode = OwnedRoot> { + pub(crate) syntax: SyntaxNode, +} +pub type Byte<'a> = ByteNode>; + +impl, R2: TreeRoot> PartialEq> for ByteNode { + fn eq(&self, other: &ByteNode) -> bool { self.syntax == other.syntax } +} +impl> Eq for ByteNode {} +impl> Hash for ByteNode { + fn hash(&self, state: &mut H) { self.syntax.hash(state) } +} + +impl<'a> AstNode<'a> for Byte<'a> { + fn cast(syntax: SyntaxNodeRef<'a>) -> Option { + match syntax.kind() { + BYTE => Some(Byte { syntax }), + _ => None, + } + } + fn syntax(self) -> SyntaxNodeRef<'a> { self.syntax } +} + +impl> ByteNode { + pub fn borrowed(&self) -> Byte { + ByteNode { syntax: self.syntax.borrowed() } + } + pub fn owned(&self) -> ByteNode { + ByteNode { syntax: self.syntax.owned() } + } +} + + +impl<'a> Byte<'a> {} + // CallExpr #[derive(Debug, Clone, Copy,)] pub struct CallExprNode = OwnedRoot> { diff --git a/crates/ra_syntax/src/ast/mod.rs b/crates/ra_syntax/src/ast/mod.rs index f20714ede..686b5cf04 100644 --- a/crates/ra_syntax/src/ast/mod.rs +++ b/crates/ra_syntax/src/ast/mod.rs @@ -134,6 +134,12 @@ impl<'a> Char<'a> { } } +impl<'a> Byte<'a> { + pub fn text(&self) -> &SmolStr { + &self.syntax().leaf_text().unwrap() + } +} + impl<'a> String<'a> { pub fn text(&self) -> &SmolStr { &self.syntax().leaf_text().unwrap() diff --git a/crates/ra_syntax/src/grammar.ron b/crates/ra_syntax/src/grammar.ron index c3184667e..2c2ed1aeb 100644 --- a/crates/ra_syntax/src/grammar.ron +++ b/crates/ra_syntax/src/grammar.ron @@ -412,6 +412,7 @@ Grammar( "RangeExpr": (), "BinExpr": (), "String": (), + "Byte": (), "Char": (), "Literal": (), diff --git a/crates/ra_syntax/src/string_lexing.rs b/crates/ra_syntax/src/string_lexing.rs index d613bb042..4e8c3a91c 100644 --- a/crates/ra_syntax/src/string_lexing.rs +++ b/crates/ra_syntax/src/string_lexing.rs @@ -63,6 +63,56 @@ impl<'a> Iterator for StringComponentIterator<'a> { } } +pub fn parse_byte_literal(src: &str) -> ByteComponentIterator { + ByteComponentIterator { + parser: Parser::new(src), + has_closing_quote: false, + } +} + +pub struct ByteComponentIterator<'a> { + parser: Parser<'a>, + pub has_closing_quote: bool, +} + +impl<'a> Iterator for ByteComponentIterator<'a> { + type Item = CharComponent; + fn next(&mut self) -> Option { + if self.parser.pos == 0 { + assert!( + self.parser.advance() == 'b', + "Byte literal should start with a b" + ); + + assert!( + self.parser.advance() == '\'', + "Byte literal should start with a b, followed by a quote" + ); + } + + + if let Some(component) = self.parser.parse_char_component() { + return Some(component); + } + + // We get here when there are no char components left to parse + if self.parser.peek() == Some('\'') { + self.parser.advance(); + self.has_closing_quote = true; + } + + assert!( + self.parser.peek() == None, + "byte literal should leave no unparsed input: src = {}, pos = {}, length = {}", + self.parser.src, + self.parser.pos, + self.parser.src.len() + ); + + None + } +} + pub fn parse_char_literal(src: &str) -> CharComponentIterator { CharComponentIterator { parser: Parser::new(src), diff --git a/crates/ra_syntax/src/validation/byte.rs b/crates/ra_syntax/src/validation/byte.rs new file mode 100644 index 000000000..3d2806c4e --- /dev/null +++ b/crates/ra_syntax/src/validation/byte.rs @@ -0,0 +1,202 @@ +//! Validation of byte literals + +use crate::{ + ast::{self, AstNode}, + string_lexing::{self, CharComponentKind}, + TextRange, + validation::char, + yellow::{ + SyntaxError, + SyntaxErrorKind::*, + }, +}; + +pub(super) fn validate_byte_node(node: ast::Byte, errors: &mut Vec) { + let literal_text = node.text(); + let literal_range = node.syntax().range(); + let mut components = string_lexing::parse_byte_literal(literal_text); + let mut len = 0; + for component in &mut components { + len += 1; + let text = &literal_text[component.range]; + let range = component.range + literal_range.start(); + + use self::CharComponentKind::*; + match component.kind { + AsciiEscape => validate_byte_escape(text, range, errors), + AsciiCodeEscape => validate_byte_code_escape(text, range, errors), + UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)), + CodePoint => { + let c = text.chars().next().expect("Code points should be one character long"); + + // These bytes must always be escaped + if c == '\t' || c == '\r' || c == '\n' { + errors.push(SyntaxError::new(UnescapedByte, range)); + } + + // Only ASCII bytes are allowed + if c > 0x7F as char { + errors.push(SyntaxError::new(ByteOutOfRange, range)); + } + } + } + } + + if !components.has_closing_quote { + errors.push(SyntaxError::new(UnclosedByte, literal_range)); + } + + if len == 0 { + errors.push(SyntaxError::new(EmptyByte, literal_range)); + } + + if len > 1 { + errors.push(SyntaxError::new(OverlongByte, literal_range)); + } +} + +fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec) { + if text.len() == 1 { + // Escape sequence consists only of leading `\` + errors.push(SyntaxError::new(EmptyByteEscape, range)); + } else { + let escape_code = text.chars().skip(1).next().unwrap(); + if !char::is_ascii_escape(escape_code) { + errors.push(SyntaxError::new(InvalidByteEscape, range)); + } + } +} + +fn validate_byte_code_escape(text: &str, range: TextRange, errors: &mut Vec) { + // A ByteCodeEscape has 4 chars, example: `\xDD` + if text.len() < 4 { + errors.push(SyntaxError::new(TooShortByteCodeEscape, range)); + } else { + assert!( + text.chars().count() == 4, + "ByteCodeEscape cannot be longer than 4 chars" + ); + + if u8::from_str_radix(&text[2..], 16).is_err() { + errors.push(SyntaxError::new(MalformedByteCodeEscape, range)); + } + } +} + +#[cfg(test)] +mod test { + use crate::SourceFileNode; + + fn build_file(literal: &str) -> SourceFileNode { + let src = format!("const C: u8 = b'{}';", literal); + SourceFileNode::parse(&src) + } + + fn assert_valid_byte(literal: &str) { + let file = build_file(literal); + assert!( + file.errors().len() == 0, + "Errors for literal '{}': {:?}", + literal, + file.errors() + ); + } + + fn assert_invalid_byte(literal: &str) { + let file = build_file(literal); + assert!(file.errors().len() > 0); + } + + #[test] + fn test_ansi_codepoints() { + for byte in 0..128 { + match byte { + b'\n' | b'\r' | b'\t' => assert_invalid_byte(&(byte as char).to_string()), + b'\'' | b'\\' => { /* Ignore character close and backslash */ } + _ => assert_valid_byte(&(byte as char).to_string()), + } + } + + for byte in 128..=255u8 { + assert_invalid_byte(&(byte as char).to_string()); + } + } + + #[test] + fn test_unicode_codepoints() { + let invalid = ["Ƒ", "バ", "メ", "﷽"]; + for c in &invalid { + assert_invalid_byte(c); + } + } + + #[test] + fn test_unicode_multiple_codepoints() { + let invalid = ["नी", "👨‍👨‍"]; + for c in &invalid { + assert_invalid_byte(c); + } + } + + #[test] + fn test_valid_byte_escape() { + let valid = [ + r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b", + ]; + for c in &valid { + assert_valid_byte(c); + } + } + + #[test] + fn test_invalid_byte_escape() { + let invalid = [r"\a", r"\?", r"\"]; + for c in &invalid { + assert_invalid_byte(c); + } + } + + #[test] + fn test_valid_byte_code_escape() { + let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"]; + for c in &valid { + assert_valid_byte(c); + } + } + + #[test] + fn test_invalid_byte_code_escape() { + let invalid = [r"\x", r"\x7"]; + for c in &invalid { + assert_invalid_byte(c); + } + } + + #[test] + fn test_invalid_unicode_escape() { + let well_formed = [ + r"\u{FF}", + r"\u{0}", + r"\u{F}", + r"\u{10FFFF}", + r"\u{1_0__FF___FF_____}", + ]; + for c in &well_formed { + assert_invalid_byte(c); + } + + let invalid = [ + r"\u", + r"\u{}", + r"\u{", + r"\u{FF", + r"\u{FFFFFF}", + r"\u{_F}", + r"\u{00FFFFF}", + r"\u{110000}", + ]; + for c in &invalid { + assert_invalid_byte(c); + } + } +} diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs index 63f9bad24..793539b3a 100644 --- a/crates/ra_syntax/src/validation/char.rs +++ b/crates/ra_syntax/src/validation/char.rs @@ -1,3 +1,5 @@ +//! Validation of char literals + use std::u32; use arrayvec::ArrayString; @@ -12,7 +14,7 @@ use crate::{ }, }; -pub(crate) fn validate_char_node(node: ast::Char, errors: &mut Vec) { +pub(super) fn validate_char_node(node: ast::Char, errors: &mut Vec) { let literal_text = node.text(); let literal_range = node.syntax().range(); let mut components = string_lexing::parse_char_literal(literal_text); @@ -37,7 +39,7 @@ pub(crate) fn validate_char_node(node: ast::Char, errors: &mut Vec) } } -pub(crate) fn validate_char_component( +pub(super) fn validate_char_component( text: &str, kind: CharComponentKind, range: TextRange, @@ -46,109 +48,115 @@ pub(crate) fn validate_char_component( // Validate escapes use self::CharComponentKind::*; match kind { - AsciiEscape => { - if text.len() == 1 { - // Escape sequence consists only of leading `\` - errors.push(SyntaxError::new(EmptyAsciiEscape, range)); - } else { - let escape_code = text.chars().skip(1).next().unwrap(); - if !is_ascii_escape(escape_code) { - errors.push(SyntaxError::new(InvalidAsciiEscape, range)); - } + AsciiEscape => validate_ascii_escape(text, range, errors), + AsciiCodeEscape => validate_ascii_code_escape(text, range, errors), + UnicodeEscape => validate_unicode_escape(text, range, errors), + CodePoint => { + // These code points must always be escaped + if text == "\t" || text == "\r" || text == "\n" { + errors.push(SyntaxError::new(UnescapedCodepoint, range)); } } - AsciiCodeEscape => { - // An AsciiCodeEscape has 4 chars, example: `\xDD` - if text.len() < 4 { - errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range)); - } else { - assert!( - text.chars().count() == 4, - "AsciiCodeEscape cannot be longer than 4 chars" - ); - - match u8::from_str_radix(&text[2..], 16) { - Ok(code) if code < 128 => { /* Escape code is valid */ } - Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)), - Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)), - } - } + } +} + +fn validate_ascii_escape(text: &str, range: TextRange, errors: &mut Vec) { + if text.len() == 1 { + // Escape sequence consists only of leading `\` + errors.push(SyntaxError::new(EmptyAsciiEscape, range)); + } else { + let escape_code = text.chars().skip(1).next().unwrap(); + if !is_ascii_escape(escape_code) { + errors.push(SyntaxError::new(InvalidAsciiEscape, range)); } - UnicodeEscape => { - assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u"); + } +} - if text.len() == 2 { - // No starting `{` - errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); - return; - } +pub(super) fn is_ascii_escape(code: char) -> bool { + match code { + '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true, + _ => false, + } +} - if text.len() == 3 { - // Only starting `{` - errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)); - return; - } +fn validate_ascii_code_escape(text: &str, range: TextRange, errors: &mut Vec) { + // An AsciiCodeEscape has 4 chars, example: `\xDD` + if text.len() < 4 { + errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range)); + } else { + assert!( + text.chars().count() == 4, + "AsciiCodeEscape cannot be longer than 4 chars" + ); - let mut code = ArrayString::<[_; 6]>::new(); - let mut closed = false; - for c in text[3..].chars() { - assert!(!closed, "no characters after escape is closed"); - - if c.is_digit(16) { - if code.len() == 6 { - errors.push(SyntaxError::new(OverlongUnicodeEscape, range)); - return; - } - - code.push(c); - } else if c == '_' { - // Reject leading _ - if code.len() == 0 { - errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); - return; - } - } else if c == '}' { - closed = true; - } else { - errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); - return; - } - } + match u8::from_str_radix(&text[2..], 16) { + Ok(code) if code < 128 => { /* Escape code is valid */ } + Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)), + Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)), + } + } +} - if !closed { - errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)) - } +fn validate_unicode_escape(text: &str, range: TextRange, errors: &mut Vec) { + assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u"); - if code.len() == 0 { - errors.push(SyntaxError::new(EmptyUnicodeEcape, range)); + if text.len() == 2 { + // No starting `{` + errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); + return; + } + + if text.len() == 3 { + // Only starting `{` + errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)); + return; + } + + let mut code = ArrayString::<[_; 6]>::new(); + let mut closed = false; + for c in text[3..].chars() { + assert!(!closed, "no characters after escape is closed"); + + if c.is_digit(16) { + if code.len() == 6 { + errors.push(SyntaxError::new(OverlongUnicodeEscape, range)); return; } - match u32::from_str_radix(&code, 16) { - Ok(code_u32) if code_u32 > 0x10FFFF => { - errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range)); - } - Ok(_) => { - // Valid escape code - } - Err(_) => { - errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); - } - } - } - CodePoint => { - // These code points must always be escaped - if text == "\t" || text == "\r" { - errors.push(SyntaxError::new(UnescapedCodepoint, range)); + code.push(c); + } else if c == '_' { + // Reject leading _ + if code.len() == 0 { + errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); + return; } + } else if c == '}' { + closed = true; + } else { + errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); + return; } } -} -fn is_ascii_escape(code: char) -> bool { - match code { - '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true, - _ => false, + if !closed { + errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)) + } + + if code.len() == 0 { + errors.push(SyntaxError::new(EmptyUnicodeEcape, range)); + return; + } + + match u32::from_str_radix(&code, 16) { + Ok(code_u32) if code_u32 > 0x10FFFF => { + errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range)); + } + Ok(_) => { + // Valid escape code + } + Err(_) => { + errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); + } } } diff --git a/crates/ra_syntax/src/validation/mod.rs b/crates/ra_syntax/src/validation/mod.rs index 2ff0bc26d..acad7cb7f 100644 --- a/crates/ra_syntax/src/validation/mod.rs +++ b/crates/ra_syntax/src/validation/mod.rs @@ -5,6 +5,7 @@ use crate::{ yellow::SyntaxError, }; +mod byte; mod char; mod string; @@ -12,6 +13,7 @@ pub(crate) fn validate(file: &SourceFileNode) -> Vec { let mut errors = Vec::new(); for node in file.syntax().descendants() { let _ = visitor_ctx(&mut errors) + .visit::(self::byte::validate_byte_node) .visit::(self::char::validate_char_node) .visit::(self::string::validate_string_node) .accept(node); diff --git a/crates/ra_syntax/src/yellow/syntax_error.rs b/crates/ra_syntax/src/yellow/syntax_error.rs index cf7b1d495..df230293b 100644 --- a/crates/ra_syntax/src/yellow/syntax_error.rs +++ b/crates/ra_syntax/src/yellow/syntax_error.rs @@ -72,6 +72,16 @@ pub enum SyntaxErrorKind { EmptyChar, UnclosedChar, OverlongChar, + EmptyByte, + UnclosedByte, + OverlongByte, + ByteOutOfRange, + UnescapedByte, + EmptyByteEscape, + InvalidByteEscape, + TooShortByteCodeEscape, + MalformedByteCodeEscape, + UnicodeEscapeForbidden, EmptyAsciiEscape, InvalidAsciiEscape, TooShortAsciiCodeEscape, @@ -98,6 +108,16 @@ impl fmt::Display for SyntaxErrorKind { EmptyChar => write!(f, "Empty char literal"), UnclosedChar => write!(f, "Unclosed char literal"), OverlongChar => write!(f, "Char literal should be one character long"), + EmptyByte => write!(f, "Empty byte literal"), + UnclosedByte => write!(f, "Unclosed byte literal"), + OverlongByte => write!(f, "Byte literal should be one character long"), + ByteOutOfRange => write!(f, "Byte should be a valid ASCII character"), + UnescapedByte => write!(f, "This byte should always be escaped"), + EmptyByteEscape => write!(f, "Empty escape sequence"), + InvalidByteEscape => write!(f, "Invalid escape sequence"), + TooShortByteCodeEscape => write!(f, "Escape sequence should have two digits"), + MalformedByteCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"), + UnicodeEscapeForbidden => write!(f, "Unicode escapes are not allowed in byte literals or byte strings"), TooShortAsciiCodeEscape => write!(f, "Escape sequence should have two digits"), AsciiCodeEscapeOutOfRange => { write!(f, "Escape sequence should be between \\x00 and \\x7F") -- cgit v1.2.3 From 30cd4d5acb7dfd40cea264a926d1c89f0c3522c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adolfo=20Ochagav=C3=ADa?= Date: Sun, 11 Nov 2018 20:41:43 +0100 Subject: Validate byte string literals --- crates/ra_syntax/src/ast/generated.rs | 37 +++++ crates/ra_syntax/src/ast/mod.rs | 6 + crates/ra_syntax/src/grammar.ron | 1 + crates/ra_syntax/src/string_lexing.rs | 53 +++++++- crates/ra_syntax/src/validation/byte.rs | 50 ++++--- crates/ra_syntax/src/validation/byte_string.rs | 178 +++++++++++++++++++++++++ crates/ra_syntax/src/validation/char.rs | 2 +- crates/ra_syntax/src/validation/mod.rs | 2 + 8 files changed, 305 insertions(+), 24 deletions(-) create mode 100644 crates/ra_syntax/src/validation/byte_string.rs (limited to 'crates/ra_syntax') diff --git a/crates/ra_syntax/src/ast/generated.rs b/crates/ra_syntax/src/ast/generated.rs index 75236153d..bf056131e 100644 --- a/crates/ra_syntax/src/ast/generated.rs +++ b/crates/ra_syntax/src/ast/generated.rs @@ -409,6 +409,43 @@ impl> ByteNode { impl<'a> Byte<'a> {} +// ByteString +#[derive(Debug, Clone, Copy,)] +pub struct ByteStringNode = OwnedRoot> { + pub(crate) syntax: SyntaxNode, +} +pub type ByteString<'a> = ByteStringNode>; + +impl, R2: TreeRoot> PartialEq> for ByteStringNode { + fn eq(&self, other: &ByteStringNode) -> bool { self.syntax == other.syntax } +} +impl> Eq for ByteStringNode {} +impl> Hash for ByteStringNode { + fn hash(&self, state: &mut H) { self.syntax.hash(state) } +} + +impl<'a> AstNode<'a> for ByteString<'a> { + fn cast(syntax: SyntaxNodeRef<'a>) -> Option { + match syntax.kind() { + BYTE_STRING => Some(ByteString { syntax }), + _ => None, + } + } + fn syntax(self) -> SyntaxNodeRef<'a> { self.syntax } +} + +impl> ByteStringNode { + pub fn borrowed(&self) -> ByteString { + ByteStringNode { syntax: self.syntax.borrowed() } + } + pub fn owned(&self) -> ByteStringNode { + ByteStringNode { syntax: self.syntax.owned() } + } +} + + +impl<'a> ByteString<'a> {} + // CallExpr #[derive(Debug, Clone, Copy,)] pub struct CallExprNode = OwnedRoot> { diff --git a/crates/ra_syntax/src/ast/mod.rs b/crates/ra_syntax/src/ast/mod.rs index 686b5cf04..7077e3492 100644 --- a/crates/ra_syntax/src/ast/mod.rs +++ b/crates/ra_syntax/src/ast/mod.rs @@ -140,6 +140,12 @@ impl<'a> Byte<'a> { } } +impl<'a> ByteString<'a> { + pub fn text(&self) -> &SmolStr { + &self.syntax().leaf_text().unwrap() + } +} + impl<'a> String<'a> { pub fn text(&self) -> &SmolStr { &self.syntax().leaf_text().unwrap() diff --git a/crates/ra_syntax/src/grammar.ron b/crates/ra_syntax/src/grammar.ron index 2c2ed1aeb..53cd2118f 100644 --- a/crates/ra_syntax/src/grammar.ron +++ b/crates/ra_syntax/src/grammar.ron @@ -413,6 +413,7 @@ Grammar( "BinExpr": (), "String": (), "Byte": (), + "ByteString": (), "Char": (), "Literal": (), diff --git a/crates/ra_syntax/src/string_lexing.rs b/crates/ra_syntax/src/string_lexing.rs index 4e8c3a91c..d253c97e7 100644 --- a/crates/ra_syntax/src/string_lexing.rs +++ b/crates/ra_syntax/src/string_lexing.rs @@ -1,6 +1,55 @@ use self::CharComponentKind::*; use rowan::{TextRange, TextUnit}; +pub fn parse_byte_string_literal(src: &str) -> ByteStringComponentIterator { + ByteStringComponentIterator { + parser: Parser::new(src), + has_closing_quote: false, + } +} + +pub struct ByteStringComponentIterator<'a> { + parser: Parser<'a>, + pub has_closing_quote: bool, +} + +impl<'a> Iterator for ByteStringComponentIterator<'a> { + type Item = StringComponent; + fn next(&mut self) -> Option { + if self.parser.pos == 0 { + assert!( + self.parser.advance() == 'b', + "byte string literal should start with a `b`" + ); + + assert!( + self.parser.advance() == '"', + "byte string literal should start with a `b`, followed by double quotes" + ); + } + + if let Some(component) = self.parser.parse_string_component() { + return Some(component); + } + + // We get here when there are no char components left to parse + if self.parser.peek() == Some('"') { + self.parser.advance(); + self.has_closing_quote = true; + } + + assert!( + self.parser.peek() == None, + "byte string literal should leave no unparsed input: src = {}, pos = {}, length = {}", + self.parser.src, + self.parser.pos, + self.parser.src.len() + ); + + None + } +} + pub fn parse_string_literal(src: &str) -> StringComponentIterator { StringComponentIterator { parser: Parser::new(src), @@ -81,12 +130,12 @@ impl<'a> Iterator for ByteComponentIterator<'a> { if self.parser.pos == 0 { assert!( self.parser.advance() == 'b', - "Byte literal should start with a b" + "Byte literal should start with a `b`" ); assert!( self.parser.advance() == '\'', - "Byte literal should start with a b, followed by a quote" + "Byte literal should start with a `b`, followed by a quote" ); } diff --git a/crates/ra_syntax/src/validation/byte.rs b/crates/ra_syntax/src/validation/byte.rs index 3d2806c4e..7baf3c1d7 100644 --- a/crates/ra_syntax/src/validation/byte.rs +++ b/crates/ra_syntax/src/validation/byte.rs @@ -20,26 +20,7 @@ pub(super) fn validate_byte_node(node: ast::Byte, errors: &mut Vec) len += 1; let text = &literal_text[component.range]; let range = component.range + literal_range.start(); - - use self::CharComponentKind::*; - match component.kind { - AsciiEscape => validate_byte_escape(text, range, errors), - AsciiCodeEscape => validate_byte_code_escape(text, range, errors), - UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)), - CodePoint => { - let c = text.chars().next().expect("Code points should be one character long"); - - // These bytes must always be escaped - if c == '\t' || c == '\r' || c == '\n' { - errors.push(SyntaxError::new(UnescapedByte, range)); - } - - // Only ASCII bytes are allowed - if c > 0x7F as char { - errors.push(SyntaxError::new(ByteOutOfRange, range)); - } - } - } + validate_byte_component(text, component.kind, range, errors); } if !components.has_closing_quote { @@ -55,6 +36,33 @@ pub(super) fn validate_byte_node(node: ast::Byte, errors: &mut Vec) } } +pub(super) fn validate_byte_component( + text: &str, + kind: CharComponentKind, + range: TextRange, + errors: &mut Vec, +) { + use self::CharComponentKind::*; + match kind { + AsciiEscape => validate_byte_escape(text, range, errors), + AsciiCodeEscape => validate_byte_code_escape(text, range, errors), + UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)), + CodePoint => { + let c = text.chars().next().expect("Code points should be one character long"); + + // These bytes must always be escaped + if c == '\t' || c == '\r' || c == '\n' { + errors.push(SyntaxError::new(UnescapedByte, range)); + } + + // Only ASCII bytes are allowed + if c > 0x7F as char { + errors.push(SyntaxError::new(ByteOutOfRange, range)); + } + } + } +} + fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec) { if text.len() == 1 { // Escape sequence consists only of leading `\` @@ -141,7 +149,7 @@ mod test { #[test] fn test_valid_byte_escape() { let valid = [ - r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b", + r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", ]; for c in &valid { assert_valid_byte(c); diff --git a/crates/ra_syntax/src/validation/byte_string.rs b/crates/ra_syntax/src/validation/byte_string.rs new file mode 100644 index 000000000..7b830e97c --- /dev/null +++ b/crates/ra_syntax/src/validation/byte_string.rs @@ -0,0 +1,178 @@ +use crate::{ + ast::{self, AstNode}, + string_lexing::{self, StringComponentKind}, + yellow::{ + SyntaxError, + SyntaxErrorKind::*, + }, +}; + +use super::byte; + +pub(crate) fn validate_byte_string_node(node: ast::ByteString, errors: &mut Vec) { + let literal_text = node.text(); + let literal_range = node.syntax().range(); + let mut components = string_lexing::parse_byte_string_literal(literal_text); + for component in &mut components { + let range = component.range + literal_range.start(); + + match component.kind { + StringComponentKind::Char(kind) => { + // Chars must escape \t, \n and \r codepoints, but strings don't + let text = &literal_text[component.range]; + match text { + "\t" | "\n" | "\r" => { /* always valid */ } + _ => byte::validate_byte_component(text, kind, range, errors), + } + } + StringComponentKind::IgnoreNewline => { /* always valid */ } + } + } + + if !components.has_closing_quote { + errors.push(SyntaxError::new(UnclosedString, literal_range)); + } +} + +#[cfg(test)] +mod test { + use crate::SourceFileNode; + + fn build_file(literal: &str) -> SourceFileNode { + let src = format!(r#"const S: &'static [u8] = b"{}";"#, literal); + println!("Source: {}", src); + SourceFileNode::parse(&src) + } + + fn assert_valid_str(literal: &str) { + let file = build_file(literal); + assert!( + file.errors().len() == 0, + "Errors for literal '{}': {:?}", + literal, + file.errors() + ); + } + + fn assert_invalid_str(literal: &str) { + let file = build_file(literal); + assert!(file.errors().len() > 0); + } + + #[test] + fn test_ansi_codepoints() { + for byte in 0..128 { + match byte { + b'\"' | b'\\' => { /* Ignore string close and backslash */ } + _ => assert_valid_str(&(byte as char).to_string()), + } + } + + for byte in 128..=255u8 { + assert_invalid_str(&(byte as char).to_string()); + } + } + + #[test] + fn test_unicode_codepoints() { + let invalid = ["Ƒ", "バ", "メ", "﷽"]; + for c in &invalid { + assert_invalid_str(c); + } + } + + #[test] + fn test_unicode_multiple_codepoints() { + let invalid = ["नी", "👨‍👨‍"]; + for c in &invalid { + assert_invalid_str(c); + } + } + + #[test] + fn test_valid_ascii_escape() { + let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"]; + for c in &valid { + assert_valid_str(c); + } + } + + #[test] + fn test_invalid_ascii_escape() { + let invalid = [r"\a", r"\?", r"\"]; + for c in &invalid { + assert_invalid_str(c); + } + } + + #[test] + fn test_valid_ascii_code_escape() { + let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"]; + for c in &valid { + assert_valid_str(c); + } + } + + #[test] + fn test_invalid_ascii_code_escape() { + let invalid = [r"\x", r"\x7"]; + for c in &invalid { + assert_invalid_str(c); + } + } + + #[test] + fn test_invalid_unicode_escape() { + let well_formed = [ + r"\u{FF}", + r"\u{0}", + r"\u{F}", + r"\u{10FFFF}", + r"\u{1_0__FF___FF_____}", + ]; + for c in &well_formed { + assert_invalid_str(c); + } + + let invalid = [ + r"\u", + r"\u{}", + r"\u{", + r"\u{FF", + r"\u{FFFFFF}", + r"\u{_F}", + r"\u{00FFFFF}", + r"\u{110000}", + ]; + for c in &invalid { + assert_invalid_str(c); + } + } + + #[test] + fn test_mixed_invalid() { + assert_invalid_str( + r"This is the tale of a string +with a newline in between, some emoji (👨‍👨‍) here and there, +unicode escapes like this: \u{1FFBB} and weird stuff like +this ﷽", + ); + } + + #[test] + fn test_mixed_valid() { + assert_valid_str( + r"This is the tale of a string +with a newline in between, no emoji at all, +nor unicode escapes or weird stuff", + ); + } + + #[test] + fn test_ignore_newline() { + assert_valid_str( + "Hello \ + World", + ); + } +} diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs index 793539b3a..622b2efdc 100644 --- a/crates/ra_syntax/src/validation/char.rs +++ b/crates/ra_syntax/src/validation/char.rs @@ -214,7 +214,7 @@ mod test { #[test] fn test_valid_ascii_escape() { let valid = [ - r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b", + r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", ]; for c in &valid { assert_valid_char(c); diff --git a/crates/ra_syntax/src/validation/mod.rs b/crates/ra_syntax/src/validation/mod.rs index acad7cb7f..bdee8120c 100644 --- a/crates/ra_syntax/src/validation/mod.rs +++ b/crates/ra_syntax/src/validation/mod.rs @@ -6,6 +6,7 @@ use crate::{ }; mod byte; +mod byte_string; mod char; mod string; @@ -14,6 +15,7 @@ pub(crate) fn validate(file: &SourceFileNode) -> Vec { for node in file.syntax().descendants() { let _ = visitor_ctx(&mut errors) .visit::(self::byte::validate_byte_node) + .visit::(self::byte_string::validate_byte_string_node) .visit::(self::char::validate_char_node) .visit::(self::string::validate_string_node) .accept(node); -- cgit v1.2.3 From c96bfe7e2d4465653fe6b0eff053f0dfb48313fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adolfo=20Ochagav=C3=ADa?= Date: Sun, 11 Nov 2018 21:00:31 +0100 Subject: Split string lexing and run rustfmt --- crates/ra_syntax/src/string_lexing.rs | 513 ---------------------- crates/ra_syntax/src/string_lexing/byte.rs | 51 +++ crates/ra_syntax/src/string_lexing/byte_string.rs | 51 +++ crates/ra_syntax/src/string_lexing/char.rs | 176 ++++++++ crates/ra_syntax/src/string_lexing/mod.rs | 13 + crates/ra_syntax/src/string_lexing/parser.rs | 201 +++++++++ crates/ra_syntax/src/string_lexing/string.rs | 46 ++ crates/ra_syntax/src/validation/byte.rs | 9 +- crates/ra_syntax/src/validation/char.rs | 4 +- crates/ra_syntax/src/yellow/syntax_error.rs | 5 +- 10 files changed, 548 insertions(+), 521 deletions(-) delete mode 100644 crates/ra_syntax/src/string_lexing.rs create mode 100644 crates/ra_syntax/src/string_lexing/byte.rs create mode 100644 crates/ra_syntax/src/string_lexing/byte_string.rs create mode 100644 crates/ra_syntax/src/string_lexing/char.rs create mode 100644 crates/ra_syntax/src/string_lexing/mod.rs create mode 100644 crates/ra_syntax/src/string_lexing/parser.rs create mode 100644 crates/ra_syntax/src/string_lexing/string.rs (limited to 'crates/ra_syntax') diff --git a/crates/ra_syntax/src/string_lexing.rs b/crates/ra_syntax/src/string_lexing.rs deleted file mode 100644 index d253c97e7..000000000 --- a/crates/ra_syntax/src/string_lexing.rs +++ /dev/null @@ -1,513 +0,0 @@ -use self::CharComponentKind::*; -use rowan::{TextRange, TextUnit}; - -pub fn parse_byte_string_literal(src: &str) -> ByteStringComponentIterator { - ByteStringComponentIterator { - parser: Parser::new(src), - has_closing_quote: false, - } -} - -pub struct ByteStringComponentIterator<'a> { - parser: Parser<'a>, - pub has_closing_quote: bool, -} - -impl<'a> Iterator for ByteStringComponentIterator<'a> { - type Item = StringComponent; - fn next(&mut self) -> Option { - if self.parser.pos == 0 { - assert!( - self.parser.advance() == 'b', - "byte string literal should start with a `b`" - ); - - assert!( - self.parser.advance() == '"', - "byte string literal should start with a `b`, followed by double quotes" - ); - } - - if let Some(component) = self.parser.parse_string_component() { - return Some(component); - } - - // We get here when there are no char components left to parse - if self.parser.peek() == Some('"') { - self.parser.advance(); - self.has_closing_quote = true; - } - - assert!( - self.parser.peek() == None, - "byte string literal should leave no unparsed input: src = {}, pos = {}, length = {}", - self.parser.src, - self.parser.pos, - self.parser.src.len() - ); - - None - } -} - -pub fn parse_string_literal(src: &str) -> StringComponentIterator { - StringComponentIterator { - parser: Parser::new(src), - has_closing_quote: false, - } -} - -#[derive(Debug, Eq, PartialEq, Clone)] -pub struct StringComponent { - pub range: TextRange, - pub kind: StringComponentKind, -} - -impl StringComponent { - fn new(range: TextRange, kind: StringComponentKind) -> StringComponent { - StringComponent { range, kind } - } -} - -#[derive(Debug, Eq, PartialEq, Clone)] -pub enum StringComponentKind { - IgnoreNewline, - Char(CharComponentKind), -} - -pub struct StringComponentIterator<'a> { - parser: Parser<'a>, - pub has_closing_quote: bool, -} - -impl<'a> Iterator for StringComponentIterator<'a> { - type Item = StringComponent; - fn next(&mut self) -> Option { - if self.parser.pos == 0 { - assert!( - self.parser.advance() == '"', - "string literal should start with double quotes" - ); - } - - if let Some(component) = self.parser.parse_string_component() { - return Some(component); - } - - // We get here when there are no char components left to parse - if self.parser.peek() == Some('"') { - self.parser.advance(); - self.has_closing_quote = true; - } - - assert!( - self.parser.peek() == None, - "string literal should leave no unparsed input: src = {}, pos = {}, length = {}", - self.parser.src, - self.parser.pos, - self.parser.src.len() - ); - - None - } -} - -pub fn parse_byte_literal(src: &str) -> ByteComponentIterator { - ByteComponentIterator { - parser: Parser::new(src), - has_closing_quote: false, - } -} - -pub struct ByteComponentIterator<'a> { - parser: Parser<'a>, - pub has_closing_quote: bool, -} - -impl<'a> Iterator for ByteComponentIterator<'a> { - type Item = CharComponent; - fn next(&mut self) -> Option { - if self.parser.pos == 0 { - assert!( - self.parser.advance() == 'b', - "Byte literal should start with a `b`" - ); - - assert!( - self.parser.advance() == '\'', - "Byte literal should start with a `b`, followed by a quote" - ); - } - - - if let Some(component) = self.parser.parse_char_component() { - return Some(component); - } - - // We get here when there are no char components left to parse - if self.parser.peek() == Some('\'') { - self.parser.advance(); - self.has_closing_quote = true; - } - - assert!( - self.parser.peek() == None, - "byte literal should leave no unparsed input: src = {}, pos = {}, length = {}", - self.parser.src, - self.parser.pos, - self.parser.src.len() - ); - - None - } -} - -pub fn parse_char_literal(src: &str) -> CharComponentIterator { - CharComponentIterator { - parser: Parser::new(src), - has_closing_quote: false, - } -} - -#[derive(Debug, Eq, PartialEq, Clone)] -pub struct CharComponent { - pub range: TextRange, - pub kind: CharComponentKind, -} - -impl CharComponent { - fn new(range: TextRange, kind: CharComponentKind) -> CharComponent { - CharComponent { range, kind } - } -} - -#[derive(Debug, Eq, PartialEq, Clone)] -pub enum CharComponentKind { - CodePoint, - AsciiEscape, - AsciiCodeEscape, - UnicodeEscape, -} - -pub struct CharComponentIterator<'a> { - parser: Parser<'a>, - pub has_closing_quote: bool, -} - -impl<'a> Iterator for CharComponentIterator<'a> { - type Item = CharComponent; - fn next(&mut self) -> Option { - if self.parser.pos == 0 { - assert!( - self.parser.advance() == '\'', - "char literal should start with a quote" - ); - } - - if let Some(component) = self.parser.parse_char_component() { - return Some(component); - } - - // We get here when there are no char components left to parse - if self.parser.peek() == Some('\'') { - self.parser.advance(); - self.has_closing_quote = true; - } - - assert!( - self.parser.peek() == None, - "char literal should leave no unparsed input: src = {}, pos = {}, length = {}", - self.parser.src, - self.parser.pos, - self.parser.src.len() - ); - - None - } -} - -pub struct Parser<'a> { - src: &'a str, - pos: usize, -} - -impl<'a> Parser<'a> { - pub fn new(src: &'a str) -> Parser<'a> { - Parser { src, pos: 0 } - } - - // Utility methods - - pub fn peek(&self) -> Option { - if self.pos == self.src.len() { - return None; - } - - self.src[self.pos..].chars().next() - } - - pub fn advance(&mut self) -> char { - let next = self - .peek() - .expect("cannot advance if end of input is reached"); - self.pos += next.len_utf8(); - next - } - - pub fn skip_whitespace(&mut self) { - while self.peek().map(|c| c.is_whitespace()) == Some(true) { - self.advance(); - } - } - - pub fn get_pos(&self) -> TextUnit { - (self.pos as u32).into() - } - - // Char parsing methods - - fn parse_unicode_escape(&mut self, start: TextUnit) -> CharComponent { - match self.peek() { - Some('{') => { - self.advance(); - - // Parse anything until we reach `}` - while let Some(next) = self.peek() { - self.advance(); - if next == '}' { - break; - } - } - - let end = self.get_pos(); - CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) - } - Some(_) | None => { - let end = self.get_pos(); - CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) - } - } - } - - fn parse_ascii_code_escape(&mut self, start: TextUnit) -> CharComponent { - let code_start = self.get_pos(); - while let Some(next) = self.peek() { - if next == '\'' || (self.get_pos() - code_start == 2.into()) { - break; - } - - self.advance(); - } - - let end = self.get_pos(); - CharComponent::new(TextRange::from_to(start, end), AsciiCodeEscape) - } - - fn parse_escape(&mut self, start: TextUnit) -> CharComponent { - if self.peek().is_none() { - return CharComponent::new(TextRange::from_to(start, start), AsciiEscape); - } - - let next = self.advance(); - let end = self.get_pos(); - let range = TextRange::from_to(start, end); - match next { - 'x' => self.parse_ascii_code_escape(start), - 'u' => self.parse_unicode_escape(start), - _ => CharComponent::new(range, AsciiEscape), - } - } - - pub fn parse_char_component(&mut self) -> Option { - let next = self.peek()?; - - // Ignore character close - if next == '\'' { - return None; - } - - let start = self.get_pos(); - self.advance(); - - if next == '\\' { - Some(self.parse_escape(start)) - } else { - let end = self.get_pos(); - Some(CharComponent::new( - TextRange::from_to(start, end), - CodePoint, - )) - } - } - - pub fn parse_ignore_newline(&mut self, start: TextUnit) -> Option { - // In string literals, when a `\` occurs immediately before the newline, the `\`, - // the newline, and all whitespace at the beginning of the next line are ignored - match self.peek() { - Some('\n') | Some('\r') => { - self.skip_whitespace(); - Some(StringComponent::new( - TextRange::from_to(start, self.get_pos()), - StringComponentKind::IgnoreNewline, - )) - } - _ => None, - } - } - - pub fn parse_string_component(&mut self) -> Option { - let next = self.peek()?; - - // Ignore string close - if next == '"' { - return None; - } - - let start = self.get_pos(); - self.advance(); - - if next == '\\' { - // Strings can use `\` to ignore newlines, so we first try to parse one of those - // before falling back to parsing char escapes - self.parse_ignore_newline(start).or_else(|| { - let char_component = self.parse_escape(start); - Some(StringComponent::new( - char_component.range, - StringComponentKind::Char(char_component.kind), - )) - }) - } else { - let end = self.get_pos(); - Some(StringComponent::new( - TextRange::from_to(start, end), - StringComponentKind::Char(CodePoint), - )) - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - fn parse(src: &str) -> (bool, Vec) { - let component_iterator = &mut super::parse_char_literal(src); - let components: Vec<_> = component_iterator.collect(); - (component_iterator.has_closing_quote, components) - } - - fn unclosed_char_component(src: &str) -> CharComponent { - let (has_closing_quote, components) = parse(src); - assert!(!has_closing_quote, "char should not have closing quote"); - assert!(components.len() == 1); - components[0].clone() - } - - fn closed_char_component(src: &str) -> CharComponent { - let (has_closing_quote, components) = parse(src); - assert!(has_closing_quote, "char should have closing quote"); - assert!( - components.len() == 1, - "Literal: {}\nComponents: {:#?}", - src, - components - ); - components[0].clone() - } - - fn closed_char_components(src: &str) -> Vec { - let (has_closing_quote, components) = parse(src); - assert!(has_closing_quote, "char should have closing quote"); - components - } - - fn range_closed(src: &str) -> TextRange { - TextRange::from_to(1.into(), (src.len() as u32 - 1).into()) - } - - fn range_unclosed(src: &str) -> TextRange { - TextRange::from_to(1.into(), (src.len() as u32).into()) - } - - #[test] - fn test_unicode_escapes() { - let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""]; - for escape in unicode_escapes { - let escape_sequence = format!(r"'\u{}'", escape); - let component = closed_char_component(&escape_sequence); - let expected_range = range_closed(&escape_sequence); - assert_eq!(component.kind, CharComponentKind::UnicodeEscape); - assert_eq!(component.range, expected_range); - } - } - - #[test] - fn test_unicode_escapes_unclosed() { - let unicode_escapes = &["{DEAD", "{BEEF", "{FF"]; - for escape in unicode_escapes { - let escape_sequence = format!(r"'\u{}'", escape); - let component = unclosed_char_component(&escape_sequence); - let expected_range = range_unclosed(&escape_sequence); - assert_eq!(component.kind, CharComponentKind::UnicodeEscape); - assert_eq!(component.range, expected_range); - } - } - - #[test] - fn test_empty_char() { - let (has_closing_quote, components) = parse("''"); - assert!(has_closing_quote, "char should have closing quote"); - assert!(components.len() == 0); - } - - #[test] - fn test_unclosed_char() { - let component = unclosed_char_component("'a"); - assert!(component.kind == CodePoint); - assert!(component.range == TextRange::from_to(1.into(), 2.into())); - } - - #[test] - fn test_digit_escapes() { - let literals = &[r"", r"5", r"55"]; - - for literal in literals { - let lit_text = format!(r"'\x{}'", literal); - let component = closed_char_component(&lit_text); - assert!(component.kind == CharComponentKind::AsciiCodeEscape); - assert!(component.range == range_closed(&lit_text)); - } - - // More than 2 digits starts a new codepoint - let components = closed_char_components(r"'\x555'"); - assert!(components.len() == 2); - assert!(components[1].kind == CharComponentKind::CodePoint); - } - - #[test] - fn test_ascii_escapes() { - let literals = &[ - r"\'", "\\\"", // equivalent to \" - r"\n", r"\r", r"\t", r"\\", r"\0", - ]; - - for literal in literals { - let lit_text = format!("'{}'", literal); - let component = closed_char_component(&lit_text); - assert!(component.kind == CharComponentKind::AsciiEscape); - assert!(component.range == range_closed(&lit_text)); - } - } - - #[test] - fn test_no_escapes() { - let literals = &['"', 'n', 'r', 't', '0', 'x', 'u']; - - for &literal in literals { - let lit_text = format!("'{}'", literal); - let component = closed_char_component(&lit_text); - assert!(component.kind == CharComponentKind::CodePoint); - assert!(component.range == range_closed(&lit_text)); - } - } -} diff --git a/crates/ra_syntax/src/string_lexing/byte.rs b/crates/ra_syntax/src/string_lexing/byte.rs new file mode 100644 index 000000000..24424349c --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/byte.rs @@ -0,0 +1,51 @@ +use super::parser::Parser; +use super::CharComponent; + +pub fn parse_byte_literal(src: &str) -> ByteComponentIterator { + ByteComponentIterator { + parser: Parser::new(src), + has_closing_quote: false, + } +} + +pub struct ByteComponentIterator<'a> { + parser: Parser<'a>, + pub has_closing_quote: bool, +} + +impl<'a> Iterator for ByteComponentIterator<'a> { + type Item = CharComponent; + fn next(&mut self) -> Option { + if self.parser.pos == 0 { + assert!( + self.parser.advance() == 'b', + "Byte literal should start with a `b`" + ); + + assert!( + self.parser.advance() == '\'', + "Byte literal should start with a `b`, followed by a quote" + ); + } + + if let Some(component) = self.parser.parse_char_component() { + return Some(component); + } + + // We get here when there are no char components left to parse + if self.parser.peek() == Some('\'') { + self.parser.advance(); + self.has_closing_quote = true; + } + + assert!( + self.parser.peek() == None, + "byte literal should leave no unparsed input: src = {}, pos = {}, length = {}", + self.parser.src, + self.parser.pos, + self.parser.src.len() + ); + + None + } +} diff --git a/crates/ra_syntax/src/string_lexing/byte_string.rs b/crates/ra_syntax/src/string_lexing/byte_string.rs new file mode 100644 index 000000000..5b6dda760 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/byte_string.rs @@ -0,0 +1,51 @@ +use super::parser::Parser; +use super::StringComponent; + +pub fn parse_byte_string_literal(src: &str) -> ByteStringComponentIterator { + ByteStringComponentIterator { + parser: Parser::new(src), + has_closing_quote: false, + } +} + +pub struct ByteStringComponentIterator<'a> { + parser: Parser<'a>, + pub has_closing_quote: bool, +} + +impl<'a> Iterator for ByteStringComponentIterator<'a> { + type Item = StringComponent; + fn next(&mut self) -> Option { + if self.parser.pos == 0 { + assert!( + self.parser.advance() == 'b', + "byte string literal should start with a `b`" + ); + + assert!( + self.parser.advance() == '"', + "byte string literal should start with a `b`, followed by double quotes" + ); + } + + if let Some(component) = self.parser.parse_string_component() { + return Some(component); + } + + // We get here when there are no char components left to parse + if self.parser.peek() == Some('"') { + self.parser.advance(); + self.has_closing_quote = true; + } + + assert!( + self.parser.peek() == None, + "byte string literal should leave no unparsed input: src = {}, pos = {}, length = {}", + self.parser.src, + self.parser.pos, + self.parser.src.len() + ); + + None + } +} diff --git a/crates/ra_syntax/src/string_lexing/char.rs b/crates/ra_syntax/src/string_lexing/char.rs new file mode 100644 index 000000000..885c03b14 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/char.rs @@ -0,0 +1,176 @@ +use super::parser::Parser; +use super::CharComponent; + +pub fn parse_char_literal(src: &str) -> CharComponentIterator { + CharComponentIterator { + parser: Parser::new(src), + has_closing_quote: false, + } +} + +pub struct CharComponentIterator<'a> { + parser: Parser<'a>, + pub has_closing_quote: bool, +} + +impl<'a> Iterator for CharComponentIterator<'a> { + type Item = CharComponent; + fn next(&mut self) -> Option { + if self.parser.pos == 0 { + assert!( + self.parser.advance() == '\'', + "char literal should start with a quote" + ); + } + + if let Some(component) = self.parser.parse_char_component() { + return Some(component); + } + + // We get here when there are no char components left to parse + if self.parser.peek() == Some('\'') { + self.parser.advance(); + self.has_closing_quote = true; + } + + assert!( + self.parser.peek() == None, + "char literal should leave no unparsed input: src = {}, pos = {}, length = {}", + self.parser.src, + self.parser.pos, + self.parser.src.len() + ); + + None + } +} + +#[cfg(test)] +mod tests { + use rowan::TextRange; + use crate::string_lexing::{ + CharComponent, + CharComponentKind::*, +}; + + fn parse(src: &str) -> (bool, Vec) { + let component_iterator = &mut super::parse_char_literal(src); + let components: Vec<_> = component_iterator.collect(); + (component_iterator.has_closing_quote, components) + } + + fn unclosed_char_component(src: &str) -> CharComponent { + let (has_closing_quote, components) = parse(src); + assert!(!has_closing_quote, "char should not have closing quote"); + assert!(components.len() == 1); + components[0].clone() + } + + fn closed_char_component(src: &str) -> CharComponent { + let (has_closing_quote, components) = parse(src); + assert!(has_closing_quote, "char should have closing quote"); + assert!( + components.len() == 1, + "Literal: {}\nComponents: {:#?}", + src, + components + ); + components[0].clone() + } + + fn closed_char_components(src: &str) -> Vec { + let (has_closing_quote, components) = parse(src); + assert!(has_closing_quote, "char should have closing quote"); + components + } + + fn range_closed(src: &str) -> TextRange { + TextRange::from_to(1.into(), (src.len() as u32 - 1).into()) + } + + fn range_unclosed(src: &str) -> TextRange { + TextRange::from_to(1.into(), (src.len() as u32).into()) + } + + #[test] + fn test_unicode_escapes() { + let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""]; + for escape in unicode_escapes { + let escape_sequence = format!(r"'\u{}'", escape); + let component = closed_char_component(&escape_sequence); + let expected_range = range_closed(&escape_sequence); + assert_eq!(component.kind, UnicodeEscape); + assert_eq!(component.range, expected_range); + } + } + + #[test] + fn test_unicode_escapes_unclosed() { + let unicode_escapes = &["{DEAD", "{BEEF", "{FF"]; + for escape in unicode_escapes { + let escape_sequence = format!(r"'\u{}'", escape); + let component = unclosed_char_component(&escape_sequence); + let expected_range = range_unclosed(&escape_sequence); + assert_eq!(component.kind, UnicodeEscape); + assert_eq!(component.range, expected_range); + } + } + + #[test] + fn test_empty_char() { + let (has_closing_quote, components) = parse("''"); + assert!(has_closing_quote, "char should have closing quote"); + assert!(components.len() == 0); + } + + #[test] + fn test_unclosed_char() { + let component = unclosed_char_component("'a"); + assert!(component.kind == CodePoint); + assert!(component.range == TextRange::from_to(1.into(), 2.into())); + } + + #[test] + fn test_digit_escapes() { + let literals = &[r"", r"5", r"55"]; + + for literal in literals { + let lit_text = format!(r"'\x{}'", literal); + let component = closed_char_component(&lit_text); + assert!(component.kind == AsciiCodeEscape); + assert!(component.range == range_closed(&lit_text)); + } + + // More than 2 digits starts a new codepoint + let components = closed_char_components(r"'\x555'"); + assert!(components.len() == 2); + assert!(components[1].kind == CodePoint); + } + + #[test] + fn test_ascii_escapes() { + let literals = &[ + r"\'", "\\\"", // equivalent to \" + r"\n", r"\r", r"\t", r"\\", r"\0", + ]; + + for literal in literals { + let lit_text = format!("'{}'", literal); + let component = closed_char_component(&lit_text); + assert!(component.kind == AsciiEscape); + assert!(component.range == range_closed(&lit_text)); + } + } + + #[test] + fn test_no_escapes() { + let literals = &['"', 'n', 'r', 't', '0', 'x', 'u']; + + for &literal in literals { + let lit_text = format!("'{}'", literal); + let component = closed_char_component(&lit_text); + assert!(component.kind == CodePoint); + assert!(component.range == range_closed(&lit_text)); + } + } +} diff --git a/crates/ra_syntax/src/string_lexing/mod.rs b/crates/ra_syntax/src/string_lexing/mod.rs new file mode 100644 index 000000000..94853331f --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/mod.rs @@ -0,0 +1,13 @@ +mod parser; +mod byte; +mod byte_string; +mod char; +mod string; + +pub use self::{ + byte::parse_byte_literal, + byte_string::parse_byte_string_literal, + char::parse_char_literal, + parser::{CharComponent, CharComponentKind, StringComponent, StringComponentKind}, + string::parse_string_literal, +}; diff --git a/crates/ra_syntax/src/string_lexing/parser.rs b/crates/ra_syntax/src/string_lexing/parser.rs new file mode 100644 index 000000000..4a6d5bc93 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/parser.rs @@ -0,0 +1,201 @@ +use rowan::{TextRange, TextUnit}; + +use self::CharComponentKind::*; + +pub struct Parser<'a> { + pub(super) src: &'a str, + pub(super) pos: usize, +} + +impl<'a> Parser<'a> { + pub fn new(src: &'a str) -> Parser<'a> { + Parser { src, pos: 0 } + } + + // Utility methods + + pub fn peek(&self) -> Option { + if self.pos == self.src.len() { + return None; + } + + self.src[self.pos..].chars().next() + } + + pub fn advance(&mut self) -> char { + let next = self + .peek() + .expect("cannot advance if end of input is reached"); + self.pos += next.len_utf8(); + next + } + + pub fn skip_whitespace(&mut self) { + while self.peek().map(|c| c.is_whitespace()) == Some(true) { + self.advance(); + } + } + + pub fn get_pos(&self) -> TextUnit { + (self.pos as u32).into() + } + + // Char parsing methods + + fn parse_unicode_escape(&mut self, start: TextUnit) -> CharComponent { + match self.peek() { + Some('{') => { + self.advance(); + + // Parse anything until we reach `}` + while let Some(next) = self.peek() { + self.advance(); + if next == '}' { + break; + } + } + + let end = self.get_pos(); + CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) + } + Some(_) | None => { + let end = self.get_pos(); + CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) + } + } + } + + fn parse_ascii_code_escape(&mut self, start: TextUnit) -> CharComponent { + let code_start = self.get_pos(); + while let Some(next) = self.peek() { + if next == '\'' || (self.get_pos() - code_start == 2.into()) { + break; + } + + self.advance(); + } + + let end = self.get_pos(); + CharComponent::new(TextRange::from_to(start, end), AsciiCodeEscape) + } + + fn parse_escape(&mut self, start: TextUnit) -> CharComponent { + if self.peek().is_none() { + return CharComponent::new(TextRange::from_to(start, start), AsciiEscape); + } + + let next = self.advance(); + let end = self.get_pos(); + let range = TextRange::from_to(start, end); + match next { + 'x' => self.parse_ascii_code_escape(start), + 'u' => self.parse_unicode_escape(start), + _ => CharComponent::new(range, AsciiEscape), + } + } + + pub fn parse_char_component(&mut self) -> Option { + let next = self.peek()?; + + // Ignore character close + if next == '\'' { + return None; + } + + let start = self.get_pos(); + self.advance(); + + if next == '\\' { + Some(self.parse_escape(start)) + } else { + let end = self.get_pos(); + Some(CharComponent::new( + TextRange::from_to(start, end), + CodePoint, + )) + } + } + + pub fn parse_ignore_newline(&mut self, start: TextUnit) -> Option { + // In string literals, when a `\` occurs immediately before the newline, the `\`, + // the newline, and all whitespace at the beginning of the next line are ignored + match self.peek() { + Some('\n') | Some('\r') => { + self.skip_whitespace(); + Some(StringComponent::new( + TextRange::from_to(start, self.get_pos()), + StringComponentKind::IgnoreNewline, + )) + } + _ => None, + } + } + + pub fn parse_string_component(&mut self) -> Option { + let next = self.peek()?; + + // Ignore string close + if next == '"' { + return None; + } + + let start = self.get_pos(); + self.advance(); + + if next == '\\' { + // Strings can use `\` to ignore newlines, so we first try to parse one of those + // before falling back to parsing char escapes + self.parse_ignore_newline(start).or_else(|| { + let char_component = self.parse_escape(start); + Some(StringComponent::new( + char_component.range, + StringComponentKind::Char(char_component.kind), + )) + }) + } else { + let end = self.get_pos(); + Some(StringComponent::new( + TextRange::from_to(start, end), + StringComponentKind::Char(CodePoint), + )) + } + } +} + +#[derive(Debug, Eq, PartialEq, Clone)] +pub struct StringComponent { + pub range: TextRange, + pub kind: StringComponentKind, +} + +impl StringComponent { + fn new(range: TextRange, kind: StringComponentKind) -> StringComponent { + StringComponent { range, kind } + } +} + +#[derive(Debug, Eq, PartialEq, Clone)] +pub enum StringComponentKind { + IgnoreNewline, + Char(CharComponentKind), +} + +#[derive(Debug, Eq, PartialEq, Clone)] +pub struct CharComponent { + pub range: TextRange, + pub kind: CharComponentKind, +} + +impl CharComponent { + fn new(range: TextRange, kind: CharComponentKind) -> CharComponent { + CharComponent { range, kind } + } +} + +#[derive(Debug, Eq, PartialEq, Clone)] +pub enum CharComponentKind { + CodePoint, + AsciiEscape, + AsciiCodeEscape, + UnicodeEscape, +} diff --git a/crates/ra_syntax/src/string_lexing/string.rs b/crates/ra_syntax/src/string_lexing/string.rs new file mode 100644 index 000000000..1b23029c6 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/string.rs @@ -0,0 +1,46 @@ +use super::parser::Parser; +use super::StringComponent; + +pub fn parse_string_literal(src: &str) -> StringComponentIterator { + StringComponentIterator { + parser: Parser::new(src), + has_closing_quote: false, + } +} + +pub struct StringComponentIterator<'a> { + parser: Parser<'a>, + pub has_closing_quote: bool, +} + +impl<'a> Iterator for StringComponentIterator<'a> { + type Item = StringComponent; + fn next(&mut self) -> Option { + if self.parser.pos == 0 { + assert!( + self.parser.advance() == '"', + "string literal should start with double quotes" + ); + } + + if let Some(component) = self.parser.parse_string_component() { + return Some(component); + } + + // We get here when there are no char components left to parse + if self.parser.peek() == Some('"') { + self.parser.advance(); + self.has_closing_quote = true; + } + + assert!( + self.parser.peek() == None, + "string literal should leave no unparsed input: src = {}, pos = {}, length = {}", + self.parser.src, + self.parser.pos, + self.parser.src.len() + ); + + None + } +} diff --git a/crates/ra_syntax/src/validation/byte.rs b/crates/ra_syntax/src/validation/byte.rs index 7baf3c1d7..43c0d7edd 100644 --- a/crates/ra_syntax/src/validation/byte.rs +++ b/crates/ra_syntax/src/validation/byte.rs @@ -48,7 +48,10 @@ pub(super) fn validate_byte_component( AsciiCodeEscape => validate_byte_code_escape(text, range, errors), UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)), CodePoint => { - let c = text.chars().next().expect("Code points should be one character long"); + let c = text + .chars() + .next() + .expect("Code points should be one character long"); // These bytes must always be escaped if c == '\t' || c == '\r' || c == '\n' { @@ -148,9 +151,7 @@ mod test { #[test] fn test_valid_byte_escape() { - let valid = [ - r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", - ]; + let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"]; for c in &valid { assert_valid_byte(c); } diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs index 622b2efdc..4728c85e6 100644 --- a/crates/ra_syntax/src/validation/char.rs +++ b/crates/ra_syntax/src/validation/char.rs @@ -213,9 +213,7 @@ mod test { #[test] fn test_valid_ascii_escape() { - let valid = [ - r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", - ]; + let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"]; for c in &valid { assert_valid_char(c); } diff --git a/crates/ra_syntax/src/yellow/syntax_error.rs b/crates/ra_syntax/src/yellow/syntax_error.rs index df230293b..c32ee650d 100644 --- a/crates/ra_syntax/src/yellow/syntax_error.rs +++ b/crates/ra_syntax/src/yellow/syntax_error.rs @@ -117,7 +117,10 @@ impl fmt::Display for SyntaxErrorKind { InvalidByteEscape => write!(f, "Invalid escape sequence"), TooShortByteCodeEscape => write!(f, "Escape sequence should have two digits"), MalformedByteCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"), - UnicodeEscapeForbidden => write!(f, "Unicode escapes are not allowed in byte literals or byte strings"), + UnicodeEscapeForbidden => write!( + f, + "Unicode escapes are not allowed in byte literals or byte strings" + ), TooShortAsciiCodeEscape => write!(f, "Escape sequence should have two digits"), AsciiCodeEscapeOutOfRange => { write!(f, "Escape sequence should be between \\x00 and \\x7F") -- cgit v1.2.3