From 313314e14b629ebf50389dbd2d440bda922f6ae7 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Tue, 7 May 2019 19:38:26 +0300 Subject: share literal validation logic with compiler --- crates/ra_syntax/src/lib.rs | 1 - crates/ra_syntax/src/string_lexing.rs | 333 ------------- crates/ra_syntax/src/syntax_error.rs | 104 ++-- crates/ra_syntax/src/validation.rs | 64 ++- crates/ra_syntax/src/validation/byte.rs | 199 -------- crates/ra_syntax/src/validation/byte_string.rs | 169 ------- crates/ra_syntax/src/validation/char.rs | 273 ----------- crates/ra_syntax/src/validation/string.rs | 154 ------ crates/ra_syntax/src/validation/unescape.rs | 521 +++++++++++++++++++++ .../tests/data/parser/err/0030_string_suffixes.txt | 3 - 10 files changed, 620 insertions(+), 1201 deletions(-) delete mode 100644 crates/ra_syntax/src/string_lexing.rs delete mode 100644 crates/ra_syntax/src/validation/byte.rs delete mode 100644 crates/ra_syntax/src/validation/byte_string.rs delete mode 100644 crates/ra_syntax/src/validation/char.rs delete mode 100644 crates/ra_syntax/src/validation/string.rs create mode 100644 crates/ra_syntax/src/validation/unescape.rs diff --git a/crates/ra_syntax/src/lib.rs b/crates/ra_syntax/src/lib.rs index 9cb66b76b..39c25dbdc 100644 --- a/crates/ra_syntax/src/lib.rs +++ b/crates/ra_syntax/src/lib.rs @@ -23,7 +23,6 @@ mod syntax_node; mod syntax_text; mod syntax_error; mod parsing; -mod string_lexing; mod validation; mod ptr; diff --git a/crates/ra_syntax/src/string_lexing.rs b/crates/ra_syntax/src/string_lexing.rs deleted file mode 100644 index 4c3eea3d2..000000000 --- a/crates/ra_syntax/src/string_lexing.rs +++ /dev/null @@ -1,333 +0,0 @@ -use crate::{TextRange, TextUnit}; -use self::StringComponentKind::*; - -#[derive(Debug, Eq, PartialEq, Clone)] -pub(crate) struct StringComponent { - pub(crate) range: TextRange, - pub(crate) kind: StringComponentKind, -} - -#[derive(Debug, Eq, PartialEq, Clone)] -pub(crate) enum StringComponentKind { - IgnoreNewline, - CodePoint, - AsciiEscape, - AsciiCodeEscape, - UnicodeEscape, -} - -pub(crate) fn parse_quoted_literal( - prefix: Option, - quote: char, - src: &str, -) -> StringComponentIter { - let prefix = prefix.map(|p| match p { - 'b' => b'b', - _ => panic!("invalid prefix"), - }); - let quote = match quote { - '\'' => b'\'', - '"' => b'"', - _ => panic!("invalid quote"), - }; - StringComponentIter { src, prefix, quote, pos: 0, has_closing_quote: false, suffix: None } -} - -pub(crate) struct StringComponentIter<'a> { - src: &'a str, - prefix: Option, - quote: u8, - pos: usize, - pub(crate) has_closing_quote: bool, - pub(crate) suffix: Option, -} - -impl<'a> Iterator for StringComponentIter<'a> { - type Item = StringComponent; - fn next(&mut self) -> Option { - if self.pos == 0 { - if let Some(prefix) = self.prefix { - assert!( - self.advance() == prefix as char, - "literal should start with a {:?}", - prefix as char, - ); - } - assert!( - self.advance() == self.quote as char, - "literal should start with a {:?}", - self.quote as char, - ); - } - - if let Some(component) = self.parse_component() { - return Some(component); - } - - // We get here when there are no char components left to parse - if self.peek() == Some(self.quote as char) { - self.advance(); - self.has_closing_quote = true; - if let Some(range) = self.parse_suffix() { - self.suffix = Some(range); - } - } - - assert!( - self.peek() == None, - "literal should leave no unparsed input: src = {:?}, pos = {}, length = {}", - self.src, - self.pos, - self.src.len() - ); - - None - } -} - -impl<'a> StringComponentIter<'a> { - fn peek(&self) -> Option { - if self.pos == self.src.len() { - return None; - } - - self.src[self.pos..].chars().next() - } - - fn advance(&mut self) -> char { - let next = self.peek().expect("cannot advance if end of input is reached"); - self.pos += next.len_utf8(); - next - } - - fn parse_component(&mut self) -> Option { - let next = self.peek()?; - - // Ignore string close - if next == self.quote as char { - return None; - } - - let start = self.start_range(); - self.advance(); - - if next == '\\' { - // Strings can use `\` to ignore newlines, so we first try to parse one of those - // before falling back to parsing char escapes - if self.quote == b'"' { - if let Some(component) = self.parse_ignore_newline(start) { - return Some(component); - } - } - - Some(self.parse_escape(start)) - } else { - Some(self.finish_component(start, CodePoint)) - } - } - - fn parse_ignore_newline(&mut self, start: TextUnit) -> Option { - // In string literals, when a `\` occurs immediately before the newline, the `\`, - // the newline, and all whitespace at the beginning of the next line are ignored - match self.peek() { - Some('\n') | Some('\r') => { - self.skip_whitespace(); - Some(self.finish_component(start, IgnoreNewline)) - } - _ => None, - } - } - - fn skip_whitespace(&mut self) { - while self.peek().map(|c| c.is_whitespace()) == Some(true) { - self.advance(); - } - } - - fn parse_escape(&mut self, start: TextUnit) -> StringComponent { - if self.peek().is_none() { - return self.finish_component(start, AsciiEscape); - } - - let next = self.advance(); - match next { - 'x' => self.parse_ascii_code_escape(start), - 'u' => self.parse_unicode_escape(start), - _ => self.finish_component(start, AsciiEscape), - } - } - - fn parse_unicode_escape(&mut self, start: TextUnit) -> StringComponent { - match self.peek() { - Some('{') => { - self.advance(); - - // Parse anything until we reach `}` - while let Some(next) = self.peek() { - self.advance(); - if next == '}' { - break; - } - } - - self.finish_component(start, UnicodeEscape) - } - Some(_) | None => self.finish_component(start, UnicodeEscape), - } - } - - fn parse_ascii_code_escape(&mut self, start: TextUnit) -> StringComponent { - let code_start = self.pos; - while let Some(next) = self.peek() { - if next == '\'' || (self.pos - code_start == 2) { - break; - } - - self.advance(); - } - self.finish_component(start, AsciiCodeEscape) - } - - fn parse_suffix(&mut self) -> Option { - let start = self.start_range(); - let _ = self.peek()?; - while let Some(_) = self.peek() { - self.advance(); - } - Some(self.finish_range(start)) - } - - fn start_range(&self) -> TextUnit { - TextUnit::from_usize(self.pos) - } - - fn finish_range(&self, start: TextUnit) -> TextRange { - TextRange::from_to(start, TextUnit::from_usize(self.pos)) - } - - fn finish_component(&self, start: TextUnit, kind: StringComponentKind) -> StringComponent { - let range = self.finish_range(start); - StringComponent { range, kind } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - fn parse(src: &str) -> (bool, Vec) { - let component_iterator = &mut parse_quoted_literal(None, '\'', src); - let components: Vec<_> = component_iterator.collect(); - (component_iterator.has_closing_quote, components) - } - - fn unclosed_char_component(src: &str) -> StringComponent { - let (has_closing_quote, components) = parse(src); - assert!(!has_closing_quote, "char should not have closing quote"); - assert!(components.len() == 1); - components[0].clone() - } - - fn closed_char_component(src: &str) -> StringComponent { - let (has_closing_quote, components) = parse(src); - assert!(has_closing_quote, "char should have closing quote"); - assert!(components.len() == 1, "Literal: {}\nComponents: {:#?}", src, components); - components[0].clone() - } - - fn closed_char_components(src: &str) -> Vec { - let (has_closing_quote, components) = parse(src); - assert!(has_closing_quote, "char should have closing quote"); - components - } - - fn range_closed(src: &str) -> TextRange { - TextRange::from_to(1.into(), (src.len() as u32 - 1).into()) - } - - fn range_unclosed(src: &str) -> TextRange { - TextRange::from_to(1.into(), (src.len() as u32).into()) - } - - #[test] - fn test_unicode_escapes() { - let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""]; - for escape in unicode_escapes { - let escape_sequence = format!(r"'\u{}'", escape); - let component = closed_char_component(&escape_sequence); - let expected_range = range_closed(&escape_sequence); - assert_eq!(component.kind, UnicodeEscape); - assert_eq!(component.range, expected_range); - } - } - - #[test] - fn test_unicode_escapes_unclosed() { - let unicode_escapes = &["{DEAD", "{BEEF", "{FF"]; - for escape in unicode_escapes { - let escape_sequence = format!(r"'\u{}'", escape); - let component = unclosed_char_component(&escape_sequence); - let expected_range = range_unclosed(&escape_sequence); - assert_eq!(component.kind, UnicodeEscape); - assert_eq!(component.range, expected_range); - } - } - - #[test] - fn test_empty_char() { - let (has_closing_quote, components) = parse("''"); - assert!(has_closing_quote, "char should have closing quote"); - assert!(components.len() == 0); - } - - #[test] - fn test_unclosed_char() { - let component = unclosed_char_component("'a"); - assert!(component.kind == CodePoint); - assert!(component.range == TextRange::from_to(1.into(), 2.into())); - } - - #[test] - fn test_digit_escapes() { - let literals = &[r"", r"5", r"55"]; - - for literal in literals { - let lit_text = format!(r"'\x{}'", literal); - let component = closed_char_component(&lit_text); - assert!(component.kind == AsciiCodeEscape); - assert!(component.range == range_closed(&lit_text)); - } - - // More than 2 digits starts a new codepoint - let components = closed_char_components(r"'\x555'"); - assert!(components.len() == 2); - assert!(components[1].kind == CodePoint); - } - - #[test] - fn test_ascii_escapes() { - let literals = &[ - r"\'", "\\\"", // equivalent to \" - r"\n", r"\r", r"\t", r"\\", r"\0", - ]; - - for literal in literals { - let lit_text = format!("'{}'", literal); - let component = closed_char_component(&lit_text); - assert!(component.kind == AsciiEscape); - assert!(component.range == range_closed(&lit_text)); - } - } - - #[test] - fn test_no_escapes() { - let literals = &['"', 'n', 'r', 't', '0', 'x', 'u']; - - for &literal in literals { - let lit_text = format!("'{}'", literal); - let component = closed_char_component(&lit_text); - assert!(component.kind == CodePoint); - assert!(component.range == range_closed(&lit_text)); - } - } -} diff --git a/crates/ra_syntax/src/syntax_error.rs b/crates/ra_syntax/src/syntax_error.rs index 4198eefdb..27e12293b 100644 --- a/crates/ra_syntax/src/syntax_error.rs +++ b/crates/ra_syntax/src/syntax_error.rs @@ -2,7 +2,10 @@ use std::fmt; use ra_parser::ParseError; -use crate::{TextRange, TextUnit}; +use crate::{ + TextRange, TextUnit, + validation::EscapeError, +}; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct SyntaxError { @@ -67,32 +70,7 @@ impl fmt::Display for SyntaxError { #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum SyntaxErrorKind { ParseError(ParseError), - UnescapedCodepoint, - EmptyChar, - UnclosedChar, - OverlongChar, - EmptyByte, - UnclosedByte, - OverlongByte, - ByteOutOfRange, - UnescapedByte, - EmptyByteEscape, - InvalidByteEscape, - TooShortByteCodeEscape, - MalformedByteCodeEscape, - UnicodeEscapeForbidden, - EmptyAsciiEscape, - InvalidAsciiEscape, - TooShortAsciiCodeEscape, - AsciiCodeEscapeOutOfRange, - MalformedAsciiCodeEscape, - UnclosedUnicodeEscape, - MalformedUnicodeEscape, - EmptyUnicodeEcape, - OverlongUnicodeEscape, - UnicodeEscapeOutOfRange, - UnclosedString, - InvalidSuffix, + EscapeError(EscapeError), InvalidBlockAttr, InvalidMatchInnerAttr, InvalidTupleIndexFormat, @@ -102,38 +80,6 @@ impl fmt::Display for SyntaxErrorKind { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use self::SyntaxErrorKind::*; match self { - UnescapedCodepoint => write!(f, "This codepoint should always be escaped"), - EmptyAsciiEscape => write!(f, "Empty escape sequence"), - InvalidAsciiEscape => write!(f, "Invalid escape sequence"), - EmptyChar => write!(f, "Empty char literal"), - UnclosedChar => write!(f, "Unclosed char literal"), - OverlongChar => write!(f, "Char literal should be one character long"), - EmptyByte => write!(f, "Empty byte literal"), - UnclosedByte => write!(f, "Unclosed byte literal"), - OverlongByte => write!(f, "Byte literal should be one character long"), - ByteOutOfRange => write!(f, "Byte should be a valid ASCII character"), - UnescapedByte => write!(f, "This byte should always be escaped"), - EmptyByteEscape => write!(f, "Empty escape sequence"), - InvalidByteEscape => write!(f, "Invalid escape sequence"), - TooShortByteCodeEscape => write!(f, "Escape sequence should have two digits"), - MalformedByteCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"), - UnicodeEscapeForbidden => { - write!(f, "Unicode escapes are not allowed in byte literals or byte strings") - } - TooShortAsciiCodeEscape => write!(f, "Escape sequence should have two digits"), - AsciiCodeEscapeOutOfRange => { - write!(f, "Escape sequence should be between \\x00 and \\x7F") - } - MalformedAsciiCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"), - UnclosedUnicodeEscape => write!(f, "Missing `}}`"), - MalformedUnicodeEscape => write!(f, "Malformed unicode escape sequence"), - EmptyUnicodeEcape => write!(f, "Empty unicode escape sequence"), - OverlongUnicodeEscape => { - write!(f, "Unicode escape sequence should have at most 6 digits") - } - UnicodeEscapeOutOfRange => write!(f, "Unicode escape code should be at most 0x10FFFF"), - UnclosedString => write!(f, "Unclosed string literal"), - InvalidSuffix => write!(f, "Invalid literal suffix"), InvalidBlockAttr => { write!(f, "A block in this position cannot accept inner attributes") } @@ -144,6 +90,46 @@ impl fmt::Display for SyntaxErrorKind { write!(f, "Tuple (struct) field access is only allowed through decimal integers with no underscores or suffix") } ParseError(msg) => write!(f, "{}", msg.0), + EscapeError(err) => write!(f, "{}", err), } } } + +impl fmt::Display for EscapeError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let msg = match self { + EscapeError::ZeroChars => "Empty literal", + EscapeError::MoreThanOneChar => "Literal should be one character long", + EscapeError::LoneSlash => "Character must be escaped: '\\'", + EscapeError::InvalidEscape => "Invalid escape sequence", + EscapeError::BareCarriageReturn => "Character must be escaped: '\r'", + EscapeError::EscapeOnlyChar => "Character must be escaped", + EscapeError::TooShortHexEscape => "Escape sequence should have two digits", + EscapeError::InvalidCharInHexEscape => "Escape sequence should be a hexadecimal number", + EscapeError::OutOfRangeHexEscape => "Escape sequence should be ASCII", + EscapeError::NoBraceInUnicodeEscape => "Invalid escape sequence", + EscapeError::InvalidCharInUnicodeEscape => "Invalid escape sequence", + EscapeError::EmptyUnicodeEscape => "Invalid escape sequence", + EscapeError::UnclosedUnicodeEscape => "Missing '}'", + EscapeError::LeadingUnderscoreUnicodeEscape => "Invalid escape sequence", + EscapeError::OverlongUnicodeEscape => { + "Unicode escape sequence should have at most 6 digits" + } + EscapeError::LoneSurrogateUnicodeEscape => { + "Unicode escape code should not be a surrogate" + } + EscapeError::OutOfRangeUnicodeEscape => { + "Unicode escape code should be at most 0x10FFFF" + } + EscapeError::UnicodeEscapeInByte => "Unicode escapes are not allowed in bytes", + EscapeError::NonAsciiCharInByte => "Non ASCII characters are not allowed in bytes", + }; + write!(f, "{}", msg) + } +} + +impl From for SyntaxErrorKind { + fn from(err: EscapeError) -> Self { + SyntaxErrorKind::EscapeError(err) + } +} diff --git a/crates/ra_syntax/src/validation.rs b/crates/ra_syntax/src/validation.rs index c2f545173..11a1fb4a7 100644 --- a/crates/ra_syntax/src/validation.rs +++ b/crates/ra_syntax/src/validation.rs @@ -1,17 +1,17 @@ -mod byte; -mod byte_string; -mod char; -mod string; +mod unescape; + mod block; mod field_expr; use crate::{ - SourceFile, SyntaxError, AstNode, SyntaxNode, + SourceFile, SyntaxError, AstNode, SyntaxNode, TextUnit, SyntaxKind::{L_CURLY, R_CURLY, BYTE, BYTE_STRING, STRING, CHAR}, ast, algo::visit::{visitor_ctx, VisitorCtx}, }; +pub(crate) use unescape::EscapeError; + pub(crate) fn validate(file: &SourceFile) -> Vec { let mut errors = Vec::new(); for node in file.syntax().descendants() { @@ -26,11 +26,55 @@ pub(crate) fn validate(file: &SourceFile) -> Vec { // FIXME: kill duplication fn validate_literal(literal: &ast::Literal, acc: &mut Vec) { - match literal.token().kind() { - BYTE => byte::validate_byte_node(literal.token(), acc), - BYTE_STRING => byte_string::validate_byte_string_node(literal.token(), acc), - STRING => string::validate_string_node(literal.token(), acc), - CHAR => char::validate_char_node(literal.token(), acc), + let token = literal.token(); + let text = token.text().as_str(); + match token.kind() { + BYTE => { + if let Some(end) = text.rfind('\'') { + if let Some(without_quotes) = text.get(2..end) { + if let Err((off, err)) = unescape::unescape_byte(without_quotes) { + let off = token.range().start() + TextUnit::from_usize(off + 2); + acc.push(SyntaxError::new(err.into(), off)) + } + } + } + } + CHAR => { + if let Some(end) = text.rfind('\'') { + if let Some(without_quotes) = text.get(1..end) { + if let Err((off, err)) = unescape::unescape_char(without_quotes) { + let off = token.range().start() + TextUnit::from_usize(off + 1); + acc.push(SyntaxError::new(err.into(), off)) + } + } + } + } + BYTE_STRING => { + if let Some(end) = text.rfind('\"') { + if let Some(without_quotes) = text.get(2..end) { + unescape::unescape_byte_str(without_quotes, &mut |range, char| { + if let Err(err) = char { + let off = range.start; + let off = token.range().start() + TextUnit::from_usize(off + 2); + acc.push(SyntaxError::new(err.into(), off)) + } + }) + } + } + } + STRING => { + if let Some(end) = text.rfind('\"') { + if let Some(without_quotes) = text.get(1..end) { + unescape::unescape_str(without_quotes, &mut |range, char| { + if let Err(err) = char { + let off = range.start; + let off = token.range().start() + TextUnit::from_usize(off + 1); + acc.push(SyntaxError::new(err.into(), off)) + } + }) + } + } + } _ => (), } } diff --git a/crates/ra_syntax/src/validation/byte.rs b/crates/ra_syntax/src/validation/byte.rs deleted file mode 100644 index f653e65d0..000000000 --- a/crates/ra_syntax/src/validation/byte.rs +++ /dev/null @@ -1,199 +0,0 @@ -//! Validation of byte literals - -use crate::{ - string_lexing::{self, StringComponentKind}, - TextRange, - validation::char, - SyntaxError, - SyntaxErrorKind::*, - SyntaxToken, -}; - -pub(super) fn validate_byte_node(node: SyntaxToken, errors: &mut Vec) { - let literal_text = node.text(); - let literal_range = node.range(); - let mut components = string_lexing::parse_quoted_literal(Some('b'), '\'', literal_text); - let mut len = 0; - for component in &mut components { - len += 1; - let text = &literal_text[component.range]; - let range = component.range + literal_range.start(); - validate_byte_component(text, component.kind, range, errors); - } - - if !components.has_closing_quote { - errors.push(SyntaxError::new(UnclosedByte, literal_range)); - } - - if let Some(range) = components.suffix { - errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start())); - } - - if len == 0 { - errors.push(SyntaxError::new(EmptyByte, literal_range)); - } - - if len > 1 { - errors.push(SyntaxError::new(OverlongByte, literal_range)); - } -} - -pub(super) fn validate_byte_component( - text: &str, - kind: StringComponentKind, - range: TextRange, - errors: &mut Vec, -) { - use self::StringComponentKind::*; - match kind { - AsciiEscape => validate_byte_escape(text, range, errors), - AsciiCodeEscape => validate_byte_code_escape(text, range, errors), - UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)), - CodePoint => { - let c = text.chars().next().expect("Code points should be one character long"); - - // These bytes must always be escaped - if c == '\t' || c == '\r' || c == '\n' { - errors.push(SyntaxError::new(UnescapedByte, range)); - } - - // Only ASCII bytes are allowed - if c > 0x7F as char { - errors.push(SyntaxError::new(ByteOutOfRange, range)); - } - } - IgnoreNewline => { /* always valid */ } - } -} - -fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec) { - if text.len() == 1 { - // Escape sequence consists only of leading `\` - errors.push(SyntaxError::new(EmptyByteEscape, range)); - } else { - let escape_code = text.chars().skip(1).next().unwrap(); - if !char::is_ascii_escape(escape_code) { - errors.push(SyntaxError::new(InvalidByteEscape, range)); - } - } -} - -fn validate_byte_code_escape(text: &str, range: TextRange, errors: &mut Vec) { - // A ByteCodeEscape has 4 chars, example: `\xDD` - if !text.is_ascii() { - errors.push(SyntaxError::new(MalformedByteCodeEscape, range)); - } else if text.chars().count() < 4 { - errors.push(SyntaxError::new(TooShortByteCodeEscape, range)); - } else { - assert!(text.chars().count() == 4, "ByteCodeEscape cannot be longer than 4 chars"); - - if u8::from_str_radix(&text[2..], 16).is_err() { - errors.push(SyntaxError::new(MalformedByteCodeEscape, range)); - } - } -} - -#[cfg(test)] -mod test { - use crate::{SourceFile, TreeArc}; - - fn build_file(literal: &str) -> TreeArc { - let src = format!("const C: u8 = b'{}';", literal); - SourceFile::parse(&src) - } - - fn assert_valid_byte(literal: &str) { - let file = build_file(literal); - assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors()); - } - - fn assert_invalid_byte(literal: &str) { - let file = build_file(literal); - assert!(file.errors().len() > 0); - } - - #[test] - fn test_ansi_codepoints() { - for byte in 0..128 { - match byte { - b'\n' | b'\r' | b'\t' => assert_invalid_byte(&(byte as char).to_string()), - b'\'' | b'\\' => { /* Ignore character close and backslash */ } - _ => assert_valid_byte(&(byte as char).to_string()), - } - } - - for byte in 128..=255u8 { - assert_invalid_byte(&(byte as char).to_string()); - } - } - - #[test] - fn test_unicode_codepoints() { - let invalid = ["Ƒ", "バ", "メ", "﷽"]; - for c in &invalid { - assert_invalid_byte(c); - } - } - - #[test] - fn test_unicode_multiple_codepoints() { - let invalid = ["नी", "👨‍👨‍"]; - for c in &invalid { - assert_invalid_byte(c); - } - } - - #[test] - fn test_valid_byte_escape() { - let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"]; - for c in &valid { - assert_valid_byte(c); - } - } - - #[test] - fn test_invalid_byte_escape() { - let invalid = [r"\a", r"\?", r"\"]; - for c in &invalid { - assert_invalid_byte(c); - } - } - - #[test] - fn test_valid_byte_code_escape() { - let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"]; - for c in &valid { - assert_valid_byte(c); - } - } - - #[test] - fn test_invalid_byte_code_escape() { - let invalid = [r"\x", r"\x7"]; - for c in &invalid { - assert_invalid_byte(c); - } - } - - #[test] - fn test_invalid_unicode_escape() { - let well_formed = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"]; - for c in &well_formed { - assert_invalid_byte(c); - } - - let invalid = [ - r"\u", - r"\u{}", - r"\u{", - r"\u{FF", - r"\u{FFFFFF}", - r"\u{_F}", - r"\u{00FFFFF}", - r"\u{110000}", - ]; - for c in &invalid { - assert_invalid_byte(c); - } - } -} diff --git a/crates/ra_syntax/src/validation/byte_string.rs b/crates/ra_syntax/src/validation/byte_string.rs deleted file mode 100644 index 1d48c2d9b..000000000 --- a/crates/ra_syntax/src/validation/byte_string.rs +++ /dev/null @@ -1,169 +0,0 @@ -use crate::{ - string_lexing::{self, StringComponentKind}, - SyntaxError, - SyntaxErrorKind::*, - SyntaxToken, -}; - -use super::byte; - -pub(crate) fn validate_byte_string_node(node: SyntaxToken, errors: &mut Vec) { - let literal_text = node.text(); - let literal_range = node.range(); - let mut components = string_lexing::parse_quoted_literal(Some('b'), '"', literal_text); - for component in &mut components { - let range = component.range + literal_range.start(); - - match component.kind { - StringComponentKind::IgnoreNewline => { /* always valid */ } - _ => { - // Chars must escape \t, \n and \r codepoints, but strings don't - let text = &literal_text[component.range]; - match text { - "\t" | "\n" | "\r" => { /* always valid */ } - _ => byte::validate_byte_component(text, component.kind, range, errors), - } - } - } - } - - if !components.has_closing_quote { - errors.push(SyntaxError::new(UnclosedString, literal_range)); - } - - if let Some(range) = components.suffix { - errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start())); - } -} - -#[cfg(test)] -mod test { - use crate::{SourceFile, TreeArc}; - - fn build_file(literal: &str) -> TreeArc { - let src = format!(r#"const S: &'static [u8] = b"{}";"#, literal); - println!("Source: {}", src); - SourceFile::parse(&src) - } - - fn assert_valid_str(literal: &str) { - let file = build_file(literal); - assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors()); - } - - fn assert_invalid_str(literal: &str) { - let file = build_file(literal); - assert!(file.errors().len() > 0); - } - - #[test] - fn test_ansi_codepoints() { - for byte in 0..128 { - match byte { - b'\"' | b'\\' => { /* Ignore string close and backslash */ } - _ => assert_valid_str(&(byte as char).to_string()), - } - } - - for byte in 128..=255u8 { - assert_invalid_str(&(byte as char).to_string()); - } - } - - #[test] - fn test_unicode_codepoints() { - let invalid = ["Ƒ", "バ", "メ", "﷽"]; - for c in &invalid { - assert_invalid_str(c); - } - } - - #[test] - fn test_unicode_multiple_codepoints() { - let invalid = ["नी", "👨‍👨‍"]; - for c in &invalid { - assert_invalid_str(c); - } - } - - #[test] - fn test_valid_ascii_escape() { - let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"]; - for c in &valid { - assert_valid_str(c); - } - } - - #[test] - fn test_invalid_ascii_escape() { - let invalid = [r"\a", r"\?", r"\"]; - for c in &invalid { - assert_invalid_str(c); - } - } - - #[test] - fn test_valid_ascii_code_escape() { - let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"]; - for c in &valid { - assert_valid_str(c); - } - } - - #[test] - fn test_invalid_ascii_code_escape() { - let invalid = [r"\x", r"\x7"]; - for c in &invalid { - assert_invalid_str(c); - } - } - - #[test] - fn test_invalid_unicode_escape() { - let well_formed = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"]; - for c in &well_formed { - assert_invalid_str(c); - } - - let invalid = [ - r"\u", - r"\u{}", - r"\u{", - r"\u{FF", - r"\u{FFFFFF}", - r"\u{_F}", - r"\u{00FFFFF}", - r"\u{110000}", - ]; - for c in &invalid { - assert_invalid_str(c); - } - } - - #[test] - fn test_mixed_invalid() { - assert_invalid_str( - r"This is the tale of a string -with a newline in between, some emoji (👨‍👨‍) here and there, -unicode escapes like this: \u{1FFBB} and weird stuff like -this ﷽", - ); - } - - #[test] - fn test_mixed_valid() { - assert_valid_str( - r"This is the tale of a string -with a newline in between, no emoji at all, -nor unicode escapes or weird stuff", - ); - } - - #[test] - fn test_ignore_newline() { - assert_valid_str( - "Hello \ - World", - ); - } -} diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs deleted file mode 100644 index 0f1885873..000000000 --- a/crates/ra_syntax/src/validation/char.rs +++ /dev/null @@ -1,273 +0,0 @@ -//! Validation of char literals - -use std::u32; - -use arrayvec::ArrayString; - -use crate::{ - string_lexing::{self, StringComponentKind}, - TextRange, - SyntaxError, - SyntaxErrorKind::*, - SyntaxToken, -}; - -pub(super) fn validate_char_node(node: SyntaxToken, errors: &mut Vec) { - let literal_text = node.text(); - let literal_range = node.range(); - let mut components = string_lexing::parse_quoted_literal(None, '\'', literal_text); - let mut len = 0; - for component in &mut components { - len += 1; - let text = &literal_text[component.range]; - let range = component.range + literal_range.start(); - validate_char_component(text, component.kind, range, errors); - } - - if !components.has_closing_quote { - errors.push(SyntaxError::new(UnclosedChar, literal_range)); - } - - if let Some(range) = components.suffix { - errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start())); - } - - if len == 0 { - errors.push(SyntaxError::new(EmptyChar, literal_range)); - } - - if len > 1 { - errors.push(SyntaxError::new(OverlongChar, literal_range)); - } -} - -pub(super) fn validate_char_component( - text: &str, - kind: StringComponentKind, - range: TextRange, - errors: &mut Vec, -) { - // Validate escapes - use self::StringComponentKind::*; - match kind { - AsciiEscape => validate_ascii_escape(text, range, errors), - AsciiCodeEscape => validate_ascii_code_escape(text, range, errors), - UnicodeEscape => validate_unicode_escape(text, range, errors), - CodePoint => { - // These code points must always be escaped - if text == "\t" || text == "\r" || text == "\n" { - errors.push(SyntaxError::new(UnescapedCodepoint, range)); - } - } - StringComponentKind::IgnoreNewline => { /* always valid */ } - } -} - -fn validate_ascii_escape(text: &str, range: TextRange, errors: &mut Vec) { - if text.len() == 1 { - // Escape sequence consists only of leading `\` (only occurs at EOF, otherwise e.g. '\' is treated as an unclosed char containing a single quote `'`) - errors.push(SyntaxError::new(EmptyAsciiEscape, range)); - } else { - let escape_code = text.chars().skip(1).next().unwrap(); - if !is_ascii_escape(escape_code) { - errors.push(SyntaxError::new(InvalidAsciiEscape, range)); - } - } -} - -pub(super) fn is_ascii_escape(code: char) -> bool { - match code { - '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true, - _ => false, - } -} - -fn validate_ascii_code_escape(text: &str, range: TextRange, errors: &mut Vec) { - // An AsciiCodeEscape has 4 chars, example: `\xDD` - if !text.is_ascii() { - // FIXME: Give a more precise error message (say what the invalid character was) - errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)); - } else if text.chars().count() < 4 { - errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range)); - } else { - assert_eq!( - text.chars().count(), - 4, - "AsciiCodeEscape cannot be longer than 4 chars, but text '{}' is", - text, - ); - - match u8::from_str_radix(&text[2..], 16) { - Ok(code) if code < 128 => { /* Escape code is valid */ } - Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)), - Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)), - } - } -} - -fn validate_unicode_escape(text: &str, range: TextRange, errors: &mut Vec) { - assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u"); - - if text.len() == 2 { - // No starting `{` - errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); - return; - } - - if text.len() == 3 { - // Only starting `{` - errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)); - return; - } - - let mut code = ArrayString::<[_; 6]>::new(); - let mut closed = false; - for c in text[3..].chars() { - assert!(!closed, "no characters after escape is closed"); - - if c.is_digit(16) { - if code.len() == 6 { - errors.push(SyntaxError::new(OverlongUnicodeEscape, range)); - return; - } - - code.push(c); - } else if c == '_' { - // Reject leading _ - if code.len() == 0 { - errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); - return; - } - } else if c == '}' { - closed = true; - } else { - errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); - return; - } - } - - if !closed { - errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)) - } - - if code.len() == 0 { - errors.push(SyntaxError::new(EmptyUnicodeEcape, range)); - return; - } - - match u32::from_str_radix(&code, 16) { - Ok(code_u32) if code_u32 > 0x10FFFF => { - errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range)); - } - Ok(_) => { - // Valid escape code - } - Err(_) => { - errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); - } - } -} - -#[cfg(test)] -mod test { - use crate::{SourceFile, TreeArc}; - - fn build_file(literal: &str) -> TreeArc { - let src = format!("const C: char = '{}';", literal); - SourceFile::parse(&src) - } - - fn assert_valid_char(literal: &str) { - let file = build_file(literal); - assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors()); - } - - fn assert_invalid_char(literal: &str) { - let file = build_file(literal); - assert!(file.errors().len() > 0); - } - - #[test] - fn test_ansi_codepoints() { - for byte in 0..=255u8 { - match byte { - b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()), - b'\'' | b'\\' => { /* Ignore character close and backslash */ } - _ => assert_valid_char(&(byte as char).to_string()), - } - } - } - - #[test] - fn test_unicode_codepoints() { - let valid = ["Ƒ", "バ", "メ", "﷽"]; - for c in &valid { - assert_valid_char(c); - } - } - - #[test] - fn test_unicode_multiple_codepoints() { - let invalid = ["नी", "👨‍👨‍"]; - for c in &invalid { - assert_invalid_char(c); - } - } - - #[test] - fn test_valid_ascii_escape() { - let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"]; - for c in &valid { - assert_valid_char(c); - } - } - - #[test] - fn test_invalid_ascii_escape() { - let invalid = [r"\a", r"\?", r"\"]; - for c in &invalid { - assert_invalid_char(c); - } - } - - #[test] - fn test_valid_ascii_code_escape() { - let valid = [r"\x00", r"\x7F", r"\x55"]; - for c in &valid { - assert_valid_char(c); - } - } - - #[test] - fn test_invalid_ascii_code_escape() { - let invalid = [r"\x", r"\x7", r"\xF0"]; - for c in &invalid { - assert_invalid_char(c); - } - } - - #[test] - fn test_valid_unicode_escape() { - let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"]; - for c in &valid { - assert_valid_char(c); - } - } - - #[test] - fn test_invalid_unicode_escape() { - let invalid = [ - r"\u", - r"\u{}", - r"\u{", - r"\u{FF", - r"\u{FFFFFF}", - r"\u{_F}", - r"\u{00FFFFF}", - r"\u{110000}", - ]; - for c in &invalid { - assert_invalid_char(c); - } - } -} diff --git a/crates/ra_syntax/src/validation/string.rs b/crates/ra_syntax/src/validation/string.rs deleted file mode 100644 index fc2f1b992..000000000 --- a/crates/ra_syntax/src/validation/string.rs +++ /dev/null @@ -1,154 +0,0 @@ -use crate::{ - string_lexing, - SyntaxError, - SyntaxErrorKind::*, - SyntaxToken, -}; - -use super::char; - -pub(crate) fn validate_string_node(node: SyntaxToken, errors: &mut Vec) { - let literal_text = node.text(); - let literal_range = node.range(); - let mut components = string_lexing::parse_quoted_literal(None, '"', literal_text); - for component in &mut components { - let range = component.range + literal_range.start(); - - // Chars must escape \t, \n and \r codepoints, but strings don't - let text = &literal_text[component.range]; - match text { - "\t" | "\n" | "\r" => { /* always valid */ } - _ => char::validate_char_component(text, component.kind, range, errors), - } - } - - if !components.has_closing_quote { - errors.push(SyntaxError::new(UnclosedString, literal_range)); - } - - if let Some(range) = components.suffix { - errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start())); - } -} - -#[cfg(test)] -mod test { - use crate::{SourceFile, TreeArc}; - - fn build_file(literal: &str) -> TreeArc { - let src = format!(r#"const S: &'static str = "{}";"#, literal); - println!("Source: {}", src); - SourceFile::parse(&src) - } - - fn assert_valid_str(literal: &str) { - let file = build_file(literal); - assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors()); - } - - fn assert_invalid_str(literal: &str) { - let file = build_file(literal); - assert!(file.errors().len() > 0); - } - - #[test] - fn test_ansi_codepoints() { - for byte in 0..=255u8 { - match byte { - b'\"' | b'\\' => { /* Ignore string close and backslash */ } - _ => assert_valid_str(&(byte as char).to_string()), - } - } - } - - #[test] - fn test_unicode_codepoints() { - let valid = ["Ƒ", "バ", "メ", "﷽"]; - for c in &valid { - assert_valid_str(c); - } - } - - #[test] - fn test_unicode_multiple_codepoints() { - let valid = ["नी", "👨‍👨‍"]; - for c in &valid { - assert_valid_str(c); - } - } - - #[test] - fn test_valid_ascii_escape() { - let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"]; - for c in &valid { - assert_valid_str(c); - } - } - - #[test] - fn test_invalid_ascii_escape() { - let invalid = [r"\a", r"\?", r"\"]; - for c in &invalid { - assert_invalid_str(c); - } - } - - #[test] - fn test_valid_ascii_code_escape() { - let valid = [r"\x00", r"\x7F", r"\x55"]; - for c in &valid { - assert_valid_str(c); - } - } - - #[test] - fn test_invalid_ascii_code_escape() { - let invalid = [r"\x", r"\x7", r"\xF0"]; - for c in &invalid { - assert_invalid_str(c); - } - } - - #[test] - fn test_valid_unicode_escape() { - let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"]; - for c in &valid { - assert_valid_str(c); - } - } - - #[test] - fn test_invalid_unicode_escape() { - let invalid = [ - r"\u", - r"\u{}", - r"\u{", - r"\u{FF", - r"\u{FFFFFF}", - r"\u{_F}", - r"\u{00FFFFF}", - r"\u{110000}", - ]; - for c in &invalid { - assert_invalid_str(c); - } - } - - #[test] - fn test_mixed() { - assert_valid_str( - r"This is the tale of a string -with a newline in between, some emoji (👨‍👨‍) here and there, -unicode escapes like this: \u{1FFBB} and weird stuff like -this ﷽", - ); - } - - #[test] - fn test_ignore_newline() { - assert_valid_str( - "Hello \ - World", - ); - } -} diff --git a/crates/ra_syntax/src/validation/unescape.rs b/crates/ra_syntax/src/validation/unescape.rs new file mode 100644 index 000000000..2086046b6 --- /dev/null +++ b/crates/ra_syntax/src/validation/unescape.rs @@ -0,0 +1,521 @@ +//! Utilities for validating string and char literals and turning them into +//! values they represent. +//! +//! This file is copy-pasted from the compiler +//! +//! https://github.com/rust-lang/rust/blob/c6ac57564852cb6e2d0db60f7b46d9eb98d4b449/src/libsyntax/parse/unescape.rs +//! +//! Hopefully, we'll share this code in a proper way some day + +use std::str::Chars; +use std::ops::Range; + +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +pub enum EscapeError { + ZeroChars, + MoreThanOneChar, + + LoneSlash, + InvalidEscape, + BareCarriageReturn, + EscapeOnlyChar, + + TooShortHexEscape, + InvalidCharInHexEscape, + OutOfRangeHexEscape, + + NoBraceInUnicodeEscape, + InvalidCharInUnicodeEscape, + EmptyUnicodeEscape, + UnclosedUnicodeEscape, + LeadingUnderscoreUnicodeEscape, + OverlongUnicodeEscape, + LoneSurrogateUnicodeEscape, + OutOfRangeUnicodeEscape, + + UnicodeEscapeInByte, + NonAsciiCharInByte, +} + +/// Takes a contents of a char literal (without quotes), and returns an +/// unescaped char or an error +pub(crate) fn unescape_char(literal_text: &str) -> Result { + let mut chars = literal_text.chars(); + unescape_char_or_byte(&mut chars, Mode::Char) + .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of escaped characters or errors. +pub(crate) fn unescape_str(literal_text: &str, callback: &mut F) +where + F: FnMut(Range, Result), +{ + unescape_str_or_byte_str(literal_text, Mode::Str, callback) +} + +pub(crate) fn unescape_byte(literal_text: &str) -> Result { + let mut chars = literal_text.chars(); + unescape_char_or_byte(&mut chars, Mode::Byte) + .map(byte_from_char) + .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of escaped characters or errors. +pub(crate) fn unescape_byte_str(literal_text: &str, callback: &mut F) +where + F: FnMut(Range, Result), +{ + unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| { + callback(range, char.map(byte_from_char)) + }) +} + +#[derive(Debug, Clone, Copy)] +pub(crate) enum Mode { + Char, + Str, + Byte, + ByteStr, +} + +impl Mode { + fn in_single_quotes(self) -> bool { + match self { + Mode::Char | Mode::Byte => true, + Mode::Str | Mode::ByteStr => false, + } + } + + pub(crate) fn in_double_quotes(self) -> bool { + !self.in_single_quotes() + } + + pub(crate) fn is_bytes(self) -> bool { + match self { + Mode::Byte | Mode::ByteStr => true, + Mode::Char | Mode::Str => false, + } + } +} + +fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result { + if first_char != '\\' { + return match first_char { + '\t' | '\n' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(if chars.clone().next() == Some('\n') { + EscapeError::EscapeOnlyChar + } else { + EscapeError::BareCarriageReturn + }), + '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar), + '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar), + _ => { + if mode.is_bytes() && !first_char.is_ascii() { + return Err(EscapeError::NonAsciiCharInByte); + } + Ok(first_char) + } + }; + } + + let second_char = chars.next().ok_or(EscapeError::LoneSlash)?; + + let res = match second_char { + '"' => '"', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + '\\' => '\\', + '\'' => '\'', + '0' => '\0', + + 'x' => { + let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + + let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + + let value = hi * 16 + lo; + + if !mode.is_bytes() && !is_ascii(value) { + return Err(EscapeError::OutOfRangeHexEscape); + } + let value = value as u8; + + value as char + } + + 'u' => { + if chars.next() != Some('{') { + return Err(EscapeError::NoBraceInUnicodeEscape); + } + + let mut n_digits = 1; + let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { + '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), + '}' => return Err(EscapeError::EmptyUnicodeEscape), + c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?, + }; + + loop { + match chars.next() { + None => return Err(EscapeError::UnclosedUnicodeEscape), + Some('_') => continue, + Some('}') => { + if n_digits > 6 { + return Err(EscapeError::OverlongUnicodeEscape); + } + if mode.is_bytes() { + return Err(EscapeError::UnicodeEscapeInByte); + } + + break std::char::from_u32(value).ok_or_else(|| { + if value > 0x10FFFF { + EscapeError::OutOfRangeUnicodeEscape + } else { + EscapeError::LoneSurrogateUnicodeEscape + } + })?; + } + Some(c) => { + let digit = + c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?; + n_digits += 1; + if n_digits > 6 { + continue; + } + let digit = digit as u32; + value = value * 16 + digit; + } + }; + } + } + _ => return Err(EscapeError::InvalidEscape), + }; + Ok(res) +} + +fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { + let first_char = chars.next().ok_or(EscapeError::ZeroChars)?; + let res = scan_escape(first_char, chars, mode)?; + if chars.next().is_some() { + return Err(EscapeError::MoreThanOneChar); + } + Ok(res) +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of escaped characters or errors. +fn unescape_str_or_byte_str(src: &str, mode: Mode, callback: &mut F) +where + F: FnMut(Range, Result), +{ + assert!(mode.in_double_quotes()); + let initial_len = src.len(); + let mut chars = src.chars(); + while let Some(first_char) = chars.next() { + let start = initial_len - chars.as_str().len() - first_char.len_utf8(); + + let unescaped_char = match first_char { + '\\' => { + let (second_char, third_char) = { + let mut chars = chars.clone(); + (chars.next(), chars.next()) + }; + match (second_char, third_char) { + (Some('\n'), _) | (Some('\r'), Some('\n')) => { + skip_ascii_whitespace(&mut chars); + continue; + } + _ => scan_escape(first_char, &mut chars, mode), + } + } + '\r' => { + let second_char = chars.clone().next(); + if second_char == Some('\n') { + chars.next(); + Ok('\n') + } else { + scan_escape(first_char, &mut chars, mode) + } + } + '\n' => Ok('\n'), + '\t' => Ok('\t'), + _ => scan_escape(first_char, &mut chars, mode), + }; + let end = initial_len - chars.as_str().len(); + callback(start..end, unescaped_char); + } + + fn skip_ascii_whitespace(chars: &mut Chars<'_>) { + let str = chars.as_str(); + let first_non_space = str + .bytes() + .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') + .unwrap_or(str.len()); + *chars = str[first_non_space..].chars() + } +} + +fn byte_from_char(c: char) -> u8 { + let res = c as u32; + assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte"); + res as u8 +} + +fn is_ascii(x: u32) -> bool { + x <= 0x7F +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_unescape_char_bad() { + fn check(literal_text: &str, expected_error: EscapeError) { + let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err); + assert_eq!(actual_result, Err(expected_error)); + } + + check("", EscapeError::ZeroChars); + check(r"\", EscapeError::LoneSlash); + + check("\n", EscapeError::EscapeOnlyChar); + check("\r\n", EscapeError::EscapeOnlyChar); + check("\t", EscapeError::EscapeOnlyChar); + check("'", EscapeError::EscapeOnlyChar); + check("\r", EscapeError::BareCarriageReturn); + + check("spam", EscapeError::MoreThanOneChar); + check(r"\x0ff", EscapeError::MoreThanOneChar); + check(r#"\"a"#, EscapeError::MoreThanOneChar); + check(r"\na", EscapeError::MoreThanOneChar); + check(r"\ra", EscapeError::MoreThanOneChar); + check(r"\ta", EscapeError::MoreThanOneChar); + check(r"\\a", EscapeError::MoreThanOneChar); + check(r"\'a", EscapeError::MoreThanOneChar); + check(r"\0a", EscapeError::MoreThanOneChar); + check(r"\u{0}x", EscapeError::MoreThanOneChar); + check(r"\u{1F63b}}", EscapeError::MoreThanOneChar); + + check(r"\v", EscapeError::InvalidEscape); + check(r"\💩", EscapeError::InvalidEscape); + check(r"\●", EscapeError::InvalidEscape); + + check(r"\x", EscapeError::TooShortHexEscape); + check(r"\x0", EscapeError::TooShortHexEscape); + check(r"\xf", EscapeError::TooShortHexEscape); + check(r"\xa", EscapeError::TooShortHexEscape); + check(r"\xx", EscapeError::InvalidCharInHexEscape); + check(r"\xы", EscapeError::InvalidCharInHexEscape); + check(r"\x🦀", EscapeError::InvalidCharInHexEscape); + check(r"\xtt", EscapeError::InvalidCharInHexEscape); + check(r"\xff", EscapeError::OutOfRangeHexEscape); + check(r"\xFF", EscapeError::OutOfRangeHexEscape); + check(r"\x80", EscapeError::OutOfRangeHexEscape); + + check(r"\u", EscapeError::NoBraceInUnicodeEscape); + check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape); + check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape); + check(r"\u{", EscapeError::UnclosedUnicodeEscape); + check(r"\u{0000", EscapeError::UnclosedUnicodeEscape); + check(r"\u{}", EscapeError::EmptyUnicodeEscape); + check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape); + check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape); + check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape); + check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape); + check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape); + + check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape); + + check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape); + } + + #[test] + fn test_unescape_char_good() { + fn check(literal_text: &str, expected_char: char) { + let actual_result = unescape_char(literal_text); + assert_eq!(actual_result, Ok(expected_char)); + } + + check("a", 'a'); + check("ы", 'ы'); + check("🦀", '🦀'); + + check(r#"\""#, '"'); + check(r"\n", '\n'); + check(r"\r", '\r'); + check(r"\t", '\t'); + check(r"\\", '\\'); + check(r"\'", '\''); + check(r"\0", '\0'); + + check(r"\x00", '\0'); + check(r"\x5a", 'Z'); + check(r"\x5A", 'Z'); + check(r"\x7f", 127 as char); + + check(r"\u{0}", '\0'); + check(r"\u{000000}", '\0'); + check(r"\u{41}", 'A'); + check(r"\u{0041}", 'A'); + check(r"\u{00_41}", 'A'); + check(r"\u{4__1__}", 'A'); + check(r"\u{1F63b}", '😻'); + } + + #[test] + fn test_unescape_str_good() { + fn check(literal_text: &str, expected: &str) { + let mut buf = Ok(String::with_capacity(literal_text.len())); + unescape_str(literal_text, &mut |range, c| { + if let Ok(b) = &mut buf { + match c { + Ok(c) => b.push(c), + Err(e) => buf = Err((range, e)), + } + } + }); + let buf = buf.as_ref().map(|it| it.as_ref()); + assert_eq!(buf, Ok(expected)) + } + + check("foo", "foo"); + check("", ""); + check(" \t\n\r\n", " \t\n\n"); + + check("hello \\\n world", "hello world"); + check("hello \\\r\n world", "hello world"); + check("thread's", "thread's") + } + + #[test] + fn test_unescape_byte_bad() { + fn check(literal_text: &str, expected_error: EscapeError) { + let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err); + assert_eq!(actual_result, Err(expected_error)); + } + + check("", EscapeError::ZeroChars); + check(r"\", EscapeError::LoneSlash); + + check("\n", EscapeError::EscapeOnlyChar); + check("\r\n", EscapeError::EscapeOnlyChar); + check("\t", EscapeError::EscapeOnlyChar); + check("'", EscapeError::EscapeOnlyChar); + check("\r", EscapeError::BareCarriageReturn); + + check("spam", EscapeError::MoreThanOneChar); + check(r"\x0ff", EscapeError::MoreThanOneChar); + check(r#"\"a"#, EscapeError::MoreThanOneChar); + check(r"\na", EscapeError::MoreThanOneChar); + check(r"\ra", EscapeError::MoreThanOneChar); + check(r"\ta", EscapeError::MoreThanOneChar); + check(r"\\a", EscapeError::MoreThanOneChar); + check(r"\'a", EscapeError::MoreThanOneChar); + check(r"\0a", EscapeError::MoreThanOneChar); + + check(r"\v", EscapeError::InvalidEscape); + check(r"\💩", EscapeError::InvalidEscape); + check(r"\●", EscapeError::InvalidEscape); + + check(r"\x", EscapeError::TooShortHexEscape); + check(r"\x0", EscapeError::TooShortHexEscape); + check(r"\xa", EscapeError::TooShortHexEscape); + check(r"\xf", EscapeError::TooShortHexEscape); + check(r"\xx", EscapeError::InvalidCharInHexEscape); + check(r"\xы", EscapeError::InvalidCharInHexEscape); + check(r"\x🦀", EscapeError::InvalidCharInHexEscape); + check(r"\xtt", EscapeError::InvalidCharInHexEscape); + + check(r"\u", EscapeError::NoBraceInUnicodeEscape); + check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape); + check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape); + check(r"\u{", EscapeError::UnclosedUnicodeEscape); + check(r"\u{0000", EscapeError::UnclosedUnicodeEscape); + check(r"\u{}", EscapeError::EmptyUnicodeEscape); + check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape); + check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape); + + check("ы", EscapeError::NonAsciiCharInByte); + check("🦀", EscapeError::NonAsciiCharInByte); + + check(r"\u{0}", EscapeError::UnicodeEscapeInByte); + check(r"\u{000000}", EscapeError::UnicodeEscapeInByte); + check(r"\u{41}", EscapeError::UnicodeEscapeInByte); + check(r"\u{0041}", EscapeError::UnicodeEscapeInByte); + check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte); + check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte); + check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte); + check(r"\u{0}x", EscapeError::UnicodeEscapeInByte); + check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte); + check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte); + check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte); + check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte); + check(r"\u{D800}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte); + } + + #[test] + fn test_unescape_byte_good() { + fn check(literal_text: &str, expected_byte: u8) { + let actual_result = unescape_byte(literal_text); + assert_eq!(actual_result, Ok(expected_byte)); + } + + check("a", b'a'); + + check(r#"\""#, b'"'); + check(r"\n", b'\n'); + check(r"\r", b'\r'); + check(r"\t", b'\t'); + check(r"\\", b'\\'); + check(r"\'", b'\''); + check(r"\0", b'\0'); + + check(r"\x00", b'\0'); + check(r"\x5a", b'Z'); + check(r"\x5A", b'Z'); + check(r"\x7f", 127); + check(r"\x80", 128); + check(r"\xff", 255); + check(r"\xFF", 255); + } + + #[test] + fn test_unescape_byte_str_good() { + fn check(literal_text: &str, expected: &[u8]) { + let mut buf = Ok(Vec::with_capacity(literal_text.len())); + unescape_byte_str(literal_text, &mut |range, c| { + if let Ok(b) = &mut buf { + match c { + Ok(c) => b.push(c), + Err(e) => buf = Err((range, e)), + } + } + }); + let buf = buf.as_ref().map(|it| it.as_ref()); + assert_eq!(buf, Ok(expected)) + } + + check("foo", b"foo"); + check("", b""); + check(" \t\n\r\n", b" \t\n\n"); + + check("hello \\\n world", b"hello world"); + check("hello \\\r\n world", b"hello world"); + check("thread's", b"thread's") + } +} diff --git a/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt b/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt index 61a28134a..e0e38d37d 100644 --- a/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt +++ b/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt @@ -40,7 +40,6 @@ SOURCE_FILE@[0; 112) WHITESPACE@[43; 44) " " LITERAL@[44; 59) STRING@[44; 59) "\"string\"invalid" - err: `Invalid literal suffix` SEMI@[59; 60) ";" WHITESPACE@[60; 65) "\n " LET_STMT@[65; 83) @@ -53,7 +52,6 @@ SOURCE_FILE@[0; 112) WHITESPACE@[72; 73) " " LITERAL@[73; 82) BYTE@[73; 82) "b\'b\'_suff" - err: `Invalid literal suffix` SEMI@[82; 83) ";" WHITESPACE@[83; 88) "\n " LET_STMT@[88; 109) @@ -66,7 +64,6 @@ SOURCE_FILE@[0; 112) WHITESPACE@[95; 96) " " LITERAL@[96; 108) BYTE_STRING@[96; 108) "b\"bs\"invalid" - err: `Invalid literal suffix` SEMI@[108; 109) ";" WHITESPACE@[109; 110) "\n" R_CURLY@[110; 111) "}" -- cgit v1.2.3