share literal validation logic with compiler

author: Aleksey Kladov <[email protected]> 2019-05-07 17:38:26 +0100
committer: Aleksey Kladov <[email protected]> 2019-05-07 17:41:59 +0100
commit: 313314e14b629ebf50389dbd2d440bda922f6ae7 (patch)
tree: ca6a4aee6ad4077a869a932a18c6c8d134406f8c /crates/ra_syntax/src/validation
parent: ef782adc293deb287128f005dbab2038ba3ccdc1 (diff)
5 files changed, 521 insertions, 795 deletions
diff --git a/crates/ra_syntax/src/validation/byte.rs b/crates/ra_syntax/src/validation/byte.rs
deleted file mode 100644
index f653e65d0..000000000
--- a/crates/ra_syntax/src/validation/byte.rs
+++ /dev/null
@@ -1,199 +0,0 @@
-//! Validation of byte literals
-use crate::{
-    string_lexing::{self, StringComponentKind},
-    TextRange,
-    validation::char,
-    SyntaxError,
-    SyntaxErrorKind::*,
-    SyntaxToken,
-};
-pub(super) fn validate_byte_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
-    let literal_text = node.text();
-    let literal_range = node.range();
-    let mut components = string_lexing::parse_quoted_literal(Some('b'), '\'', literal_text);
-    let mut len = 0;
-    for component in &mut components {
-        len += 1;
-        let text = &literal_text[component.range];
-        let range = component.range + literal_range.start();
-        validate_byte_component(text, component.kind, range, errors);
-    }
-    if !components.has_closing_quote {
-        errors.push(SyntaxError::new(UnclosedByte, literal_range));
-    }
-    if let Some(range) = components.suffix {
-        errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
-    }
-    if len == 0 {
-        errors.push(SyntaxError::new(EmptyByte, literal_range));
-    }
-    if len > 1 {
-        errors.push(SyntaxError::new(OverlongByte, literal_range));
-    }
-}
-pub(super) fn validate_byte_component(
-    text: &str,
-    kind: StringComponentKind,
-    range: TextRange,
-    errors: &mut Vec<SyntaxError>,
-) {
-    use self::StringComponentKind::*;
-    match kind {
-        AsciiEscape => validate_byte_escape(text, range, errors),
-        AsciiCodeEscape => validate_byte_code_escape(text, range, errors),
-        UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)),
-        CodePoint => {
-            let c = text.chars().next().expect("Code points should be one character long");
-            // These bytes must always be escaped
-            if c == '\t' || c == '\r' || c == '\n' {
-                errors.push(SyntaxError::new(UnescapedByte, range));
-            }
-            // Only ASCII bytes are allowed
-            if c > 0x7F as char {
-                errors.push(SyntaxError::new(ByteOutOfRange, range));
-            }
-        }
-        IgnoreNewline => { /* always valid */ }
-    }
-}
-fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
-    if text.len() == 1 {
-        // Escape sequence consists only of leading `\`
-        errors.push(SyntaxError::new(EmptyByteEscape, range));
-    } else {
-        let escape_code = text.chars().skip(1).next().unwrap();
-        if !char::is_ascii_escape(escape_code) {
-            errors.push(SyntaxError::new(InvalidByteEscape, range));
-        }
-    }
-}
-fn validate_byte_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
-    // A ByteCodeEscape has 4 chars, example: `\xDD`
-    if !text.is_ascii() {
-        errors.push(SyntaxError::new(MalformedByteCodeEscape, range));
-    } else if text.chars().count() < 4 {
-        errors.push(SyntaxError::new(TooShortByteCodeEscape, range));
-    } else {
-        assert!(text.chars().count() == 4, "ByteCodeEscape cannot be longer than 4 chars");
-        if u8::from_str_radix(&text[2..], 16).is_err() {
-            errors.push(SyntaxError::new(MalformedByteCodeEscape, range));
-        }
-    }
-}
-#[cfg(test)]
-mod test {
-    use crate::{SourceFile, TreeArc};
-    fn build_file(literal: &str) -> TreeArc<SourceFile> {
-        let src = format!("const C: u8 = b'{}';", literal);
-        SourceFile::parse(&src)
-    }
-    fn assert_valid_byte(literal: &str) {
-        let file = build_file(literal);
-        assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
-    }
-    fn assert_invalid_byte(literal: &str) {
-        let file = build_file(literal);
-        assert!(file.errors().len() > 0);
-    }
-    #[test]
-    fn test_ansi_codepoints() {
-        for byte in 0..128 {
-            match byte {
-                b'\n' | b'\r' | b'\t' => assert_invalid_byte(&(byte as char).to_string()),
-                b'\'' | b'\\' => { /* Ignore character close and backslash */ }
-                _ => assert_valid_byte(&(byte as char).to_string()),
-            }
-        }
-        for byte in 128..=255u8 {
-            assert_invalid_byte(&(byte as char).to_string());
-        }
-    }
-    #[test]
-    fn test_unicode_codepoints() {
-        let invalid = ["Ƒ", "バ", "メ", "﷽"];
-        for c in &invalid {
-            assert_invalid_byte(c);
-        }
-    }
-    #[test]
-    fn test_unicode_multiple_codepoints() {
-        let invalid = ["नी", "👨‍👨‍"];
-        for c in &invalid {
-            assert_invalid_byte(c);
-        }
-    }
-    #[test]
-    fn test_valid_byte_escape() {
-        let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"];
-        for c in &valid {
-            assert_valid_byte(c);
-        }
-    }
-    #[test]
-    fn test_invalid_byte_escape() {
-        let invalid = [r"\a", r"\?", r"\"];
-        for c in &invalid {
-            assert_invalid_byte(c);
-        }
-    }
-    #[test]
-    fn test_valid_byte_code_escape() {
-        let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"];
-        for c in &valid {
-            assert_valid_byte(c);
-        }
-    }
-    #[test]
-    fn test_invalid_byte_code_escape() {
-        let invalid = [r"\x", r"\x7"];
-        for c in &invalid {
-            assert_invalid_byte(c);
-        }
-    }
-    #[test]
-    fn test_invalid_unicode_escape() {
-        let well_formed = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
-        for c in &well_formed {
-            assert_invalid_byte(c);
-        }
-        let invalid = [
-            r"\u",
-            r"\u{}",
-            r"\u{",
-            r"\u{FF",
-            r"\u{FFFFFF}",
-            r"\u{_F}",
-            r"\u{00FFFFF}",
-            r"\u{110000}",
-        ];
-        for c in &invalid {
-            assert_invalid_byte(c);
-        }
-    }
-}
diff --git a/crates/ra_syntax/src/validation/byte_string.rs b/crates/ra_syntax/src/validation/byte_string.rs
deleted file mode 100644
index 1d48c2d9b..000000000
--- a/crates/ra_syntax/src/validation/byte_string.rs
+++ /dev/null
@@ -1,169 +0,0 @@
-use crate::{
-    string_lexing::{self, StringComponentKind},
-    SyntaxError,
-    SyntaxErrorKind::*,
-    SyntaxToken,
-};
-use super::byte;
-pub(crate) fn validate_byte_string_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
-    let literal_text = node.text();
-    let literal_range = node.range();
-    let mut components = string_lexing::parse_quoted_literal(Some('b'), '"', literal_text);
-    for component in &mut components {
-        let range = component.range + literal_range.start();
-        match component.kind {
-            StringComponentKind::IgnoreNewline => { /* always valid */ }
-            _ => {
-                // Chars must escape \t, \n and \r codepoints, but strings don't
-                let text = &literal_text[component.range];
-                match text {
-                    "\t" | "\n" | "\r" => { /* always valid */ }
-                    _ => byte::validate_byte_component(text, component.kind, range, errors),
-                }
-            }
-        }
-    }
-    if !components.has_closing_quote {
-        errors.push(SyntaxError::new(UnclosedString, literal_range));
-    }
-    if let Some(range) = components.suffix {
-        errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
-    }
-}
-#[cfg(test)]
-mod test {
-    use crate::{SourceFile, TreeArc};
-    fn build_file(literal: &str) -> TreeArc<SourceFile> {
-        let src = format!(r#"const S: &'static [u8] = b"{}";"#, literal);
-        println!("Source: {}", src);
-        SourceFile::parse(&src)
-    }
-    fn assert_valid_str(literal: &str) {
-        let file = build_file(literal);
-        assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
-    }
-    fn assert_invalid_str(literal: &str) {
-        let file = build_file(literal);
-        assert!(file.errors().len() > 0);
-    }
-    #[test]
-    fn test_ansi_codepoints() {
-        for byte in 0..128 {
-            match byte {
-                b'\"' | b'\\' => { /* Ignore string close and backslash */ }
-                _ => assert_valid_str(&(byte as char).to_string()),
-            }
-        }
-        for byte in 128..=255u8 {
-            assert_invalid_str(&(byte as char).to_string());
-        }
-    }
-    #[test]
-    fn test_unicode_codepoints() {
-        let invalid = ["Ƒ", "バ", "メ", "﷽"];
-        for c in &invalid {
-            assert_invalid_str(c);
-        }
-    }
-    #[test]
-    fn test_unicode_multiple_codepoints() {
-        let invalid = ["नी", "👨‍👨‍"];
-        for c in &invalid {
-            assert_invalid_str(c);
-        }
-    }
-    #[test]
-    fn test_valid_ascii_escape() {
-        let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"];
-        for c in &valid {
-            assert_valid_str(c);
-        }
-    }
-    #[test]
-    fn test_invalid_ascii_escape() {
-        let invalid = [r"\a", r"\?", r"\"];
-        for c in &invalid {
-            assert_invalid_str(c);
-        }
-    }
-    #[test]
-    fn test_valid_ascii_code_escape() {
-        let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"];
-        for c in &valid {
-            assert_valid_str(c);
-        }
-    }
-    #[test]
-    fn test_invalid_ascii_code_escape() {
-        let invalid = [r"\x", r"\x7"];
-        for c in &invalid {
-            assert_invalid_str(c);
-        }
-    }
-    #[test]
-    fn test_invalid_unicode_escape() {
-        let well_formed = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
-        for c in &well_formed {
-            assert_invalid_str(c);
-        }
-        let invalid = [
-            r"\u",
-            r"\u{}",
-            r"\u{",
-            r"\u{FF",
-            r"\u{FFFFFF}",
-            r"\u{_F}",
-            r"\u{00FFFFF}",
-            r"\u{110000}",
-        ];
-        for c in &invalid {
-            assert_invalid_str(c);
-        }
-    }
-    #[test]
-    fn test_mixed_invalid() {
-        assert_invalid_str(
-            r"This is the tale of a string
-with a newline in between, some emoji (👨‍👨‍) here and there,
-unicode escapes like this: \u{1FFBB} and weird stuff like
-this ﷽",
-        );
-    }
-    #[test]
-    fn test_mixed_valid() {
-        assert_valid_str(
-            r"This is the tale of a string
-with a newline in between, no emoji at all,
-nor unicode escapes or weird stuff",
-        );
-    }
-    #[test]
-    fn test_ignore_newline() {
-        assert_valid_str(
-            "Hello \
-             World",
-        );
-    }
-}
diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs
deleted file mode 100644
index 0f1885873..000000000
--- a/crates/ra_syntax/src/validation/char.rs
+++ /dev/null
@@ -1,273 +0,0 @@
-//! Validation of char literals
-use std::u32;
-use arrayvec::ArrayString;
-use crate::{
-    string_lexing::{self, StringComponentKind},
-    TextRange,
-    SyntaxError,
-    SyntaxErrorKind::*,
-    SyntaxToken,
-};
-pub(super) fn validate_char_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
-    let literal_text = node.text();
-    let literal_range = node.range();
-    let mut components = string_lexing::parse_quoted_literal(None, '\'', literal_text);
-    let mut len = 0;
-    for component in &mut components {
-        len += 1;
-        let text = &literal_text[component.range];
-        let range = component.range + literal_range.start();
-        validate_char_component(text, component.kind, range, errors);
-    }
-    if !components.has_closing_quote {
-        errors.push(SyntaxError::new(UnclosedChar, literal_range));
-    }
-    if let Some(range) = components.suffix {
-        errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
-    }
-    if len == 0 {
-        errors.push(SyntaxError::new(EmptyChar, literal_range));
-    }
-    if len > 1 {
-        errors.push(SyntaxError::new(OverlongChar, literal_range));
-    }
-}
-pub(super) fn validate_char_component(
-    text: &str,
-    kind: StringComponentKind,
-    range: TextRange,
-    errors: &mut Vec<SyntaxError>,
-) {
-    // Validate escapes
-    use self::StringComponentKind::*;
-    match kind {
-        AsciiEscape => validate_ascii_escape(text, range, errors),
-        AsciiCodeEscape => validate_ascii_code_escape(text, range, errors),
-        UnicodeEscape => validate_unicode_escape(text, range, errors),
-        CodePoint => {
-            // These code points must always be escaped
-            if text == "\t" || text == "\r" || text == "\n" {
-                errors.push(SyntaxError::new(UnescapedCodepoint, range));
-            }
-        }
-        StringComponentKind::IgnoreNewline => { /* always valid */ }
-    }
-}
-fn validate_ascii_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
-    if text.len() == 1 {
-        // Escape sequence consists only of leading `\` (only occurs at EOF, otherwise e.g. '\' is treated as an unclosed char containing a single quote `'`)
-        errors.push(SyntaxError::new(EmptyAsciiEscape, range));
-    } else {
-        let escape_code = text.chars().skip(1).next().unwrap();
-        if !is_ascii_escape(escape_code) {
-            errors.push(SyntaxError::new(InvalidAsciiEscape, range));
-        }
-    }
-}
-pub(super) fn is_ascii_escape(code: char) -> bool {
-    match code {
-        '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true,
-        _ => false,
-    }
-}
-fn validate_ascii_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
-    // An AsciiCodeEscape has 4 chars, example: `\xDD`
-    if !text.is_ascii() {
-        // FIXME: Give a more precise error message (say what the invalid character was)
-        errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range));
-    } else if text.chars().count() < 4 {
-        errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range));
-    } else {
-        assert_eq!(
-            text.chars().count(),
-            4,
-            "AsciiCodeEscape cannot be longer than 4 chars, but text '{}' is",
-            text,
-        );
-        match u8::from_str_radix(&text[2..], 16) {
-            Ok(code) if code < 128 => { /* Escape code is valid */ }
-            Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)),
-            Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)),
-        }
-    }
-}
-fn validate_unicode_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
-    assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u");
-    if text.len() == 2 {
-        // No starting `{`
-        errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
-        return;
-    }
-    if text.len() == 3 {
-        // Only starting `{`
-        errors.push(SyntaxError::new(UnclosedUnicodeEscape, range));
-        return;
-    }
-    let mut code = ArrayString::<[_; 6]>::new();
-    let mut closed = false;
-    for c in text[3..].chars() {
-        assert!(!closed, "no characters after escape is closed");
-        if c.is_digit(16) {
-            if code.len() == 6 {
-                errors.push(SyntaxError::new(OverlongUnicodeEscape, range));
-                return;
-            }
-            code.push(c);
-        } else if c == '_' {
-            // Reject leading _
-            if code.len() == 0 {
-                errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
-                return;
-            }
-        } else if c == '}' {
-            closed = true;
-        } else {
-            errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
-            return;
-        }
-    }
-    if !closed {
-        errors.push(SyntaxError::new(UnclosedUnicodeEscape, range))
-    }
-    if code.len() == 0 {
-        errors.push(SyntaxError::new(EmptyUnicodeEcape, range));
-        return;
-    }
-    match u32::from_str_radix(&code, 16) {
-        Ok(code_u32) if code_u32 > 0x10FFFF => {
-            errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range));
-        }
-        Ok(_) => {
-            // Valid escape code
-        }
-        Err(_) => {
-            errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
-        }
-    }
-}
-#[cfg(test)]
-mod test {
-    use crate::{SourceFile, TreeArc};
-    fn build_file(literal: &str) -> TreeArc<SourceFile> {
-        let src = format!("const C: char = '{}';", literal);
-        SourceFile::parse(&src)
-    }
-    fn assert_valid_char(literal: &str) {
-        let file = build_file(literal);
-        assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
-    }
-    fn assert_invalid_char(literal: &str) {
-        let file = build_file(literal);
-        assert!(file.errors().len() > 0);
-    }
-    #[test]
-    fn test_ansi_codepoints() {
-        for byte in 0..=255u8 {
-            match byte {
-                b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()),
-                b'\'' | b'\\' => { /* Ignore character close and backslash */ }
-                _ => assert_valid_char(&(byte as char).to_string()),
-            }
-        }
-    }
-    #[test]
-    fn test_unicode_codepoints() {
-        let valid = ["Ƒ", "バ", "メ", "﷽"];
-        for c in &valid {
-            assert_valid_char(c);
-        }
-    }
-    #[test]
-    fn test_unicode_multiple_codepoints() {
-        let invalid = ["नी", "👨‍👨‍"];
-        for c in &invalid {
-            assert_invalid_char(c);
-        }
-    }
-    #[test]
-    fn test_valid_ascii_escape() {
-        let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"];
-        for c in &valid {
-            assert_valid_char(c);
-        }
-    }
-    #[test]
-    fn test_invalid_ascii_escape() {
-        let invalid = [r"\a", r"\?", r"\"];
-        for c in &invalid {
-            assert_invalid_char(c);
-        }
-    }
-    #[test]
-    fn test_valid_ascii_code_escape() {
-        let valid = [r"\x00", r"\x7F", r"\x55"];
-        for c in &valid {
-            assert_valid_char(c);
-        }
-    }
-    #[test]
-    fn test_invalid_ascii_code_escape() {
-        let invalid = [r"\x", r"\x7", r"\xF0"];
-        for c in &invalid {
-            assert_invalid_char(c);
-        }
-    }
-    #[test]
-    fn test_valid_unicode_escape() {
-        let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
-        for c in &valid {
-            assert_valid_char(c);
-        }
-    }
-    #[test]
-    fn test_invalid_unicode_escape() {
-        let invalid = [
-            r"\u",
-            r"\u{}",
-            r"\u{",
-            r"\u{FF",
-            r"\u{FFFFFF}",
-            r"\u{_F}",
-            r"\u{00FFFFF}",
-            r"\u{110000}",
-        ];
-        for c in &invalid {
-            assert_invalid_char(c);
-        }
-    }
-}
diff --git a/crates/ra_syntax/src/validation/string.rs b/crates/ra_syntax/src/validation/string.rs
deleted file mode 100644
index fc2f1b992..000000000
--- a/crates/ra_syntax/src/validation/string.rs
+++ /dev/null
@@ -1,154 +0,0 @@
-use crate::{
-    string_lexing,
-    SyntaxError,
-    SyntaxErrorKind::*,
-    SyntaxToken,
-};
-use super::char;
-pub(crate) fn validate_string_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
-    let literal_text = node.text();
-    let literal_range = node.range();
-    let mut components = string_lexing::parse_quoted_literal(None, '"', literal_text);
-    for component in &mut components {
-        let range = component.range + literal_range.start();
-        // Chars must escape \t, \n and \r codepoints, but strings don't
-        let text = &literal_text[component.range];
-        match text {
-            "\t" | "\n" | "\r" => { /* always valid */ }
-            _ => char::validate_char_component(text, component.kind, range, errors),
-        }
-    }
-    if !components.has_closing_quote {
-        errors.push(SyntaxError::new(UnclosedString, literal_range));
-    }
-    if let Some(range) = components.suffix {
-        errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
-    }
-}
-#[cfg(test)]
-mod test {
-    use crate::{SourceFile, TreeArc};
-    fn build_file(literal: &str) -> TreeArc<SourceFile> {
-        let src = format!(r#"const S: &'static str = "{}";"#, literal);
-        println!("Source: {}", src);
-        SourceFile::parse(&src)
-    }
-    fn assert_valid_str(literal: &str) {
-        let file = build_file(literal);
-        assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
-    }
-    fn assert_invalid_str(literal: &str) {
-        let file = build_file(literal);
-        assert!(file.errors().len() > 0);
-    }
-    #[test]
-    fn test_ansi_codepoints() {
-        for byte in 0..=255u8 {
-            match byte {
-                b'\"' | b'\\' => { /* Ignore string close and backslash */ }
-                _ => assert_valid_str(&(byte as char).to_string()),
-            }
-        }
-    }
-    #[test]
-    fn test_unicode_codepoints() {
-        let valid = ["Ƒ", "バ", "メ", "﷽"];
-        for c in &valid {
-            assert_valid_str(c);
-        }
-    }
-    #[test]
-    fn test_unicode_multiple_codepoints() {
-        let valid = ["नी", "👨‍👨‍"];
-        for c in &valid {
-            assert_valid_str(c);
-        }
-    }
-    #[test]
-    fn test_valid_ascii_escape() {
-        let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"];
-        for c in &valid {
-            assert_valid_str(c);
-        }
-    }
-    #[test]
-    fn test_invalid_ascii_escape() {
-        let invalid = [r"\a", r"\?", r"\"];
-        for c in &invalid {
-            assert_invalid_str(c);
-        }
-    }
-    #[test]
-    fn test_valid_ascii_code_escape() {
-        let valid = [r"\x00", r"\x7F", r"\x55"];
-        for c in &valid {
-            assert_valid_str(c);
-        }
-    }
-    #[test]
-    fn test_invalid_ascii_code_escape() {
-        let invalid = [r"\x", r"\x7", r"\xF0"];
-        for c in &invalid {
-            assert_invalid_str(c);
-        }
-    }
-    #[test]
-    fn test_valid_unicode_escape() {
-        let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
-        for c in &valid {
-            assert_valid_str(c);
-        }
-    }
-    #[test]
-    fn test_invalid_unicode_escape() {
-        let invalid = [
-            r"\u",
-            r"\u{}",
-            r"\u{",
-            r"\u{FF",
-            r"\u{FFFFFF}",
-            r"\u{_F}",
-            r"\u{00FFFFF}",
-            r"\u{110000}",
-        ];
-        for c in &invalid {
-            assert_invalid_str(c);
-        }
-    }
-    #[test]
-    fn test_mixed() {
-        assert_valid_str(
-            r"This is the tale of a string
-with a newline in between, some emoji (👨‍👨‍) here and there,
-unicode escapes like this: \u{1FFBB} and weird stuff like
-this ﷽",
-        );
-    }
-    #[test]
-    fn test_ignore_newline() {
-        assert_valid_str(
-            "Hello \
-             World",
-        );
-    }
-}
diff --git a/crates/ra_syntax/src/validation/unescape.rs b/crates/ra_syntax/src/validation/unescape.rs
new file mode 100644
index 000000000..2086046b6
--- /dev/null
+++ b/crates/ra_syntax/src/validation/unescape.rs
@@ -0,0 +1,521 @@
+//! Utilities for validating  string and char literals and turning them into
+//! values they represent.
+//!
+//! This file is copy-pasted from the compiler
+//!
+//! https://github.com/rust-lang/rust/blob/c6ac57564852cb6e2d0db60f7b46d9eb98d4b449/src/libsyntax/parse/unescape.rs
+//!
+//! Hopefully, we'll share this code in a proper way some day
+use std::str::Chars;
+use std::ops::Range;
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+pub enum EscapeError {
+    ZeroChars,
+    MoreThanOneChar,
+    LoneSlash,
+    InvalidEscape,
+    BareCarriageReturn,
+    EscapeOnlyChar,
+    TooShortHexEscape,
+    InvalidCharInHexEscape,
+    OutOfRangeHexEscape,
+    NoBraceInUnicodeEscape,
+    InvalidCharInUnicodeEscape,
+    EmptyUnicodeEscape,
+    UnclosedUnicodeEscape,
+    LeadingUnderscoreUnicodeEscape,
+    OverlongUnicodeEscape,
+    LoneSurrogateUnicodeEscape,
+    OutOfRangeUnicodeEscape,
+    UnicodeEscapeInByte,
+    NonAsciiCharInByte,
+}
+/// Takes a contents of a char literal (without quotes), and returns an
+/// unescaped char or an error
+pub(crate) fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> {
+    let mut chars = literal_text.chars();
+    unescape_char_or_byte(&mut chars, Mode::Char)
+        .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
+}
+/// Takes a contents of a string literal (without quotes) and produces a
+/// sequence of escaped characters or errors.
+pub(crate) fn unescape_str<F>(literal_text: &str, callback: &mut F)
+where
+    F: FnMut(Range<usize>, Result<char, EscapeError>),
+{
+    unescape_str_or_byte_str(literal_text, Mode::Str, callback)
+}
+pub(crate) fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
+    let mut chars = literal_text.chars();
+    unescape_char_or_byte(&mut chars, Mode::Byte)
+        .map(byte_from_char)
+        .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
+}
+/// Takes a contents of a string literal (without quotes) and produces a
+/// sequence of escaped characters or errors.
+pub(crate) fn unescape_byte_str<F>(literal_text: &str, callback: &mut F)
+where
+    F: FnMut(Range<usize>, Result<u8, EscapeError>),
+{
+    unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
+        callback(range, char.map(byte_from_char))
+    })
+}
+#[derive(Debug, Clone, Copy)]
+pub(crate) enum Mode {
+    Char,
+    Str,
+    Byte,
+    ByteStr,
+}
+impl Mode {
+    fn in_single_quotes(self) -> bool {
+        match self {
+            Mode::Char | Mode::Byte => true,
+            Mode::Str | Mode::ByteStr => false,
+        }
+    }
+    pub(crate) fn in_double_quotes(self) -> bool {
+        !self.in_single_quotes()
+    }
+    pub(crate) fn is_bytes(self) -> bool {
+        match self {
+            Mode::Byte | Mode::ByteStr => true,
+            Mode::Char | Mode::Str => false,
+        }
+    }
+}
+fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
+    if first_char != '\\' {
+        return match first_char {
+            '\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
+            '\r' => Err(if chars.clone().next() == Some('\n') {
+                EscapeError::EscapeOnlyChar
+            } else {
+                EscapeError::BareCarriageReturn
+            }),
+            '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
+            '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
+            _ => {
+                if mode.is_bytes() && !first_char.is_ascii() {
+                    return Err(EscapeError::NonAsciiCharInByte);
+                }
+                Ok(first_char)
+            }
+        };
+    }
+    let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
+    let res = match second_char {
+        '"' => '"',
+        'n' => '\n',
+        'r' => '\r',
+        't' => '\t',
+        '\\' => '\\',
+        '\'' => '\'',
+        '0' => '\0',
+        'x' => {
+            let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
+            let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
+            let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
+            let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
+            let value = hi * 16 + lo;
+            if !mode.is_bytes() && !is_ascii(value) {
+                return Err(EscapeError::OutOfRangeHexEscape);
+            }
+            let value = value as u8;
+            value as char
+        }
+        'u' => {
+            if chars.next() != Some('{') {
+                return Err(EscapeError::NoBraceInUnicodeEscape);
+            }
+            let mut n_digits = 1;
+            let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
+                '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
+                '}' => return Err(EscapeError::EmptyUnicodeEscape),
+                c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
+            };
+            loop {
+                match chars.next() {
+                    None => return Err(EscapeError::UnclosedUnicodeEscape),
+                    Some('_') => continue,
+                    Some('}') => {
+                        if n_digits > 6 {
+                            return Err(EscapeError::OverlongUnicodeEscape);
+                        }
+                        if mode.is_bytes() {
+                            return Err(EscapeError::UnicodeEscapeInByte);
+                        }
+                        break std::char::from_u32(value).ok_or_else(|| {
+                            if value > 0x10FFFF {
+                                EscapeError::OutOfRangeUnicodeEscape
+                            } else {
+                                EscapeError::LoneSurrogateUnicodeEscape
+                            }
+                        })?;
+                    }
+                    Some(c) => {
+                        let digit =
+                            c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
+                        n_digits += 1;
+                        if n_digits > 6 {
+                            continue;
+                        }
+                        let digit = digit as u32;
+                        value = value * 16 + digit;
+                    }
+                };
+            }
+        }
+        _ => return Err(EscapeError::InvalidEscape),
+    };
+    Ok(res)
+}
+fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
+    let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
+    let res = scan_escape(first_char, chars, mode)?;
+    if chars.next().is_some() {
+        return Err(EscapeError::MoreThanOneChar);
+    }
+    Ok(res)
+}
+/// Takes a contents of a string literal (without quotes) and produces a
+/// sequence of escaped characters or errors.
+fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
+where
+    F: FnMut(Range<usize>, Result<char, EscapeError>),
+{
+    assert!(mode.in_double_quotes());
+    let initial_len = src.len();
+    let mut chars = src.chars();
+    while let Some(first_char) = chars.next() {
+        let start = initial_len - chars.as_str().len() - first_char.len_utf8();
+        let unescaped_char = match first_char {
+            '\\' => {
+                let (second_char, third_char) = {
+                    let mut chars = chars.clone();
+                    (chars.next(), chars.next())
+                };
+                match (second_char, third_char) {
+                    (Some('\n'), _) | (Some('\r'), Some('\n')) => {
+                        skip_ascii_whitespace(&mut chars);
+                        continue;
+                    }
+                    _ => scan_escape(first_char, &mut chars, mode),
+                }
+            }
+            '\r' => {
+                let second_char = chars.clone().next();
+                if second_char == Some('\n') {
+                    chars.next();
+                    Ok('\n')
+                } else {
+                    scan_escape(first_char, &mut chars, mode)
+                }
+            }
+            '\n' => Ok('\n'),
+            '\t' => Ok('\t'),
+            _ => scan_escape(first_char, &mut chars, mode),
+        };
+        let end = initial_len - chars.as_str().len();
+        callback(start..end, unescaped_char);
+    }
+    fn skip_ascii_whitespace(chars: &mut Chars<'_>) {
+        let str = chars.as_str();
+        let first_non_space = str
+            .bytes()
+            .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
+            .unwrap_or(str.len());
+        *chars = str[first_non_space..].chars()
+    }
+}
+fn byte_from_char(c: char) -> u8 {
+    let res = c as u32;
+    assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte");
+    res as u8
+}
+fn is_ascii(x: u32) -> bool {
+    x <= 0x7F
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_unescape_char_bad() {
+        fn check(literal_text: &str, expected_error: EscapeError) {
+            let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err);
+            assert_eq!(actual_result, Err(expected_error));
+        }
+        check("", EscapeError::ZeroChars);
+        check(r"\", EscapeError::LoneSlash);
+        check("\n", EscapeError::EscapeOnlyChar);
+        check("\r\n", EscapeError::EscapeOnlyChar);
+        check("\t", EscapeError::EscapeOnlyChar);
+        check("'", EscapeError::EscapeOnlyChar);
+        check("\r", EscapeError::BareCarriageReturn);
+        check("spam", EscapeError::MoreThanOneChar);
+        check(r"\x0ff", EscapeError::MoreThanOneChar);
+        check(r#"\"a"#, EscapeError::MoreThanOneChar);
+        check(r"\na", EscapeError::MoreThanOneChar);
+        check(r"\ra", EscapeError::MoreThanOneChar);
+        check(r"\ta", EscapeError::MoreThanOneChar);
+        check(r"\\a", EscapeError::MoreThanOneChar);
+        check(r"\'a", EscapeError::MoreThanOneChar);
+        check(r"\0a", EscapeError::MoreThanOneChar);
+        check(r"\u{0}x", EscapeError::MoreThanOneChar);
+        check(r"\u{1F63b}}", EscapeError::MoreThanOneChar);
+        check(r"\v", EscapeError::InvalidEscape);
+        check(r"\💩", EscapeError::InvalidEscape);
+        check(r"\●", EscapeError::InvalidEscape);
+        check(r"\x", EscapeError::TooShortHexEscape);
+        check(r"\x0", EscapeError::TooShortHexEscape);
+        check(r"\xf", EscapeError::TooShortHexEscape);
+        check(r"\xa", EscapeError::TooShortHexEscape);
+        check(r"\xx", EscapeError::InvalidCharInHexEscape);
+        check(r"\xы", EscapeError::InvalidCharInHexEscape);
+        check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
+        check(r"\xtt", EscapeError::InvalidCharInHexEscape);
+        check(r"\xff", EscapeError::OutOfRangeHexEscape);
+        check(r"\xFF", EscapeError::OutOfRangeHexEscape);
+        check(r"\x80", EscapeError::OutOfRangeHexEscape);
+        check(r"\u", EscapeError::NoBraceInUnicodeEscape);
+        check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
+        check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
+        check(r"\u{", EscapeError::UnclosedUnicodeEscape);
+        check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
+        check(r"\u{}", EscapeError::EmptyUnicodeEscape);
+        check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
+        check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
+        check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape);
+        check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
+        check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
+        check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape);
+        check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape);
+        check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape);
+        check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape);
+        check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape);
+        check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape);
+    }
+    #[test]
+    fn test_unescape_char_good() {
+        fn check(literal_text: &str, expected_char: char) {
+            let actual_result = unescape_char(literal_text);
+            assert_eq!(actual_result, Ok(expected_char));
+        }
+        check("a", 'a');
+        check("ы", 'ы');
+        check("🦀", '🦀');
+        check(r#"\""#, '"');
+        check(r"\n", '\n');
+        check(r"\r", '\r');
+        check(r"\t", '\t');
+        check(r"\\", '\\');
+        check(r"\'", '\'');
+        check(r"\0", '\0');
+        check(r"\x00", '\0');
+        check(r"\x5a", 'Z');
+        check(r"\x5A", 'Z');
+        check(r"\x7f", 127 as char);
+        check(r"\u{0}", '\0');
+        check(r"\u{000000}", '\0');
+        check(r"\u{41}", 'A');
+        check(r"\u{0041}", 'A');
+        check(r"\u{00_41}", 'A');
+        check(r"\u{4__1__}", 'A');
+        check(r"\u{1F63b}", '😻');
+    }
+    #[test]
+    fn test_unescape_str_good() {
+        fn check(literal_text: &str, expected: &str) {
+            let mut buf = Ok(String::with_capacity(literal_text.len()));
+            unescape_str(literal_text, &mut |range, c| {
+                if let Ok(b) = &mut buf {
+                    match c {
+                        Ok(c) => b.push(c),
+                        Err(e) => buf = Err((range, e)),
+                    }
+                }
+            });
+            let buf = buf.as_ref().map(|it| it.as_ref());
+            assert_eq!(buf, Ok(expected))
+        }
+        check("foo", "foo");
+        check("", "");
+        check(" \t\n\r\n", " \t\n\n");
+        check("hello \\\n     world", "hello world");
+        check("hello \\\r\n     world", "hello world");
+        check("thread's", "thread's")
+    }
+    #[test]
+    fn test_unescape_byte_bad() {
+        fn check(literal_text: &str, expected_error: EscapeError) {
+            let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err);
+            assert_eq!(actual_result, Err(expected_error));
+        }
+        check("", EscapeError::ZeroChars);
+        check(r"\", EscapeError::LoneSlash);
+        check("\n", EscapeError::EscapeOnlyChar);
+        check("\r\n", EscapeError::EscapeOnlyChar);
+        check("\t", EscapeError::EscapeOnlyChar);
+        check("'", EscapeError::EscapeOnlyChar);
+        check("\r", EscapeError::BareCarriageReturn);
+        check("spam", EscapeError::MoreThanOneChar);
+        check(r"\x0ff", EscapeError::MoreThanOneChar);
+        check(r#"\"a"#, EscapeError::MoreThanOneChar);
+        check(r"\na", EscapeError::MoreThanOneChar);
+        check(r"\ra", EscapeError::MoreThanOneChar);
+        check(r"\ta", EscapeError::MoreThanOneChar);
+        check(r"\\a", EscapeError::MoreThanOneChar);
+        check(r"\'a", EscapeError::MoreThanOneChar);
+        check(r"\0a", EscapeError::MoreThanOneChar);
+        check(r"\v", EscapeError::InvalidEscape);
+        check(r"\💩", EscapeError::InvalidEscape);
+        check(r"\●", EscapeError::InvalidEscape);
+        check(r"\x", EscapeError::TooShortHexEscape);
+        check(r"\x0", EscapeError::TooShortHexEscape);
+        check(r"\xa", EscapeError::TooShortHexEscape);
+        check(r"\xf", EscapeError::TooShortHexEscape);
+        check(r"\xx", EscapeError::InvalidCharInHexEscape);
+        check(r"\xы", EscapeError::InvalidCharInHexEscape);
+        check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
+        check(r"\xtt", EscapeError::InvalidCharInHexEscape);
+        check(r"\u", EscapeError::NoBraceInUnicodeEscape);
+        check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
+        check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
+        check(r"\u{", EscapeError::UnclosedUnicodeEscape);
+        check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
+        check(r"\u{}", EscapeError::EmptyUnicodeEscape);
+        check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
+        check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
+        check("ы", EscapeError::NonAsciiCharInByte);
+        check("🦀", EscapeError::NonAsciiCharInByte);
+        check(r"\u{0}", EscapeError::UnicodeEscapeInByte);
+        check(r"\u{000000}", EscapeError::UnicodeEscapeInByte);
+        check(r"\u{41}", EscapeError::UnicodeEscapeInByte);
+        check(r"\u{0041}", EscapeError::UnicodeEscapeInByte);
+        check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte);
+        check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte);
+        check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte);
+        check(r"\u{0}x", EscapeError::UnicodeEscapeInByte);
+        check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte);
+        check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte);
+        check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
+        check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
+        check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte);
+        check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte);
+        check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte);
+        check(r"\u{D800}", EscapeError::UnicodeEscapeInByte);
+        check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte);
+        check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte);
+    }
+    #[test]
+    fn test_unescape_byte_good() {
+        fn check(literal_text: &str, expected_byte: u8) {
+            let actual_result = unescape_byte(literal_text);
+            assert_eq!(actual_result, Ok(expected_byte));
+        }
+        check("a", b'a');
+        check(r#"\""#, b'"');
+        check(r"\n", b'\n');
+        check(r"\r", b'\r');
+        check(r"\t", b'\t');
+        check(r"\\", b'\\');
+        check(r"\'", b'\'');
+        check(r"\0", b'\0');
+        check(r"\x00", b'\0');
+        check(r"\x5a", b'Z');
+        check(r"\x5A", b'Z');
+        check(r"\x7f", 127);
+        check(r"\x80", 128);
+        check(r"\xff", 255);
+        check(r"\xFF", 255);
+    }
+    #[test]
+    fn test_unescape_byte_str_good() {
+        fn check(literal_text: &str, expected: &[u8]) {
+            let mut buf = Ok(Vec::with_capacity(literal_text.len()));
+            unescape_byte_str(literal_text, &mut |range, c| {
+                if let Ok(b) = &mut buf {
+                    match c {
+                        Ok(c) => b.push(c),
+                        Err(e) => buf = Err((range, e)),
+                    }
+                }
+            });
+            let buf = buf.as_ref().map(|it| it.as_ref());
+            assert_eq!(buf, Ok(expected))
+        }
+        check("foo", b"foo");
+        check("", b"");
+        check(" \t\n\r\n", b" \t\n\n");
+        check("hello \\\n     world", b"hello world");
+        check("hello \\\r\n     world", b"hello world");
+        check("thread's", b"thread's")
+    }
+}
author	Aleksey Kladov <[email protected]>	2019-05-07 17:38:26 +0100
committer	Aleksey Kladov <[email protected]>	2019-05-07 17:41:59 +0100
commit	313314e14b629ebf50389dbd2d440bda922f6ae7 (patch)
tree	ca6a4aee6ad4077a869a932a18c6c8d134406f8c /crates/ra_syntax/src/validation
parent	ef782adc293deb287128f005dbab2038ba3ccdc1 (diff)