From c96bfe7e2d4465653fe6b0eff053f0dfb48313fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adolfo=20Ochagav=C3=ADa?= Date: Sun, 11 Nov 2018 21:00:31 +0100 Subject: Split string lexing and run rustfmt --- crates/ra_syntax/src/string_lexing/byte.rs | 51 ++++++ crates/ra_syntax/src/string_lexing/byte_string.rs | 51 ++++++ crates/ra_syntax/src/string_lexing/char.rs | 176 +++++++++++++++++++ crates/ra_syntax/src/string_lexing/mod.rs | 13 ++ crates/ra_syntax/src/string_lexing/parser.rs | 201 ++++++++++++++++++++++ crates/ra_syntax/src/string_lexing/string.rs | 46 +++++ 6 files changed, 538 insertions(+) create mode 100644 crates/ra_syntax/src/string_lexing/byte.rs create mode 100644 crates/ra_syntax/src/string_lexing/byte_string.rs create mode 100644 crates/ra_syntax/src/string_lexing/char.rs create mode 100644 crates/ra_syntax/src/string_lexing/mod.rs create mode 100644 crates/ra_syntax/src/string_lexing/parser.rs create mode 100644 crates/ra_syntax/src/string_lexing/string.rs (limited to 'crates/ra_syntax/src/string_lexing') diff --git a/crates/ra_syntax/src/string_lexing/byte.rs b/crates/ra_syntax/src/string_lexing/byte.rs new file mode 100644 index 000000000..24424349c --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/byte.rs @@ -0,0 +1,51 @@ +use super::parser::Parser; +use super::CharComponent; + +pub fn parse_byte_literal(src: &str) -> ByteComponentIterator { + ByteComponentIterator { + parser: Parser::new(src), + has_closing_quote: false, + } +} + +pub struct ByteComponentIterator<'a> { + parser: Parser<'a>, + pub has_closing_quote: bool, +} + +impl<'a> Iterator for ByteComponentIterator<'a> { + type Item = CharComponent; + fn next(&mut self) -> Option { + if self.parser.pos == 0 { + assert!( + self.parser.advance() == 'b', + "Byte literal should start with a `b`" + ); + + assert!( + self.parser.advance() == '\'', + "Byte literal should start with a `b`, followed by a quote" + ); + } + + if let Some(component) = self.parser.parse_char_component() { + return Some(component); + } + + // We get here when there are no char components left to parse + if self.parser.peek() == Some('\'') { + self.parser.advance(); + self.has_closing_quote = true; + } + + assert!( + self.parser.peek() == None, + "byte literal should leave no unparsed input: src = {}, pos = {}, length = {}", + self.parser.src, + self.parser.pos, + self.parser.src.len() + ); + + None + } +} diff --git a/crates/ra_syntax/src/string_lexing/byte_string.rs b/crates/ra_syntax/src/string_lexing/byte_string.rs new file mode 100644 index 000000000..5b6dda760 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/byte_string.rs @@ -0,0 +1,51 @@ +use super::parser::Parser; +use super::StringComponent; + +pub fn parse_byte_string_literal(src: &str) -> ByteStringComponentIterator { + ByteStringComponentIterator { + parser: Parser::new(src), + has_closing_quote: false, + } +} + +pub struct ByteStringComponentIterator<'a> { + parser: Parser<'a>, + pub has_closing_quote: bool, +} + +impl<'a> Iterator for ByteStringComponentIterator<'a> { + type Item = StringComponent; + fn next(&mut self) -> Option { + if self.parser.pos == 0 { + assert!( + self.parser.advance() == 'b', + "byte string literal should start with a `b`" + ); + + assert!( + self.parser.advance() == '"', + "byte string literal should start with a `b`, followed by double quotes" + ); + } + + if let Some(component) = self.parser.parse_string_component() { + return Some(component); + } + + // We get here when there are no char components left to parse + if self.parser.peek() == Some('"') { + self.parser.advance(); + self.has_closing_quote = true; + } + + assert!( + self.parser.peek() == None, + "byte string literal should leave no unparsed input: src = {}, pos = {}, length = {}", + self.parser.src, + self.parser.pos, + self.parser.src.len() + ); + + None + } +} diff --git a/crates/ra_syntax/src/string_lexing/char.rs b/crates/ra_syntax/src/string_lexing/char.rs new file mode 100644 index 000000000..885c03b14 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/char.rs @@ -0,0 +1,176 @@ +use super::parser::Parser; +use super::CharComponent; + +pub fn parse_char_literal(src: &str) -> CharComponentIterator { + CharComponentIterator { + parser: Parser::new(src), + has_closing_quote: false, + } +} + +pub struct CharComponentIterator<'a> { + parser: Parser<'a>, + pub has_closing_quote: bool, +} + +impl<'a> Iterator for CharComponentIterator<'a> { + type Item = CharComponent; + fn next(&mut self) -> Option { + if self.parser.pos == 0 { + assert!( + self.parser.advance() == '\'', + "char literal should start with a quote" + ); + } + + if let Some(component) = self.parser.parse_char_component() { + return Some(component); + } + + // We get here when there are no char components left to parse + if self.parser.peek() == Some('\'') { + self.parser.advance(); + self.has_closing_quote = true; + } + + assert!( + self.parser.peek() == None, + "char literal should leave no unparsed input: src = {}, pos = {}, length = {}", + self.parser.src, + self.parser.pos, + self.parser.src.len() + ); + + None + } +} + +#[cfg(test)] +mod tests { + use rowan::TextRange; + use crate::string_lexing::{ + CharComponent, + CharComponentKind::*, +}; + + fn parse(src: &str) -> (bool, Vec) { + let component_iterator = &mut super::parse_char_literal(src); + let components: Vec<_> = component_iterator.collect(); + (component_iterator.has_closing_quote, components) + } + + fn unclosed_char_component(src: &str) -> CharComponent { + let (has_closing_quote, components) = parse(src); + assert!(!has_closing_quote, "char should not have closing quote"); + assert!(components.len() == 1); + components[0].clone() + } + + fn closed_char_component(src: &str) -> CharComponent { + let (has_closing_quote, components) = parse(src); + assert!(has_closing_quote, "char should have closing quote"); + assert!( + components.len() == 1, + "Literal: {}\nComponents: {:#?}", + src, + components + ); + components[0].clone() + } + + fn closed_char_components(src: &str) -> Vec { + let (has_closing_quote, components) = parse(src); + assert!(has_closing_quote, "char should have closing quote"); + components + } + + fn range_closed(src: &str) -> TextRange { + TextRange::from_to(1.into(), (src.len() as u32 - 1).into()) + } + + fn range_unclosed(src: &str) -> TextRange { + TextRange::from_to(1.into(), (src.len() as u32).into()) + } + + #[test] + fn test_unicode_escapes() { + let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""]; + for escape in unicode_escapes { + let escape_sequence = format!(r"'\u{}'", escape); + let component = closed_char_component(&escape_sequence); + let expected_range = range_closed(&escape_sequence); + assert_eq!(component.kind, UnicodeEscape); + assert_eq!(component.range, expected_range); + } + } + + #[test] + fn test_unicode_escapes_unclosed() { + let unicode_escapes = &["{DEAD", "{BEEF", "{FF"]; + for escape in unicode_escapes { + let escape_sequence = format!(r"'\u{}'", escape); + let component = unclosed_char_component(&escape_sequence); + let expected_range = range_unclosed(&escape_sequence); + assert_eq!(component.kind, UnicodeEscape); + assert_eq!(component.range, expected_range); + } + } + + #[test] + fn test_empty_char() { + let (has_closing_quote, components) = parse("''"); + assert!(has_closing_quote, "char should have closing quote"); + assert!(components.len() == 0); + } + + #[test] + fn test_unclosed_char() { + let component = unclosed_char_component("'a"); + assert!(component.kind == CodePoint); + assert!(component.range == TextRange::from_to(1.into(), 2.into())); + } + + #[test] + fn test_digit_escapes() { + let literals = &[r"", r"5", r"55"]; + + for literal in literals { + let lit_text = format!(r"'\x{}'", literal); + let component = closed_char_component(&lit_text); + assert!(component.kind == AsciiCodeEscape); + assert!(component.range == range_closed(&lit_text)); + } + + // More than 2 digits starts a new codepoint + let components = closed_char_components(r"'\x555'"); + assert!(components.len() == 2); + assert!(components[1].kind == CodePoint); + } + + #[test] + fn test_ascii_escapes() { + let literals = &[ + r"\'", "\\\"", // equivalent to \" + r"\n", r"\r", r"\t", r"\\", r"\0", + ]; + + for literal in literals { + let lit_text = format!("'{}'", literal); + let component = closed_char_component(&lit_text); + assert!(component.kind == AsciiEscape); + assert!(component.range == range_closed(&lit_text)); + } + } + + #[test] + fn test_no_escapes() { + let literals = &['"', 'n', 'r', 't', '0', 'x', 'u']; + + for &literal in literals { + let lit_text = format!("'{}'", literal); + let component = closed_char_component(&lit_text); + assert!(component.kind == CodePoint); + assert!(component.range == range_closed(&lit_text)); + } + } +} diff --git a/crates/ra_syntax/src/string_lexing/mod.rs b/crates/ra_syntax/src/string_lexing/mod.rs new file mode 100644 index 000000000..94853331f --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/mod.rs @@ -0,0 +1,13 @@ +mod parser; +mod byte; +mod byte_string; +mod char; +mod string; + +pub use self::{ + byte::parse_byte_literal, + byte_string::parse_byte_string_literal, + char::parse_char_literal, + parser::{CharComponent, CharComponentKind, StringComponent, StringComponentKind}, + string::parse_string_literal, +}; diff --git a/crates/ra_syntax/src/string_lexing/parser.rs b/crates/ra_syntax/src/string_lexing/parser.rs new file mode 100644 index 000000000..4a6d5bc93 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/parser.rs @@ -0,0 +1,201 @@ +use rowan::{TextRange, TextUnit}; + +use self::CharComponentKind::*; + +pub struct Parser<'a> { + pub(super) src: &'a str, + pub(super) pos: usize, +} + +impl<'a> Parser<'a> { + pub fn new(src: &'a str) -> Parser<'a> { + Parser { src, pos: 0 } + } + + // Utility methods + + pub fn peek(&self) -> Option { + if self.pos == self.src.len() { + return None; + } + + self.src[self.pos..].chars().next() + } + + pub fn advance(&mut self) -> char { + let next = self + .peek() + .expect("cannot advance if end of input is reached"); + self.pos += next.len_utf8(); + next + } + + pub fn skip_whitespace(&mut self) { + while self.peek().map(|c| c.is_whitespace()) == Some(true) { + self.advance(); + } + } + + pub fn get_pos(&self) -> TextUnit { + (self.pos as u32).into() + } + + // Char parsing methods + + fn parse_unicode_escape(&mut self, start: TextUnit) -> CharComponent { + match self.peek() { + Some('{') => { + self.advance(); + + // Parse anything until we reach `}` + while let Some(next) = self.peek() { + self.advance(); + if next == '}' { + break; + } + } + + let end = self.get_pos(); + CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) + } + Some(_) | None => { + let end = self.get_pos(); + CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) + } + } + } + + fn parse_ascii_code_escape(&mut self, start: TextUnit) -> CharComponent { + let code_start = self.get_pos(); + while let Some(next) = self.peek() { + if next == '\'' || (self.get_pos() - code_start == 2.into()) { + break; + } + + self.advance(); + } + + let end = self.get_pos(); + CharComponent::new(TextRange::from_to(start, end), AsciiCodeEscape) + } + + fn parse_escape(&mut self, start: TextUnit) -> CharComponent { + if self.peek().is_none() { + return CharComponent::new(TextRange::from_to(start, start), AsciiEscape); + } + + let next = self.advance(); + let end = self.get_pos(); + let range = TextRange::from_to(start, end); + match next { + 'x' => self.parse_ascii_code_escape(start), + 'u' => self.parse_unicode_escape(start), + _ => CharComponent::new(range, AsciiEscape), + } + } + + pub fn parse_char_component(&mut self) -> Option { + let next = self.peek()?; + + // Ignore character close + if next == '\'' { + return None; + } + + let start = self.get_pos(); + self.advance(); + + if next == '\\' { + Some(self.parse_escape(start)) + } else { + let end = self.get_pos(); + Some(CharComponent::new( + TextRange::from_to(start, end), + CodePoint, + )) + } + } + + pub fn parse_ignore_newline(&mut self, start: TextUnit) -> Option { + // In string literals, when a `\` occurs immediately before the newline, the `\`, + // the newline, and all whitespace at the beginning of the next line are ignored + match self.peek() { + Some('\n') | Some('\r') => { + self.skip_whitespace(); + Some(StringComponent::new( + TextRange::from_to(start, self.get_pos()), + StringComponentKind::IgnoreNewline, + )) + } + _ => None, + } + } + + pub fn parse_string_component(&mut self) -> Option { + let next = self.peek()?; + + // Ignore string close + if next == '"' { + return None; + } + + let start = self.get_pos(); + self.advance(); + + if next == '\\' { + // Strings can use `\` to ignore newlines, so we first try to parse one of those + // before falling back to parsing char escapes + self.parse_ignore_newline(start).or_else(|| { + let char_component = self.parse_escape(start); + Some(StringComponent::new( + char_component.range, + StringComponentKind::Char(char_component.kind), + )) + }) + } else { + let end = self.get_pos(); + Some(StringComponent::new( + TextRange::from_to(start, end), + StringComponentKind::Char(CodePoint), + )) + } + } +} + +#[derive(Debug, Eq, PartialEq, Clone)] +pub struct StringComponent { + pub range: TextRange, + pub kind: StringComponentKind, +} + +impl StringComponent { + fn new(range: TextRange, kind: StringComponentKind) -> StringComponent { + StringComponent { range, kind } + } +} + +#[derive(Debug, Eq, PartialEq, Clone)] +pub enum StringComponentKind { + IgnoreNewline, + Char(CharComponentKind), +} + +#[derive(Debug, Eq, PartialEq, Clone)] +pub struct CharComponent { + pub range: TextRange, + pub kind: CharComponentKind, +} + +impl CharComponent { + fn new(range: TextRange, kind: CharComponentKind) -> CharComponent { + CharComponent { range, kind } + } +} + +#[derive(Debug, Eq, PartialEq, Clone)] +pub enum CharComponentKind { + CodePoint, + AsciiEscape, + AsciiCodeEscape, + UnicodeEscape, +} diff --git a/crates/ra_syntax/src/string_lexing/string.rs b/crates/ra_syntax/src/string_lexing/string.rs new file mode 100644 index 000000000..1b23029c6 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/string.rs @@ -0,0 +1,46 @@ +use super::parser::Parser; +use super::StringComponent; + +pub fn parse_string_literal(src: &str) -> StringComponentIterator { + StringComponentIterator { + parser: Parser::new(src), + has_closing_quote: false, + } +} + +pub struct StringComponentIterator<'a> { + parser: Parser<'a>, + pub has_closing_quote: bool, +} + +impl<'a> Iterator for StringComponentIterator<'a> { + type Item = StringComponent; + fn next(&mut self) -> Option { + if self.parser.pos == 0 { + assert!( + self.parser.advance() == '"', + "string literal should start with double quotes" + ); + } + + if let Some(component) = self.parser.parse_string_component() { + return Some(component); + } + + // We get here when there are no char components left to parse + if self.parser.peek() == Some('"') { + self.parser.advance(); + self.has_closing_quote = true; + } + + assert!( + self.parser.peek() == None, + "string literal should leave no unparsed input: src = {}, pos = {}, length = {}", + self.parser.src, + self.parser.pos, + self.parser.src.len() + ); + + None + } +} -- cgit v1.2.3