From 5222b8aba3b1c2c68706aacf6869423a8e4fe6d5 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Wed, 20 Feb 2019 15:47:32 +0300 Subject: move all parsing related bits to a separate module --- crates/ra_syntax/src/parsing/lexer/classes.rs | 26 ++++ crates/ra_syntax/src/parsing/lexer/comments.rs | 57 +++++++++ crates/ra_syntax/src/parsing/lexer/numbers.rs | 69 +++++++++++ crates/ra_syntax/src/parsing/lexer/ptr.rs | 162 +++++++++++++++++++++++++ crates/ra_syntax/src/parsing/lexer/strings.rs | 112 +++++++++++++++++ 5 files changed, 426 insertions(+) create mode 100644 crates/ra_syntax/src/parsing/lexer/classes.rs create mode 100644 crates/ra_syntax/src/parsing/lexer/comments.rs create mode 100644 crates/ra_syntax/src/parsing/lexer/numbers.rs create mode 100644 crates/ra_syntax/src/parsing/lexer/ptr.rs create mode 100644 crates/ra_syntax/src/parsing/lexer/strings.rs (limited to 'crates/ra_syntax/src/parsing/lexer') diff --git a/crates/ra_syntax/src/parsing/lexer/classes.rs b/crates/ra_syntax/src/parsing/lexer/classes.rs new file mode 100644 index 000000000..4235d2648 --- /dev/null +++ b/crates/ra_syntax/src/parsing/lexer/classes.rs @@ -0,0 +1,26 @@ +use unicode_xid::UnicodeXID; + +pub fn is_ident_start(c: char) -> bool { + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || c == '_' + || (c > '\x7f' && UnicodeXID::is_xid_start(c)) +} + +pub fn is_ident_continue(c: char) -> bool { + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || (c >= '0' && c <= '9') + || c == '_' + || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) +} + +pub fn is_whitespace(c: char) -> bool { + //FIXME: use is_pattern_whitespace + //https://github.com/behnam/rust-unic/issues/192 + c.is_whitespace() +} + +pub fn is_dec_digit(c: char) -> bool { + '0' <= c && c <= '9' +} diff --git a/crates/ra_syntax/src/parsing/lexer/comments.rs b/crates/ra_syntax/src/parsing/lexer/comments.rs new file mode 100644 index 000000000..8bbbe659b --- /dev/null +++ b/crates/ra_syntax/src/parsing/lexer/comments.rs @@ -0,0 +1,57 @@ +use crate::parsing::lexer::ptr::Ptr; + +use crate::SyntaxKind::{self, *}; + +pub(crate) fn scan_shebang(ptr: &mut Ptr) -> bool { + if ptr.at_str("!/") { + ptr.bump(); + ptr.bump(); + bump_until_eol(ptr); + true + } else { + false + } +} + +fn scan_block_comment(ptr: &mut Ptr) -> Option { + if ptr.at('*') { + ptr.bump(); + let mut depth: u32 = 1; + while depth > 0 { + if ptr.at_str("*/") { + depth -= 1; + ptr.bump(); + ptr.bump(); + } else if ptr.at_str("/*") { + depth += 1; + ptr.bump(); + ptr.bump(); + } else if ptr.bump().is_none() { + break; + } + } + Some(COMMENT) + } else { + None + } +} + +pub(crate) fn scan_comment(ptr: &mut Ptr) -> Option { + if ptr.at('/') { + bump_until_eol(ptr); + Some(COMMENT) + } else { + scan_block_comment(ptr) + } +} + +fn bump_until_eol(ptr: &mut Ptr) { + loop { + if ptr.at('\n') || ptr.at_str("\r\n") { + return; + } + if ptr.bump().is_none() { + break; + } + } +} diff --git a/crates/ra_syntax/src/parsing/lexer/numbers.rs b/crates/ra_syntax/src/parsing/lexer/numbers.rs new file mode 100644 index 000000000..7f6abe1d5 --- /dev/null +++ b/crates/ra_syntax/src/parsing/lexer/numbers.rs @@ -0,0 +1,69 @@ +use crate::parsing::lexer::{ + ptr::Ptr, + classes::*, +}; + +use crate::SyntaxKind::{self, *}; + +pub(crate) fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind { + if c == '0' { + match ptr.current().unwrap_or('\0') { + 'b' | 'o' => { + ptr.bump(); + scan_digits(ptr, false); + } + 'x' => { + ptr.bump(); + scan_digits(ptr, true); + } + '0'...'9' | '_' | '.' | 'e' | 'E' => { + scan_digits(ptr, true); + } + _ => return INT_NUMBER, + } + } else { + scan_digits(ptr, false); + } + + // might be a float, but don't be greedy if this is actually an + // integer literal followed by field/method access or a range pattern + // (`0..2` and `12.foo()`) + if ptr.at('.') && !(ptr.at_str("..") || ptr.nth_is_p(1, is_ident_start)) { + // might have stuff after the ., and if it does, it needs to start + // with a number + ptr.bump(); + scan_digits(ptr, false); + scan_float_exponent(ptr); + return FLOAT_NUMBER; + } + // it might be a float if it has an exponent + if ptr.at('e') || ptr.at('E') { + scan_float_exponent(ptr); + return FLOAT_NUMBER; + } + INT_NUMBER +} + +fn scan_digits(ptr: &mut Ptr, allow_hex: bool) { + while let Some(c) = ptr.current() { + match c { + '_' | '0'...'9' => { + ptr.bump(); + } + 'a'...'f' | 'A'...'F' if allow_hex => { + ptr.bump(); + } + _ => return, + } + } +} + +fn scan_float_exponent(ptr: &mut Ptr) { + if ptr.at('e') || ptr.at('E') { + ptr.bump(); + if ptr.at('-') || ptr.at('+') { + ptr.bump(); + } + scan_digits(ptr, false); + } +} diff --git a/crates/ra_syntax/src/parsing/lexer/ptr.rs b/crates/ra_syntax/src/parsing/lexer/ptr.rs new file mode 100644 index 000000000..c341c4176 --- /dev/null +++ b/crates/ra_syntax/src/parsing/lexer/ptr.rs @@ -0,0 +1,162 @@ +use crate::TextUnit; + +use std::str::Chars; + +/// A simple view into the characters of a string. +pub(crate) struct Ptr<'s> { + text: &'s str, + len: TextUnit, +} + +impl<'s> Ptr<'s> { + /// Creates a new `Ptr` from a string. + pub fn new(text: &'s str) -> Ptr<'s> { + Ptr { text, len: 0.into() } + } + + /// Gets the length of the remaining string. + pub fn into_len(self) -> TextUnit { + self.len + } + + /// Gets the current character, if one exists. + pub fn current(&self) -> Option { + self.chars().next() + } + + /// Gets the nth character from the current. + /// For example, 0 will return the current character, 1 will return the next, etc. + pub fn nth(&self, n: u32) -> Option { + self.chars().nth(n as usize) + } + + /// Checks whether the current character is `c`. + pub fn at(&self, c: char) -> bool { + self.current() == Some(c) + } + + /// Checks whether the next characters match `s`. + pub fn at_str(&self, s: &str) -> bool { + let chars = self.chars(); + chars.as_str().starts_with(s) + } + + /// Checks whether the current character satisfies the predicate `p`. + pub fn at_p bool>(&self, p: P) -> bool { + self.current().map(p) == Some(true) + } + + /// Checks whether the nth character satisfies the predicate `p`. + pub fn nth_is_p bool>(&self, n: u32, p: P) -> bool { + self.nth(n).map(p) == Some(true) + } + + /// Moves to the next character. + pub fn bump(&mut self) -> Option { + let ch = self.chars().next()?; + self.len += TextUnit::of_char(ch); + Some(ch) + } + + /// Moves to the next character as long as `pred` is satisfied. + pub fn bump_while bool>(&mut self, pred: F) { + loop { + match self.current() { + Some(c) if pred(c) => { + self.bump(); + } + _ => return, + } + } + } + + /// Returns the text up to the current point. + pub fn current_token_text(&self) -> &str { + let len: u32 = self.len.into(); + &self.text[..len as usize] + } + + /// Returns an iterator over the remaining characters. + fn chars(&self) -> Chars { + let len: u32 = self.len.into(); + self.text[len as usize..].chars() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_current() { + let ptr = Ptr::new("test"); + assert_eq!(ptr.current(), Some('t')); + } + + #[test] + fn test_nth() { + let ptr = Ptr::new("test"); + assert_eq!(ptr.nth(0), Some('t')); + assert_eq!(ptr.nth(1), Some('e')); + assert_eq!(ptr.nth(2), Some('s')); + assert_eq!(ptr.nth(3), Some('t')); + assert_eq!(ptr.nth(4), None); + } + + #[test] + fn test_at() { + let ptr = Ptr::new("test"); + assert!(ptr.at('t')); + assert!(!ptr.at('a')); + } + + #[test] + fn test_at_str() { + let ptr = Ptr::new("test"); + assert!(ptr.at_str("t")); + assert!(ptr.at_str("te")); + assert!(ptr.at_str("test")); + assert!(!ptr.at_str("tests")); + assert!(!ptr.at_str("rust")); + } + + #[test] + fn test_at_p() { + let ptr = Ptr::new("test"); + assert!(ptr.at_p(|c| c == 't')); + assert!(!ptr.at_p(|c| c == 'e')); + } + + #[test] + fn test_nth_is_p() { + let ptr = Ptr::new("test"); + assert!(ptr.nth_is_p(0, |c| c == 't')); + assert!(!ptr.nth_is_p(1, |c| c == 't')); + assert!(ptr.nth_is_p(3, |c| c == 't')); + assert!(!ptr.nth_is_p(150, |c| c == 't')); + } + + #[test] + fn test_bump() { + let mut ptr = Ptr::new("test"); + assert_eq!(ptr.current(), Some('t')); + ptr.bump(); + assert_eq!(ptr.current(), Some('e')); + ptr.bump(); + assert_eq!(ptr.current(), Some('s')); + ptr.bump(); + assert_eq!(ptr.current(), Some('t')); + ptr.bump(); + assert_eq!(ptr.current(), None); + ptr.bump(); + assert_eq!(ptr.current(), None); + } + + #[test] + fn test_bump_while() { + let mut ptr = Ptr::new("test"); + assert_eq!(ptr.current(), Some('t')); + ptr.bump_while(|c| c != 's'); + assert_eq!(ptr.current(), Some('s')); + } +} diff --git a/crates/ra_syntax/src/parsing/lexer/strings.rs b/crates/ra_syntax/src/parsing/lexer/strings.rs new file mode 100644 index 000000000..f74acff9e --- /dev/null +++ b/crates/ra_syntax/src/parsing/lexer/strings.rs @@ -0,0 +1,112 @@ +use crate::{ + parsing::lexer::ptr::Ptr, + SyntaxKind::{self, *}, +}; + +pub(crate) fn is_string_literal_start(c: char, c1: Option, c2: Option) -> bool { + match (c, c1, c2) { + ('r', Some('"'), _) + | ('r', Some('#'), Some('"')) + | ('r', Some('#'), Some('#')) + | ('b', Some('"'), _) + | ('b', Some('\''), _) + | ('b', Some('r'), Some('"')) + | ('b', Some('r'), Some('#')) => true, + _ => false, + } +} + +pub(crate) fn scan_char(ptr: &mut Ptr) { + while let Some(c) = ptr.current() { + match c { + '\\' => { + ptr.bump(); + if ptr.at('\\') || ptr.at('\'') { + ptr.bump(); + } + } + '\'' => { + ptr.bump(); + return; + } + '\n' => return, + _ => { + ptr.bump(); + } + } + } +} + +pub(crate) fn scan_byte_char_or_string(ptr: &mut Ptr) -> SyntaxKind { + // unwrapping and not-exhaustive match are ok + // because of string_literal_start + let c = ptr.bump().unwrap(); + match c { + '\'' => { + scan_byte(ptr); + BYTE + } + '"' => { + scan_byte_string(ptr); + BYTE_STRING + } + 'r' => { + scan_raw_string(ptr); + RAW_BYTE_STRING + } + _ => unreachable!(), + } +} + +pub(crate) fn scan_string(ptr: &mut Ptr) { + while let Some(c) = ptr.current() { + match c { + '\\' => { + ptr.bump(); + if ptr.at('\\') || ptr.at('"') { + ptr.bump(); + } + } + '"' => { + ptr.bump(); + return; + } + _ => { + ptr.bump(); + } + } + } +} + +pub(crate) fn scan_raw_string(ptr: &mut Ptr) { + let mut hashes = 0; + while ptr.at('#') { + hashes += 1; + ptr.bump(); + } + if !ptr.at('"') { + return; + } + ptr.bump(); + + while let Some(c) = ptr.bump() { + if c == '"' { + let mut hashes_left = hashes; + while ptr.at('#') && hashes_left > 0 { + hashes_left -= 1; + ptr.bump(); + } + if hashes_left == 0 { + return; + } + } + } +} + +fn scan_byte(ptr: &mut Ptr) { + scan_char(ptr) +} + +fn scan_byte_string(ptr: &mut Ptr) { + scan_string(ptr) +} -- cgit v1.2.3