From b5021411a84822cb3f1e3aeffad9550dd15bdeb6 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sun, 16 Sep 2018 12:54:24 +0300 Subject: rename all things --- crates/ra_syntax/src/lexer/classes.rs | 26 ++++ crates/ra_syntax/src/lexer/comments.rs | 57 +++++++++ crates/ra_syntax/src/lexer/mod.rs | 209 +++++++++++++++++++++++++++++++++ crates/ra_syntax/src/lexer/numbers.rs | 67 +++++++++++ crates/ra_syntax/src/lexer/ptr.rs | 166 ++++++++++++++++++++++++++ crates/ra_syntax/src/lexer/strings.rs | 123 +++++++++++++++++++ 6 files changed, 648 insertions(+) create mode 100644 crates/ra_syntax/src/lexer/classes.rs create mode 100644 crates/ra_syntax/src/lexer/comments.rs create mode 100644 crates/ra_syntax/src/lexer/mod.rs create mode 100644 crates/ra_syntax/src/lexer/numbers.rs create mode 100644 crates/ra_syntax/src/lexer/ptr.rs create mode 100644 crates/ra_syntax/src/lexer/strings.rs (limited to 'crates/ra_syntax/src/lexer') diff --git a/crates/ra_syntax/src/lexer/classes.rs b/crates/ra_syntax/src/lexer/classes.rs new file mode 100644 index 000000000..4235d2648 --- /dev/null +++ b/crates/ra_syntax/src/lexer/classes.rs @@ -0,0 +1,26 @@ +use unicode_xid::UnicodeXID; + +pub fn is_ident_start(c: char) -> bool { + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || c == '_' + || (c > '\x7f' && UnicodeXID::is_xid_start(c)) +} + +pub fn is_ident_continue(c: char) -> bool { + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || (c >= '0' && c <= '9') + || c == '_' + || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) +} + +pub fn is_whitespace(c: char) -> bool { + //FIXME: use is_pattern_whitespace + //https://github.com/behnam/rust-unic/issues/192 + c.is_whitespace() +} + +pub fn is_dec_digit(c: char) -> bool { + '0' <= c && c <= '9' +} diff --git a/crates/ra_syntax/src/lexer/comments.rs b/crates/ra_syntax/src/lexer/comments.rs new file mode 100644 index 000000000..eb417c2dc --- /dev/null +++ b/crates/ra_syntax/src/lexer/comments.rs @@ -0,0 +1,57 @@ +use lexer::ptr::Ptr; + +use SyntaxKind::{self, *}; + +pub(crate) fn scan_shebang(ptr: &mut Ptr) -> bool { + if ptr.at_str("!/") { + ptr.bump(); + ptr.bump(); + bump_until_eol(ptr); + true + } else { + false + } +} + +fn scan_block_comment(ptr: &mut Ptr) -> Option { + if ptr.at('*') { + ptr.bump(); + let mut depth: u32 = 1; + while depth > 0 { + if ptr.at_str("*/") { + depth -= 1; + ptr.bump(); + ptr.bump(); + } else if ptr.at_str("/*") { + depth += 1; + ptr.bump(); + ptr.bump(); + } else if ptr.bump().is_none() { + break; + } + } + Some(COMMENT) + } else { + None + } +} + +pub(crate) fn scan_comment(ptr: &mut Ptr) -> Option { + if ptr.at('/') { + bump_until_eol(ptr); + Some(COMMENT) + } else { + scan_block_comment(ptr) + } +} + +fn bump_until_eol(ptr: &mut Ptr) { + loop { + if ptr.at('\n') || ptr.at_str("\r\n") { + return; + } + if ptr.bump().is_none() { + break; + } + } +} diff --git a/crates/ra_syntax/src/lexer/mod.rs b/crates/ra_syntax/src/lexer/mod.rs new file mode 100644 index 000000000..3e11db88b --- /dev/null +++ b/crates/ra_syntax/src/lexer/mod.rs @@ -0,0 +1,209 @@ +mod classes; +mod comments; +mod numbers; +mod ptr; +mod strings; + +use { + SyntaxKind::{self, *}, + TextUnit, +}; + +use self::{ + classes::*, + comments::{scan_comment, scan_shebang}, + numbers::scan_number, + ptr::Ptr, + strings::{ + is_string_literal_start, scan_byte_char_or_string, scan_char, scan_raw_string, scan_string, + }, +}; + +/// A token of Rust source. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Token { + /// The kind of token. + pub kind: SyntaxKind, + /// The length of the token. + pub len: TextUnit, +} + +/// Break a string up into its component tokens +pub fn tokenize(text: &str) -> Vec { + let mut text = text; + let mut acc = Vec::new(); + while !text.is_empty() { + let token = next_token(text); + acc.push(token); + let len: u32 = token.len.into(); + text = &text[len as usize..]; + } + acc +} + +/// Get the next token from a string +pub fn next_token(text: &str) -> Token { + assert!(!text.is_empty()); + let mut ptr = Ptr::new(text); + let c = ptr.bump().unwrap(); + let kind = next_token_inner(c, &mut ptr); + let len = ptr.into_len(); + Token { kind, len } +} + +fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { + if is_whitespace(c) { + ptr.bump_while(is_whitespace); + return WHITESPACE; + } + + match c { + '#' => if scan_shebang(ptr) { + return SHEBANG; + }, + '/' => if let Some(kind) = scan_comment(ptr) { + return kind; + }, + _ => (), + } + + let ident_start = is_ident_start(c) && !is_string_literal_start(c, ptr.current(), ptr.nth(1)); + if ident_start { + return scan_ident(c, ptr); + } + + if is_dec_digit(c) { + let kind = scan_number(c, ptr); + scan_literal_suffix(ptr); + return kind; + } + + // One-byte tokens. + if let Some(kind) = SyntaxKind::from_char(c) { + return kind; + } + + match c { + // Multi-byte tokens. + '.' => { + return match (ptr.current(), ptr.nth(1)) { + (Some('.'), Some('.')) => { + ptr.bump(); + ptr.bump(); + DOTDOTDOT + } + (Some('.'), Some('=')) => { + ptr.bump(); + ptr.bump(); + DOTDOTEQ + } + (Some('.'), _) => { + ptr.bump(); + DOTDOT + } + _ => DOT, + }; + } + ':' => { + return match ptr.current() { + Some(':') => { + ptr.bump(); + COLONCOLON + } + _ => COLON, + }; + } + '=' => { + return match ptr.current() { + Some('=') => { + ptr.bump(); + EQEQ + } + Some('>') => { + ptr.bump(); + FAT_ARROW + } + _ => EQ, + }; + } + '!' => { + return match ptr.current() { + Some('=') => { + ptr.bump(); + NEQ + } + _ => EXCL, + }; + } + '-' => { + return if ptr.at('>') { + ptr.bump(); + THIN_ARROW + } else { + MINUS + }; + } + + // If the character is an ident start not followed by another single + // quote, then this is a lifetime name: + '\'' => { + return if ptr.at_p(is_ident_start) && !ptr.at_str("''") { + ptr.bump(); + while ptr.at_p(is_ident_continue) { + ptr.bump(); + } + // lifetimes shouldn't end with a single quote + // if we find one, then this is an invalid character literal + if ptr.at('\'') { + ptr.bump(); + return CHAR; // TODO: error reporting + } + LIFETIME + } else { + scan_char(ptr); + scan_literal_suffix(ptr); + CHAR + }; + } + 'b' => { + let kind = scan_byte_char_or_string(ptr); + scan_literal_suffix(ptr); + return kind; + } + '"' => { + scan_string(ptr); + scan_literal_suffix(ptr); + return STRING; + } + 'r' => { + scan_raw_string(ptr); + scan_literal_suffix(ptr); + return RAW_STRING; + } + _ => (), + } + ERROR +} + +fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind { + let is_single_letter = match ptr.current() { + None => true, + Some(c) if !is_ident_continue(c) => true, + _ => false, + }; + if is_single_letter { + return if c == '_' { UNDERSCORE } else { IDENT }; + } + ptr.bump_while(is_ident_continue); + if let Some(kind) = SyntaxKind::from_keyword(ptr.current_token_text()) { + return kind; + } + IDENT +} + +fn scan_literal_suffix(ptr: &mut Ptr) { + if ptr.at_p(is_ident_start) { + ptr.bump(); + } + ptr.bump_while(is_ident_continue); +} diff --git a/crates/ra_syntax/src/lexer/numbers.rs b/crates/ra_syntax/src/lexer/numbers.rs new file mode 100644 index 000000000..22e7d4e99 --- /dev/null +++ b/crates/ra_syntax/src/lexer/numbers.rs @@ -0,0 +1,67 @@ +use lexer::classes::*; +use lexer::ptr::Ptr; + +use SyntaxKind::{self, *}; + +pub(crate) fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind { + if c == '0' { + match ptr.current().unwrap_or('\0') { + 'b' | 'o' => { + ptr.bump(); + scan_digits(ptr, false); + } + 'x' => { + ptr.bump(); + scan_digits(ptr, true); + } + '0'...'9' | '_' | '.' | 'e' | 'E' => { + scan_digits(ptr, true); + } + _ => return INT_NUMBER, + } + } else { + scan_digits(ptr, false); + } + + // might be a float, but don't be greedy if this is actually an + // integer literal followed by field/method access or a range pattern + // (`0..2` and `12.foo()`) + if ptr.at('.') && !(ptr.at_str("..") || ptr.nth_is_p(1, is_ident_start)) { + // might have stuff after the ., and if it does, it needs to start + // with a number + ptr.bump(); + scan_digits(ptr, false); + scan_float_exponent(ptr); + return FLOAT_NUMBER; + } + // it might be a float if it has an exponent + if ptr.at('e') || ptr.at('E') { + scan_float_exponent(ptr); + return FLOAT_NUMBER; + } + INT_NUMBER +} + +fn scan_digits(ptr: &mut Ptr, allow_hex: bool) { + while let Some(c) = ptr.current() { + match c { + '_' | '0'...'9' => { + ptr.bump(); + } + 'a'...'f' | 'A'...'F' if allow_hex => { + ptr.bump(); + } + _ => return, + } + } +} + +fn scan_float_exponent(ptr: &mut Ptr) { + if ptr.at('e') || ptr.at('E') { + ptr.bump(); + if ptr.at('-') || ptr.at('+') { + ptr.bump(); + } + scan_digits(ptr, false); + } +} diff --git a/crates/ra_syntax/src/lexer/ptr.rs b/crates/ra_syntax/src/lexer/ptr.rs new file mode 100644 index 000000000..c9a5354ea --- /dev/null +++ b/crates/ra_syntax/src/lexer/ptr.rs @@ -0,0 +1,166 @@ +use TextUnit; + +use std::str::Chars; + +/// A simple view into the characters of a string. +pub(crate) struct Ptr<'s> { + text: &'s str, + len: TextUnit, +} + +impl<'s> Ptr<'s> { + /// Creates a new `Ptr` from a string. + pub fn new(text: &'s str) -> Ptr<'s> { + Ptr { + text, + len: 0.into(), + } + } + + /// Gets the length of the remaining string. + pub fn into_len(self) -> TextUnit { + self.len + } + + /// Gets the current character, if one exists. + pub fn current(&self) -> Option { + self.chars().next() + } + + /// Gets the nth character from the current. + /// For example, 0 will return the current token, 1 will return the next, etc. + pub fn nth(&self, n: u32) -> Option { + let mut chars = self.chars().peekable(); + chars.by_ref().skip(n as usize).next() + } + + /// Checks whether the current character is `c`. + pub fn at(&self, c: char) -> bool { + self.current() == Some(c) + } + + /// Checks whether the next characters match `s`. + pub fn at_str(&self, s: &str) -> bool { + let chars = self.chars(); + chars.as_str().starts_with(s) + } + + /// Checks whether the current character satisfies the predicate `p`. + pub fn at_p bool>(&self, p: P) -> bool { + self.current().map(p) == Some(true) + } + + /// Checks whether the nth character satisfies the predicate `p`. + pub fn nth_is_p bool>(&self, n: u32, p: P) -> bool { + self.nth(n).map(p) == Some(true) + } + + /// Moves to the next character. + pub fn bump(&mut self) -> Option { + let ch = self.chars().next()?; + self.len += TextUnit::of_char(ch); + Some(ch) + } + + /// Moves to the next character as long as `pred` is satisfied. + pub fn bump_while bool>(&mut self, pred: F) { + loop { + match self.current() { + Some(c) if pred(c) => { + self.bump(); + } + _ => return, + } + } + } + + /// Returns the text up to the current point. + pub fn current_token_text(&self) -> &str { + let len: u32 = self.len.into(); + &self.text[..len as usize] + } + + /// Returns an iterator over the remaining characters. + fn chars(&self) -> Chars { + let len: u32 = self.len.into(); + self.text[len as usize..].chars() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_current() { + let ptr = Ptr::new("test"); + assert_eq!(ptr.current(), Some('t')); + } + + #[test] + fn test_nth() { + let ptr = Ptr::new("test"); + assert_eq!(ptr.nth(0), Some('t')); + assert_eq!(ptr.nth(1), Some('e')); + assert_eq!(ptr.nth(2), Some('s')); + assert_eq!(ptr.nth(3), Some('t')); + assert_eq!(ptr.nth(4), None); + } + + #[test] + fn test_at() { + let ptr = Ptr::new("test"); + assert!(ptr.at('t')); + assert!(!ptr.at('a')); + } + + #[test] + fn test_at_str() { + let ptr = Ptr::new("test"); + assert!(ptr.at_str("t")); + assert!(ptr.at_str("te")); + assert!(ptr.at_str("test")); + assert!(!ptr.at_str("tests")); + assert!(!ptr.at_str("rust")); + } + + #[test] + fn test_at_p() { + let ptr = Ptr::new("test"); + assert!(ptr.at_p(|c| c == 't')); + assert!(!ptr.at_p(|c| c == 'e')); + } + + #[test] + fn test_nth_is_p() { + let ptr = Ptr::new("test"); + assert!(ptr.nth_is_p(0,|c| c == 't')); + assert!(!ptr.nth_is_p(1,|c| c == 't')); + assert!(ptr.nth_is_p(3,|c| c == 't')); + assert!(!ptr.nth_is_p(150,|c| c == 't')); + } + + #[test] + fn test_bump() { + let mut ptr = Ptr::new("test"); + assert_eq!(ptr.current(), Some('t')); + ptr.bump(); + assert_eq!(ptr.current(), Some('e')); + ptr.bump(); + assert_eq!(ptr.current(), Some('s')); + ptr.bump(); + assert_eq!(ptr.current(), Some('t')); + ptr.bump(); + assert_eq!(ptr.current(), None); + ptr.bump(); + assert_eq!(ptr.current(), None); + } + + #[test] + fn test_bump_while() { + let mut ptr = Ptr::new("test"); + assert_eq!(ptr.current(), Some('t')); + ptr.bump_while(|c| c != 's'); + assert_eq!(ptr.current(), Some('s')); + } +} diff --git a/crates/ra_syntax/src/lexer/strings.rs b/crates/ra_syntax/src/lexer/strings.rs new file mode 100644 index 000000000..5ff483d14 --- /dev/null +++ b/crates/ra_syntax/src/lexer/strings.rs @@ -0,0 +1,123 @@ +use SyntaxKind::{self, *}; + +use lexer::ptr::Ptr; + +pub(crate) fn is_string_literal_start(c: char, c1: Option, c2: Option) -> bool { + match (c, c1, c2) { + ('r', Some('"'), _) + | ('r', Some('#'), _) + | ('b', Some('"'), _) + | ('b', Some('\''), _) + | ('b', Some('r'), Some('"')) + | ('b', Some('r'), Some('#')) => true, + _ => false, + } +} + +pub(crate) fn scan_char(ptr: &mut Ptr) { + while let Some(c) = ptr.current() { + match c { + '\\' => { + ptr.bump(); + if ptr.at('\\') || ptr.at('\'') { + ptr.bump(); + } + } + '\'' => { + ptr.bump(); + return; + } + '\n' => return, + _ => { + ptr.bump(); + } + } + } +} + +pub(crate) fn scan_byte_char_or_string(ptr: &mut Ptr) -> SyntaxKind { + // unwrapping and not-exhaustive match are ok + // because of string_literal_start + let c = ptr.bump().unwrap(); + match c { + '\'' => { + scan_byte(ptr); + BYTE + } + '"' => { + scan_byte_string(ptr); + BYTE_STRING + } + 'r' => { + scan_raw_byte_string(ptr); + RAW_BYTE_STRING + } + _ => unreachable!(), + } +} + +pub(crate) fn scan_string(ptr: &mut Ptr) { + while let Some(c) = ptr.current() { + match c { + '\\' => { + ptr.bump(); + if ptr.at('\\') || ptr.at('"') { + ptr.bump(); + } + } + '"' => { + ptr.bump(); + return; + } + _ => { + ptr.bump(); + }, + } + } +} + +pub(crate) fn scan_raw_string(ptr: &mut Ptr) { + let mut hashes = 0; + while ptr.at('#') { + hashes += 1; + ptr.bump(); + } + if !ptr.at('"') { + return; + } + ptr.bump(); + + while let Some(c) = ptr.bump() { + if c == '"' { + let mut hashes_left = hashes; + while ptr.at('#') && hashes_left > 0{ + hashes_left -= 1; + ptr.bump(); + } + if hashes_left == 0 { + return; + } + } + } +} + +fn scan_byte(ptr: &mut Ptr) { + scan_char(ptr) +} + +fn scan_byte_string(ptr: &mut Ptr) { + scan_string(ptr) +} + +fn scan_raw_byte_string(ptr: &mut Ptr) { + if !ptr.at('"') { + return; + } + ptr.bump(); + + while let Some(c) = ptr.bump() { + if c == '"' { + return; + } + } +} -- cgit v1.2.3