From 7c67612b8a894187fa3b64725531a5459f9211bf Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Fri, 10 Aug 2018 22:33:29 +0300 Subject: organizize --- crates/libsyntax2/src/lexer/classes.rs | 26 ++++ crates/libsyntax2/src/lexer/comments.rs | 57 +++++++++ crates/libsyntax2/src/lexer/mod.rs | 209 ++++++++++++++++++++++++++++++++ crates/libsyntax2/src/lexer/numbers.rs | 67 ++++++++++ crates/libsyntax2/src/lexer/ptr.rs | 74 +++++++++++ crates/libsyntax2/src/lexer/strings.rs | 106 ++++++++++++++++ 6 files changed, 539 insertions(+) create mode 100644 crates/libsyntax2/src/lexer/classes.rs create mode 100644 crates/libsyntax2/src/lexer/comments.rs create mode 100644 crates/libsyntax2/src/lexer/mod.rs create mode 100644 crates/libsyntax2/src/lexer/numbers.rs create mode 100644 crates/libsyntax2/src/lexer/ptr.rs create mode 100644 crates/libsyntax2/src/lexer/strings.rs (limited to 'crates/libsyntax2/src/lexer') diff --git a/crates/libsyntax2/src/lexer/classes.rs b/crates/libsyntax2/src/lexer/classes.rs new file mode 100644 index 000000000..4235d2648 --- /dev/null +++ b/crates/libsyntax2/src/lexer/classes.rs @@ -0,0 +1,26 @@ +use unicode_xid::UnicodeXID; + +pub fn is_ident_start(c: char) -> bool { + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || c == '_' + || (c > '\x7f' && UnicodeXID::is_xid_start(c)) +} + +pub fn is_ident_continue(c: char) -> bool { + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || (c >= '0' && c <= '9') + || c == '_' + || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) +} + +pub fn is_whitespace(c: char) -> bool { + //FIXME: use is_pattern_whitespace + //https://github.com/behnam/rust-unic/issues/192 + c.is_whitespace() +} + +pub fn is_dec_digit(c: char) -> bool { + '0' <= c && c <= '9' +} diff --git a/crates/libsyntax2/src/lexer/comments.rs b/crates/libsyntax2/src/lexer/comments.rs new file mode 100644 index 000000000..01acb6515 --- /dev/null +++ b/crates/libsyntax2/src/lexer/comments.rs @@ -0,0 +1,57 @@ +use lexer::ptr::Ptr; + +use SyntaxKind::{self, *}; + +pub(crate) fn scan_shebang(ptr: &mut Ptr) -> bool { + if ptr.next_is('!') && ptr.nnext_is('/') { + ptr.bump(); + ptr.bump(); + bump_until_eol(ptr); + true + } else { + false + } +} + +fn scan_block_comment(ptr: &mut Ptr) -> Option { + if ptr.next_is('*') { + ptr.bump(); + let mut depth: u32 = 1; + while depth > 0 { + if ptr.next_is('*') && ptr.nnext_is('/') { + depth -= 1; + ptr.bump(); + ptr.bump(); + } else if ptr.next_is('/') && ptr.nnext_is('*') { + depth += 1; + ptr.bump(); + ptr.bump(); + } else if ptr.bump().is_none() { + break; + } + } + Some(COMMENT) + } else { + None + } +} + +pub(crate) fn scan_comment(ptr: &mut Ptr) -> Option { + if ptr.next_is('/') { + bump_until_eol(ptr); + Some(COMMENT) + } else { + scan_block_comment(ptr) + } +} + +fn bump_until_eol(ptr: &mut Ptr) { + loop { + if ptr.next_is('\n') || ptr.next_is('\r') && ptr.nnext_is('\n') { + return; + } + if ptr.bump().is_none() { + break; + } + } +} diff --git a/crates/libsyntax2/src/lexer/mod.rs b/crates/libsyntax2/src/lexer/mod.rs new file mode 100644 index 000000000..f8fdc41ac --- /dev/null +++ b/crates/libsyntax2/src/lexer/mod.rs @@ -0,0 +1,209 @@ +mod classes; +mod comments; +mod numbers; +mod ptr; +mod strings; + +use { + SyntaxKind::{self, *}, + TextUnit, +}; + +use self::{ + classes::*, + comments::{scan_comment, scan_shebang}, + numbers::scan_number, + ptr::Ptr, + strings::{ + is_string_literal_start, scan_byte_char_or_string, scan_char, scan_raw_string, scan_string, + }, +}; + +/// A token of Rust source. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Token { + /// The kind of token. + pub kind: SyntaxKind, + /// The length of the token. + pub len: TextUnit, +} + +/// Break a string up into its component tokens +pub fn tokenize(text: &str) -> Vec { + let mut text = text; + let mut acc = Vec::new(); + while !text.is_empty() { + let token = next_token(text); + acc.push(token); + let len: u32 = token.len.into(); + text = &text[len as usize..]; + } + acc +} + +/// Get the next token from a string +pub fn next_token(text: &str) -> Token { + assert!(!text.is_empty()); + let mut ptr = Ptr::new(text); + let c = ptr.bump().unwrap(); + let kind = next_token_inner(c, &mut ptr); + let len = ptr.into_len(); + Token { kind, len } +} + +fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { + if is_whitespace(c) { + ptr.bump_while(is_whitespace); + return WHITESPACE; + } + + match c { + '#' => if scan_shebang(ptr) { + return SHEBANG; + }, + '/' => if let Some(kind) = scan_comment(ptr) { + return kind; + }, + _ => (), + } + + let ident_start = is_ident_start(c) && !is_string_literal_start(c, ptr.next(), ptr.nnext()); + if ident_start { + return scan_ident(c, ptr); + } + + if is_dec_digit(c) { + let kind = scan_number(c, ptr); + scan_literal_suffix(ptr); + return kind; + } + + // One-byte tokens. + if let Some(kind) = SyntaxKind::from_char(c) { + return kind; + } + + match c { + // Multi-byte tokens. + '.' => { + return match (ptr.next(), ptr.nnext()) { + (Some('.'), Some('.')) => { + ptr.bump(); + ptr.bump(); + DOTDOTDOT + } + (Some('.'), Some('=')) => { + ptr.bump(); + ptr.bump(); + DOTDOTEQ + } + (Some('.'), _) => { + ptr.bump(); + DOTDOT + } + _ => DOT, + }; + } + ':' => { + return match ptr.next() { + Some(':') => { + ptr.bump(); + COLONCOLON + } + _ => COLON, + }; + } + '=' => { + return match ptr.next() { + Some('=') => { + ptr.bump(); + EQEQ + } + Some('>') => { + ptr.bump(); + FAT_ARROW + } + _ => EQ, + }; + } + '!' => { + return match ptr.next() { + Some('=') => { + ptr.bump(); + NEQ + } + _ => EXCL, + }; + } + '-' => { + return if ptr.next_is('>') { + ptr.bump(); + THIN_ARROW + } else { + MINUS + }; + } + + // If the character is an ident start not followed by another single + // quote, then this is a lifetime name: + '\'' => { + return if ptr.next_is_p(is_ident_start) && !ptr.nnext_is('\'') { + ptr.bump(); + while ptr.next_is_p(is_ident_continue) { + ptr.bump(); + } + // lifetimes shouldn't end with a single quote + // if we find one, then this is an invalid character literal + if ptr.next_is('\'') { + ptr.bump(); + return CHAR; // TODO: error reporting + } + LIFETIME + } else { + scan_char(ptr); + scan_literal_suffix(ptr); + CHAR + }; + } + 'b' => { + let kind = scan_byte_char_or_string(ptr); + scan_literal_suffix(ptr); + return kind; + } + '"' => { + scan_string(ptr); + scan_literal_suffix(ptr); + return STRING; + } + 'r' => { + scan_raw_string(ptr); + scan_literal_suffix(ptr); + return RAW_STRING; + } + _ => (), + } + ERROR +} + +fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind { + let is_single_letter = match ptr.next() { + None => true, + Some(c) if !is_ident_continue(c) => true, + _ => false, + }; + if is_single_letter { + return if c == '_' { UNDERSCORE } else { IDENT }; + } + ptr.bump_while(is_ident_continue); + if let Some(kind) = SyntaxKind::from_keyword(ptr.current_token_text()) { + return kind; + } + IDENT +} + +fn scan_literal_suffix(ptr: &mut Ptr) { + if ptr.next_is_p(is_ident_start) { + ptr.bump(); + } + ptr.bump_while(is_ident_continue); +} diff --git a/crates/libsyntax2/src/lexer/numbers.rs b/crates/libsyntax2/src/lexer/numbers.rs new file mode 100644 index 000000000..5c4641a2d --- /dev/null +++ b/crates/libsyntax2/src/lexer/numbers.rs @@ -0,0 +1,67 @@ +use lexer::classes::*; +use lexer::ptr::Ptr; + +use SyntaxKind::{self, *}; + +pub(crate) fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind { + if c == '0' { + match ptr.next().unwrap_or('\0') { + 'b' | 'o' => { + ptr.bump(); + scan_digits(ptr, false); + } + 'x' => { + ptr.bump(); + scan_digits(ptr, true); + } + '0'...'9' | '_' | '.' | 'e' | 'E' => { + scan_digits(ptr, true); + } + _ => return INT_NUMBER, + } + } else { + scan_digits(ptr, false); + } + + // might be a float, but don't be greedy if this is actually an + // integer literal followed by field/method access or a range pattern + // (`0..2` and `12.foo()`) + if ptr.next_is('.') && !(ptr.nnext_is('.') || ptr.nnext_is_p(is_ident_start)) { + // might have stuff after the ., and if it does, it needs to start + // with a number + ptr.bump(); + scan_digits(ptr, false); + scan_float_exponent(ptr); + return FLOAT_NUMBER; + } + // it might be a float if it has an exponent + if ptr.next_is('e') || ptr.next_is('E') { + scan_float_exponent(ptr); + return FLOAT_NUMBER; + } + INT_NUMBER +} + +fn scan_digits(ptr: &mut Ptr, allow_hex: bool) { + while let Some(c) = ptr.next() { + match c { + '_' | '0'...'9' => { + ptr.bump(); + } + 'a'...'f' | 'A'...'F' if allow_hex => { + ptr.bump(); + } + _ => return, + } + } +} + +fn scan_float_exponent(ptr: &mut Ptr) { + if ptr.next_is('e') || ptr.next_is('E') { + ptr.bump(); + if ptr.next_is('-') || ptr.next_is('+') { + ptr.bump(); + } + scan_digits(ptr, false); + } +} diff --git a/crates/libsyntax2/src/lexer/ptr.rs b/crates/libsyntax2/src/lexer/ptr.rs new file mode 100644 index 000000000..d1391fd5f --- /dev/null +++ b/crates/libsyntax2/src/lexer/ptr.rs @@ -0,0 +1,74 @@ +use TextUnit; + +use std::str::Chars; + +pub(crate) struct Ptr<'s> { + text: &'s str, + len: TextUnit, +} + +impl<'s> Ptr<'s> { + pub fn new(text: &'s str) -> Ptr<'s> { + Ptr { + text, + len: 0.into(), + } + } + + pub fn into_len(self) -> TextUnit { + self.len + } + + pub fn next(&self) -> Option { + self.chars().next() + } + + pub fn nnext(&self) -> Option { + let mut chars = self.chars(); + chars.next()?; + chars.next() + } + + pub fn next_is(&self, c: char) -> bool { + self.next() == Some(c) + } + + pub fn nnext_is(&self, c: char) -> bool { + self.nnext() == Some(c) + } + + pub fn next_is_p bool>(&self, p: P) -> bool { + self.next().map(p) == Some(true) + } + + pub fn nnext_is_p bool>(&self, p: P) -> bool { + self.nnext().map(p) == Some(true) + } + + pub fn bump(&mut self) -> Option { + let ch = self.chars().next()?; + self.len += TextUnit::of_char(ch); + Some(ch) + } + + pub fn bump_while bool>(&mut self, pred: F) { + loop { + match self.next() { + Some(c) if pred(c) => { + self.bump(); + } + _ => return, + } + } + } + + pub fn current_token_text(&self) -> &str { + let len: u32 = self.len.into(); + &self.text[..len as usize] + } + + fn chars(&self) -> Chars { + let len: u32 = self.len.into(); + self.text[len as usize..].chars() + } +} diff --git a/crates/libsyntax2/src/lexer/strings.rs b/crates/libsyntax2/src/lexer/strings.rs new file mode 100644 index 000000000..e3704fbb3 --- /dev/null +++ b/crates/libsyntax2/src/lexer/strings.rs @@ -0,0 +1,106 @@ +use SyntaxKind::{self, *}; + +use lexer::ptr::Ptr; + +pub(crate) fn is_string_literal_start(c: char, c1: Option, c2: Option) -> bool { + match (c, c1, c2) { + ('r', Some('"'), _) + | ('r', Some('#'), _) + | ('b', Some('"'), _) + | ('b', Some('\''), _) + | ('b', Some('r'), Some('"')) + | ('b', Some('r'), Some('#')) => true, + _ => false, + } +} + +pub(crate) fn scan_char(ptr: &mut Ptr) { + if ptr.bump().is_none() { + return; // TODO: error reporting is upper in the stack + } + scan_char_or_byte(ptr); + if !ptr.next_is('\'') { + return; // TODO: error reporting + } + ptr.bump(); +} + +pub(crate) fn scan_byte_char_or_string(ptr: &mut Ptr) -> SyntaxKind { + // unwrapping and not-exhaustive match are ok + // because of string_literal_start + let c = ptr.bump().unwrap(); + match c { + '\'' => { + scan_byte(ptr); + BYTE + } + '"' => { + scan_byte_string(ptr); + BYTE_STRING + } + 'r' => { + scan_raw_byte_string(ptr); + RAW_BYTE_STRING + } + _ => unreachable!(), + } +} + +pub(crate) fn scan_string(ptr: &mut Ptr) { + while let Some(c) = ptr.bump() { + if c == '"' { + return; + } + } +} + +pub(crate) fn scan_raw_string(ptr: &mut Ptr) { + if !ptr.next_is('"') { + return; + } + ptr.bump(); + + while let Some(c) = ptr.bump() { + if c == '"' { + return; + } + } +} + +fn scan_byte(ptr: &mut Ptr) { + if ptr.next_is('\'') { + ptr.bump(); + return; + } + ptr.bump(); + if ptr.next_is('\'') { + ptr.bump(); + return; + } +} + +fn scan_byte_string(ptr: &mut Ptr) { + while let Some(c) = ptr.bump() { + if c == '"' { + return; + } + } +} + +fn scan_raw_byte_string(ptr: &mut Ptr) { + if !ptr.next_is('"') { + return; + } + ptr.bump(); + + while let Some(c) = ptr.bump() { + if c == '"' { + return; + } + } +} + +fn scan_char_or_byte(ptr: &mut Ptr) { + //FIXME: deal with escape sequencies + ptr.bump(); +} -- cgit v1.2.3