From 7c67612b8a894187fa3b64725531a5459f9211bf Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Fri, 10 Aug 2018 22:33:29 +0300 Subject: organizize --- src/lexer/classes.rs | 26 ------- src/lexer/comments.rs | 57 -------------- src/lexer/mod.rs | 209 -------------------------------------------------- src/lexer/numbers.rs | 67 ---------------- src/lexer/ptr.rs | 74 ------------------ src/lexer/strings.rs | 106 ------------------------- 6 files changed, 539 deletions(-) delete mode 100644 src/lexer/classes.rs delete mode 100644 src/lexer/comments.rs delete mode 100644 src/lexer/mod.rs delete mode 100644 src/lexer/numbers.rs delete mode 100644 src/lexer/ptr.rs delete mode 100644 src/lexer/strings.rs (limited to 'src/lexer') diff --git a/src/lexer/classes.rs b/src/lexer/classes.rs deleted file mode 100644 index 4235d2648..000000000 --- a/src/lexer/classes.rs +++ /dev/null @@ -1,26 +0,0 @@ -use unicode_xid::UnicodeXID; - -pub fn is_ident_start(c: char) -> bool { - (c >= 'a' && c <= 'z') - || (c >= 'A' && c <= 'Z') - || c == '_' - || (c > '\x7f' && UnicodeXID::is_xid_start(c)) -} - -pub fn is_ident_continue(c: char) -> bool { - (c >= 'a' && c <= 'z') - || (c >= 'A' && c <= 'Z') - || (c >= '0' && c <= '9') - || c == '_' - || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) -} - -pub fn is_whitespace(c: char) -> bool { - //FIXME: use is_pattern_whitespace - //https://github.com/behnam/rust-unic/issues/192 - c.is_whitespace() -} - -pub fn is_dec_digit(c: char) -> bool { - '0' <= c && c <= '9' -} diff --git a/src/lexer/comments.rs b/src/lexer/comments.rs deleted file mode 100644 index 01acb6515..000000000 --- a/src/lexer/comments.rs +++ /dev/null @@ -1,57 +0,0 @@ -use lexer::ptr::Ptr; - -use SyntaxKind::{self, *}; - -pub(crate) fn scan_shebang(ptr: &mut Ptr) -> bool { - if ptr.next_is('!') && ptr.nnext_is('/') { - ptr.bump(); - ptr.bump(); - bump_until_eol(ptr); - true - } else { - false - } -} - -fn scan_block_comment(ptr: &mut Ptr) -> Option { - if ptr.next_is('*') { - ptr.bump(); - let mut depth: u32 = 1; - while depth > 0 { - if ptr.next_is('*') && ptr.nnext_is('/') { - depth -= 1; - ptr.bump(); - ptr.bump(); - } else if ptr.next_is('/') && ptr.nnext_is('*') { - depth += 1; - ptr.bump(); - ptr.bump(); - } else if ptr.bump().is_none() { - break; - } - } - Some(COMMENT) - } else { - None - } -} - -pub(crate) fn scan_comment(ptr: &mut Ptr) -> Option { - if ptr.next_is('/') { - bump_until_eol(ptr); - Some(COMMENT) - } else { - scan_block_comment(ptr) - } -} - -fn bump_until_eol(ptr: &mut Ptr) { - loop { - if ptr.next_is('\n') || ptr.next_is('\r') && ptr.nnext_is('\n') { - return; - } - if ptr.bump().is_none() { - break; - } - } -} diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs deleted file mode 100644 index f8fdc41ac..000000000 --- a/src/lexer/mod.rs +++ /dev/null @@ -1,209 +0,0 @@ -mod classes; -mod comments; -mod numbers; -mod ptr; -mod strings; - -use { - SyntaxKind::{self, *}, - TextUnit, -}; - -use self::{ - classes::*, - comments::{scan_comment, scan_shebang}, - numbers::scan_number, - ptr::Ptr, - strings::{ - is_string_literal_start, scan_byte_char_or_string, scan_char, scan_raw_string, scan_string, - }, -}; - -/// A token of Rust source. -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct Token { - /// The kind of token. - pub kind: SyntaxKind, - /// The length of the token. - pub len: TextUnit, -} - -/// Break a string up into its component tokens -pub fn tokenize(text: &str) -> Vec { - let mut text = text; - let mut acc = Vec::new(); - while !text.is_empty() { - let token = next_token(text); - acc.push(token); - let len: u32 = token.len.into(); - text = &text[len as usize..]; - } - acc -} - -/// Get the next token from a string -pub fn next_token(text: &str) -> Token { - assert!(!text.is_empty()); - let mut ptr = Ptr::new(text); - let c = ptr.bump().unwrap(); - let kind = next_token_inner(c, &mut ptr); - let len = ptr.into_len(); - Token { kind, len } -} - -fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { - if is_whitespace(c) { - ptr.bump_while(is_whitespace); - return WHITESPACE; - } - - match c { - '#' => if scan_shebang(ptr) { - return SHEBANG; - }, - '/' => if let Some(kind) = scan_comment(ptr) { - return kind; - }, - _ => (), - } - - let ident_start = is_ident_start(c) && !is_string_literal_start(c, ptr.next(), ptr.nnext()); - if ident_start { - return scan_ident(c, ptr); - } - - if is_dec_digit(c) { - let kind = scan_number(c, ptr); - scan_literal_suffix(ptr); - return kind; - } - - // One-byte tokens. - if let Some(kind) = SyntaxKind::from_char(c) { - return kind; - } - - match c { - // Multi-byte tokens. - '.' => { - return match (ptr.next(), ptr.nnext()) { - (Some('.'), Some('.')) => { - ptr.bump(); - ptr.bump(); - DOTDOTDOT - } - (Some('.'), Some('=')) => { - ptr.bump(); - ptr.bump(); - DOTDOTEQ - } - (Some('.'), _) => { - ptr.bump(); - DOTDOT - } - _ => DOT, - }; - } - ':' => { - return match ptr.next() { - Some(':') => { - ptr.bump(); - COLONCOLON - } - _ => COLON, - }; - } - '=' => { - return match ptr.next() { - Some('=') => { - ptr.bump(); - EQEQ - } - Some('>') => { - ptr.bump(); - FAT_ARROW - } - _ => EQ, - }; - } - '!' => { - return match ptr.next() { - Some('=') => { - ptr.bump(); - NEQ - } - _ => EXCL, - }; - } - '-' => { - return if ptr.next_is('>') { - ptr.bump(); - THIN_ARROW - } else { - MINUS - }; - } - - // If the character is an ident start not followed by another single - // quote, then this is a lifetime name: - '\'' => { - return if ptr.next_is_p(is_ident_start) && !ptr.nnext_is('\'') { - ptr.bump(); - while ptr.next_is_p(is_ident_continue) { - ptr.bump(); - } - // lifetimes shouldn't end with a single quote - // if we find one, then this is an invalid character literal - if ptr.next_is('\'') { - ptr.bump(); - return CHAR; // TODO: error reporting - } - LIFETIME - } else { - scan_char(ptr); - scan_literal_suffix(ptr); - CHAR - }; - } - 'b' => { - let kind = scan_byte_char_or_string(ptr); - scan_literal_suffix(ptr); - return kind; - } - '"' => { - scan_string(ptr); - scan_literal_suffix(ptr); - return STRING; - } - 'r' => { - scan_raw_string(ptr); - scan_literal_suffix(ptr); - return RAW_STRING; - } - _ => (), - } - ERROR -} - -fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind { - let is_single_letter = match ptr.next() { - None => true, - Some(c) if !is_ident_continue(c) => true, - _ => false, - }; - if is_single_letter { - return if c == '_' { UNDERSCORE } else { IDENT }; - } - ptr.bump_while(is_ident_continue); - if let Some(kind) = SyntaxKind::from_keyword(ptr.current_token_text()) { - return kind; - } - IDENT -} - -fn scan_literal_suffix(ptr: &mut Ptr) { - if ptr.next_is_p(is_ident_start) { - ptr.bump(); - } - ptr.bump_while(is_ident_continue); -} diff --git a/src/lexer/numbers.rs b/src/lexer/numbers.rs deleted file mode 100644 index 5c4641a2d..000000000 --- a/src/lexer/numbers.rs +++ /dev/null @@ -1,67 +0,0 @@ -use lexer::classes::*; -use lexer::ptr::Ptr; - -use SyntaxKind::{self, *}; - -pub(crate) fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind { - if c == '0' { - match ptr.next().unwrap_or('\0') { - 'b' | 'o' => { - ptr.bump(); - scan_digits(ptr, false); - } - 'x' => { - ptr.bump(); - scan_digits(ptr, true); - } - '0'...'9' | '_' | '.' | 'e' | 'E' => { - scan_digits(ptr, true); - } - _ => return INT_NUMBER, - } - } else { - scan_digits(ptr, false); - } - - // might be a float, but don't be greedy if this is actually an - // integer literal followed by field/method access or a range pattern - // (`0..2` and `12.foo()`) - if ptr.next_is('.') && !(ptr.nnext_is('.') || ptr.nnext_is_p(is_ident_start)) { - // might have stuff after the ., and if it does, it needs to start - // with a number - ptr.bump(); - scan_digits(ptr, false); - scan_float_exponent(ptr); - return FLOAT_NUMBER; - } - // it might be a float if it has an exponent - if ptr.next_is('e') || ptr.next_is('E') { - scan_float_exponent(ptr); - return FLOAT_NUMBER; - } - INT_NUMBER -} - -fn scan_digits(ptr: &mut Ptr, allow_hex: bool) { - while let Some(c) = ptr.next() { - match c { - '_' | '0'...'9' => { - ptr.bump(); - } - 'a'...'f' | 'A'...'F' if allow_hex => { - ptr.bump(); - } - _ => return, - } - } -} - -fn scan_float_exponent(ptr: &mut Ptr) { - if ptr.next_is('e') || ptr.next_is('E') { - ptr.bump(); - if ptr.next_is('-') || ptr.next_is('+') { - ptr.bump(); - } - scan_digits(ptr, false); - } -} diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs deleted file mode 100644 index d1391fd5f..000000000 --- a/src/lexer/ptr.rs +++ /dev/null @@ -1,74 +0,0 @@ -use TextUnit; - -use std::str::Chars; - -pub(crate) struct Ptr<'s> { - text: &'s str, - len: TextUnit, -} - -impl<'s> Ptr<'s> { - pub fn new(text: &'s str) -> Ptr<'s> { - Ptr { - text, - len: 0.into(), - } - } - - pub fn into_len(self) -> TextUnit { - self.len - } - - pub fn next(&self) -> Option { - self.chars().next() - } - - pub fn nnext(&self) -> Option { - let mut chars = self.chars(); - chars.next()?; - chars.next() - } - - pub fn next_is(&self, c: char) -> bool { - self.next() == Some(c) - } - - pub fn nnext_is(&self, c: char) -> bool { - self.nnext() == Some(c) - } - - pub fn next_is_p bool>(&self, p: P) -> bool { - self.next().map(p) == Some(true) - } - - pub fn nnext_is_p bool>(&self, p: P) -> bool { - self.nnext().map(p) == Some(true) - } - - pub fn bump(&mut self) -> Option { - let ch = self.chars().next()?; - self.len += TextUnit::of_char(ch); - Some(ch) - } - - pub fn bump_while bool>(&mut self, pred: F) { - loop { - match self.next() { - Some(c) if pred(c) => { - self.bump(); - } - _ => return, - } - } - } - - pub fn current_token_text(&self) -> &str { - let len: u32 = self.len.into(); - &self.text[..len as usize] - } - - fn chars(&self) -> Chars { - let len: u32 = self.len.into(); - self.text[len as usize..].chars() - } -} diff --git a/src/lexer/strings.rs b/src/lexer/strings.rs deleted file mode 100644 index e3704fbb3..000000000 --- a/src/lexer/strings.rs +++ /dev/null @@ -1,106 +0,0 @@ -use SyntaxKind::{self, *}; - -use lexer::ptr::Ptr; - -pub(crate) fn is_string_literal_start(c: char, c1: Option, c2: Option) -> bool { - match (c, c1, c2) { - ('r', Some('"'), _) - | ('r', Some('#'), _) - | ('b', Some('"'), _) - | ('b', Some('\''), _) - | ('b', Some('r'), Some('"')) - | ('b', Some('r'), Some('#')) => true, - _ => false, - } -} - -pub(crate) fn scan_char(ptr: &mut Ptr) { - if ptr.bump().is_none() { - return; // TODO: error reporting is upper in the stack - } - scan_char_or_byte(ptr); - if !ptr.next_is('\'') { - return; // TODO: error reporting - } - ptr.bump(); -} - -pub(crate) fn scan_byte_char_or_string(ptr: &mut Ptr) -> SyntaxKind { - // unwrapping and not-exhaustive match are ok - // because of string_literal_start - let c = ptr.bump().unwrap(); - match c { - '\'' => { - scan_byte(ptr); - BYTE - } - '"' => { - scan_byte_string(ptr); - BYTE_STRING - } - 'r' => { - scan_raw_byte_string(ptr); - RAW_BYTE_STRING - } - _ => unreachable!(), - } -} - -pub(crate) fn scan_string(ptr: &mut Ptr) { - while let Some(c) = ptr.bump() { - if c == '"' { - return; - } - } -} - -pub(crate) fn scan_raw_string(ptr: &mut Ptr) { - if !ptr.next_is('"') { - return; - } - ptr.bump(); - - while let Some(c) = ptr.bump() { - if c == '"' { - return; - } - } -} - -fn scan_byte(ptr: &mut Ptr) { - if ptr.next_is('\'') { - ptr.bump(); - return; - } - ptr.bump(); - if ptr.next_is('\'') { - ptr.bump(); - return; - } -} - -fn scan_byte_string(ptr: &mut Ptr) { - while let Some(c) = ptr.bump() { - if c == '"' { - return; - } - } -} - -fn scan_raw_byte_string(ptr: &mut Ptr) { - if !ptr.next_is('"') { - return; - } - ptr.bump(); - - while let Some(c) = ptr.bump() { - if c == '"' { - return; - } - } -} - -fn scan_char_or_byte(ptr: &mut Ptr) { - //FIXME: deal with escape sequencies - ptr.bump(); -} -- cgit v1.2.3