From 171baf4c4863f035384c6c63a5f0ce531b01cf9d Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Fri, 29 Dec 2017 23:33:04 +0300 Subject: Simple identifier lexer --- src/lexer/mod.rs | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/lexer/ptr.rs | 38 +++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 src/lexer/mod.rs create mode 100644 src/lexer/ptr.rs (limited to 'src/lexer') diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs new file mode 100644 index 000000000..136afb7b8 --- /dev/null +++ b/src/lexer/mod.rs @@ -0,0 +1,64 @@ +use unicode_xid::UnicodeXID; + +use {Token, SyntaxKind}; +use syntax_kinds::*; + +mod ptr; +use self::ptr::Ptr; + +pub fn next_token(text: &str) -> Token { + assert!(!text.is_empty()); + let mut ptr = Ptr::new(text); + let c = ptr.bump().unwrap(); + let kind = next_token_inner(c, &mut ptr); + let len = ptr.into_len(); + Token { kind, len } +} + +fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { + // Note: r as in r" or r#" is part of a raw string literal, + // b as in b' is part of a byte literal. + // They are not identifiers, and are handled further down. + let ident_start = ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext()); + if ident_start { + loop { + match ptr.next() { + Some(c) if ident_continue(c) => { + ptr.bump(); + }, + _ => break, + } + } + IDENT + } else { + WHITESPACE + } +} + +fn ident_start(c: char) -> bool { + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || c == '_' + || (c > '\x7f' && UnicodeXID::is_xid_start(c)) +} + +fn ident_continue(c: char) -> bool { + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || (c >= '0' && c <= '9') + || c == '_' + || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) +} + + +fn string_literal_start(c: char, c1: Option, c2: Option) -> bool { + match (c, c1, c2) { + ('r', Some('"'), _) | + ('r', Some('#'), _) | + ('b', Some('"'), _) | + ('b', Some('\''), _) | + ('b', Some('r'), Some('"')) | + ('b', Some('r'), Some('#')) => true, + _ => false + } +} diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs new file mode 100644 index 000000000..4638dac21 --- /dev/null +++ b/src/lexer/ptr.rs @@ -0,0 +1,38 @@ +use {TextUnit}; + +use std::str::Chars; + +pub(crate) struct Ptr<'s> { + text: &'s str, + len: TextUnit, +} + +impl<'s> Ptr<'s> { + pub fn new(text: &'s str) -> Ptr<'s> { + Ptr { text, len: TextUnit::new(0) } + } + + pub fn into_len(self) -> TextUnit { + self.len + } + + pub fn next(&self) -> Option { + self.chars().next() + } + + pub fn nnext(&self) -> Option { + let mut chars = self.chars(); + chars.next()?; + chars.next() + } + + pub fn bump(&mut self) -> Option { + let ch = self.chars().next()?; + self.len += TextUnit::len_of_char(ch); + Some(ch) + } + + fn chars(&self) -> Chars { + self.text[self.len.0 as usize ..].chars() + } +} -- cgit v1.2.3