From 171baf4c4863f035384c6c63a5f0ce531b01cf9d Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Fri, 29 Dec 2017 23:33:04 +0300 Subject: Simple identifier lexer --- src/lexer.rs | 10 --------- src/lexer/mod.rs | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/lexer/ptr.rs | 38 +++++++++++++++++++++++++++++++++ src/lib.rs | 2 ++ src/text.rs | 31 ++++++++++++++++++++++++++- 5 files changed, 134 insertions(+), 11 deletions(-) delete mode 100644 src/lexer.rs create mode 100644 src/lexer/mod.rs create mode 100644 src/lexer/ptr.rs (limited to 'src') diff --git a/src/lexer.rs b/src/lexer.rs deleted file mode 100644 index cda9fe2b2..000000000 --- a/src/lexer.rs +++ /dev/null @@ -1,10 +0,0 @@ -use {Token, TextUnit}; -use syntax_kinds::*; - -pub fn next_token(text: &str) -> Token { - let c = text.chars().next().unwrap(); - Token { - kind: IDENT, - len: TextUnit::len_of_char(c), - } -} \ No newline at end of file diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs new file mode 100644 index 000000000..136afb7b8 --- /dev/null +++ b/src/lexer/mod.rs @@ -0,0 +1,64 @@ +use unicode_xid::UnicodeXID; + +use {Token, SyntaxKind}; +use syntax_kinds::*; + +mod ptr; +use self::ptr::Ptr; + +pub fn next_token(text: &str) -> Token { + assert!(!text.is_empty()); + let mut ptr = Ptr::new(text); + let c = ptr.bump().unwrap(); + let kind = next_token_inner(c, &mut ptr); + let len = ptr.into_len(); + Token { kind, len } +} + +fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { + // Note: r as in r" or r#" is part of a raw string literal, + // b as in b' is part of a byte literal. + // They are not identifiers, and are handled further down. + let ident_start = ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext()); + if ident_start { + loop { + match ptr.next() { + Some(c) if ident_continue(c) => { + ptr.bump(); + }, + _ => break, + } + } + IDENT + } else { + WHITESPACE + } +} + +fn ident_start(c: char) -> bool { + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || c == '_' + || (c > '\x7f' && UnicodeXID::is_xid_start(c)) +} + +fn ident_continue(c: char) -> bool { + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || (c >= '0' && c <= '9') + || c == '_' + || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) +} + + +fn string_literal_start(c: char, c1: Option, c2: Option) -> bool { + match (c, c1, c2) { + ('r', Some('"'), _) | + ('r', Some('#'), _) | + ('b', Some('"'), _) | + ('b', Some('\''), _) | + ('b', Some('r'), Some('"')) | + ('b', Some('r'), Some('#')) => true, + _ => false + } +} diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs new file mode 100644 index 000000000..4638dac21 --- /dev/null +++ b/src/lexer/ptr.rs @@ -0,0 +1,38 @@ +use {TextUnit}; + +use std::str::Chars; + +pub(crate) struct Ptr<'s> { + text: &'s str, + len: TextUnit, +} + +impl<'s> Ptr<'s> { + pub fn new(text: &'s str) -> Ptr<'s> { + Ptr { text, len: TextUnit::new(0) } + } + + pub fn into_len(self) -> TextUnit { + self.len + } + + pub fn next(&self) -> Option { + self.chars().next() + } + + pub fn nnext(&self) -> Option { + let mut chars = self.chars(); + chars.next()?; + chars.next() + } + + pub fn bump(&mut self) -> Option { + let ch = self.chars().next()?; + self.len += TextUnit::len_of_char(ch); + Some(ch) + } + + fn chars(&self) -> Chars { + self.text[self.len.0 as usize ..].chars() + } +} diff --git a/src/lib.rs b/src/lib.rs index 4385c0325..3b9dbc8f7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,5 @@ +extern crate unicode_xid; + mod text; mod tree; mod lexer; diff --git a/src/text.rs b/src/text.rs index 5297275ed..31e67b456 100644 --- a/src/text.rs +++ b/src/text.rs @@ -1,7 +1,10 @@ use std::fmt; +use std::ops; #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct TextUnit(u32); +pub struct TextUnit( + pub(crate) u32 +); impl TextUnit { pub fn len_of_char(c: char) -> TextUnit { @@ -30,3 +33,29 @@ impl From for u32 { tu.0 } } + +impl ops::Add for TextUnit { + type Output = TextUnit; + fn add(self, rhs: TextUnit) -> TextUnit { + TextUnit(self.0 + rhs.0) + } +} + +impl ops::AddAssign for TextUnit { + fn add_assign(&mut self, rhs: TextUnit) { + self.0 += rhs.0 + } +} + +impl ops::Sub for TextUnit { + type Output = TextUnit; + fn sub(self, rhs: TextUnit) -> TextUnit { + TextUnit(self.0 - rhs.0) + } +} + +impl ops::SubAssign for TextUnit { + fn sub_assign(&mut self, rhs: TextUnit) { + self.0 -= rhs.0 + } +} \ No newline at end of file -- cgit v1.2.3