From ddc637c16120fb352183698f635fc93a68580f7b Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sat, 30 Dec 2017 15:22:40 +0300 Subject: Lexer: start numbers --- src/lexer/classes.rs | 4 +++ src/lexer/mod.rs | 91 ++++++++++++++++++++++++++++++++++++++++++++++------ src/lexer/ptr.rs | 12 +++++++ src/syntax_kinds.rs | 6 +++- 4 files changed, 102 insertions(+), 11 deletions(-) (limited to 'src') diff --git a/src/lexer/classes.rs b/src/lexer/classes.rs index 7cc050bde..4235d2648 100644 --- a/src/lexer/classes.rs +++ b/src/lexer/classes.rs @@ -20,3 +20,7 @@ pub fn is_whitespace(c: char) -> bool { //https://github.com/behnam/rust-unic/issues/192 c.is_whitespace() } + +pub fn is_dec_digit(c: char) -> bool { + '0' <= c && c <= '9' +} diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 83a411cdd..afbbee4d0 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -22,16 +22,7 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { // They are not identifiers, and are handled further down. let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext()); if ident_start { - let is_single_letter = match ptr.next() { - None => true, - Some(c) if !is_ident_continue(c) => true, - _ => false, - }; - if is_single_letter { - return if c == '_' { UNDERSCORE } else { IDENT }; - } - ptr.bump_while(is_ident_continue); - return IDENT; + return scan_ident(c, ptr); } if is_whitespace(c) { @@ -39,9 +30,89 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { return WHITESPACE; } + if is_dec_digit(c) { + return scan_number(c, ptr); + } + ERROR } +fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind { + let is_single_letter = match ptr.next() { + None => true, + Some(c) if !is_ident_continue(c) => true, + _ => false, + }; + if is_single_letter { + return if c == '_' { UNDERSCORE } else { IDENT }; + } + ptr.bump_while(is_ident_continue); + IDENT +} + +fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind { + if c == '0' { + match ptr.next().unwrap_or('\0') { + 'b' | 'o' => { + ptr.bump(); + scan_digits(ptr, false); + } + 'x' => { + ptr.bump(); + scan_digits(ptr, true); + } + '0'...'9' | '_' | '.' | 'e' | 'E' => { + scan_digits(ptr, true); + } + _ => return INT_NUMBER, + } + } else { + scan_digits(ptr, false); + } + + // might be a float, but don't be greedy if this is actually an + // integer literal followed by field/method access or a range pattern + // (`0..2` and `12.foo()`) + if ptr.next_is('.') && !(ptr.nnext_is('.') || ptr.nnext_is_p(is_ident_start)) { + // might have stuff after the ., and if it does, it needs to start + // with a number + ptr.bump(); + scan_digits(ptr, false); + scan_float_exponent(ptr); + return FLOAT_NUMBER; + } + // it might be a float if it has an exponent + if ptr.next_is('e') || ptr.next_is('E') { + scan_float_exponent(ptr); + return FLOAT_NUMBER; + } + INT_NUMBER +} + +fn scan_digits(ptr: &mut Ptr, allow_hex: bool) { + while let Some(c) = ptr.next() { + match c { + '_' | '0'...'9' => { + ptr.bump(); + } + 'a'...'f' | 'A' ... 'F' if allow_hex => { + ptr.bump(); + } + _ => return + } + } +} + +fn scan_float_exponent(ptr: &mut Ptr) { + if ptr.next_is('e') || ptr.next_is('E') { + ptr.bump(); + if ptr.next_is('-') || ptr.next_is('+') { + ptr.bump(); + } + scan_digits(ptr, false); + } +} + fn string_literal_start(c: char, c1: Option, c2: Option) -> bool { match (c, c1, c2) { ('r', Some('"'), _) | diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs index e8aa6f37b..d441b826b 100644 --- a/src/lexer/ptr.rs +++ b/src/lexer/ptr.rs @@ -26,6 +26,18 @@ impl<'s> Ptr<'s> { chars.next() } + pub fn next_is(&self, c: char) -> bool { + self.next() == Some(c) + } + + pub fn nnext_is(&self, c: char) -> bool { + self.nnext() == Some(c) + } + + pub fn nnext_is_p bool>(&self, p: P) -> bool { + self.nnext().map(p) == Some(true) + } + pub fn bump(&mut self) -> Option { let ch = self.chars().next()?; self.len += TextUnit::len_of_char(ch); diff --git a/src/syntax_kinds.rs b/src/syntax_kinds.rs index b9b47a2ed..bd1265bde 100644 --- a/src/syntax_kinds.rs +++ b/src/syntax_kinds.rs @@ -5,12 +5,16 @@ pub const ERROR: SyntaxKind = SyntaxKind(0); pub const IDENT: SyntaxKind = SyntaxKind(1); pub const UNDERSCORE: SyntaxKind = SyntaxKind(2); pub const WHITESPACE: SyntaxKind = SyntaxKind(3); +pub const INT_NUMBER: SyntaxKind = SyntaxKind(4); +pub const FLOAT_NUMBER: SyntaxKind = SyntaxKind(5); -static INFOS: [SyntaxInfo; 4] = [ +static INFOS: [SyntaxInfo; 6] = [ SyntaxInfo { name: "ERROR" }, SyntaxInfo { name: "IDENT" }, SyntaxInfo { name: "UNDERSCORE" }, SyntaxInfo { name: "WHITESPACE" }, + SyntaxInfo { name: "INT_NUMBER" }, + SyntaxInfo { name: "FLOAT_NUMBER" }, ]; pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo { -- cgit v1.2.3