aboutsummaryrefslogtreecommitdiff
path: root/src/lexer
diff options
context:
space:
mode:
authorAleksey Kladov <[email protected]>2017-12-29 20:33:04 +0000
committerAleksey Kladov <[email protected]>2017-12-29 20:33:04 +0000
commit171baf4c4863f035384c6c63a5f0ce531b01cf9d (patch)
tree8dd3885e6d02f64e41275c07bf15491477272182 /src/lexer
parent15af7ad36c507b17093ba86c393272819ff4b3cd (diff)
Simple identifier lexer
Diffstat (limited to 'src/lexer')
-rw-r--r--src/lexer/mod.rs64
-rw-r--r--src/lexer/ptr.rs38
2 files changed, 102 insertions, 0 deletions
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
new file mode 100644
index 000000000..136afb7b8
--- /dev/null
+++ b/src/lexer/mod.rs
@@ -0,0 +1,64 @@
1use unicode_xid::UnicodeXID;
2
3use {Token, SyntaxKind};
4use syntax_kinds::*;
5
6mod ptr;
7use self::ptr::Ptr;
8
9pub fn next_token(text: &str) -> Token {
10 assert!(!text.is_empty());
11 let mut ptr = Ptr::new(text);
12 let c = ptr.bump().unwrap();
13 let kind = next_token_inner(c, &mut ptr);
14 let len = ptr.into_len();
15 Token { kind, len }
16}
17
18fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
19 // Note: r as in r" or r#" is part of a raw string literal,
20 // b as in b' is part of a byte literal.
21 // They are not identifiers, and are handled further down.
22 let ident_start = ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
23 if ident_start {
24 loop {
25 match ptr.next() {
26 Some(c) if ident_continue(c) => {
27 ptr.bump();
28 },
29 _ => break,
30 }
31 }
32 IDENT
33 } else {
34 WHITESPACE
35 }
36}
37
38fn ident_start(c: char) -> bool {
39 (c >= 'a' && c <= 'z')
40 || (c >= 'A' && c <= 'Z')
41 || c == '_'
42 || (c > '\x7f' && UnicodeXID::is_xid_start(c))
43}
44
45fn ident_continue(c: char) -> bool {
46 (c >= 'a' && c <= 'z')
47 || (c >= 'A' && c <= 'Z')
48 || (c >= '0' && c <= '9')
49 || c == '_'
50 || (c > '\x7f' && UnicodeXID::is_xid_continue(c))
51}
52
53
54fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
55 match (c, c1, c2) {
56 ('r', Some('"'), _) |
57 ('r', Some('#'), _) |
58 ('b', Some('"'), _) |
59 ('b', Some('\''), _) |
60 ('b', Some('r'), Some('"')) |
61 ('b', Some('r'), Some('#')) => true,
62 _ => false
63 }
64}
diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs
new file mode 100644
index 000000000..4638dac21
--- /dev/null
+++ b/src/lexer/ptr.rs
@@ -0,0 +1,38 @@
1use {TextUnit};
2
3use std::str::Chars;
4
5pub(crate) struct Ptr<'s> {
6 text: &'s str,
7 len: TextUnit,
8}
9
10impl<'s> Ptr<'s> {
11 pub fn new(text: &'s str) -> Ptr<'s> {
12 Ptr { text, len: TextUnit::new(0) }
13 }
14
15 pub fn into_len(self) -> TextUnit {
16 self.len
17 }
18
19 pub fn next(&self) -> Option<char> {
20 self.chars().next()
21 }
22
23 pub fn nnext(&self) -> Option<char> {
24 let mut chars = self.chars();
25 chars.next()?;
26 chars.next()
27 }
28
29 pub fn bump(&mut self) -> Option<char> {
30 let ch = self.chars().next()?;
31 self.len += TextUnit::len_of_char(ch);
32 Some(ch)
33 }
34
35 fn chars(&self) -> Chars {
36 self.text[self.len.0 as usize ..].chars()
37 }
38}