diff options
author | Aleksey Kladov <[email protected]> | 2017-12-29 20:33:04 +0000 |
---|---|---|
committer | Aleksey Kladov <[email protected]> | 2017-12-29 20:33:04 +0000 |
commit | 171baf4c4863f035384c6c63a5f0ce531b01cf9d (patch) | |
tree | 8dd3885e6d02f64e41275c07bf15491477272182 /src/lexer | |
parent | 15af7ad36c507b17093ba86c393272819ff4b3cd (diff) |
Simple identifier lexer
Diffstat (limited to 'src/lexer')
-rw-r--r-- | src/lexer/mod.rs | 64 | ||||
-rw-r--r-- | src/lexer/ptr.rs | 38 |
2 files changed, 102 insertions, 0 deletions
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs new file mode 100644 index 000000000..136afb7b8 --- /dev/null +++ b/src/lexer/mod.rs | |||
@@ -0,0 +1,64 @@ | |||
1 | use unicode_xid::UnicodeXID; | ||
2 | |||
3 | use {Token, SyntaxKind}; | ||
4 | use syntax_kinds::*; | ||
5 | |||
6 | mod ptr; | ||
7 | use self::ptr::Ptr; | ||
8 | |||
9 | pub fn next_token(text: &str) -> Token { | ||
10 | assert!(!text.is_empty()); | ||
11 | let mut ptr = Ptr::new(text); | ||
12 | let c = ptr.bump().unwrap(); | ||
13 | let kind = next_token_inner(c, &mut ptr); | ||
14 | let len = ptr.into_len(); | ||
15 | Token { kind, len } | ||
16 | } | ||
17 | |||
18 | fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { | ||
19 | // Note: r as in r" or r#" is part of a raw string literal, | ||
20 | // b as in b' is part of a byte literal. | ||
21 | // They are not identifiers, and are handled further down. | ||
22 | let ident_start = ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext()); | ||
23 | if ident_start { | ||
24 | loop { | ||
25 | match ptr.next() { | ||
26 | Some(c) if ident_continue(c) => { | ||
27 | ptr.bump(); | ||
28 | }, | ||
29 | _ => break, | ||
30 | } | ||
31 | } | ||
32 | IDENT | ||
33 | } else { | ||
34 | WHITESPACE | ||
35 | } | ||
36 | } | ||
37 | |||
38 | fn ident_start(c: char) -> bool { | ||
39 | (c >= 'a' && c <= 'z') | ||
40 | || (c >= 'A' && c <= 'Z') | ||
41 | || c == '_' | ||
42 | || (c > '\x7f' && UnicodeXID::is_xid_start(c)) | ||
43 | } | ||
44 | |||
45 | fn ident_continue(c: char) -> bool { | ||
46 | (c >= 'a' && c <= 'z') | ||
47 | || (c >= 'A' && c <= 'Z') | ||
48 | || (c >= '0' && c <= '9') | ||
49 | || c == '_' | ||
50 | || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) | ||
51 | } | ||
52 | |||
53 | |||
54 | fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool { | ||
55 | match (c, c1, c2) { | ||
56 | ('r', Some('"'), _) | | ||
57 | ('r', Some('#'), _) | | ||
58 | ('b', Some('"'), _) | | ||
59 | ('b', Some('\''), _) | | ||
60 | ('b', Some('r'), Some('"')) | | ||
61 | ('b', Some('r'), Some('#')) => true, | ||
62 | _ => false | ||
63 | } | ||
64 | } | ||
diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs new file mode 100644 index 000000000..4638dac21 --- /dev/null +++ b/src/lexer/ptr.rs | |||
@@ -0,0 +1,38 @@ | |||
1 | use {TextUnit}; | ||
2 | |||
3 | use std::str::Chars; | ||
4 | |||
5 | pub(crate) struct Ptr<'s> { | ||
6 | text: &'s str, | ||
7 | len: TextUnit, | ||
8 | } | ||
9 | |||
10 | impl<'s> Ptr<'s> { | ||
11 | pub fn new(text: &'s str) -> Ptr<'s> { | ||
12 | Ptr { text, len: TextUnit::new(0) } | ||
13 | } | ||
14 | |||
15 | pub fn into_len(self) -> TextUnit { | ||
16 | self.len | ||
17 | } | ||
18 | |||
19 | pub fn next(&self) -> Option<char> { | ||
20 | self.chars().next() | ||
21 | } | ||
22 | |||
23 | pub fn nnext(&self) -> Option<char> { | ||
24 | let mut chars = self.chars(); | ||
25 | chars.next()?; | ||
26 | chars.next() | ||
27 | } | ||
28 | |||
29 | pub fn bump(&mut self) -> Option<char> { | ||
30 | let ch = self.chars().next()?; | ||
31 | self.len += TextUnit::len_of_char(ch); | ||
32 | Some(ch) | ||
33 | } | ||
34 | |||
35 | fn chars(&self) -> Chars { | ||
36 | self.text[self.len.0 as usize ..].chars() | ||
37 | } | ||
38 | } | ||