aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Cargo.toml2
-rw-r--r--src/lexer.rs10
-rw-r--r--src/lexer/mod.rs64
-rw-r--r--src/lexer/ptr.rs38
-rw-r--r--src/lib.rs2
-rw-r--r--src/text.rs31
-rw-r--r--tests/data/lexer/0001_hello.txt2
-rw-r--r--tests/lexer.rs16
8 files changed, 146 insertions, 19 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 0afd4d327..063d52211 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,6 +4,8 @@ version = "0.1.0"
4authors = ["Aleksey Kladov <[email protected]>"] 4authors = ["Aleksey Kladov <[email protected]>"]
5 5
6[dependencies] 6[dependencies]
7unicode-xid = "0.1.0"
8
7serde = "1.0.26" 9serde = "1.0.26"
8serde_derive = "1.0.26" 10serde_derive = "1.0.26"
9file = "1.1.1" 11file = "1.1.1"
diff --git a/src/lexer.rs b/src/lexer.rs
deleted file mode 100644
index cda9fe2b2..000000000
--- a/src/lexer.rs
+++ /dev/null
@@ -1,10 +0,0 @@
1use {Token, TextUnit};
2use syntax_kinds::*;
3
4pub fn next_token(text: &str) -> Token {
5 let c = text.chars().next().unwrap();
6 Token {
7 kind: IDENT,
8 len: TextUnit::len_of_char(c),
9 }
10} \ No newline at end of file
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
new file mode 100644
index 000000000..136afb7b8
--- /dev/null
+++ b/src/lexer/mod.rs
@@ -0,0 +1,64 @@
1use unicode_xid::UnicodeXID;
2
3use {Token, SyntaxKind};
4use syntax_kinds::*;
5
6mod ptr;
7use self::ptr::Ptr;
8
9pub fn next_token(text: &str) -> Token {
10 assert!(!text.is_empty());
11 let mut ptr = Ptr::new(text);
12 let c = ptr.bump().unwrap();
13 let kind = next_token_inner(c, &mut ptr);
14 let len = ptr.into_len();
15 Token { kind, len }
16}
17
18fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
19 // Note: r as in r" or r#" is part of a raw string literal,
20 // b as in b' is part of a byte literal.
21 // They are not identifiers, and are handled further down.
22 let ident_start = ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
23 if ident_start {
24 loop {
25 match ptr.next() {
26 Some(c) if ident_continue(c) => {
27 ptr.bump();
28 },
29 _ => break,
30 }
31 }
32 IDENT
33 } else {
34 WHITESPACE
35 }
36}
37
38fn ident_start(c: char) -> bool {
39 (c >= 'a' && c <= 'z')
40 || (c >= 'A' && c <= 'Z')
41 || c == '_'
42 || (c > '\x7f' && UnicodeXID::is_xid_start(c))
43}
44
45fn ident_continue(c: char) -> bool {
46 (c >= 'a' && c <= 'z')
47 || (c >= 'A' && c <= 'Z')
48 || (c >= '0' && c <= '9')
49 || c == '_'
50 || (c > '\x7f' && UnicodeXID::is_xid_continue(c))
51}
52
53
54fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
55 match (c, c1, c2) {
56 ('r', Some('"'), _) |
57 ('r', Some('#'), _) |
58 ('b', Some('"'), _) |
59 ('b', Some('\''), _) |
60 ('b', Some('r'), Some('"')) |
61 ('b', Some('r'), Some('#')) => true,
62 _ => false
63 }
64}
diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs
new file mode 100644
index 000000000..4638dac21
--- /dev/null
+++ b/src/lexer/ptr.rs
@@ -0,0 +1,38 @@
1use {TextUnit};
2
3use std::str::Chars;
4
5pub(crate) struct Ptr<'s> {
6 text: &'s str,
7 len: TextUnit,
8}
9
10impl<'s> Ptr<'s> {
11 pub fn new(text: &'s str) -> Ptr<'s> {
12 Ptr { text, len: TextUnit::new(0) }
13 }
14
15 pub fn into_len(self) -> TextUnit {
16 self.len
17 }
18
19 pub fn next(&self) -> Option<char> {
20 self.chars().next()
21 }
22
23 pub fn nnext(&self) -> Option<char> {
24 let mut chars = self.chars();
25 chars.next()?;
26 chars.next()
27 }
28
29 pub fn bump(&mut self) -> Option<char> {
30 let ch = self.chars().next()?;
31 self.len += TextUnit::len_of_char(ch);
32 Some(ch)
33 }
34
35 fn chars(&self) -> Chars {
36 self.text[self.len.0 as usize ..].chars()
37 }
38}
diff --git a/src/lib.rs b/src/lib.rs
index 4385c0325..3b9dbc8f7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,3 +1,5 @@
1extern crate unicode_xid;
2
1mod text; 3mod text;
2mod tree; 4mod tree;
3mod lexer; 5mod lexer;
diff --git a/src/text.rs b/src/text.rs
index 5297275ed..31e67b456 100644
--- a/src/text.rs
+++ b/src/text.rs
@@ -1,7 +1,10 @@
1use std::fmt; 1use std::fmt;
2use std::ops;
2 3
3#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] 4#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
4pub struct TextUnit(u32); 5pub struct TextUnit(
6 pub(crate) u32
7);
5 8
6impl TextUnit { 9impl TextUnit {
7 pub fn len_of_char(c: char) -> TextUnit { 10 pub fn len_of_char(c: char) -> TextUnit {
@@ -30,3 +33,29 @@ impl From<TextUnit> for u32 {
30 tu.0 33 tu.0
31 } 34 }
32} 35}
36
37impl ops::Add<TextUnit> for TextUnit {
38 type Output = TextUnit;
39 fn add(self, rhs: TextUnit) -> TextUnit {
40 TextUnit(self.0 + rhs.0)
41 }
42}
43
44impl ops::AddAssign<TextUnit> for TextUnit {
45 fn add_assign(&mut self, rhs: TextUnit) {
46 self.0 += rhs.0
47 }
48}
49
50impl ops::Sub<TextUnit> for TextUnit {
51 type Output = TextUnit;
52 fn sub(self, rhs: TextUnit) -> TextUnit {
53 TextUnit(self.0 - rhs.0)
54 }
55}
56
57impl ops::SubAssign<TextUnit> for TextUnit {
58 fn sub_assign(&mut self, rhs: TextUnit) {
59 self.0 -= rhs.0
60 }
61} \ No newline at end of file
diff --git a/tests/data/lexer/0001_hello.txt b/tests/data/lexer/0001_hello.txt
index 5bec9be80..e0b6a1f10 100644
--- a/tests/data/lexer/0001_hello.txt
+++ b/tests/data/lexer/0001_hello.txt
@@ -1,3 +1,3 @@
1IDENT 5 1IDENT 5
2WHITESPACE 1 2WHITESPACE 1
3IDENT 5 \ No newline at end of file 3IDENT 5
diff --git a/tests/lexer.rs b/tests/lexer.rs
index a27e7c395..a3c8916b1 100644
--- a/tests/lexer.rs
+++ b/tests/lexer.rs
@@ -41,13 +41,15 @@ fn lexer_test_case(path: &Path) {
41 dump_tokens(&tokens) 41 dump_tokens(&tokens)
42 }; 42 };
43 let expected = file::get_text(&path.with_extension("txt")).unwrap(); 43 let expected = file::get_text(&path.with_extension("txt")).unwrap();
44 44 let expected = expected.as_str();
45 assert_diff!( 45 let actual = actual.as_str();
46 expected.as_str(), 46 if expected == actual {
47 actual.as_str(), 47 return
48 "\n", 48 }
49 0 49 if expected.trim() == actual.trim() {
50 ) 50 panic!("Whitespace difference!")
51 }
52 assert_diff!(expected, actual, "\n", 0)
51} 53}
52 54
53fn tokenize(text: &str) -> Vec<Token> { 55fn tokenize(text: &str) -> Vec<Token> {