diff options
-rw-r--r-- | Cargo.toml | 2 | ||||
-rw-r--r-- | src/lexer.rs | 10 | ||||
-rw-r--r-- | src/lexer/mod.rs | 64 | ||||
-rw-r--r-- | src/lexer/ptr.rs | 38 | ||||
-rw-r--r-- | src/lib.rs | 2 | ||||
-rw-r--r-- | src/text.rs | 31 | ||||
-rw-r--r-- | tests/data/lexer/0001_hello.txt | 2 | ||||
-rw-r--r-- | tests/lexer.rs | 16 |
8 files changed, 146 insertions, 19 deletions
diff --git a/Cargo.toml b/Cargo.toml index 0afd4d327..063d52211 100644 --- a/Cargo.toml +++ b/Cargo.toml | |||
@@ -4,6 +4,8 @@ version = "0.1.0" | |||
4 | authors = ["Aleksey Kladov <[email protected]>"] | 4 | authors = ["Aleksey Kladov <[email protected]>"] |
5 | 5 | ||
6 | [dependencies] | 6 | [dependencies] |
7 | unicode-xid = "0.1.0" | ||
8 | |||
7 | serde = "1.0.26" | 9 | serde = "1.0.26" |
8 | serde_derive = "1.0.26" | 10 | serde_derive = "1.0.26" |
9 | file = "1.1.1" | 11 | file = "1.1.1" |
diff --git a/src/lexer.rs b/src/lexer.rs deleted file mode 100644 index cda9fe2b2..000000000 --- a/src/lexer.rs +++ /dev/null | |||
@@ -1,10 +0,0 @@ | |||
1 | use {Token, TextUnit}; | ||
2 | use syntax_kinds::*; | ||
3 | |||
4 | pub fn next_token(text: &str) -> Token { | ||
5 | let c = text.chars().next().unwrap(); | ||
6 | Token { | ||
7 | kind: IDENT, | ||
8 | len: TextUnit::len_of_char(c), | ||
9 | } | ||
10 | } \ No newline at end of file | ||
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs new file mode 100644 index 000000000..136afb7b8 --- /dev/null +++ b/src/lexer/mod.rs | |||
@@ -0,0 +1,64 @@ | |||
1 | use unicode_xid::UnicodeXID; | ||
2 | |||
3 | use {Token, SyntaxKind}; | ||
4 | use syntax_kinds::*; | ||
5 | |||
6 | mod ptr; | ||
7 | use self::ptr::Ptr; | ||
8 | |||
9 | pub fn next_token(text: &str) -> Token { | ||
10 | assert!(!text.is_empty()); | ||
11 | let mut ptr = Ptr::new(text); | ||
12 | let c = ptr.bump().unwrap(); | ||
13 | let kind = next_token_inner(c, &mut ptr); | ||
14 | let len = ptr.into_len(); | ||
15 | Token { kind, len } | ||
16 | } | ||
17 | |||
18 | fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { | ||
19 | // Note: r as in r" or r#" is part of a raw string literal, | ||
20 | // b as in b' is part of a byte literal. | ||
21 | // They are not identifiers, and are handled further down. | ||
22 | let ident_start = ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext()); | ||
23 | if ident_start { | ||
24 | loop { | ||
25 | match ptr.next() { | ||
26 | Some(c) if ident_continue(c) => { | ||
27 | ptr.bump(); | ||
28 | }, | ||
29 | _ => break, | ||
30 | } | ||
31 | } | ||
32 | IDENT | ||
33 | } else { | ||
34 | WHITESPACE | ||
35 | } | ||
36 | } | ||
37 | |||
38 | fn ident_start(c: char) -> bool { | ||
39 | (c >= 'a' && c <= 'z') | ||
40 | || (c >= 'A' && c <= 'Z') | ||
41 | || c == '_' | ||
42 | || (c > '\x7f' && UnicodeXID::is_xid_start(c)) | ||
43 | } | ||
44 | |||
45 | fn ident_continue(c: char) -> bool { | ||
46 | (c >= 'a' && c <= 'z') | ||
47 | || (c >= 'A' && c <= 'Z') | ||
48 | || (c >= '0' && c <= '9') | ||
49 | || c == '_' | ||
50 | || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) | ||
51 | } | ||
52 | |||
53 | |||
54 | fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool { | ||
55 | match (c, c1, c2) { | ||
56 | ('r', Some('"'), _) | | ||
57 | ('r', Some('#'), _) | | ||
58 | ('b', Some('"'), _) | | ||
59 | ('b', Some('\''), _) | | ||
60 | ('b', Some('r'), Some('"')) | | ||
61 | ('b', Some('r'), Some('#')) => true, | ||
62 | _ => false | ||
63 | } | ||
64 | } | ||
diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs new file mode 100644 index 000000000..4638dac21 --- /dev/null +++ b/src/lexer/ptr.rs | |||
@@ -0,0 +1,38 @@ | |||
1 | use {TextUnit}; | ||
2 | |||
3 | use std::str::Chars; | ||
4 | |||
5 | pub(crate) struct Ptr<'s> { | ||
6 | text: &'s str, | ||
7 | len: TextUnit, | ||
8 | } | ||
9 | |||
10 | impl<'s> Ptr<'s> { | ||
11 | pub fn new(text: &'s str) -> Ptr<'s> { | ||
12 | Ptr { text, len: TextUnit::new(0) } | ||
13 | } | ||
14 | |||
15 | pub fn into_len(self) -> TextUnit { | ||
16 | self.len | ||
17 | } | ||
18 | |||
19 | pub fn next(&self) -> Option<char> { | ||
20 | self.chars().next() | ||
21 | } | ||
22 | |||
23 | pub fn nnext(&self) -> Option<char> { | ||
24 | let mut chars = self.chars(); | ||
25 | chars.next()?; | ||
26 | chars.next() | ||
27 | } | ||
28 | |||
29 | pub fn bump(&mut self) -> Option<char> { | ||
30 | let ch = self.chars().next()?; | ||
31 | self.len += TextUnit::len_of_char(ch); | ||
32 | Some(ch) | ||
33 | } | ||
34 | |||
35 | fn chars(&self) -> Chars { | ||
36 | self.text[self.len.0 as usize ..].chars() | ||
37 | } | ||
38 | } | ||
diff --git a/src/lib.rs b/src/lib.rs index 4385c0325..3b9dbc8f7 100644 --- a/src/lib.rs +++ b/src/lib.rs | |||
@@ -1,3 +1,5 @@ | |||
1 | extern crate unicode_xid; | ||
2 | |||
1 | mod text; | 3 | mod text; |
2 | mod tree; | 4 | mod tree; |
3 | mod lexer; | 5 | mod lexer; |
diff --git a/src/text.rs b/src/text.rs index 5297275ed..31e67b456 100644 --- a/src/text.rs +++ b/src/text.rs | |||
@@ -1,7 +1,10 @@ | |||
1 | use std::fmt; | 1 | use std::fmt; |
2 | use std::ops; | ||
2 | 3 | ||
3 | #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] | 4 | #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] |
4 | pub struct TextUnit(u32); | 5 | pub struct TextUnit( |
6 | pub(crate) u32 | ||
7 | ); | ||
5 | 8 | ||
6 | impl TextUnit { | 9 | impl TextUnit { |
7 | pub fn len_of_char(c: char) -> TextUnit { | 10 | pub fn len_of_char(c: char) -> TextUnit { |
@@ -30,3 +33,29 @@ impl From<TextUnit> for u32 { | |||
30 | tu.0 | 33 | tu.0 |
31 | } | 34 | } |
32 | } | 35 | } |
36 | |||
37 | impl ops::Add<TextUnit> for TextUnit { | ||
38 | type Output = TextUnit; | ||
39 | fn add(self, rhs: TextUnit) -> TextUnit { | ||
40 | TextUnit(self.0 + rhs.0) | ||
41 | } | ||
42 | } | ||
43 | |||
44 | impl ops::AddAssign<TextUnit> for TextUnit { | ||
45 | fn add_assign(&mut self, rhs: TextUnit) { | ||
46 | self.0 += rhs.0 | ||
47 | } | ||
48 | } | ||
49 | |||
50 | impl ops::Sub<TextUnit> for TextUnit { | ||
51 | type Output = TextUnit; | ||
52 | fn sub(self, rhs: TextUnit) -> TextUnit { | ||
53 | TextUnit(self.0 - rhs.0) | ||
54 | } | ||
55 | } | ||
56 | |||
57 | impl ops::SubAssign<TextUnit> for TextUnit { | ||
58 | fn sub_assign(&mut self, rhs: TextUnit) { | ||
59 | self.0 -= rhs.0 | ||
60 | } | ||
61 | } \ No newline at end of file | ||
diff --git a/tests/data/lexer/0001_hello.txt b/tests/data/lexer/0001_hello.txt index 5bec9be80..e0b6a1f10 100644 --- a/tests/data/lexer/0001_hello.txt +++ b/tests/data/lexer/0001_hello.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | IDENT 5 | 1 | IDENT 5 |
2 | WHITESPACE 1 | 2 | WHITESPACE 1 |
3 | IDENT 5 \ No newline at end of file | 3 | IDENT 5 |
diff --git a/tests/lexer.rs b/tests/lexer.rs index a27e7c395..a3c8916b1 100644 --- a/tests/lexer.rs +++ b/tests/lexer.rs | |||
@@ -41,13 +41,15 @@ fn lexer_test_case(path: &Path) { | |||
41 | dump_tokens(&tokens) | 41 | dump_tokens(&tokens) |
42 | }; | 42 | }; |
43 | let expected = file::get_text(&path.with_extension("txt")).unwrap(); | 43 | let expected = file::get_text(&path.with_extension("txt")).unwrap(); |
44 | 44 | let expected = expected.as_str(); | |
45 | assert_diff!( | 45 | let actual = actual.as_str(); |
46 | expected.as_str(), | 46 | if expected == actual { |
47 | actual.as_str(), | 47 | return |
48 | "\n", | 48 | } |
49 | 0 | 49 | if expected.trim() == actual.trim() { |
50 | ) | 50 | panic!("Whitespace difference!") |
51 | } | ||
52 | assert_diff!(expected, actual, "\n", 0) | ||
51 | } | 53 | } |
52 | 54 | ||
53 | fn tokenize(text: &str) -> Vec<Token> { | 55 | fn tokenize(text: &str) -> Vec<Token> { |