aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--grammar.ron2
-rw-r--r--src/lexer/classes.rs22
-rw-r--r--src/lexer/mod.rs39
-rw-r--r--src/lexer/ptr.rs11
-rw-r--r--src/syntax_kinds.rs10
-rw-r--r--tests/data/lexer/0002_whitespace.rs4
-rw-r--r--tests/data/lexer/0002_whitespace.txt12
-rw-r--r--validation.md4
8 files changed, 73 insertions, 31 deletions
diff --git a/grammar.ron b/grammar.ron
index 18c382536..49b9c527c 100644
--- a/grammar.ron
+++ b/grammar.ron
@@ -1,6 +1,8 @@
1Grammar( 1Grammar(
2 syntax_kinds: [ 2 syntax_kinds: [
3 "ERROR",
3 "IDENT", 4 "IDENT",
5 "UNDERSCORE",
4 "WHITESPACE", 6 "WHITESPACE",
5 ] 7 ]
6) \ No newline at end of file 8) \ No newline at end of file
diff --git a/src/lexer/classes.rs b/src/lexer/classes.rs
new file mode 100644
index 000000000..7cc050bde
--- /dev/null
+++ b/src/lexer/classes.rs
@@ -0,0 +1,22 @@
1use unicode_xid::UnicodeXID;
2
3pub fn is_ident_start(c: char) -> bool {
4 (c >= 'a' && c <= 'z')
5 || (c >= 'A' && c <= 'Z')
6 || c == '_'
7 || (c > '\x7f' && UnicodeXID::is_xid_start(c))
8}
9
10pub fn is_ident_continue(c: char) -> bool {
11 (c >= 'a' && c <= 'z')
12 || (c >= 'A' && c <= 'Z')
13 || (c >= '0' && c <= '9')
14 || c == '_'
15 || (c > '\x7f' && UnicodeXID::is_xid_continue(c))
16}
17
18pub fn is_whitespace(c: char) -> bool {
19 //FIXME: use is_pattern_whitespace
20 //https://github.com/behnam/rust-unic/issues/192
21 c.is_whitespace()
22}
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index 136afb7b8..dd3e2896d 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -1,11 +1,12 @@
1use unicode_xid::UnicodeXID;
2
3use {Token, SyntaxKind}; 1use {Token, SyntaxKind};
4use syntax_kinds::*; 2use syntax_kinds::*;
5 3
6mod ptr; 4mod ptr;
7use self::ptr::Ptr; 5use self::ptr::Ptr;
8 6
7mod classes;
8use self::classes::*;
9
9pub fn next_token(text: &str) -> Token { 10pub fn next_token(text: &str) -> Token {
10 assert!(!text.is_empty()); 11 assert!(!text.is_empty());
11 let mut ptr = Ptr::new(text); 12 let mut ptr = Ptr::new(text);
@@ -19,38 +20,20 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
19 // Note: r as in r" or r#" is part of a raw string literal, 20 // Note: r as in r" or r#" is part of a raw string literal,
20 // b as in b' is part of a byte literal. 21 // b as in b' is part of a byte literal.
21 // They are not identifiers, and are handled further down. 22 // They are not identifiers, and are handled further down.
22 let ident_start = ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext()); 23 let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
23 if ident_start { 24 if ident_start {
24 loop { 25 ptr.bump_while(is_ident_continue);
25 match ptr.next() { 26 return IDENT;
26 Some(c) if ident_continue(c) => {
27 ptr.bump();
28 },
29 _ => break,
30 }
31 }
32 IDENT
33 } else {
34 WHITESPACE
35 } 27 }
36}
37 28
38fn ident_start(c: char) -> bool { 29 if is_whitespace(c) {
39 (c >= 'a' && c <= 'z') 30 ptr.bump_while(is_whitespace);
40 || (c >= 'A' && c <= 'Z') 31 return WHITESPACE;
41 || c == '_' 32 }
42 || (c > '\x7f' && UnicodeXID::is_xid_start(c))
43}
44 33
45fn ident_continue(c: char) -> bool { 34 return ERROR
46 (c >= 'a' && c <= 'z')
47 || (c >= 'A' && c <= 'Z')
48 || (c >= '0' && c <= '9')
49 || c == '_'
50 || (c > '\x7f' && UnicodeXID::is_xid_continue(c))
51} 35}
52 36
53
54fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool { 37fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
55 match (c, c1, c2) { 38 match (c, c1, c2) {
56 ('r', Some('"'), _) | 39 ('r', Some('"'), _) |
diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs
index 4638dac21..e8aa6f37b 100644
--- a/src/lexer/ptr.rs
+++ b/src/lexer/ptr.rs
@@ -32,6 +32,17 @@ impl<'s> Ptr<'s> {
32 Some(ch) 32 Some(ch)
33 } 33 }
34 34
35 pub fn bump_while<F: Fn(char) -> bool>(&mut self, pred: F) {
36 loop {
37 match self.next() {
38 Some(c) if pred(c) => {
39 self.bump();
40 },
41 _ => return,
42 }
43 }
44 }
45
35 fn chars(&self) -> Chars { 46 fn chars(&self) -> Chars {
36 self.text[self.len.0 as usize ..].chars() 47 self.text[self.len.0 as usize ..].chars()
37 } 48 }
diff --git a/src/syntax_kinds.rs b/src/syntax_kinds.rs
index 421cae15a..b9b47a2ed 100644
--- a/src/syntax_kinds.rs
+++ b/src/syntax_kinds.rs
@@ -1,11 +1,15 @@
1// Generated from grammar.ron 1// Generated from grammar.ron
2use tree::{SyntaxKind, SyntaxInfo}; 2use tree::{SyntaxKind, SyntaxInfo};
3 3
4pub const IDENT: SyntaxKind = SyntaxKind(0); 4pub const ERROR: SyntaxKind = SyntaxKind(0);
5pub const WHITESPACE: SyntaxKind = SyntaxKind(1); 5pub const IDENT: SyntaxKind = SyntaxKind(1);
6pub const UNDERSCORE: SyntaxKind = SyntaxKind(2);
7pub const WHITESPACE: SyntaxKind = SyntaxKind(3);
6 8
7static INFOS: [SyntaxInfo; 2] = [ 9static INFOS: [SyntaxInfo; 4] = [
10 SyntaxInfo { name: "ERROR" },
8 SyntaxInfo { name: "IDENT" }, 11 SyntaxInfo { name: "IDENT" },
12 SyntaxInfo { name: "UNDERSCORE" },
9 SyntaxInfo { name: "WHITESPACE" }, 13 SyntaxInfo { name: "WHITESPACE" },
10]; 14];
11 15
diff --git a/tests/data/lexer/0002_whitespace.rs b/tests/data/lexer/0002_whitespace.rs
new file mode 100644
index 000000000..08fce1418
--- /dev/null
+++ b/tests/data/lexer/0002_whitespace.rs
@@ -0,0 +1,4 @@
1a b c
2d
3
4e f
diff --git a/tests/data/lexer/0002_whitespace.txt b/tests/data/lexer/0002_whitespace.txt
new file mode 100644
index 000000000..4b9885e4a
--- /dev/null
+++ b/tests/data/lexer/0002_whitespace.txt
@@ -0,0 +1,12 @@
1IDENT 1
2WHITESPACE 1
3IDENT 1
4WHITESPACE 2
5IDENT 1
6WHITESPACE 1
7IDENT 1
8WHITESPACE 2
9IDENT 1
10WHITESPACE 1
11IDENT 1
12WHITESPACE 1
diff --git a/validation.md b/validation.md
new file mode 100644
index 000000000..9cfec5309
--- /dev/null
+++ b/validation.md
@@ -0,0 +1,4 @@
1Fixmes:
2
3* Fix `is_whitespace`, add more test
4