diff options
-rw-r--r-- | grammar.ron | 2 | ||||
-rw-r--r-- | src/lexer/classes.rs | 22 | ||||
-rw-r--r-- | src/lexer/mod.rs | 39 | ||||
-rw-r--r-- | src/lexer/ptr.rs | 11 | ||||
-rw-r--r-- | src/syntax_kinds.rs | 10 | ||||
-rw-r--r-- | tests/data/lexer/0002_whitespace.rs | 4 | ||||
-rw-r--r-- | tests/data/lexer/0002_whitespace.txt | 12 | ||||
-rw-r--r-- | validation.md | 4 |
8 files changed, 73 insertions, 31 deletions
diff --git a/grammar.ron b/grammar.ron index 18c382536..49b9c527c 100644 --- a/grammar.ron +++ b/grammar.ron | |||
@@ -1,6 +1,8 @@ | |||
1 | Grammar( | 1 | Grammar( |
2 | syntax_kinds: [ | 2 | syntax_kinds: [ |
3 | "ERROR", | ||
3 | "IDENT", | 4 | "IDENT", |
5 | "UNDERSCORE", | ||
4 | "WHITESPACE", | 6 | "WHITESPACE", |
5 | ] | 7 | ] |
6 | ) \ No newline at end of file | 8 | ) \ No newline at end of file |
diff --git a/src/lexer/classes.rs b/src/lexer/classes.rs new file mode 100644 index 000000000..7cc050bde --- /dev/null +++ b/src/lexer/classes.rs | |||
@@ -0,0 +1,22 @@ | |||
1 | use unicode_xid::UnicodeXID; | ||
2 | |||
3 | pub fn is_ident_start(c: char) -> bool { | ||
4 | (c >= 'a' && c <= 'z') | ||
5 | || (c >= 'A' && c <= 'Z') | ||
6 | || c == '_' | ||
7 | || (c > '\x7f' && UnicodeXID::is_xid_start(c)) | ||
8 | } | ||
9 | |||
10 | pub fn is_ident_continue(c: char) -> bool { | ||
11 | (c >= 'a' && c <= 'z') | ||
12 | || (c >= 'A' && c <= 'Z') | ||
13 | || (c >= '0' && c <= '9') | ||
14 | || c == '_' | ||
15 | || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) | ||
16 | } | ||
17 | |||
18 | pub fn is_whitespace(c: char) -> bool { | ||
19 | //FIXME: use is_pattern_whitespace | ||
20 | //https://github.com/behnam/rust-unic/issues/192 | ||
21 | c.is_whitespace() | ||
22 | } | ||
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 136afb7b8..dd3e2896d 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs | |||
@@ -1,11 +1,12 @@ | |||
1 | use unicode_xid::UnicodeXID; | ||
2 | |||
3 | use {Token, SyntaxKind}; | 1 | use {Token, SyntaxKind}; |
4 | use syntax_kinds::*; | 2 | use syntax_kinds::*; |
5 | 3 | ||
6 | mod ptr; | 4 | mod ptr; |
7 | use self::ptr::Ptr; | 5 | use self::ptr::Ptr; |
8 | 6 | ||
7 | mod classes; | ||
8 | use self::classes::*; | ||
9 | |||
9 | pub fn next_token(text: &str) -> Token { | 10 | pub fn next_token(text: &str) -> Token { |
10 | assert!(!text.is_empty()); | 11 | assert!(!text.is_empty()); |
11 | let mut ptr = Ptr::new(text); | 12 | let mut ptr = Ptr::new(text); |
@@ -19,38 +20,20 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { | |||
19 | // Note: r as in r" or r#" is part of a raw string literal, | 20 | // Note: r as in r" or r#" is part of a raw string literal, |
20 | // b as in b' is part of a byte literal. | 21 | // b as in b' is part of a byte literal. |
21 | // They are not identifiers, and are handled further down. | 22 | // They are not identifiers, and are handled further down. |
22 | let ident_start = ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext()); | 23 | let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext()); |
23 | if ident_start { | 24 | if ident_start { |
24 | loop { | 25 | ptr.bump_while(is_ident_continue); |
25 | match ptr.next() { | 26 | return IDENT; |
26 | Some(c) if ident_continue(c) => { | ||
27 | ptr.bump(); | ||
28 | }, | ||
29 | _ => break, | ||
30 | } | ||
31 | } | ||
32 | IDENT | ||
33 | } else { | ||
34 | WHITESPACE | ||
35 | } | 27 | } |
36 | } | ||
37 | 28 | ||
38 | fn ident_start(c: char) -> bool { | 29 | if is_whitespace(c) { |
39 | (c >= 'a' && c <= 'z') | 30 | ptr.bump_while(is_whitespace); |
40 | || (c >= 'A' && c <= 'Z') | 31 | return WHITESPACE; |
41 | || c == '_' | 32 | } |
42 | || (c > '\x7f' && UnicodeXID::is_xid_start(c)) | ||
43 | } | ||
44 | 33 | ||
45 | fn ident_continue(c: char) -> bool { | 34 | return ERROR |
46 | (c >= 'a' && c <= 'z') | ||
47 | || (c >= 'A' && c <= 'Z') | ||
48 | || (c >= '0' && c <= '9') | ||
49 | || c == '_' | ||
50 | || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) | ||
51 | } | 35 | } |
52 | 36 | ||
53 | |||
54 | fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool { | 37 | fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool { |
55 | match (c, c1, c2) { | 38 | match (c, c1, c2) { |
56 | ('r', Some('"'), _) | | 39 | ('r', Some('"'), _) | |
diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs index 4638dac21..e8aa6f37b 100644 --- a/src/lexer/ptr.rs +++ b/src/lexer/ptr.rs | |||
@@ -32,6 +32,17 @@ impl<'s> Ptr<'s> { | |||
32 | Some(ch) | 32 | Some(ch) |
33 | } | 33 | } |
34 | 34 | ||
35 | pub fn bump_while<F: Fn(char) -> bool>(&mut self, pred: F) { | ||
36 | loop { | ||
37 | match self.next() { | ||
38 | Some(c) if pred(c) => { | ||
39 | self.bump(); | ||
40 | }, | ||
41 | _ => return, | ||
42 | } | ||
43 | } | ||
44 | } | ||
45 | |||
35 | fn chars(&self) -> Chars { | 46 | fn chars(&self) -> Chars { |
36 | self.text[self.len.0 as usize ..].chars() | 47 | self.text[self.len.0 as usize ..].chars() |
37 | } | 48 | } |
diff --git a/src/syntax_kinds.rs b/src/syntax_kinds.rs index 421cae15a..b9b47a2ed 100644 --- a/src/syntax_kinds.rs +++ b/src/syntax_kinds.rs | |||
@@ -1,11 +1,15 @@ | |||
1 | // Generated from grammar.ron | 1 | // Generated from grammar.ron |
2 | use tree::{SyntaxKind, SyntaxInfo}; | 2 | use tree::{SyntaxKind, SyntaxInfo}; |
3 | 3 | ||
4 | pub const IDENT: SyntaxKind = SyntaxKind(0); | 4 | pub const ERROR: SyntaxKind = SyntaxKind(0); |
5 | pub const WHITESPACE: SyntaxKind = SyntaxKind(1); | 5 | pub const IDENT: SyntaxKind = SyntaxKind(1); |
6 | pub const UNDERSCORE: SyntaxKind = SyntaxKind(2); | ||
7 | pub const WHITESPACE: SyntaxKind = SyntaxKind(3); | ||
6 | 8 | ||
7 | static INFOS: [SyntaxInfo; 2] = [ | 9 | static INFOS: [SyntaxInfo; 4] = [ |
10 | SyntaxInfo { name: "ERROR" }, | ||
8 | SyntaxInfo { name: "IDENT" }, | 11 | SyntaxInfo { name: "IDENT" }, |
12 | SyntaxInfo { name: "UNDERSCORE" }, | ||
9 | SyntaxInfo { name: "WHITESPACE" }, | 13 | SyntaxInfo { name: "WHITESPACE" }, |
10 | ]; | 14 | ]; |
11 | 15 | ||
diff --git a/tests/data/lexer/0002_whitespace.rs b/tests/data/lexer/0002_whitespace.rs new file mode 100644 index 000000000..08fce1418 --- /dev/null +++ b/tests/data/lexer/0002_whitespace.rs | |||
@@ -0,0 +1,4 @@ | |||
1 | a b c | ||
2 | d | ||
3 | |||
4 | e f | ||
diff --git a/tests/data/lexer/0002_whitespace.txt b/tests/data/lexer/0002_whitespace.txt new file mode 100644 index 000000000..4b9885e4a --- /dev/null +++ b/tests/data/lexer/0002_whitespace.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | IDENT 1 | ||
2 | WHITESPACE 1 | ||
3 | IDENT 1 | ||
4 | WHITESPACE 2 | ||
5 | IDENT 1 | ||
6 | WHITESPACE 1 | ||
7 | IDENT 1 | ||
8 | WHITESPACE 2 | ||
9 | IDENT 1 | ||
10 | WHITESPACE 1 | ||
11 | IDENT 1 | ||
12 | WHITESPACE 1 | ||
diff --git a/validation.md b/validation.md new file mode 100644 index 000000000..9cfec5309 --- /dev/null +++ b/validation.md | |||
@@ -0,0 +1,4 @@ | |||
1 | Fixmes: | ||
2 | |||
3 | * Fix `is_whitespace`, add more test | ||
4 | |||