diff options
author | Aleksey Kladov <[email protected]> | 2017-12-30 12:22:40 +0000 |
---|---|---|
committer | Aleksey Kladov <[email protected]> | 2017-12-30 12:23:38 +0000 |
commit | ddc637c16120fb352183698f635fc93a68580f7b (patch) | |
tree | 288f1497551f2667af693157f2451be40c25d697 | |
parent | 8103772a10f00378c4dcdd09f9af310c23146933 (diff) |
Lexer: start numbers
-rw-r--r-- | grammar.ron | 2 | ||||
-rw-r--r-- | src/lexer/classes.rs | 4 | ||||
-rw-r--r-- | src/lexer/mod.rs | 91 | ||||
-rw-r--r-- | src/lexer/ptr.rs | 12 | ||||
-rw-r--r-- | src/syntax_kinds.rs | 6 | ||||
-rw-r--r-- | tests/data/lexer/0004_number.rs | 7 | ||||
-rw-r--r-- | tests/data/lexer/0004_number.txt | 62 | ||||
-rw-r--r-- | validation.md | 4 |
8 files changed, 176 insertions, 12 deletions
diff --git a/grammar.ron b/grammar.ron index 49b9c527c..a86fe693f 100644 --- a/grammar.ron +++ b/grammar.ron | |||
@@ -4,5 +4,7 @@ Grammar( | |||
4 | "IDENT", | 4 | "IDENT", |
5 | "UNDERSCORE", | 5 | "UNDERSCORE", |
6 | "WHITESPACE", | 6 | "WHITESPACE", |
7 | "INT_NUMBER", | ||
8 | "FLOAT_NUMBER", | ||
7 | ] | 9 | ] |
8 | ) \ No newline at end of file | 10 | ) \ No newline at end of file |
diff --git a/src/lexer/classes.rs b/src/lexer/classes.rs index 7cc050bde..4235d2648 100644 --- a/src/lexer/classes.rs +++ b/src/lexer/classes.rs | |||
@@ -20,3 +20,7 @@ pub fn is_whitespace(c: char) -> bool { | |||
20 | //https://github.com/behnam/rust-unic/issues/192 | 20 | //https://github.com/behnam/rust-unic/issues/192 |
21 | c.is_whitespace() | 21 | c.is_whitespace() |
22 | } | 22 | } |
23 | |||
24 | pub fn is_dec_digit(c: char) -> bool { | ||
25 | '0' <= c && c <= '9' | ||
26 | } | ||
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 83a411cdd..afbbee4d0 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs | |||
@@ -22,16 +22,7 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { | |||
22 | // They are not identifiers, and are handled further down. | 22 | // They are not identifiers, and are handled further down. |
23 | let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext()); | 23 | let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext()); |
24 | if ident_start { | 24 | if ident_start { |
25 | let is_single_letter = match ptr.next() { | 25 | return scan_ident(c, ptr); |
26 | None => true, | ||
27 | Some(c) if !is_ident_continue(c) => true, | ||
28 | _ => false, | ||
29 | }; | ||
30 | if is_single_letter { | ||
31 | return if c == '_' { UNDERSCORE } else { IDENT }; | ||
32 | } | ||
33 | ptr.bump_while(is_ident_continue); | ||
34 | return IDENT; | ||
35 | } | 26 | } |
36 | 27 | ||
37 | if is_whitespace(c) { | 28 | if is_whitespace(c) { |
@@ -39,9 +30,89 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { | |||
39 | return WHITESPACE; | 30 | return WHITESPACE; |
40 | } | 31 | } |
41 | 32 | ||
33 | if is_dec_digit(c) { | ||
34 | return scan_number(c, ptr); | ||
35 | } | ||
36 | |||
42 | ERROR | 37 | ERROR |
43 | } | 38 | } |
44 | 39 | ||
40 | fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind { | ||
41 | let is_single_letter = match ptr.next() { | ||
42 | None => true, | ||
43 | Some(c) if !is_ident_continue(c) => true, | ||
44 | _ => false, | ||
45 | }; | ||
46 | if is_single_letter { | ||
47 | return if c == '_' { UNDERSCORE } else { IDENT }; | ||
48 | } | ||
49 | ptr.bump_while(is_ident_continue); | ||
50 | IDENT | ||
51 | } | ||
52 | |||
53 | fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind { | ||
54 | if c == '0' { | ||
55 | match ptr.next().unwrap_or('\0') { | ||
56 | 'b' | 'o' => { | ||
57 | ptr.bump(); | ||
58 | scan_digits(ptr, false); | ||
59 | } | ||
60 | 'x' => { | ||
61 | ptr.bump(); | ||
62 | scan_digits(ptr, true); | ||
63 | } | ||
64 | '0'...'9' | '_' | '.' | 'e' | 'E' => { | ||
65 | scan_digits(ptr, true); | ||
66 | } | ||
67 | _ => return INT_NUMBER, | ||
68 | } | ||
69 | } else { | ||
70 | scan_digits(ptr, false); | ||
71 | } | ||
72 | |||
73 | // might be a float, but don't be greedy if this is actually an | ||
74 | // integer literal followed by field/method access or a range pattern | ||
75 | // (`0..2` and `12.foo()`) | ||
76 | if ptr.next_is('.') && !(ptr.nnext_is('.') || ptr.nnext_is_p(is_ident_start)) { | ||
77 | // might have stuff after the ., and if it does, it needs to start | ||
78 | // with a number | ||
79 | ptr.bump(); | ||
80 | scan_digits(ptr, false); | ||
81 | scan_float_exponent(ptr); | ||
82 | return FLOAT_NUMBER; | ||
83 | } | ||
84 | // it might be a float if it has an exponent | ||
85 | if ptr.next_is('e') || ptr.next_is('E') { | ||
86 | scan_float_exponent(ptr); | ||
87 | return FLOAT_NUMBER; | ||
88 | } | ||
89 | INT_NUMBER | ||
90 | } | ||
91 | |||
92 | fn scan_digits(ptr: &mut Ptr, allow_hex: bool) { | ||
93 | while let Some(c) = ptr.next() { | ||
94 | match c { | ||
95 | '_' | '0'...'9' => { | ||
96 | ptr.bump(); | ||
97 | } | ||
98 | 'a'...'f' | 'A' ... 'F' if allow_hex => { | ||
99 | ptr.bump(); | ||
100 | } | ||
101 | _ => return | ||
102 | } | ||
103 | } | ||
104 | } | ||
105 | |||
106 | fn scan_float_exponent(ptr: &mut Ptr) { | ||
107 | if ptr.next_is('e') || ptr.next_is('E') { | ||
108 | ptr.bump(); | ||
109 | if ptr.next_is('-') || ptr.next_is('+') { | ||
110 | ptr.bump(); | ||
111 | } | ||
112 | scan_digits(ptr, false); | ||
113 | } | ||
114 | } | ||
115 | |||
45 | fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool { | 116 | fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool { |
46 | match (c, c1, c2) { | 117 | match (c, c1, c2) { |
47 | ('r', Some('"'), _) | | 118 | ('r', Some('"'), _) | |
diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs index e8aa6f37b..d441b826b 100644 --- a/src/lexer/ptr.rs +++ b/src/lexer/ptr.rs | |||
@@ -26,6 +26,18 @@ impl<'s> Ptr<'s> { | |||
26 | chars.next() | 26 | chars.next() |
27 | } | 27 | } |
28 | 28 | ||
29 | pub fn next_is(&self, c: char) -> bool { | ||
30 | self.next() == Some(c) | ||
31 | } | ||
32 | |||
33 | pub fn nnext_is(&self, c: char) -> bool { | ||
34 | self.nnext() == Some(c) | ||
35 | } | ||
36 | |||
37 | pub fn nnext_is_p<P: Fn(char) -> bool>(&self, p: P) -> bool { | ||
38 | self.nnext().map(p) == Some(true) | ||
39 | } | ||
40 | |||
29 | pub fn bump(&mut self) -> Option<char> { | 41 | pub fn bump(&mut self) -> Option<char> { |
30 | let ch = self.chars().next()?; | 42 | let ch = self.chars().next()?; |
31 | self.len += TextUnit::len_of_char(ch); | 43 | self.len += TextUnit::len_of_char(ch); |
diff --git a/src/syntax_kinds.rs b/src/syntax_kinds.rs index b9b47a2ed..bd1265bde 100644 --- a/src/syntax_kinds.rs +++ b/src/syntax_kinds.rs | |||
@@ -5,12 +5,16 @@ pub const ERROR: SyntaxKind = SyntaxKind(0); | |||
5 | pub const IDENT: SyntaxKind = SyntaxKind(1); | 5 | pub const IDENT: SyntaxKind = SyntaxKind(1); |
6 | pub const UNDERSCORE: SyntaxKind = SyntaxKind(2); | 6 | pub const UNDERSCORE: SyntaxKind = SyntaxKind(2); |
7 | pub const WHITESPACE: SyntaxKind = SyntaxKind(3); | 7 | pub const WHITESPACE: SyntaxKind = SyntaxKind(3); |
8 | pub const INT_NUMBER: SyntaxKind = SyntaxKind(4); | ||
9 | pub const FLOAT_NUMBER: SyntaxKind = SyntaxKind(5); | ||
8 | 10 | ||
9 | static INFOS: [SyntaxInfo; 4] = [ | 11 | static INFOS: [SyntaxInfo; 6] = [ |
10 | SyntaxInfo { name: "ERROR" }, | 12 | SyntaxInfo { name: "ERROR" }, |
11 | SyntaxInfo { name: "IDENT" }, | 13 | SyntaxInfo { name: "IDENT" }, |
12 | SyntaxInfo { name: "UNDERSCORE" }, | 14 | SyntaxInfo { name: "UNDERSCORE" }, |
13 | SyntaxInfo { name: "WHITESPACE" }, | 15 | SyntaxInfo { name: "WHITESPACE" }, |
16 | SyntaxInfo { name: "INT_NUMBER" }, | ||
17 | SyntaxInfo { name: "FLOAT_NUMBER" }, | ||
14 | ]; | 18 | ]; |
15 | 19 | ||
16 | pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo { | 20 | pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo { |
diff --git a/tests/data/lexer/0004_number.rs b/tests/data/lexer/0004_number.rs new file mode 100644 index 000000000..af53ff2cd --- /dev/null +++ b/tests/data/lexer/0004_number.rs | |||
@@ -0,0 +1,7 @@ | |||
1 | 0 0b 0o 0x 00 0_ 0. 0e 0E 0z | ||
2 | 01790 0b1790 0o1790 0x1790aAbBcCdDeEfF 001279 0_1279 0.1279 0e1279 0E1279 | ||
3 | 0..2 | ||
4 | 0.foo() | ||
5 | 0e+1 | ||
6 | 0.e+1 | ||
7 | 0.0E-2 | ||
diff --git a/tests/data/lexer/0004_number.txt b/tests/data/lexer/0004_number.txt new file mode 100644 index 000000000..e9ad8410d --- /dev/null +++ b/tests/data/lexer/0004_number.txt | |||
@@ -0,0 +1,62 @@ | |||
1 | INT_NUMBER 1 | ||
2 | WHITESPACE 1 | ||
3 | INT_NUMBER 2 | ||
4 | WHITESPACE 1 | ||
5 | INT_NUMBER 2 | ||
6 | WHITESPACE 1 | ||
7 | INT_NUMBER 2 | ||
8 | WHITESPACE 1 | ||
9 | INT_NUMBER 2 | ||
10 | WHITESPACE 1 | ||
11 | INT_NUMBER 2 | ||
12 | WHITESPACE 1 | ||
13 | FLOAT_NUMBER 2 | ||
14 | WHITESPACE 1 | ||
15 | INT_NUMBER 2 | ||
16 | WHITESPACE 1 | ||
17 | INT_NUMBER 2 | ||
18 | WHITESPACE 1 | ||
19 | INT_NUMBER 1 | ||
20 | IDENT 1 | ||
21 | WHITESPACE 1 | ||
22 | INT_NUMBER 5 | ||
23 | WHITESPACE 1 | ||
24 | INT_NUMBER 6 | ||
25 | WHITESPACE 1 | ||
26 | INT_NUMBER 6 | ||
27 | WHITESPACE 1 | ||
28 | INT_NUMBER 18 | ||
29 | WHITESPACE 1 | ||
30 | INT_NUMBER 6 | ||
31 | WHITESPACE 1 | ||
32 | INT_NUMBER 6 | ||
33 | WHITESPACE 1 | ||
34 | FLOAT_NUMBER 6 | ||
35 | WHITESPACE 1 | ||
36 | INT_NUMBER 6 | ||
37 | WHITESPACE 1 | ||
38 | INT_NUMBER 6 | ||
39 | WHITESPACE 1 | ||
40 | INT_NUMBER 1 | ||
41 | ERROR 1 | ||
42 | ERROR 1 | ||
43 | INT_NUMBER 1 | ||
44 | WHITESPACE 1 | ||
45 | INT_NUMBER 1 | ||
46 | ERROR 1 | ||
47 | IDENT 3 | ||
48 | ERROR 1 | ||
49 | ERROR 1 | ||
50 | WHITESPACE 1 | ||
51 | INT_NUMBER 2 | ||
52 | ERROR 1 | ||
53 | INT_NUMBER 1 | ||
54 | WHITESPACE 1 | ||
55 | INT_NUMBER 1 | ||
56 | ERROR 1 | ||
57 | IDENT 1 | ||
58 | ERROR 1 | ||
59 | INT_NUMBER 1 | ||
60 | WHITESPACE 1 | ||
61 | FLOAT_NUMBER 6 | ||
62 | WHITESPACE 1 | ||
diff --git a/validation.md b/validation.md index 3706760ba..b21ffebd5 100644 --- a/validation.md +++ b/validation.md | |||
@@ -1,5 +1,7 @@ | |||
1 | Fixmes: | 1 | Fixmes: |
2 | 2 | ||
3 | * Fix `is_whitespace`, add more test | 3 | * Fix `is_whitespace`, add more tests |
4 | * Add more thorough tests for idents for XID_Start & XID_Continue | 4 | * Add more thorough tests for idents for XID_Start & XID_Continue |
5 | * Validate that float and integer literals use digits only of the appropriate | ||
6 | base, and are in range | ||
5 | 7 | ||