aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--grammar.ron2
-rw-r--r--src/lexer/classes.rs4
-rw-r--r--src/lexer/mod.rs91
-rw-r--r--src/lexer/ptr.rs12
-rw-r--r--src/syntax_kinds.rs6
-rw-r--r--tests/data/lexer/0004_number.rs7
-rw-r--r--tests/data/lexer/0004_number.txt62
-rw-r--r--validation.md4
8 files changed, 176 insertions, 12 deletions
diff --git a/grammar.ron b/grammar.ron
index 49b9c527c..a86fe693f 100644
--- a/grammar.ron
+++ b/grammar.ron
@@ -4,5 +4,7 @@ Grammar(
4 "IDENT", 4 "IDENT",
5 "UNDERSCORE", 5 "UNDERSCORE",
6 "WHITESPACE", 6 "WHITESPACE",
7 "INT_NUMBER",
8 "FLOAT_NUMBER",
7 ] 9 ]
8) \ No newline at end of file 10) \ No newline at end of file
diff --git a/src/lexer/classes.rs b/src/lexer/classes.rs
index 7cc050bde..4235d2648 100644
--- a/src/lexer/classes.rs
+++ b/src/lexer/classes.rs
@@ -20,3 +20,7 @@ pub fn is_whitespace(c: char) -> bool {
20 //https://github.com/behnam/rust-unic/issues/192 20 //https://github.com/behnam/rust-unic/issues/192
21 c.is_whitespace() 21 c.is_whitespace()
22} 22}
23
24pub fn is_dec_digit(c: char) -> bool {
25 '0' <= c && c <= '9'
26}
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index 83a411cdd..afbbee4d0 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -22,16 +22,7 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
22 // They are not identifiers, and are handled further down. 22 // They are not identifiers, and are handled further down.
23 let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext()); 23 let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
24 if ident_start { 24 if ident_start {
25 let is_single_letter = match ptr.next() { 25 return scan_ident(c, ptr);
26 None => true,
27 Some(c) if !is_ident_continue(c) => true,
28 _ => false,
29 };
30 if is_single_letter {
31 return if c == '_' { UNDERSCORE } else { IDENT };
32 }
33 ptr.bump_while(is_ident_continue);
34 return IDENT;
35 } 26 }
36 27
37 if is_whitespace(c) { 28 if is_whitespace(c) {
@@ -39,9 +30,89 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
39 return WHITESPACE; 30 return WHITESPACE;
40 } 31 }
41 32
33 if is_dec_digit(c) {
34 return scan_number(c, ptr);
35 }
36
42 ERROR 37 ERROR
43} 38}
44 39
40fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind {
41 let is_single_letter = match ptr.next() {
42 None => true,
43 Some(c) if !is_ident_continue(c) => true,
44 _ => false,
45 };
46 if is_single_letter {
47 return if c == '_' { UNDERSCORE } else { IDENT };
48 }
49 ptr.bump_while(is_ident_continue);
50 IDENT
51}
52
53fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind {
54 if c == '0' {
55 match ptr.next().unwrap_or('\0') {
56 'b' | 'o' => {
57 ptr.bump();
58 scan_digits(ptr, false);
59 }
60 'x' => {
61 ptr.bump();
62 scan_digits(ptr, true);
63 }
64 '0'...'9' | '_' | '.' | 'e' | 'E' => {
65 scan_digits(ptr, true);
66 }
67 _ => return INT_NUMBER,
68 }
69 } else {
70 scan_digits(ptr, false);
71 }
72
73 // might be a float, but don't be greedy if this is actually an
74 // integer literal followed by field/method access or a range pattern
75 // (`0..2` and `12.foo()`)
76 if ptr.next_is('.') && !(ptr.nnext_is('.') || ptr.nnext_is_p(is_ident_start)) {
77 // might have stuff after the ., and if it does, it needs to start
78 // with a number
79 ptr.bump();
80 scan_digits(ptr, false);
81 scan_float_exponent(ptr);
82 return FLOAT_NUMBER;
83 }
84 // it might be a float if it has an exponent
85 if ptr.next_is('e') || ptr.next_is('E') {
86 scan_float_exponent(ptr);
87 return FLOAT_NUMBER;
88 }
89 INT_NUMBER
90}
91
92fn scan_digits(ptr: &mut Ptr, allow_hex: bool) {
93 while let Some(c) = ptr.next() {
94 match c {
95 '_' | '0'...'9' => {
96 ptr.bump();
97 }
98 'a'...'f' | 'A' ... 'F' if allow_hex => {
99 ptr.bump();
100 }
101 _ => return
102 }
103 }
104}
105
106fn scan_float_exponent(ptr: &mut Ptr) {
107 if ptr.next_is('e') || ptr.next_is('E') {
108 ptr.bump();
109 if ptr.next_is('-') || ptr.next_is('+') {
110 ptr.bump();
111 }
112 scan_digits(ptr, false);
113 }
114}
115
45fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool { 116fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
46 match (c, c1, c2) { 117 match (c, c1, c2) {
47 ('r', Some('"'), _) | 118 ('r', Some('"'), _) |
diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs
index e8aa6f37b..d441b826b 100644
--- a/src/lexer/ptr.rs
+++ b/src/lexer/ptr.rs
@@ -26,6 +26,18 @@ impl<'s> Ptr<'s> {
26 chars.next() 26 chars.next()
27 } 27 }
28 28
29 pub fn next_is(&self, c: char) -> bool {
30 self.next() == Some(c)
31 }
32
33 pub fn nnext_is(&self, c: char) -> bool {
34 self.nnext() == Some(c)
35 }
36
37 pub fn nnext_is_p<P: Fn(char) -> bool>(&self, p: P) -> bool {
38 self.nnext().map(p) == Some(true)
39 }
40
29 pub fn bump(&mut self) -> Option<char> { 41 pub fn bump(&mut self) -> Option<char> {
30 let ch = self.chars().next()?; 42 let ch = self.chars().next()?;
31 self.len += TextUnit::len_of_char(ch); 43 self.len += TextUnit::len_of_char(ch);
diff --git a/src/syntax_kinds.rs b/src/syntax_kinds.rs
index b9b47a2ed..bd1265bde 100644
--- a/src/syntax_kinds.rs
+++ b/src/syntax_kinds.rs
@@ -5,12 +5,16 @@ pub const ERROR: SyntaxKind = SyntaxKind(0);
5pub const IDENT: SyntaxKind = SyntaxKind(1); 5pub const IDENT: SyntaxKind = SyntaxKind(1);
6pub const UNDERSCORE: SyntaxKind = SyntaxKind(2); 6pub const UNDERSCORE: SyntaxKind = SyntaxKind(2);
7pub const WHITESPACE: SyntaxKind = SyntaxKind(3); 7pub const WHITESPACE: SyntaxKind = SyntaxKind(3);
8pub const INT_NUMBER: SyntaxKind = SyntaxKind(4);
9pub const FLOAT_NUMBER: SyntaxKind = SyntaxKind(5);
8 10
9static INFOS: [SyntaxInfo; 4] = [ 11static INFOS: [SyntaxInfo; 6] = [
10 SyntaxInfo { name: "ERROR" }, 12 SyntaxInfo { name: "ERROR" },
11 SyntaxInfo { name: "IDENT" }, 13 SyntaxInfo { name: "IDENT" },
12 SyntaxInfo { name: "UNDERSCORE" }, 14 SyntaxInfo { name: "UNDERSCORE" },
13 SyntaxInfo { name: "WHITESPACE" }, 15 SyntaxInfo { name: "WHITESPACE" },
16 SyntaxInfo { name: "INT_NUMBER" },
17 SyntaxInfo { name: "FLOAT_NUMBER" },
14]; 18];
15 19
16pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo { 20pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo {
diff --git a/tests/data/lexer/0004_number.rs b/tests/data/lexer/0004_number.rs
new file mode 100644
index 000000000..af53ff2cd
--- /dev/null
+++ b/tests/data/lexer/0004_number.rs
@@ -0,0 +1,7 @@
10 0b 0o 0x 00 0_ 0. 0e 0E 0z
201790 0b1790 0o1790 0x1790aAbBcCdDeEfF 001279 0_1279 0.1279 0e1279 0E1279
30..2
40.foo()
50e+1
60.e+1
70.0E-2
diff --git a/tests/data/lexer/0004_number.txt b/tests/data/lexer/0004_number.txt
new file mode 100644
index 000000000..e9ad8410d
--- /dev/null
+++ b/tests/data/lexer/0004_number.txt
@@ -0,0 +1,62 @@
1INT_NUMBER 1
2WHITESPACE 1
3INT_NUMBER 2
4WHITESPACE 1
5INT_NUMBER 2
6WHITESPACE 1
7INT_NUMBER 2
8WHITESPACE 1
9INT_NUMBER 2
10WHITESPACE 1
11INT_NUMBER 2
12WHITESPACE 1
13FLOAT_NUMBER 2
14WHITESPACE 1
15INT_NUMBER 2
16WHITESPACE 1
17INT_NUMBER 2
18WHITESPACE 1
19INT_NUMBER 1
20IDENT 1
21WHITESPACE 1
22INT_NUMBER 5
23WHITESPACE 1
24INT_NUMBER 6
25WHITESPACE 1
26INT_NUMBER 6
27WHITESPACE 1
28INT_NUMBER 18
29WHITESPACE 1
30INT_NUMBER 6
31WHITESPACE 1
32INT_NUMBER 6
33WHITESPACE 1
34FLOAT_NUMBER 6
35WHITESPACE 1
36INT_NUMBER 6
37WHITESPACE 1
38INT_NUMBER 6
39WHITESPACE 1
40INT_NUMBER 1
41ERROR 1
42ERROR 1
43INT_NUMBER 1
44WHITESPACE 1
45INT_NUMBER 1
46ERROR 1
47IDENT 3
48ERROR 1
49ERROR 1
50WHITESPACE 1
51INT_NUMBER 2
52ERROR 1
53INT_NUMBER 1
54WHITESPACE 1
55INT_NUMBER 1
56ERROR 1
57IDENT 1
58ERROR 1
59INT_NUMBER 1
60WHITESPACE 1
61FLOAT_NUMBER 6
62WHITESPACE 1
diff --git a/validation.md b/validation.md
index 3706760ba..b21ffebd5 100644
--- a/validation.md
+++ b/validation.md
@@ -1,5 +1,7 @@
1Fixmes: 1Fixmes:
2 2
3* Fix `is_whitespace`, add more test 3* Fix `is_whitespace`, add more tests
4* Add more thorough tests for idents for XID_Start & XID_Continue 4* Add more thorough tests for idents for XID_Start & XID_Continue
5* Validate that float and integer literals use digits only of the appropriate
6 base, and are in range
5 7