Lexer: start numbers

author: Aleksey Kladov <[email protected]> 2017-12-30 12:22:40 +0000
committer: Aleksey Kladov <[email protected]> 2017-12-30 12:23:38 +0000
commit: ddc637c16120fb352183698f635fc93a68580f7b (patch)
tree: 288f1497551f2667af693157f2451be40c25d697
parent: 8103772a10f00378c4dcdd09f9af310c23146933 (diff)
8 files changed, 176 insertions, 12 deletions
diff --git a/grammar.ron b/grammar.ron
index 49b9c527c..a86fe693f 100644
--- a/grammar.ron
+++ b/grammar.ron
@@ -4,5 +4,7 @@ Grammar(
        "IDENT",
        "UNDERSCORE",
        "WHITESPACE",
+        "INT_NUMBER",
+        "FLOAT_NUMBER",
    ]
 )
 \ No newline at end of file
diff --git a/src/lexer/classes.rs b/src/lexer/classes.rs
index 7cc050bde..4235d2648 100644
--- a/src/lexer/classes.rs
+++ b/src/lexer/classes.rs
@@ -20,3 +20,7 @@ pub fn is_whitespace(c: char) -> bool {
    //https://github.com/behnam/rust-unic/issues/192
    c.is_whitespace()
 }
+pub fn is_dec_digit(c: char) -> bool {
+    '0' <= c && c <= '9'
+}
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index 83a411cdd..afbbee4d0 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -22,16 +22,7 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
    // They are not identifiers, and are handled further down.
    let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
    if ident_start {
-        let is_single_letter = match ptr.next() {
+        return scan_ident(c, ptr);
-            None => true,
-            Some(c) if !is_ident_continue(c) => true,
-            _ => false,
-        };
-        if is_single_letter {
-            return if c == '_' { UNDERSCORE } else { IDENT };
-        }
-        ptr.bump_while(is_ident_continue);
-        return IDENT;
    }
    if is_whitespace(c) {
@@ -39,9 +30,89 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
        return WHITESPACE;
    }
+    if is_dec_digit(c) {
+        return scan_number(c, ptr);
+    }
    ERROR
 }
+fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind {
+    let is_single_letter = match ptr.next() {
+        None => true,
+        Some(c) if !is_ident_continue(c) => true,
+        _ => false,
+    };
+    if is_single_letter {
+        return if c == '_' { UNDERSCORE } else { IDENT };
+    }
+    ptr.bump_while(is_ident_continue);
+    IDENT
+}
+fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind {
+    if c == '0' {
+        match ptr.next().unwrap_or('\0') {
+            'b' | 'o' => {
+                ptr.bump();
+                scan_digits(ptr, false);
+            }
+            'x' => {
+                ptr.bump();
+                scan_digits(ptr, true);
+            }
+            '0'...'9' | '_' | '.' | 'e' | 'E' => {
+                scan_digits(ptr, true);
+            }
+            _ => return INT_NUMBER,
+        }
+    } else {
+        scan_digits(ptr, false);
+    }
+    // might be a float, but don't be greedy if this is actually an
+    // integer literal followed by field/method access or a range pattern
+    // (`0..2` and `12.foo()`)
+    if ptr.next_is('.') && !(ptr.nnext_is('.') || ptr.nnext_is_p(is_ident_start)) {
+        // might have stuff after the ., and if it does, it needs to start
+        // with a number
+        ptr.bump();
+        scan_digits(ptr, false);
+        scan_float_exponent(ptr);
+        return FLOAT_NUMBER;
+    }
+    // it might be a float if it has an exponent
+    if ptr.next_is('e') || ptr.next_is('E') {
+        scan_float_exponent(ptr);
+        return FLOAT_NUMBER;
+    }
+    INT_NUMBER
+}
+fn scan_digits(ptr: &mut Ptr, allow_hex: bool) {
+    while let Some(c) = ptr.next() {
+        match c {
+            '_' | '0'...'9' => {
+                ptr.bump();
+            }
+            'a'...'f' | 'A' ... 'F' if allow_hex => {
+                ptr.bump();
+            }
+            _ => return
+        }
+    }
+}
+fn scan_float_exponent(ptr: &mut Ptr) {
+    if ptr.next_is('e') || ptr.next_is('E') {
+        ptr.bump();
+        if ptr.next_is('-') || ptr.next_is('+') {
+            ptr.bump();
+        }
+        scan_digits(ptr, false);
+    }
+}
 fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
    match (c, c1, c2) {
        ('r', Some('"'), _) |
diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs
index e8aa6f37b..d441b826b 100644
--- a/src/lexer/ptr.rs
+++ b/src/lexer/ptr.rs
@@ -26,6 +26,18 @@ impl<'s> Ptr<'s> {
        chars.next()
    }
+    pub fn next_is(&self, c: char) -> bool {
+        self.next() == Some(c)
+    }
+    pub fn nnext_is(&self, c: char) -> bool {
+        self.nnext() == Some(c)
+    }
+    pub fn nnext_is_p<P: Fn(char) -> bool>(&self, p: P) -> bool {
+        self.nnext().map(p) == Some(true)
+    }
    pub fn bump(&mut self) -> Option<char> {
        let ch = self.chars().next()?;
        self.len += TextUnit::len_of_char(ch);
diff --git a/src/syntax_kinds.rs b/src/syntax_kinds.rs
index b9b47a2ed..bd1265bde 100644
--- a/src/syntax_kinds.rs
+++ b/src/syntax_kinds.rs
@@ -5,12 +5,16 @@ pub const ERROR: SyntaxKind = SyntaxKind(0);
 pub const IDENT: SyntaxKind = SyntaxKind(1);
 pub const UNDERSCORE: SyntaxKind = SyntaxKind(2);
 pub const WHITESPACE: SyntaxKind = SyntaxKind(3);
+pub const INT_NUMBER: SyntaxKind = SyntaxKind(4);
+pub const FLOAT_NUMBER: SyntaxKind = SyntaxKind(5);
-static INFOS: [SyntaxInfo; 4] = [
+static INFOS: [SyntaxInfo; 6] = [
    SyntaxInfo { name: "ERROR" },
    SyntaxInfo { name: "IDENT" },
    SyntaxInfo { name: "UNDERSCORE" },
    SyntaxInfo { name: "WHITESPACE" },
+    SyntaxInfo { name: "INT_NUMBER" },
+    SyntaxInfo { name: "FLOAT_NUMBER" },
 ];
 pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo {
diff --git a/tests/data/lexer/0004_number.rs b/tests/data/lexer/0004_number.rs
new file mode 100644
index 000000000..af53ff2cd
--- /dev/null
+++ b/tests/data/lexer/0004_number.rs
@@ -0,0 +1,7 @@
+0 0b 0o 0x 00 0_ 0. 0e 0E 0z
+01790 0b1790 0o1790 0x1790aAbBcCdDeEfF 001279 0_1279 0.1279 0e1279 0E1279
+0..2
+0.foo()
+0e+1
+0.e+1
+0.0E-2
diff --git a/tests/data/lexer/0004_number.txt b/tests/data/lexer/0004_number.txt
new file mode 100644
index 000000000..e9ad8410d
--- /dev/null
+++ b/tests/data/lexer/0004_number.txt
@@ -0,0 +1,62 @@
+INT_NUMBER 1
+WHITESPACE 1
+INT_NUMBER 2
+WHITESPACE 1
+INT_NUMBER 2
+WHITESPACE 1
+INT_NUMBER 2
+WHITESPACE 1
+INT_NUMBER 2
+WHITESPACE 1
+INT_NUMBER 2
+WHITESPACE 1
+FLOAT_NUMBER 2
+WHITESPACE 1
+INT_NUMBER 2
+WHITESPACE 1
+INT_NUMBER 2
+WHITESPACE 1
+INT_NUMBER 1
+IDENT 1
+WHITESPACE 1
+INT_NUMBER 5
+WHITESPACE 1
+INT_NUMBER 6
+WHITESPACE 1
+INT_NUMBER 6
+WHITESPACE 1
+INT_NUMBER 18
+WHITESPACE 1
+INT_NUMBER 6
+WHITESPACE 1
+INT_NUMBER 6
+WHITESPACE 1
+FLOAT_NUMBER 6
+WHITESPACE 1
+INT_NUMBER 6
+WHITESPACE 1
+INT_NUMBER 6
+WHITESPACE 1
+INT_NUMBER 1
+ERROR 1
+ERROR 1
+INT_NUMBER 1
+WHITESPACE 1
+INT_NUMBER 1
+ERROR 1
+IDENT 3
+ERROR 1
+ERROR 1
+WHITESPACE 1
+INT_NUMBER 2
+ERROR 1
+INT_NUMBER 1
+WHITESPACE 1
+INT_NUMBER 1
+ERROR 1
+IDENT 1
+ERROR 1
+INT_NUMBER 1
+WHITESPACE 1
+FLOAT_NUMBER 6
+WHITESPACE 1
diff --git a/validation.md b/validation.md
index 3706760ba..b21ffebd5 100644
--- a/validation.md
+++ b/validation.md
@@ -1,5 +1,7 @@
 Fixmes:
-* Fix `is_whitespace`, add more test
+* Fix `is_whitespace`, add more tests
 * Add more thorough tests for idents for XID_Start & XID_Continue
+* Validate that float and integer literals use digits only of the appropriate
+  base, and are in range
author	Aleksey Kladov <[email protected]>	2017-12-30 12:22:40 +0000
committer	Aleksey Kladov <[email protected]>	2017-12-30 12:23:38 +0000
commit	ddc637c16120fb352183698f635fc93a68580f7b (patch)
tree	288f1497551f2667af693157f2451be40c25d697
parent	8103772a10f00378c4dcdd09f9af310c23146933 (diff)

diff --git a/grammar.ron b/grammar.ron index 49b9c527c..a86fe693f 100644 --- a/grammar.ron +++ b/grammar.ron
@@ -4,5 +4,7 @@ Grammar(
4	"IDENT",	4	"IDENT",
5	"UNDERSCORE",	5	"UNDERSCORE",
6	"WHITESPACE",	6	"WHITESPACE",
		7	"INT_NUMBER",
		8	"FLOAT_NUMBER",
7	]	9	]
8	) \ No newline at end of file	10	) \ No newline at end of file


diff --git a/src/lexer/classes.rs b/src/lexer/classes.rs index 7cc050bde..4235d2648 100644 --- a/src/lexer/classes.rs +++ b/src/lexer/classes.rs
@@ -20,3 +20,7 @@ pub fn is_whitespace(c: char) -> bool {
20	//https://github.com/behnam/rust-unic/issues/192	20	//https://github.com/behnam/rust-unic/issues/192
21	c.is_whitespace()	21	c.is_whitespace()
22	}	22	}
		23
		24	pub fn is_dec_digit(c: char) -> bool {
		25	'0' <= c && c <= '9'
		26	}


diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 83a411cdd..afbbee4d0 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs
@@ -22,16 +22,7 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
22	// They are not identifiers, and are handled further down.	22	// They are not identifiers, and are handled further down.
23	let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());	23	let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
24	if ident_start {	24	if ident_start {
25	let is_single_letter = match ptr.next() {	25	return scan_ident(c, ptr);
26	None => true,
27	Some(c) if !is_ident_continue(c) => true,
28	_ => false,
29	};
30	if is_single_letter {
31	return if c == '_' { UNDERSCORE } else { IDENT };
32	}
33	ptr.bump_while(is_ident_continue);
34	return IDENT;
35	}	26	}
36		27
37	if is_whitespace(c) {	28	if is_whitespace(c) {
@@ -39,9 +30,89 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
39	return WHITESPACE;	30	return WHITESPACE;
40	}	31	}
41		32
		33	if is_dec_digit(c) {
		34	return scan_number(c, ptr);
		35	}
		36
42	ERROR	37	ERROR
43	}	38	}
44		39
		40	fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind {
		41	let is_single_letter = match ptr.next() {
		42	None => true,
		43	Some(c) if !is_ident_continue(c) => true,
		44	_ => false,
		45	};
		46	if is_single_letter {
		47	return if c == '_' { UNDERSCORE } else { IDENT };
		48	}
		49	ptr.bump_while(is_ident_continue);
		50	IDENT
		51	}
		52
		53	fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind {
		54	if c == '0' {
		55	match ptr.next().unwrap_or('\0') {
		56	'b' \| 'o' => {
		57	ptr.bump();
		58	scan_digits(ptr, false);
		59	}
		60	'x' => {
		61	ptr.bump();
		62	scan_digits(ptr, true);
		63	}
		64	'0'...'9' \| '_' \| '.' \| 'e' \| 'E' => {
		65	scan_digits(ptr, true);
		66	}
		67	_ => return INT_NUMBER,
		68	}
		69	} else {
		70	scan_digits(ptr, false);
		71	}
		72
		73	// might be a float, but don't be greedy if this is actually an
		74	// integer literal followed by field/method access or a range pattern
		75	// (`0..2` and `12.foo()`)
		76	if ptr.next_is('.') && !(ptr.nnext_is('.') \|\| ptr.nnext_is_p(is_ident_start)) {
		77	// might have stuff after the ., and if it does, it needs to start
		78	// with a number
		79	ptr.bump();
		80	scan_digits(ptr, false);
		81	scan_float_exponent(ptr);
		82	return FLOAT_NUMBER;
		83	}
		84	// it might be a float if it has an exponent
		85	if ptr.next_is('e') \|\| ptr.next_is('E') {
		86	scan_float_exponent(ptr);
		87	return FLOAT_NUMBER;
		88	}
		89	INT_NUMBER
		90	}
		91
		92	fn scan_digits(ptr: &mut Ptr, allow_hex: bool) {
		93	while let Some(c) = ptr.next() {
		94	match c {
		95	'_' \| '0'...'9' => {
		96	ptr.bump();
		97	}
		98	'a'...'f' \| 'A' ... 'F' if allow_hex => {
		99	ptr.bump();
		100	}
		101	_ => return
		102	}
		103	}
		104	}
		105
		106	fn scan_float_exponent(ptr: &mut Ptr) {
		107	if ptr.next_is('e') \|\| ptr.next_is('E') {
		108	ptr.bump();
		109	if ptr.next_is('-') \|\| ptr.next_is('+') {
		110	ptr.bump();
		111	}
		112	scan_digits(ptr, false);
		113	}
		114	}
		115
45	fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {	116	fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
46	match (c, c1, c2) {	117	match (c, c1, c2) {
47	('r', Some('"'), _) \|	118	('r', Some('"'), _) \|


diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs index e8aa6f37b..d441b826b 100644 --- a/src/lexer/ptr.rs +++ b/src/lexer/ptr.rs
@@ -26,6 +26,18 @@ impl<'s> Ptr<'s> {
26	chars.next()	26	chars.next()
27	}	27	}
28		28
		29	pub fn next_is(&self, c: char) -> bool {
		30	self.next() == Some(c)
		31	}
		32
		33	pub fn nnext_is(&self, c: char) -> bool {
		34	self.nnext() == Some(c)
		35	}
		36
		37	pub fn nnext_is_p<P: Fn(char) -> bool>(&self, p: P) -> bool {
		38	self.nnext().map(p) == Some(true)
		39	}
		40
29	pub fn bump(&mut self) -> Option<char> {	41	pub fn bump(&mut self) -> Option<char> {
30	let ch = self.chars().next()?;	42	let ch = self.chars().next()?;
31	self.len += TextUnit::len_of_char(ch);	43	self.len += TextUnit::len_of_char(ch);


diff --git a/src/syntax_kinds.rs b/src/syntax_kinds.rs index b9b47a2ed..bd1265bde 100644 --- a/src/syntax_kinds.rs +++ b/src/syntax_kinds.rs
@@ -5,12 +5,16 @@ pub const ERROR: SyntaxKind = SyntaxKind(0);
5	pub const IDENT: SyntaxKind = SyntaxKind(1);	5	pub const IDENT: SyntaxKind = SyntaxKind(1);
6	pub const UNDERSCORE: SyntaxKind = SyntaxKind(2);	6	pub const UNDERSCORE: SyntaxKind = SyntaxKind(2);
7	pub const WHITESPACE: SyntaxKind = SyntaxKind(3);	7	pub const WHITESPACE: SyntaxKind = SyntaxKind(3);
		8	pub const INT_NUMBER: SyntaxKind = SyntaxKind(4);
		9	pub const FLOAT_NUMBER: SyntaxKind = SyntaxKind(5);
8		10
9	static INFOS: [SyntaxInfo; 4] = [	11	static INFOS: [SyntaxInfo; 6] = [
10	SyntaxInfo { name: "ERROR" },	12	SyntaxInfo { name: "ERROR" },
11	SyntaxInfo { name: "IDENT" },	13	SyntaxInfo { name: "IDENT" },
12	SyntaxInfo { name: "UNDERSCORE" },	14	SyntaxInfo { name: "UNDERSCORE" },
13	SyntaxInfo { name: "WHITESPACE" },	15	SyntaxInfo { name: "WHITESPACE" },
		16	SyntaxInfo { name: "INT_NUMBER" },
		17	SyntaxInfo { name: "FLOAT_NUMBER" },
14	];	18	];
15		19
16	pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo {	20	pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo {


diff --git a/tests/data/lexer/0004_number.rs b/tests/data/lexer/0004_number.rs new file mode 100644 index 000000000..af53ff2cd --- /dev/null +++ b/tests/data/lexer/0004_number.rs
@@ -0,0 +1,7 @@
		1	0 0b 0o 0x 00 0_ 0. 0e 0E 0z
		2	01790 0b1790 0o1790 0x1790aAbBcCdDeEfF 001279 0_1279 0.1279 0e1279 0E1279
		3	0..2
		4	0.foo()
		5	0e+1
		6	0.e+1
		7	0.0E-2


diff --git a/tests/data/lexer/0004_number.txt b/tests/data/lexer/0004_number.txt new file mode 100644 index 000000000..e9ad8410d --- /dev/null +++ b/tests/data/lexer/0004_number.txt
@@ -0,0 +1,62 @@
		1	INT_NUMBER 1
		2	WHITESPACE 1
		3	INT_NUMBER 2
		4	WHITESPACE 1
		5	INT_NUMBER 2
		6	WHITESPACE 1
		7	INT_NUMBER 2
		8	WHITESPACE 1
		9	INT_NUMBER 2
		10	WHITESPACE 1
		11	INT_NUMBER 2
		12	WHITESPACE 1
		13	FLOAT_NUMBER 2
		14	WHITESPACE 1
		15	INT_NUMBER 2
		16	WHITESPACE 1
		17	INT_NUMBER 2
		18	WHITESPACE 1
		19	INT_NUMBER 1
		20	IDENT 1
		21	WHITESPACE 1
		22	INT_NUMBER 5
		23	WHITESPACE 1
		24	INT_NUMBER 6
		25	WHITESPACE 1
		26	INT_NUMBER 6
		27	WHITESPACE 1
		28	INT_NUMBER 18
		29	WHITESPACE 1
		30	INT_NUMBER 6
		31	WHITESPACE 1
		32	INT_NUMBER 6
		33	WHITESPACE 1
		34	FLOAT_NUMBER 6
		35	WHITESPACE 1
		36	INT_NUMBER 6
		37	WHITESPACE 1
		38	INT_NUMBER 6
		39	WHITESPACE 1
		40	INT_NUMBER 1
		41	ERROR 1
		42	ERROR 1
		43	INT_NUMBER 1
		44	WHITESPACE 1
		45	INT_NUMBER 1
		46	ERROR 1
		47	IDENT 3
		48	ERROR 1
		49	ERROR 1
		50	WHITESPACE 1
		51	INT_NUMBER 2
		52	ERROR 1
		53	INT_NUMBER 1
		54	WHITESPACE 1
		55	INT_NUMBER 1
		56	ERROR 1
		57	IDENT 1
		58	ERROR 1
		59	INT_NUMBER 1
		60	WHITESPACE 1
		61	FLOAT_NUMBER 6
		62	WHITESPACE 1


diff --git a/validation.md b/validation.md index 3706760ba..b21ffebd5 100644 --- a/validation.md +++ b/validation.md
@@ -1,5 +1,7 @@
1	Fixmes:	1	Fixmes:
2		2
3	* Fix `is_whitespace`, add more test	3	* Fix `is_whitespace`, add more tests
4	* Add more thorough tests for idents for XID_Start & XID_Continue	4	* Add more thorough tests for idents for XID_Start & XID_Continue
		5	* Validate that float and integer literals use digits only of the appropriate
		6	base, and are in range
5		7