Lexer: start numbers

author: Aleksey Kladov <[email protected]> 2017-12-30 12:22:40 +0000
committer: Aleksey Kladov <[email protected]> 2017-12-30 12:23:38 +0000
commit: ddc637c16120fb352183698f635fc93a68580f7b (patch)
tree: 288f1497551f2667af693157f2451be40c25d697 /src
parent: 8103772a10f00378c4dcdd09f9af310c23146933 (diff)
4 files changed, 102 insertions, 11 deletions
diff --git a/src/lexer/classes.rs b/src/lexer/classes.rs
index 7cc050bde..4235d2648 100644
--- a/src/lexer/classes.rs
+++ b/src/lexer/classes.rs
@@ -20,3 +20,7 @@ pub fn is_whitespace(c: char) -> bool {
    //https://github.com/behnam/rust-unic/issues/192
    c.is_whitespace()
 }
+pub fn is_dec_digit(c: char) -> bool {
+    '0' <= c && c <= '9'
+}
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index 83a411cdd..afbbee4d0 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -22,16 +22,7 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
    // They are not identifiers, and are handled further down.
    let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
    if ident_start {
-        let is_single_letter = match ptr.next() {
+        return scan_ident(c, ptr);
-            None => true,
-            Some(c) if !is_ident_continue(c) => true,
-            _ => false,
-        };
-        if is_single_letter {
-            return if c == '_' { UNDERSCORE } else { IDENT };
-        }
-        ptr.bump_while(is_ident_continue);
-        return IDENT;
    }
    if is_whitespace(c) {
@@ -39,9 +30,89 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
        return WHITESPACE;
    }
+    if is_dec_digit(c) {
+        return scan_number(c, ptr);
+    }
    ERROR
 }
+fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind {
+    let is_single_letter = match ptr.next() {
+        None => true,
+        Some(c) if !is_ident_continue(c) => true,
+        _ => false,
+    };
+    if is_single_letter {
+        return if c == '_' { UNDERSCORE } else { IDENT };
+    }
+    ptr.bump_while(is_ident_continue);
+    IDENT
+}
+fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind {
+    if c == '0' {
+        match ptr.next().unwrap_or('\0') {
+            'b' | 'o' => {
+                ptr.bump();
+                scan_digits(ptr, false);
+            }
+            'x' => {
+                ptr.bump();
+                scan_digits(ptr, true);
+            }
+            '0'...'9' | '_' | '.' | 'e' | 'E' => {
+                scan_digits(ptr, true);
+            }
+            _ => return INT_NUMBER,
+        }
+    } else {
+        scan_digits(ptr, false);
+    }
+    // might be a float, but don't be greedy if this is actually an
+    // integer literal followed by field/method access or a range pattern
+    // (`0..2` and `12.foo()`)
+    if ptr.next_is('.') && !(ptr.nnext_is('.') || ptr.nnext_is_p(is_ident_start)) {
+        // might have stuff after the ., and if it does, it needs to start
+        // with a number
+        ptr.bump();
+        scan_digits(ptr, false);
+        scan_float_exponent(ptr);
+        return FLOAT_NUMBER;
+    }
+    // it might be a float if it has an exponent
+    if ptr.next_is('e') || ptr.next_is('E') {
+        scan_float_exponent(ptr);
+        return FLOAT_NUMBER;
+    }
+    INT_NUMBER
+}
+fn scan_digits(ptr: &mut Ptr, allow_hex: bool) {
+    while let Some(c) = ptr.next() {
+        match c {
+            '_' | '0'...'9' => {
+                ptr.bump();
+            }
+            'a'...'f' | 'A' ... 'F' if allow_hex => {
+                ptr.bump();
+            }
+            _ => return
+        }
+    }
+}
+fn scan_float_exponent(ptr: &mut Ptr) {
+    if ptr.next_is('e') || ptr.next_is('E') {
+        ptr.bump();
+        if ptr.next_is('-') || ptr.next_is('+') {
+            ptr.bump();
+        }
+        scan_digits(ptr, false);
+    }
+}
 fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
    match (c, c1, c2) {
        ('r', Some('"'), _) |
diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs
index e8aa6f37b..d441b826b 100644
--- a/src/lexer/ptr.rs
+++ b/src/lexer/ptr.rs
@@ -26,6 +26,18 @@ impl<'s> Ptr<'s> {
        chars.next()
    }
+    pub fn next_is(&self, c: char) -> bool {
+        self.next() == Some(c)
+    }
+    pub fn nnext_is(&self, c: char) -> bool {
+        self.nnext() == Some(c)
+    }
+    pub fn nnext_is_p<P: Fn(char) -> bool>(&self, p: P) -> bool {
+        self.nnext().map(p) == Some(true)
+    }
    pub fn bump(&mut self) -> Option<char> {
        let ch = self.chars().next()?;
        self.len += TextUnit::len_of_char(ch);
diff --git a/src/syntax_kinds.rs b/src/syntax_kinds.rs
index b9b47a2ed..bd1265bde 100644
--- a/src/syntax_kinds.rs
+++ b/src/syntax_kinds.rs
@@ -5,12 +5,16 @@ pub const ERROR: SyntaxKind = SyntaxKind(0);
 pub const IDENT: SyntaxKind = SyntaxKind(1);
 pub const UNDERSCORE: SyntaxKind = SyntaxKind(2);
 pub const WHITESPACE: SyntaxKind = SyntaxKind(3);
+pub const INT_NUMBER: SyntaxKind = SyntaxKind(4);
+pub const FLOAT_NUMBER: SyntaxKind = SyntaxKind(5);
-static INFOS: [SyntaxInfo; 4] = [
+static INFOS: [SyntaxInfo; 6] = [
    SyntaxInfo { name: "ERROR" },
    SyntaxInfo { name: "IDENT" },
    SyntaxInfo { name: "UNDERSCORE" },
    SyntaxInfo { name: "WHITESPACE" },
+    SyntaxInfo { name: "INT_NUMBER" },
+    SyntaxInfo { name: "FLOAT_NUMBER" },
 ];
 pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo {
author	Aleksey Kladov <[email protected]>	2017-12-30 12:22:40 +0000
committer	Aleksey Kladov <[email protected]>	2017-12-30 12:23:38 +0000
commit	ddc637c16120fb352183698f635fc93a68580f7b (patch)
tree	288f1497551f2667af693157f2451be40c25d697 /src
parent	8103772a10f00378c4dcdd09f9af310c23146933 (diff)

diff --git a/src/lexer/classes.rs b/src/lexer/classes.rs index 7cc050bde..4235d2648 100644 --- a/src/lexer/classes.rs +++ b/src/lexer/classes.rs
@@ -20,3 +20,7 @@ pub fn is_whitespace(c: char) -> bool {
20	//https://github.com/behnam/rust-unic/issues/192	20	//https://github.com/behnam/rust-unic/issues/192
21	c.is_whitespace()	21	c.is_whitespace()
22	}	22	}
		23
		24	pub fn is_dec_digit(c: char) -> bool {
		25	'0' <= c && c <= '9'
		26	}


diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 83a411cdd..afbbee4d0 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs
@@ -22,16 +22,7 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
22	// They are not identifiers, and are handled further down.	22	// They are not identifiers, and are handled further down.
23	let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());	23	let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
24	if ident_start {	24	if ident_start {
25	let is_single_letter = match ptr.next() {	25	return scan_ident(c, ptr);
26	None => true,
27	Some(c) if !is_ident_continue(c) => true,
28	_ => false,
29	};
30	if is_single_letter {
31	return if c == '_' { UNDERSCORE } else { IDENT };
32	}
33	ptr.bump_while(is_ident_continue);
34	return IDENT;
35	}	26	}
36		27
37	if is_whitespace(c) {	28	if is_whitespace(c) {
@@ -39,9 +30,89 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
39	return WHITESPACE;	30	return WHITESPACE;
40	}	31	}
41		32
		33	if is_dec_digit(c) {
		34	return scan_number(c, ptr);
		35	}
		36
42	ERROR	37	ERROR
43	}	38	}
44		39
		40	fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind {
		41	let is_single_letter = match ptr.next() {
		42	None => true,
		43	Some(c) if !is_ident_continue(c) => true,
		44	_ => false,
		45	};
		46	if is_single_letter {
		47	return if c == '_' { UNDERSCORE } else { IDENT };
		48	}
		49	ptr.bump_while(is_ident_continue);
		50	IDENT
		51	}
		52
		53	fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind {
		54	if c == '0' {
		55	match ptr.next().unwrap_or('\0') {
		56	'b' \| 'o' => {
		57	ptr.bump();
		58	scan_digits(ptr, false);
		59	}
		60	'x' => {
		61	ptr.bump();
		62	scan_digits(ptr, true);
		63	}
		64	'0'...'9' \| '_' \| '.' \| 'e' \| 'E' => {
		65	scan_digits(ptr, true);
		66	}
		67	_ => return INT_NUMBER,
		68	}
		69	} else {
		70	scan_digits(ptr, false);
		71	}
		72
		73	// might be a float, but don't be greedy if this is actually an
		74	// integer literal followed by field/method access or a range pattern
		75	// (`0..2` and `12.foo()`)
		76	if ptr.next_is('.') && !(ptr.nnext_is('.') \|\| ptr.nnext_is_p(is_ident_start)) {
		77	// might have stuff after the ., and if it does, it needs to start
		78	// with a number
		79	ptr.bump();
		80	scan_digits(ptr, false);
		81	scan_float_exponent(ptr);
		82	return FLOAT_NUMBER;
		83	}
		84	// it might be a float if it has an exponent
		85	if ptr.next_is('e') \|\| ptr.next_is('E') {
		86	scan_float_exponent(ptr);
		87	return FLOAT_NUMBER;
		88	}
		89	INT_NUMBER
		90	}
		91
		92	fn scan_digits(ptr: &mut Ptr, allow_hex: bool) {
		93	while let Some(c) = ptr.next() {
		94	match c {
		95	'_' \| '0'...'9' => {
		96	ptr.bump();
		97	}
		98	'a'...'f' \| 'A' ... 'F' if allow_hex => {
		99	ptr.bump();
		100	}
		101	_ => return
		102	}
		103	}
		104	}
		105
		106	fn scan_float_exponent(ptr: &mut Ptr) {
		107	if ptr.next_is('e') \|\| ptr.next_is('E') {
		108	ptr.bump();
		109	if ptr.next_is('-') \|\| ptr.next_is('+') {
		110	ptr.bump();
		111	}
		112	scan_digits(ptr, false);
		113	}
		114	}
		115
45	fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {	116	fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
46	match (c, c1, c2) {	117	match (c, c1, c2) {
47	('r', Some('"'), _) \|	118	('r', Some('"'), _) \|


diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs index e8aa6f37b..d441b826b 100644 --- a/src/lexer/ptr.rs +++ b/src/lexer/ptr.rs
@@ -26,6 +26,18 @@ impl<'s> Ptr<'s> {
26	chars.next()	26	chars.next()
27	}	27	}
28		28
		29	pub fn next_is(&self, c: char) -> bool {
		30	self.next() == Some(c)
		31	}
		32
		33	pub fn nnext_is(&self, c: char) -> bool {
		34	self.nnext() == Some(c)
		35	}
		36
		37	pub fn nnext_is_p<P: Fn(char) -> bool>(&self, p: P) -> bool {
		38	self.nnext().map(p) == Some(true)
		39	}
		40
29	pub fn bump(&mut self) -> Option<char> {	41	pub fn bump(&mut self) -> Option<char> {
30	let ch = self.chars().next()?;	42	let ch = self.chars().next()?;
31	self.len += TextUnit::len_of_char(ch);	43	self.len += TextUnit::len_of_char(ch);


diff --git a/src/syntax_kinds.rs b/src/syntax_kinds.rs index b9b47a2ed..bd1265bde 100644 --- a/src/syntax_kinds.rs +++ b/src/syntax_kinds.rs
@@ -5,12 +5,16 @@ pub const ERROR: SyntaxKind = SyntaxKind(0);
5	pub const IDENT: SyntaxKind = SyntaxKind(1);	5	pub const IDENT: SyntaxKind = SyntaxKind(1);
6	pub const UNDERSCORE: SyntaxKind = SyntaxKind(2);	6	pub const UNDERSCORE: SyntaxKind = SyntaxKind(2);
7	pub const WHITESPACE: SyntaxKind = SyntaxKind(3);	7	pub const WHITESPACE: SyntaxKind = SyntaxKind(3);
		8	pub const INT_NUMBER: SyntaxKind = SyntaxKind(4);
		9	pub const FLOAT_NUMBER: SyntaxKind = SyntaxKind(5);
8		10
9	static INFOS: [SyntaxInfo; 4] = [	11	static INFOS: [SyntaxInfo; 6] = [
10	SyntaxInfo { name: "ERROR" },	12	SyntaxInfo { name: "ERROR" },
11	SyntaxInfo { name: "IDENT" },	13	SyntaxInfo { name: "IDENT" },
12	SyntaxInfo { name: "UNDERSCORE" },	14	SyntaxInfo { name: "UNDERSCORE" },
13	SyntaxInfo { name: "WHITESPACE" },	15	SyntaxInfo { name: "WHITESPACE" },
		16	SyntaxInfo { name: "INT_NUMBER" },
		17	SyntaxInfo { name: "FLOAT_NUMBER" },
14	];	18	];
15		19
16	pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo {	20	pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo {