diff options
author | Aleksey Kladov <[email protected]> | 2018-08-10 20:33:29 +0100 |
---|---|---|
committer | Aleksey Kladov <[email protected]> | 2018-08-10 20:33:29 +0100 |
commit | 7c67612b8a894187fa3b64725531a5459f9211bf (patch) | |
tree | 9e2a536efa0c880d921fd8d4d74423afc9451fd4 /src/lexer | |
parent | 26262aaf05983c5b7f41cc438e287523268fe1eb (diff) |
organizize
Diffstat (limited to 'src/lexer')
-rw-r--r-- | src/lexer/classes.rs | 26 | ||||
-rw-r--r-- | src/lexer/comments.rs | 57 | ||||
-rw-r--r-- | src/lexer/mod.rs | 209 | ||||
-rw-r--r-- | src/lexer/numbers.rs | 67 | ||||
-rw-r--r-- | src/lexer/ptr.rs | 74 | ||||
-rw-r--r-- | src/lexer/strings.rs | 106 |
6 files changed, 0 insertions, 539 deletions
diff --git a/src/lexer/classes.rs b/src/lexer/classes.rs deleted file mode 100644 index 4235d2648..000000000 --- a/src/lexer/classes.rs +++ /dev/null | |||
@@ -1,26 +0,0 @@ | |||
1 | use unicode_xid::UnicodeXID; | ||
2 | |||
3 | pub fn is_ident_start(c: char) -> bool { | ||
4 | (c >= 'a' && c <= 'z') | ||
5 | || (c >= 'A' && c <= 'Z') | ||
6 | || c == '_' | ||
7 | || (c > '\x7f' && UnicodeXID::is_xid_start(c)) | ||
8 | } | ||
9 | |||
10 | pub fn is_ident_continue(c: char) -> bool { | ||
11 | (c >= 'a' && c <= 'z') | ||
12 | || (c >= 'A' && c <= 'Z') | ||
13 | || (c >= '0' && c <= '9') | ||
14 | || c == '_' | ||
15 | || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) | ||
16 | } | ||
17 | |||
18 | pub fn is_whitespace(c: char) -> bool { | ||
19 | //FIXME: use is_pattern_whitespace | ||
20 | //https://github.com/behnam/rust-unic/issues/192 | ||
21 | c.is_whitespace() | ||
22 | } | ||
23 | |||
24 | pub fn is_dec_digit(c: char) -> bool { | ||
25 | '0' <= c && c <= '9' | ||
26 | } | ||
diff --git a/src/lexer/comments.rs b/src/lexer/comments.rs deleted file mode 100644 index 01acb6515..000000000 --- a/src/lexer/comments.rs +++ /dev/null | |||
@@ -1,57 +0,0 @@ | |||
1 | use lexer::ptr::Ptr; | ||
2 | |||
3 | use SyntaxKind::{self, *}; | ||
4 | |||
5 | pub(crate) fn scan_shebang(ptr: &mut Ptr) -> bool { | ||
6 | if ptr.next_is('!') && ptr.nnext_is('/') { | ||
7 | ptr.bump(); | ||
8 | ptr.bump(); | ||
9 | bump_until_eol(ptr); | ||
10 | true | ||
11 | } else { | ||
12 | false | ||
13 | } | ||
14 | } | ||
15 | |||
16 | fn scan_block_comment(ptr: &mut Ptr) -> Option<SyntaxKind> { | ||
17 | if ptr.next_is('*') { | ||
18 | ptr.bump(); | ||
19 | let mut depth: u32 = 1; | ||
20 | while depth > 0 { | ||
21 | if ptr.next_is('*') && ptr.nnext_is('/') { | ||
22 | depth -= 1; | ||
23 | ptr.bump(); | ||
24 | ptr.bump(); | ||
25 | } else if ptr.next_is('/') && ptr.nnext_is('*') { | ||
26 | depth += 1; | ||
27 | ptr.bump(); | ||
28 | ptr.bump(); | ||
29 | } else if ptr.bump().is_none() { | ||
30 | break; | ||
31 | } | ||
32 | } | ||
33 | Some(COMMENT) | ||
34 | } else { | ||
35 | None | ||
36 | } | ||
37 | } | ||
38 | |||
39 | pub(crate) fn scan_comment(ptr: &mut Ptr) -> Option<SyntaxKind> { | ||
40 | if ptr.next_is('/') { | ||
41 | bump_until_eol(ptr); | ||
42 | Some(COMMENT) | ||
43 | } else { | ||
44 | scan_block_comment(ptr) | ||
45 | } | ||
46 | } | ||
47 | |||
48 | fn bump_until_eol(ptr: &mut Ptr) { | ||
49 | loop { | ||
50 | if ptr.next_is('\n') || ptr.next_is('\r') && ptr.nnext_is('\n') { | ||
51 | return; | ||
52 | } | ||
53 | if ptr.bump().is_none() { | ||
54 | break; | ||
55 | } | ||
56 | } | ||
57 | } | ||
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs deleted file mode 100644 index f8fdc41ac..000000000 --- a/src/lexer/mod.rs +++ /dev/null | |||
@@ -1,209 +0,0 @@ | |||
1 | mod classes; | ||
2 | mod comments; | ||
3 | mod numbers; | ||
4 | mod ptr; | ||
5 | mod strings; | ||
6 | |||
7 | use { | ||
8 | SyntaxKind::{self, *}, | ||
9 | TextUnit, | ||
10 | }; | ||
11 | |||
12 | use self::{ | ||
13 | classes::*, | ||
14 | comments::{scan_comment, scan_shebang}, | ||
15 | numbers::scan_number, | ||
16 | ptr::Ptr, | ||
17 | strings::{ | ||
18 | is_string_literal_start, scan_byte_char_or_string, scan_char, scan_raw_string, scan_string, | ||
19 | }, | ||
20 | }; | ||
21 | |||
22 | /// A token of Rust source. | ||
23 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||
24 | pub struct Token { | ||
25 | /// The kind of token. | ||
26 | pub kind: SyntaxKind, | ||
27 | /// The length of the token. | ||
28 | pub len: TextUnit, | ||
29 | } | ||
30 | |||
31 | /// Break a string up into its component tokens | ||
32 | pub fn tokenize(text: &str) -> Vec<Token> { | ||
33 | let mut text = text; | ||
34 | let mut acc = Vec::new(); | ||
35 | while !text.is_empty() { | ||
36 | let token = next_token(text); | ||
37 | acc.push(token); | ||
38 | let len: u32 = token.len.into(); | ||
39 | text = &text[len as usize..]; | ||
40 | } | ||
41 | acc | ||
42 | } | ||
43 | |||
44 | /// Get the next token from a string | ||
45 | pub fn next_token(text: &str) -> Token { | ||
46 | assert!(!text.is_empty()); | ||
47 | let mut ptr = Ptr::new(text); | ||
48 | let c = ptr.bump().unwrap(); | ||
49 | let kind = next_token_inner(c, &mut ptr); | ||
50 | let len = ptr.into_len(); | ||
51 | Token { kind, len } | ||
52 | } | ||
53 | |||
54 | fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { | ||
55 | if is_whitespace(c) { | ||
56 | ptr.bump_while(is_whitespace); | ||
57 | return WHITESPACE; | ||
58 | } | ||
59 | |||
60 | match c { | ||
61 | '#' => if scan_shebang(ptr) { | ||
62 | return SHEBANG; | ||
63 | }, | ||
64 | '/' => if let Some(kind) = scan_comment(ptr) { | ||
65 | return kind; | ||
66 | }, | ||
67 | _ => (), | ||
68 | } | ||
69 | |||
70 | let ident_start = is_ident_start(c) && !is_string_literal_start(c, ptr.next(), ptr.nnext()); | ||
71 | if ident_start { | ||
72 | return scan_ident(c, ptr); | ||
73 | } | ||
74 | |||
75 | if is_dec_digit(c) { | ||
76 | let kind = scan_number(c, ptr); | ||
77 | scan_literal_suffix(ptr); | ||
78 | return kind; | ||
79 | } | ||
80 | |||
81 | // One-byte tokens. | ||
82 | if let Some(kind) = SyntaxKind::from_char(c) { | ||
83 | return kind; | ||
84 | } | ||
85 | |||
86 | match c { | ||
87 | // Multi-byte tokens. | ||
88 | '.' => { | ||
89 | return match (ptr.next(), ptr.nnext()) { | ||
90 | (Some('.'), Some('.')) => { | ||
91 | ptr.bump(); | ||
92 | ptr.bump(); | ||
93 | DOTDOTDOT | ||
94 | } | ||
95 | (Some('.'), Some('=')) => { | ||
96 | ptr.bump(); | ||
97 | ptr.bump(); | ||
98 | DOTDOTEQ | ||
99 | } | ||
100 | (Some('.'), _) => { | ||
101 | ptr.bump(); | ||
102 | DOTDOT | ||
103 | } | ||
104 | _ => DOT, | ||
105 | }; | ||
106 | } | ||
107 | ':' => { | ||
108 | return match ptr.next() { | ||
109 | Some(':') => { | ||
110 | ptr.bump(); | ||
111 | COLONCOLON | ||
112 | } | ||
113 | _ => COLON, | ||
114 | }; | ||
115 | } | ||
116 | '=' => { | ||
117 | return match ptr.next() { | ||
118 | Some('=') => { | ||
119 | ptr.bump(); | ||
120 | EQEQ | ||
121 | } | ||
122 | Some('>') => { | ||
123 | ptr.bump(); | ||
124 | FAT_ARROW | ||
125 | } | ||
126 | _ => EQ, | ||
127 | }; | ||
128 | } | ||
129 | '!' => { | ||
130 | return match ptr.next() { | ||
131 | Some('=') => { | ||
132 | ptr.bump(); | ||
133 | NEQ | ||
134 | } | ||
135 | _ => EXCL, | ||
136 | }; | ||
137 | } | ||
138 | '-' => { | ||
139 | return if ptr.next_is('>') { | ||
140 | ptr.bump(); | ||
141 | THIN_ARROW | ||
142 | } else { | ||
143 | MINUS | ||
144 | }; | ||
145 | } | ||
146 | |||
147 | // If the character is an ident start not followed by another single | ||
148 | // quote, then this is a lifetime name: | ||
149 | '\'' => { | ||
150 | return if ptr.next_is_p(is_ident_start) && !ptr.nnext_is('\'') { | ||
151 | ptr.bump(); | ||
152 | while ptr.next_is_p(is_ident_continue) { | ||
153 | ptr.bump(); | ||
154 | } | ||
155 | // lifetimes shouldn't end with a single quote | ||
156 | // if we find one, then this is an invalid character literal | ||
157 | if ptr.next_is('\'') { | ||
158 | ptr.bump(); | ||
159 | return CHAR; // TODO: error reporting | ||
160 | } | ||
161 | LIFETIME | ||
162 | } else { | ||
163 | scan_char(ptr); | ||
164 | scan_literal_suffix(ptr); | ||
165 | CHAR | ||
166 | }; | ||
167 | } | ||
168 | 'b' => { | ||
169 | let kind = scan_byte_char_or_string(ptr); | ||
170 | scan_literal_suffix(ptr); | ||
171 | return kind; | ||
172 | } | ||
173 | '"' => { | ||
174 | scan_string(ptr); | ||
175 | scan_literal_suffix(ptr); | ||
176 | return STRING; | ||
177 | } | ||
178 | 'r' => { | ||
179 | scan_raw_string(ptr); | ||
180 | scan_literal_suffix(ptr); | ||
181 | return RAW_STRING; | ||
182 | } | ||
183 | _ => (), | ||
184 | } | ||
185 | ERROR | ||
186 | } | ||
187 | |||
188 | fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind { | ||
189 | let is_single_letter = match ptr.next() { | ||
190 | None => true, | ||
191 | Some(c) if !is_ident_continue(c) => true, | ||
192 | _ => false, | ||
193 | }; | ||
194 | if is_single_letter { | ||
195 | return if c == '_' { UNDERSCORE } else { IDENT }; | ||
196 | } | ||
197 | ptr.bump_while(is_ident_continue); | ||
198 | if let Some(kind) = SyntaxKind::from_keyword(ptr.current_token_text()) { | ||
199 | return kind; | ||
200 | } | ||
201 | IDENT | ||
202 | } | ||
203 | |||
204 | fn scan_literal_suffix(ptr: &mut Ptr) { | ||
205 | if ptr.next_is_p(is_ident_start) { | ||
206 | ptr.bump(); | ||
207 | } | ||
208 | ptr.bump_while(is_ident_continue); | ||
209 | } | ||
diff --git a/src/lexer/numbers.rs b/src/lexer/numbers.rs deleted file mode 100644 index 5c4641a2d..000000000 --- a/src/lexer/numbers.rs +++ /dev/null | |||
@@ -1,67 +0,0 @@ | |||
1 | use lexer::classes::*; | ||
2 | use lexer::ptr::Ptr; | ||
3 | |||
4 | use SyntaxKind::{self, *}; | ||
5 | |||
6 | pub(crate) fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind { | ||
7 | if c == '0' { | ||
8 | match ptr.next().unwrap_or('\0') { | ||
9 | 'b' | 'o' => { | ||
10 | ptr.bump(); | ||
11 | scan_digits(ptr, false); | ||
12 | } | ||
13 | 'x' => { | ||
14 | ptr.bump(); | ||
15 | scan_digits(ptr, true); | ||
16 | } | ||
17 | '0'...'9' | '_' | '.' | 'e' | 'E' => { | ||
18 | scan_digits(ptr, true); | ||
19 | } | ||
20 | _ => return INT_NUMBER, | ||
21 | } | ||
22 | } else { | ||
23 | scan_digits(ptr, false); | ||
24 | } | ||
25 | |||
26 | // might be a float, but don't be greedy if this is actually an | ||
27 | // integer literal followed by field/method access or a range pattern | ||
28 | // (`0..2` and `12.foo()`) | ||
29 | if ptr.next_is('.') && !(ptr.nnext_is('.') || ptr.nnext_is_p(is_ident_start)) { | ||
30 | // might have stuff after the ., and if it does, it needs to start | ||
31 | // with a number | ||
32 | ptr.bump(); | ||
33 | scan_digits(ptr, false); | ||
34 | scan_float_exponent(ptr); | ||
35 | return FLOAT_NUMBER; | ||
36 | } | ||
37 | // it might be a float if it has an exponent | ||
38 | if ptr.next_is('e') || ptr.next_is('E') { | ||
39 | scan_float_exponent(ptr); | ||
40 | return FLOAT_NUMBER; | ||
41 | } | ||
42 | INT_NUMBER | ||
43 | } | ||
44 | |||
45 | fn scan_digits(ptr: &mut Ptr, allow_hex: bool) { | ||
46 | while let Some(c) = ptr.next() { | ||
47 | match c { | ||
48 | '_' | '0'...'9' => { | ||
49 | ptr.bump(); | ||
50 | } | ||
51 | 'a'...'f' | 'A'...'F' if allow_hex => { | ||
52 | ptr.bump(); | ||
53 | } | ||
54 | _ => return, | ||
55 | } | ||
56 | } | ||
57 | } | ||
58 | |||
59 | fn scan_float_exponent(ptr: &mut Ptr) { | ||
60 | if ptr.next_is('e') || ptr.next_is('E') { | ||
61 | ptr.bump(); | ||
62 | if ptr.next_is('-') || ptr.next_is('+') { | ||
63 | ptr.bump(); | ||
64 | } | ||
65 | scan_digits(ptr, false); | ||
66 | } | ||
67 | } | ||
diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs deleted file mode 100644 index d1391fd5f..000000000 --- a/src/lexer/ptr.rs +++ /dev/null | |||
@@ -1,74 +0,0 @@ | |||
1 | use TextUnit; | ||
2 | |||
3 | use std::str::Chars; | ||
4 | |||
5 | pub(crate) struct Ptr<'s> { | ||
6 | text: &'s str, | ||
7 | len: TextUnit, | ||
8 | } | ||
9 | |||
10 | impl<'s> Ptr<'s> { | ||
11 | pub fn new(text: &'s str) -> Ptr<'s> { | ||
12 | Ptr { | ||
13 | text, | ||
14 | len: 0.into(), | ||
15 | } | ||
16 | } | ||
17 | |||
18 | pub fn into_len(self) -> TextUnit { | ||
19 | self.len | ||
20 | } | ||
21 | |||
22 | pub fn next(&self) -> Option<char> { | ||
23 | self.chars().next() | ||
24 | } | ||
25 | |||
26 | pub fn nnext(&self) -> Option<char> { | ||
27 | let mut chars = self.chars(); | ||
28 | chars.next()?; | ||
29 | chars.next() | ||
30 | } | ||
31 | |||
32 | pub fn next_is(&self, c: char) -> bool { | ||
33 | self.next() == Some(c) | ||
34 | } | ||
35 | |||
36 | pub fn nnext_is(&self, c: char) -> bool { | ||
37 | self.nnext() == Some(c) | ||
38 | } | ||
39 | |||
40 | pub fn next_is_p<P: Fn(char) -> bool>(&self, p: P) -> bool { | ||
41 | self.next().map(p) == Some(true) | ||
42 | } | ||
43 | |||
44 | pub fn nnext_is_p<P: Fn(char) -> bool>(&self, p: P) -> bool { | ||
45 | self.nnext().map(p) == Some(true) | ||
46 | } | ||
47 | |||
48 | pub fn bump(&mut self) -> Option<char> { | ||
49 | let ch = self.chars().next()?; | ||
50 | self.len += TextUnit::of_char(ch); | ||
51 | Some(ch) | ||
52 | } | ||
53 | |||
54 | pub fn bump_while<F: Fn(char) -> bool>(&mut self, pred: F) { | ||
55 | loop { | ||
56 | match self.next() { | ||
57 | Some(c) if pred(c) => { | ||
58 | self.bump(); | ||
59 | } | ||
60 | _ => return, | ||
61 | } | ||
62 | } | ||
63 | } | ||
64 | |||
65 | pub fn current_token_text(&self) -> &str { | ||
66 | let len: u32 = self.len.into(); | ||
67 | &self.text[..len as usize] | ||
68 | } | ||
69 | |||
70 | fn chars(&self) -> Chars { | ||
71 | let len: u32 = self.len.into(); | ||
72 | self.text[len as usize..].chars() | ||
73 | } | ||
74 | } | ||
diff --git a/src/lexer/strings.rs b/src/lexer/strings.rs deleted file mode 100644 index e3704fbb3..000000000 --- a/src/lexer/strings.rs +++ /dev/null | |||
@@ -1,106 +0,0 @@ | |||
1 | use SyntaxKind::{self, *}; | ||
2 | |||
3 | use lexer::ptr::Ptr; | ||
4 | |||
5 | pub(crate) fn is_string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool { | ||
6 | match (c, c1, c2) { | ||
7 | ('r', Some('"'), _) | ||
8 | | ('r', Some('#'), _) | ||
9 | | ('b', Some('"'), _) | ||
10 | | ('b', Some('\''), _) | ||
11 | | ('b', Some('r'), Some('"')) | ||
12 | | ('b', Some('r'), Some('#')) => true, | ||
13 | _ => false, | ||
14 | } | ||
15 | } | ||
16 | |||
17 | pub(crate) fn scan_char(ptr: &mut Ptr) { | ||
18 | if ptr.bump().is_none() { | ||
19 | return; // TODO: error reporting is upper in the stack | ||
20 | } | ||
21 | scan_char_or_byte(ptr); | ||
22 | if !ptr.next_is('\'') { | ||
23 | return; // TODO: error reporting | ||
24 | } | ||
25 | ptr.bump(); | ||
26 | } | ||
27 | |||
28 | pub(crate) fn scan_byte_char_or_string(ptr: &mut Ptr) -> SyntaxKind { | ||
29 | // unwrapping and not-exhaustive match are ok | ||
30 | // because of string_literal_start | ||
31 | let c = ptr.bump().unwrap(); | ||
32 | match c { | ||
33 | '\'' => { | ||
34 | scan_byte(ptr); | ||
35 | BYTE | ||
36 | } | ||
37 | '"' => { | ||
38 | scan_byte_string(ptr); | ||
39 | BYTE_STRING | ||
40 | } | ||
41 | 'r' => { | ||
42 | scan_raw_byte_string(ptr); | ||
43 | RAW_BYTE_STRING | ||
44 | } | ||
45 | _ => unreachable!(), | ||
46 | } | ||
47 | } | ||
48 | |||
49 | pub(crate) fn scan_string(ptr: &mut Ptr) { | ||
50 | while let Some(c) = ptr.bump() { | ||
51 | if c == '"' { | ||
52 | return; | ||
53 | } | ||
54 | } | ||
55 | } | ||
56 | |||
57 | pub(crate) fn scan_raw_string(ptr: &mut Ptr) { | ||
58 | if !ptr.next_is('"') { | ||
59 | return; | ||
60 | } | ||
61 | ptr.bump(); | ||
62 | |||
63 | while let Some(c) = ptr.bump() { | ||
64 | if c == '"' { | ||
65 | return; | ||
66 | } | ||
67 | } | ||
68 | } | ||
69 | |||
70 | fn scan_byte(ptr: &mut Ptr) { | ||
71 | if ptr.next_is('\'') { | ||
72 | ptr.bump(); | ||
73 | return; | ||
74 | } | ||
75 | ptr.bump(); | ||
76 | if ptr.next_is('\'') { | ||
77 | ptr.bump(); | ||
78 | return; | ||
79 | } | ||
80 | } | ||
81 | |||
82 | fn scan_byte_string(ptr: &mut Ptr) { | ||
83 | while let Some(c) = ptr.bump() { | ||
84 | if c == '"' { | ||
85 | return; | ||
86 | } | ||
87 | } | ||
88 | } | ||
89 | |||
90 | fn scan_raw_byte_string(ptr: &mut Ptr) { | ||
91 | if !ptr.next_is('"') { | ||
92 | return; | ||
93 | } | ||
94 | ptr.bump(); | ||
95 | |||
96 | while let Some(c) = ptr.bump() { | ||
97 | if c == '"' { | ||
98 | return; | ||
99 | } | ||
100 | } | ||
101 | } | ||
102 | |||
103 | fn scan_char_or_byte(ptr: &mut Ptr) { | ||
104 | //FIXME: deal with escape sequencies | ||
105 | ptr.bump(); | ||
106 | } | ||