diff options
Diffstat (limited to 'crates/libsyntax2/src/lexer/mod.rs')
-rw-r--r-- | crates/libsyntax2/src/lexer/mod.rs | 209 |
1 files changed, 0 insertions, 209 deletions
diff --git a/crates/libsyntax2/src/lexer/mod.rs b/crates/libsyntax2/src/lexer/mod.rs deleted file mode 100644 index 3e11db88b..000000000 --- a/crates/libsyntax2/src/lexer/mod.rs +++ /dev/null | |||
@@ -1,209 +0,0 @@ | |||
1 | mod classes; | ||
2 | mod comments; | ||
3 | mod numbers; | ||
4 | mod ptr; | ||
5 | mod strings; | ||
6 | |||
7 | use { | ||
8 | SyntaxKind::{self, *}, | ||
9 | TextUnit, | ||
10 | }; | ||
11 | |||
12 | use self::{ | ||
13 | classes::*, | ||
14 | comments::{scan_comment, scan_shebang}, | ||
15 | numbers::scan_number, | ||
16 | ptr::Ptr, | ||
17 | strings::{ | ||
18 | is_string_literal_start, scan_byte_char_or_string, scan_char, scan_raw_string, scan_string, | ||
19 | }, | ||
20 | }; | ||
21 | |||
22 | /// A token of Rust source. | ||
23 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||
24 | pub struct Token { | ||
25 | /// The kind of token. | ||
26 | pub kind: SyntaxKind, | ||
27 | /// The length of the token. | ||
28 | pub len: TextUnit, | ||
29 | } | ||
30 | |||
31 | /// Break a string up into its component tokens | ||
32 | pub fn tokenize(text: &str) -> Vec<Token> { | ||
33 | let mut text = text; | ||
34 | let mut acc = Vec::new(); | ||
35 | while !text.is_empty() { | ||
36 | let token = next_token(text); | ||
37 | acc.push(token); | ||
38 | let len: u32 = token.len.into(); | ||
39 | text = &text[len as usize..]; | ||
40 | } | ||
41 | acc | ||
42 | } | ||
43 | |||
44 | /// Get the next token from a string | ||
45 | pub fn next_token(text: &str) -> Token { | ||
46 | assert!(!text.is_empty()); | ||
47 | let mut ptr = Ptr::new(text); | ||
48 | let c = ptr.bump().unwrap(); | ||
49 | let kind = next_token_inner(c, &mut ptr); | ||
50 | let len = ptr.into_len(); | ||
51 | Token { kind, len } | ||
52 | } | ||
53 | |||
54 | fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { | ||
55 | if is_whitespace(c) { | ||
56 | ptr.bump_while(is_whitespace); | ||
57 | return WHITESPACE; | ||
58 | } | ||
59 | |||
60 | match c { | ||
61 | '#' => if scan_shebang(ptr) { | ||
62 | return SHEBANG; | ||
63 | }, | ||
64 | '/' => if let Some(kind) = scan_comment(ptr) { | ||
65 | return kind; | ||
66 | }, | ||
67 | _ => (), | ||
68 | } | ||
69 | |||
70 | let ident_start = is_ident_start(c) && !is_string_literal_start(c, ptr.current(), ptr.nth(1)); | ||
71 | if ident_start { | ||
72 | return scan_ident(c, ptr); | ||
73 | } | ||
74 | |||
75 | if is_dec_digit(c) { | ||
76 | let kind = scan_number(c, ptr); | ||
77 | scan_literal_suffix(ptr); | ||
78 | return kind; | ||
79 | } | ||
80 | |||
81 | // One-byte tokens. | ||
82 | if let Some(kind) = SyntaxKind::from_char(c) { | ||
83 | return kind; | ||
84 | } | ||
85 | |||
86 | match c { | ||
87 | // Multi-byte tokens. | ||
88 | '.' => { | ||
89 | return match (ptr.current(), ptr.nth(1)) { | ||
90 | (Some('.'), Some('.')) => { | ||
91 | ptr.bump(); | ||
92 | ptr.bump(); | ||
93 | DOTDOTDOT | ||
94 | } | ||
95 | (Some('.'), Some('=')) => { | ||
96 | ptr.bump(); | ||
97 | ptr.bump(); | ||
98 | DOTDOTEQ | ||
99 | } | ||
100 | (Some('.'), _) => { | ||
101 | ptr.bump(); | ||
102 | DOTDOT | ||
103 | } | ||
104 | _ => DOT, | ||
105 | }; | ||
106 | } | ||
107 | ':' => { | ||
108 | return match ptr.current() { | ||
109 | Some(':') => { | ||
110 | ptr.bump(); | ||
111 | COLONCOLON | ||
112 | } | ||
113 | _ => COLON, | ||
114 | }; | ||
115 | } | ||
116 | '=' => { | ||
117 | return match ptr.current() { | ||
118 | Some('=') => { | ||
119 | ptr.bump(); | ||
120 | EQEQ | ||
121 | } | ||
122 | Some('>') => { | ||
123 | ptr.bump(); | ||
124 | FAT_ARROW | ||
125 | } | ||
126 | _ => EQ, | ||
127 | }; | ||
128 | } | ||
129 | '!' => { | ||
130 | return match ptr.current() { | ||
131 | Some('=') => { | ||
132 | ptr.bump(); | ||
133 | NEQ | ||
134 | } | ||
135 | _ => EXCL, | ||
136 | }; | ||
137 | } | ||
138 | '-' => { | ||
139 | return if ptr.at('>') { | ||
140 | ptr.bump(); | ||
141 | THIN_ARROW | ||
142 | } else { | ||
143 | MINUS | ||
144 | }; | ||
145 | } | ||
146 | |||
147 | // If the character is an ident start not followed by another single | ||
148 | // quote, then this is a lifetime name: | ||
149 | '\'' => { | ||
150 | return if ptr.at_p(is_ident_start) && !ptr.at_str("''") { | ||
151 | ptr.bump(); | ||
152 | while ptr.at_p(is_ident_continue) { | ||
153 | ptr.bump(); | ||
154 | } | ||
155 | // lifetimes shouldn't end with a single quote | ||
156 | // if we find one, then this is an invalid character literal | ||
157 | if ptr.at('\'') { | ||
158 | ptr.bump(); | ||
159 | return CHAR; // TODO: error reporting | ||
160 | } | ||
161 | LIFETIME | ||
162 | } else { | ||
163 | scan_char(ptr); | ||
164 | scan_literal_suffix(ptr); | ||
165 | CHAR | ||
166 | }; | ||
167 | } | ||
168 | 'b' => { | ||
169 | let kind = scan_byte_char_or_string(ptr); | ||
170 | scan_literal_suffix(ptr); | ||
171 | return kind; | ||
172 | } | ||
173 | '"' => { | ||
174 | scan_string(ptr); | ||
175 | scan_literal_suffix(ptr); | ||
176 | return STRING; | ||
177 | } | ||
178 | 'r' => { | ||
179 | scan_raw_string(ptr); | ||
180 | scan_literal_suffix(ptr); | ||
181 | return RAW_STRING; | ||
182 | } | ||
183 | _ => (), | ||
184 | } | ||
185 | ERROR | ||
186 | } | ||
187 | |||
188 | fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind { | ||
189 | let is_single_letter = match ptr.current() { | ||
190 | None => true, | ||
191 | Some(c) if !is_ident_continue(c) => true, | ||
192 | _ => false, | ||
193 | }; | ||
194 | if is_single_letter { | ||
195 | return if c == '_' { UNDERSCORE } else { IDENT }; | ||
196 | } | ||
197 | ptr.bump_while(is_ident_continue); | ||
198 | if let Some(kind) = SyntaxKind::from_keyword(ptr.current_token_text()) { | ||
199 | return kind; | ||
200 | } | ||
201 | IDENT | ||
202 | } | ||
203 | |||
204 | fn scan_literal_suffix(ptr: &mut Ptr) { | ||
205 | if ptr.at_p(is_ident_start) { | ||
206 | ptr.bump(); | ||
207 | } | ||
208 | ptr.bump_while(is_ident_continue); | ||
209 | } | ||