diff options
Diffstat (limited to 'crates/ra_syntax/src/lexer.rs')
-rw-r--r-- | crates/ra_syntax/src/lexer.rs | 213 |
1 files changed, 213 insertions, 0 deletions
diff --git a/crates/ra_syntax/src/lexer.rs b/crates/ra_syntax/src/lexer.rs new file mode 100644 index 000000000..f388da273 --- /dev/null +++ b/crates/ra_syntax/src/lexer.rs | |||
@@ -0,0 +1,213 @@ | |||
1 | mod classes; | ||
2 | mod comments; | ||
3 | mod numbers; | ||
4 | mod ptr; | ||
5 | mod strings; | ||
6 | |||
7 | use crate::{ | ||
8 | SyntaxKind::{self, *}, | ||
9 | TextUnit, | ||
10 | }; | ||
11 | |||
12 | use self::{ | ||
13 | classes::*, | ||
14 | comments::{scan_comment, scan_shebang}, | ||
15 | numbers::scan_number, | ||
16 | ptr::Ptr, | ||
17 | strings::{ | ||
18 | is_string_literal_start, scan_byte_char_or_string, scan_char, scan_raw_string, scan_string, | ||
19 | }, | ||
20 | }; | ||
21 | |||
22 | /// A token of Rust source. | ||
23 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||
24 | pub struct Token { | ||
25 | /// The kind of token. | ||
26 | pub kind: SyntaxKind, | ||
27 | /// The length of the token. | ||
28 | pub len: TextUnit, | ||
29 | } | ||
30 | |||
31 | /// Break a string up into its component tokens | ||
32 | pub fn tokenize(text: &str) -> Vec<Token> { | ||
33 | let mut text = text; | ||
34 | let mut acc = Vec::new(); | ||
35 | while !text.is_empty() { | ||
36 | let token = next_token(text); | ||
37 | acc.push(token); | ||
38 | let len: u32 = token.len.into(); | ||
39 | text = &text[len as usize..]; | ||
40 | } | ||
41 | acc | ||
42 | } | ||
43 | |||
44 | /// Get the next token from a string | ||
45 | pub fn next_token(text: &str) -> Token { | ||
46 | assert!(!text.is_empty()); | ||
47 | let mut ptr = Ptr::new(text); | ||
48 | let c = ptr.bump().unwrap(); | ||
49 | let kind = next_token_inner(c, &mut ptr); | ||
50 | let len = ptr.into_len(); | ||
51 | Token { kind, len } | ||
52 | } | ||
53 | |||
54 | fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { | ||
55 | if is_whitespace(c) { | ||
56 | ptr.bump_while(is_whitespace); | ||
57 | return WHITESPACE; | ||
58 | } | ||
59 | |||
60 | match c { | ||
61 | '#' => { | ||
62 | if scan_shebang(ptr) { | ||
63 | return SHEBANG; | ||
64 | } | ||
65 | } | ||
66 | '/' => { | ||
67 | if let Some(kind) = scan_comment(ptr) { | ||
68 | return kind; | ||
69 | } | ||
70 | } | ||
71 | _ => (), | ||
72 | } | ||
73 | |||
74 | let ident_start = is_ident_start(c) && !is_string_literal_start(c, ptr.current(), ptr.nth(1)); | ||
75 | if ident_start { | ||
76 | return scan_ident(c, ptr); | ||
77 | } | ||
78 | |||
79 | if is_dec_digit(c) { | ||
80 | let kind = scan_number(c, ptr); | ||
81 | scan_literal_suffix(ptr); | ||
82 | return kind; | ||
83 | } | ||
84 | |||
85 | // One-byte tokens. | ||
86 | if let Some(kind) = SyntaxKind::from_char(c) { | ||
87 | return kind; | ||
88 | } | ||
89 | |||
90 | match c { | ||
91 | // Multi-byte tokens. | ||
92 | '.' => { | ||
93 | return match (ptr.current(), ptr.nth(1)) { | ||
94 | (Some('.'), Some('.')) => { | ||
95 | ptr.bump(); | ||
96 | ptr.bump(); | ||
97 | DOTDOTDOT | ||
98 | } | ||
99 | (Some('.'), Some('=')) => { | ||
100 | ptr.bump(); | ||
101 | ptr.bump(); | ||
102 | DOTDOTEQ | ||
103 | } | ||
104 | (Some('.'), _) => { | ||
105 | ptr.bump(); | ||
106 | DOTDOT | ||
107 | } | ||
108 | _ => DOT, | ||
109 | }; | ||
110 | } | ||
111 | ':' => { | ||
112 | return match ptr.current() { | ||
113 | Some(':') => { | ||
114 | ptr.bump(); | ||
115 | COLONCOLON | ||
116 | } | ||
117 | _ => COLON, | ||
118 | }; | ||
119 | } | ||
120 | '=' => { | ||
121 | return match ptr.current() { | ||
122 | Some('=') => { | ||
123 | ptr.bump(); | ||
124 | EQEQ | ||
125 | } | ||
126 | Some('>') => { | ||
127 | ptr.bump(); | ||
128 | FAT_ARROW | ||
129 | } | ||
130 | _ => EQ, | ||
131 | }; | ||
132 | } | ||
133 | '!' => { | ||
134 | return match ptr.current() { | ||
135 | Some('=') => { | ||
136 | ptr.bump(); | ||
137 | NEQ | ||
138 | } | ||
139 | _ => EXCL, | ||
140 | }; | ||
141 | } | ||
142 | '-' => { | ||
143 | return if ptr.at('>') { | ||
144 | ptr.bump(); | ||
145 | THIN_ARROW | ||
146 | } else { | ||
147 | MINUS | ||
148 | }; | ||
149 | } | ||
150 | |||
151 | // If the character is an ident start not followed by another single | ||
152 | // quote, then this is a lifetime name: | ||
153 | '\'' => { | ||
154 | return if ptr.at_p(is_ident_start) && !ptr.at_str("''") { | ||
155 | ptr.bump(); | ||
156 | while ptr.at_p(is_ident_continue) { | ||
157 | ptr.bump(); | ||
158 | } | ||
159 | // lifetimes shouldn't end with a single quote | ||
160 | // if we find one, then this is an invalid character literal | ||
161 | if ptr.at('\'') { | ||
162 | ptr.bump(); | ||
163 | return CHAR; // TODO: error reporting | ||
164 | } | ||
165 | LIFETIME | ||
166 | } else { | ||
167 | scan_char(ptr); | ||
168 | scan_literal_suffix(ptr); | ||
169 | CHAR | ||
170 | }; | ||
171 | } | ||
172 | 'b' => { | ||
173 | let kind = scan_byte_char_or_string(ptr); | ||
174 | scan_literal_suffix(ptr); | ||
175 | return kind; | ||
176 | } | ||
177 | '"' => { | ||
178 | scan_string(ptr); | ||
179 | scan_literal_suffix(ptr); | ||
180 | return STRING; | ||
181 | } | ||
182 | 'r' => { | ||
183 | scan_raw_string(ptr); | ||
184 | scan_literal_suffix(ptr); | ||
185 | return RAW_STRING; | ||
186 | } | ||
187 | _ => (), | ||
188 | } | ||
189 | ERROR | ||
190 | } | ||
191 | |||
192 | fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind { | ||
193 | let is_single_letter = match ptr.current() { | ||
194 | None => true, | ||
195 | Some(c) if !is_ident_continue(c) => true, | ||
196 | _ => false, | ||
197 | }; | ||
198 | if is_single_letter { | ||
199 | return if c == '_' { UNDERSCORE } else { IDENT }; | ||
200 | } | ||
201 | ptr.bump_while(is_ident_continue); | ||
202 | if let Some(kind) = SyntaxKind::from_keyword(ptr.current_token_text()) { | ||
203 | return kind; | ||
204 | } | ||
205 | IDENT | ||
206 | } | ||
207 | |||
208 | fn scan_literal_suffix(ptr: &mut Ptr) { | ||
209 | if ptr.at_p(is_ident_start) { | ||
210 | ptr.bump(); | ||
211 | } | ||
212 | ptr.bump_while(is_ident_continue); | ||
213 | } | ||