aboutsummaryrefslogtreecommitdiff
path: root/crates/ra_syntax/src/parsing/lexer.rs
diff options
context:
space:
mode:
Diffstat (limited to 'crates/ra_syntax/src/parsing/lexer.rs')
-rw-r--r--crates/ra_syntax/src/parsing/lexer.rs215
1 files changed, 215 insertions, 0 deletions
diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs
new file mode 100644
index 000000000..f9362120e
--- /dev/null
+++ b/crates/ra_syntax/src/parsing/lexer.rs
@@ -0,0 +1,215 @@
1mod classes;
2mod comments;
3mod numbers;
4mod ptr;
5mod strings;
6
7use crate::{
8 SyntaxKind::{self, *},
9 TextUnit,
10};
11
12use self::{
13 classes::*,
14 comments::{scan_comment, scan_shebang},
15 numbers::scan_number,
16 ptr::Ptr,
17 strings::{
18 is_string_literal_start, scan_byte_char_or_string, scan_char, scan_raw_string, scan_string,
19 },
20};
21
22/// A token of Rust source.
23#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
24pub struct Token {
25 /// The kind of token.
26 pub kind: SyntaxKind,
27 /// The length of the token.
28 pub len: TextUnit,
29}
30
31/// Break a string up into its component tokens
32pub fn tokenize(text: &str) -> Vec<Token> {
33 let mut text = text;
34 let mut acc = Vec::new();
35 while !text.is_empty() {
36 let token = next_token(text);
37 acc.push(token);
38 let len: u32 = token.len.into();
39 text = &text[len as usize..];
40 }
41 acc
42}
43
44/// Get the next token from a string
45pub fn next_token(text: &str) -> Token {
46 assert!(!text.is_empty());
47 let mut ptr = Ptr::new(text);
48 let c = ptr.bump().unwrap();
49 let kind = next_token_inner(c, &mut ptr);
50 let len = ptr.into_len();
51 Token { kind, len }
52}
53
54fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
55 if is_whitespace(c) {
56 ptr.bump_while(is_whitespace);
57 return WHITESPACE;
58 }
59
60 match c {
61 '#' => {
62 if scan_shebang(ptr) {
63 return SHEBANG;
64 }
65 }
66 '/' => {
67 if let Some(kind) = scan_comment(ptr) {
68 return kind;
69 }
70 }
71 _ => (),
72 }
73
74 let ident_start = is_ident_start(c) && !is_string_literal_start(c, ptr.current(), ptr.nth(1));
75 if ident_start {
76 return scan_ident(c, ptr);
77 }
78
79 if is_dec_digit(c) {
80 let kind = scan_number(c, ptr);
81 scan_literal_suffix(ptr);
82 return kind;
83 }
84
85 // One-byte tokens.
86 if let Some(kind) = SyntaxKind::from_char(c) {
87 return kind;
88 }
89
90 match c {
91 // Multi-byte tokens.
92 '.' => {
93 return match (ptr.current(), ptr.nth(1)) {
94 (Some('.'), Some('.')) => {
95 ptr.bump();
96 ptr.bump();
97 DOTDOTDOT
98 }
99 (Some('.'), Some('=')) => {
100 ptr.bump();
101 ptr.bump();
102 DOTDOTEQ
103 }
104 (Some('.'), _) => {
105 ptr.bump();
106 DOTDOT
107 }
108 _ => DOT,
109 };
110 }
111 ':' => {
112 return match ptr.current() {
113 Some(':') => {
114 ptr.bump();
115 COLONCOLON
116 }
117 _ => COLON,
118 };
119 }
120 '=' => {
121 return match ptr.current() {
122 Some('=') => {
123 ptr.bump();
124 EQEQ
125 }
126 Some('>') => {
127 ptr.bump();
128 FAT_ARROW
129 }
130 _ => EQ,
131 };
132 }
133 '!' => {
134 return match ptr.current() {
135 Some('=') => {
136 ptr.bump();
137 NEQ
138 }
139 _ => EXCL,
140 };
141 }
142 '-' => {
143 return if ptr.at('>') {
144 ptr.bump();
145 THIN_ARROW
146 } else {
147 MINUS
148 };
149 }
150
151 // If the character is an ident start not followed by another single
152 // quote, then this is a lifetime name:
153 '\'' => {
154 return if ptr.at_p(is_ident_start) && !ptr.at_str("''") {
155 ptr.bump();
156 while ptr.at_p(is_ident_continue) {
157 ptr.bump();
158 }
159 // lifetimes shouldn't end with a single quote
160 // if we find one, then this is an invalid character literal
161 if ptr.at('\'') {
162 ptr.bump();
163 return CHAR;
164 }
165 LIFETIME
166 } else {
167 scan_char(ptr);
168 scan_literal_suffix(ptr);
169 CHAR
170 };
171 }
172 'b' => {
173 let kind = scan_byte_char_or_string(ptr);
174 scan_literal_suffix(ptr);
175 return kind;
176 }
177 '"' => {
178 scan_string(ptr);
179 scan_literal_suffix(ptr);
180 return STRING;
181 }
182 'r' => {
183 scan_raw_string(ptr);
184 scan_literal_suffix(ptr);
185 return RAW_STRING;
186 }
187 _ => (),
188 }
189 ERROR
190}
191
192fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind {
193 let is_raw = match (c, ptr.current()) {
194 ('r', Some('#')) => {
195 ptr.bump();
196 true
197 }
198 ('_', Some(c)) if !is_ident_continue(c) => return UNDERSCORE,
199 _ => false,
200 };
201 ptr.bump_while(is_ident_continue);
202 if !is_raw {
203 if let Some(kind) = SyntaxKind::from_keyword(ptr.current_token_text()) {
204 return kind;
205 }
206 }
207 IDENT
208}
209
210fn scan_literal_suffix(ptr: &mut Ptr) {
211 if ptr.at_p(is_ident_start) {
212 ptr.bump();
213 }
214 ptr.bump_while(is_ident_continue);
215}