aboutsummaryrefslogtreecommitdiff
path: root/crates/ra_syntax/src/parsing/lexer.rs
diff options
context:
space:
mode:
Diffstat (limited to 'crates/ra_syntax/src/parsing/lexer.rs')
-rw-r--r--crates/ra_syntax/src/parsing/lexer.rs271
1 files changed, 120 insertions, 151 deletions
diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs
index 60cf37047..2a4343b0a 100644
--- a/crates/ra_syntax/src/parsing/lexer.rs
+++ b/crates/ra_syntax/src/parsing/lexer.rs
@@ -1,22 +1,6 @@
1mod classes;
2mod comments;
3mod numbers;
4mod ptr;
5mod strings;
6
7use crate::{ 1use crate::{
8 SyntaxKind::{self, *}, 2 SyntaxKind::{self, *},
9 TextUnit, T, 3 TextUnit,
10};
11
12use self::{
13 classes::*,
14 comments::{scan_comment, scan_shebang},
15 numbers::scan_number,
16 ptr::Ptr,
17 strings::{
18 is_string_literal_start, scan_byte_char_or_string, scan_char, scan_raw_string, scan_string,
19 },
20}; 4};
21 5
22/// A token of Rust source. 6/// A token of Rust source.
@@ -30,149 +14,134 @@ pub struct Token {
30 14
31/// Break a string up into its component tokens 15/// Break a string up into its component tokens
32pub fn tokenize(text: &str) -> Vec<Token> { 16pub fn tokenize(text: &str) -> Vec<Token> {
17 if text.is_empty() {
18 return vec![];
19 }
33 let mut text = text; 20 let mut text = text;
34 let mut acc = Vec::new(); 21 let mut acc = Vec::new();
35 while !text.is_empty() { 22 if let Some(len) = ra_rustc_lexer::strip_shebang(text) {
36 let token = next_token(text); 23 acc.push(Token { kind: SHEBANG, len: TextUnit::from_usize(len) });
37 acc.push(token); 24 text = &text[len..];
38 let len: u32 = token.len.into();
39 text = &text[len as usize..];
40 }
41 acc
42}
43
44/// Get the next token from a string
45pub fn next_token(text: &str) -> Token {
46 assert!(!text.is_empty());
47 let mut ptr = Ptr::new(text);
48 let c = ptr.bump().unwrap();
49 let kind = next_token_inner(c, &mut ptr);
50 let len = ptr.into_len();
51 Token { kind, len }
52}
53
54fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
55 if is_whitespace(c) {
56 ptr.bump_while(is_whitespace);
57 return WHITESPACE;
58 } 25 }
59 26 while !text.is_empty() {
60 match c { 27 let rustc_token = ra_rustc_lexer::first_token(text);
61 '#' => { 28 macro_rules! decompose {
62 if scan_shebang(ptr) { 29 ($t1:expr, $t2:expr) => {{
63 return SHEBANG; 30 acc.push(Token { kind: $t1, len: 1.into() });
64 } 31 acc.push(Token { kind: $t2, len: 1.into() });
65 } 32 text = &text[2..];
66 '/' => { 33 continue;
67 if let Some(kind) = scan_comment(ptr) { 34 }};
68 return kind; 35 ($t1:expr, $t2:expr, $t3:expr) => {{
69 } 36 acc.push(Token { kind: $t1, len: 1.into() });
37 acc.push(Token { kind: $t2, len: 1.into() });
38 acc.push(Token { kind: $t3, len: 1.into() });
39 text = &text[3..];
40 continue;
41 }};
70 } 42 }
71 _ => (), 43 let kind = match rustc_token.kind {
72 } 44 ra_rustc_lexer::TokenKind::LineComment => COMMENT,
73 45 ra_rustc_lexer::TokenKind::BlockComment { .. } => COMMENT,
74 let ident_start = is_ident_start(c) && !is_string_literal_start(c, ptr.current(), ptr.nth(1)); 46 ra_rustc_lexer::TokenKind::Whitespace => WHITESPACE,
75 if ident_start { 47 ra_rustc_lexer::TokenKind::Ident => {
76 return scan_ident(c, ptr); 48 let token_text = &text[..rustc_token.len];
77 } 49 if token_text == "_" {
78 50 UNDERSCORE
79 if is_dec_digit(c) { 51 } else {
80 let kind = scan_number(c, ptr); 52 SyntaxKind::from_keyword(&text[..rustc_token.len]).unwrap_or(IDENT)
81 scan_literal_suffix(ptr);
82 return kind;
83 }
84
85 // One-byte tokens.
86 if let Some(kind) = SyntaxKind::from_char(c) {
87 return kind;
88 }
89
90 match c {
91 // Possiblily multi-byte tokens,
92 // but we only produce single byte token now
93 // T![...], T![..], T![..=], T![.]
94 '.' => return T![.],
95 // T![::] T![:]
96 ':' => return T![:],
97 // T![==] FATARROW T![=]
98 '=' => return T![=],
99 // T![!=] T![!]
100 '!' => return T![!],
101 // T![->] T![-]
102 '-' => return T![-],
103
104 // If the character is an ident start not followed by another single
105 // quote, then this is a lifetime name:
106 '\'' => {
107 return if ptr.at_p(is_ident_start) && !ptr.at_str("''") {
108 ptr.bump();
109 while ptr.at_p(is_ident_continue) {
110 ptr.bump();
111 } 53 }
112 // lifetimes shouldn't end with a single quote 54 }
113 // if we find one, then this is an invalid character literal 55 ra_rustc_lexer::TokenKind::RawIdent => IDENT,
114 if ptr.at('\'') { 56 ra_rustc_lexer::TokenKind::Literal { kind, .. } => match kind {
115 ptr.bump(); 57 ra_rustc_lexer::LiteralKind::Int { .. } => INT_NUMBER,
116 return CHAR; 58 ra_rustc_lexer::LiteralKind::Float { .. } => FLOAT_NUMBER,
117 } 59 ra_rustc_lexer::LiteralKind::Char { .. } => CHAR,
118 LIFETIME 60 ra_rustc_lexer::LiteralKind::Byte { .. } => BYTE,
119 } else { 61 ra_rustc_lexer::LiteralKind::Str { .. } => STRING,
120 scan_char(ptr); 62 ra_rustc_lexer::LiteralKind::ByteStr { .. } => BYTE_STRING,
121 scan_literal_suffix(ptr); 63 ra_rustc_lexer::LiteralKind::RawStr { .. } => RAW_STRING,
122 CHAR 64 ra_rustc_lexer::LiteralKind::RawByteStr { .. } => RAW_BYTE_STRING,
123 }; 65 },
124 } 66 ra_rustc_lexer::TokenKind::Lifetime { .. } => LIFETIME,
125 'b' => { 67 ra_rustc_lexer::TokenKind::Semi => SEMI,
126 let kind = scan_byte_char_or_string(ptr); 68 ra_rustc_lexer::TokenKind::Comma => COMMA,
127 scan_literal_suffix(ptr); 69 ra_rustc_lexer::TokenKind::DotDotDot => decompose!(DOT, DOT, DOT),
128 return kind; 70 ra_rustc_lexer::TokenKind::DotDotEq => decompose!(DOT, DOT, EQ),
129 } 71 ra_rustc_lexer::TokenKind::DotDot => decompose!(DOT, DOT),
130 '"' => { 72 ra_rustc_lexer::TokenKind::Dot => DOT,
131 scan_string(ptr); 73 ra_rustc_lexer::TokenKind::OpenParen => L_PAREN,
132 scan_literal_suffix(ptr); 74 ra_rustc_lexer::TokenKind::CloseParen => R_PAREN,
133 return STRING; 75 ra_rustc_lexer::TokenKind::OpenBrace => L_CURLY,
134 } 76 ra_rustc_lexer::TokenKind::CloseBrace => R_CURLY,
135 'r' => { 77 ra_rustc_lexer::TokenKind::OpenBracket => L_BRACK,
136 scan_raw_string(ptr); 78 ra_rustc_lexer::TokenKind::CloseBracket => R_BRACK,
137 scan_literal_suffix(ptr); 79 ra_rustc_lexer::TokenKind::At => AT,
138 return RAW_STRING; 80 ra_rustc_lexer::TokenKind::Pound => POUND,
139 } 81 ra_rustc_lexer::TokenKind::Tilde => TILDE,
140 _ => (), 82 ra_rustc_lexer::TokenKind::Question => QUESTION,
141 } 83 ra_rustc_lexer::TokenKind::ColonColon => decompose!(COLON, COLON),
142 ERROR 84 ra_rustc_lexer::TokenKind::Colon => COLON,
143} 85 ra_rustc_lexer::TokenKind::Dollar => DOLLAR,
144 86 ra_rustc_lexer::TokenKind::EqEq => decompose!(EQ, EQ),
145fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind { 87 ra_rustc_lexer::TokenKind::Eq => EQ,
146 let is_raw = match (c, ptr.current()) { 88 ra_rustc_lexer::TokenKind::FatArrow => decompose!(EQ, R_ANGLE),
147 ('r', Some('#')) => { 89 ra_rustc_lexer::TokenKind::Ne => decompose!(EXCL, EQ),
148 ptr.bump(); 90 ra_rustc_lexer::TokenKind::Not => EXCL,
149 true 91 ra_rustc_lexer::TokenKind::Le => decompose!(L_ANGLE, EQ),
150 } 92 ra_rustc_lexer::TokenKind::LArrow => decompose!(COLON, MINUS),
151 ('_', None) => return T![_], 93 ra_rustc_lexer::TokenKind::Lt => L_ANGLE,
152 ('_', Some(c)) if !is_ident_continue(c) => return T![_], 94 ra_rustc_lexer::TokenKind::ShlEq => decompose!(L_ANGLE, L_ANGLE, EQ),
153 _ => false, 95 ra_rustc_lexer::TokenKind::Shl => decompose!(L_ANGLE, L_ANGLE),
154 }; 96 ra_rustc_lexer::TokenKind::Ge => decompose!(R_ANGLE, EQ),
155 ptr.bump_while(is_ident_continue); 97 ra_rustc_lexer::TokenKind::Gt => R_ANGLE,
156 if !is_raw { 98 ra_rustc_lexer::TokenKind::ShrEq => decompose!(R_ANGLE, R_ANGLE, EQ),
157 if let Some(kind) = SyntaxKind::from_keyword(ptr.current_token_text()) { 99 ra_rustc_lexer::TokenKind::Shr => decompose!(R_ANGLE, R_ANGLE),
158 return kind; 100 ra_rustc_lexer::TokenKind::RArrow => decompose!(MINUS, R_ANGLE),
159 } 101 ra_rustc_lexer::TokenKind::Minus => MINUS,
160 } 102 ra_rustc_lexer::TokenKind::MinusEq => decompose!(MINUS, EQ),
161 IDENT 103 ra_rustc_lexer::TokenKind::And => AMP,
162} 104 ra_rustc_lexer::TokenKind::AndAnd => decompose!(AMP, AMP),
163 105 ra_rustc_lexer::TokenKind::AndEq => decompose!(AMP, EQ),
164fn scan_literal_suffix(ptr: &mut Ptr) { 106 ra_rustc_lexer::TokenKind::Or => PIPE,
165 if ptr.at_p(is_ident_start) { 107 ra_rustc_lexer::TokenKind::OrOr => decompose!(PIPE, PIPE),
166 ptr.bump(); 108 ra_rustc_lexer::TokenKind::OrEq => decompose!(PIPE, EQ),
109 ra_rustc_lexer::TokenKind::PlusEq => decompose!(PLUS, EQ),
110 ra_rustc_lexer::TokenKind::Plus => PLUS,
111 ra_rustc_lexer::TokenKind::StarEq => decompose!(STAR, EQ),
112 ra_rustc_lexer::TokenKind::Star => STAR,
113 ra_rustc_lexer::TokenKind::SlashEq => decompose!(SLASH, EQ),
114 ra_rustc_lexer::TokenKind::Slash => SLASH,
115 ra_rustc_lexer::TokenKind::CaretEq => decompose!(CARET, EQ),
116 ra_rustc_lexer::TokenKind::Caret => CARET,
117 ra_rustc_lexer::TokenKind::PercentEq => decompose!(PERCENT, EQ),
118 ra_rustc_lexer::TokenKind::Percent => PERCENT,
119 ra_rustc_lexer::TokenKind::Unknown => ERROR,
120 };
121 let token = Token { kind, len: TextUnit::from_usize(rustc_token.len) };
122 acc.push(token);
123 text = &text[rustc_token.len..];
167 } 124 }
168 ptr.bump_while(is_ident_continue); 125 acc
169} 126}
170 127
171pub fn classify_literal(text: &str) -> Option<Token> { 128pub fn classify_literal(text: &str) -> Option<Token> {
172 let tkn = next_token(text); 129 let t = ra_rustc_lexer::first_token(text);
173 if !tkn.kind.is_literal() || tkn.len.to_usize() != text.len() { 130 if t.len != text.len() {
174 return None; 131 return None;
175 } 132 }
176 133 let kind = match t.kind {
177 Some(tkn) 134 ra_rustc_lexer::TokenKind::Literal { kind, .. } => match kind {
135 ra_rustc_lexer::LiteralKind::Int { .. } => INT_NUMBER,
136 ra_rustc_lexer::LiteralKind::Float { .. } => FLOAT_NUMBER,
137 ra_rustc_lexer::LiteralKind::Char { .. } => CHAR,
138 ra_rustc_lexer::LiteralKind::Byte { .. } => BYTE,
139 ra_rustc_lexer::LiteralKind::Str { .. } => STRING,
140 ra_rustc_lexer::LiteralKind::ByteStr { .. } => BYTE_STRING,
141 ra_rustc_lexer::LiteralKind::RawStr { .. } => RAW_STRING,
142 ra_rustc_lexer::LiteralKind::RawByteStr { .. } => RAW_BYTE_STRING,
143 },
144 _ => return None,
145 };
146 Some(Token { kind, len: TextUnit::from_usize(t.len) })
178} 147}