diff options
author | bors[bot] <26634292+bors[bot]@users.noreply.github.com> | 2019-07-22 15:59:48 +0100 |
---|---|---|
committer | bors[bot] <26634292+bors[bot]@users.noreply.github.com> | 2019-07-22 15:59:48 +0100 |
commit | 7d0713e8d2500e6f56116965b93b47d0ef552515 (patch) | |
tree | 20ee49ed4ee94e463cd81f3f8142d64cde0ca134 /crates/ra_syntax/src/parsing | |
parent | d690249bc81bc265cb3d1836c2922325f4fdb8af (diff) | |
parent | 700669bbd0ab3ae0c5a56985ce13ca896d342a3a (diff) |
Merge #1575
1575: Use the same lexer as `rustc` r=matklad a=matklad
This is :zap: !
bors r+
Co-authored-by: Aleksey Kladov <[email protected]>
Diffstat (limited to 'crates/ra_syntax/src/parsing')
-rw-r--r-- | crates/ra_syntax/src/parsing/lexer.rs | 271 | ||||
-rw-r--r-- | crates/ra_syntax/src/parsing/lexer/classes.rs | 26 | ||||
-rw-r--r-- | crates/ra_syntax/src/parsing/lexer/comments.rs | 57 | ||||
-rw-r--r-- | crates/ra_syntax/src/parsing/lexer/numbers.rs | 66 | ||||
-rw-r--r-- | crates/ra_syntax/src/parsing/lexer/ptr.rs | 162 | ||||
-rw-r--r-- | crates/ra_syntax/src/parsing/lexer/strings.rs | 112 |
6 files changed, 120 insertions, 574 deletions
diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs index 60cf37047..2a4343b0a 100644 --- a/crates/ra_syntax/src/parsing/lexer.rs +++ b/crates/ra_syntax/src/parsing/lexer.rs | |||
@@ -1,22 +1,6 @@ | |||
1 | mod classes; | ||
2 | mod comments; | ||
3 | mod numbers; | ||
4 | mod ptr; | ||
5 | mod strings; | ||
6 | |||
7 | use crate::{ | 1 | use crate::{ |
8 | SyntaxKind::{self, *}, | 2 | SyntaxKind::{self, *}, |
9 | TextUnit, T, | 3 | TextUnit, |
10 | }; | ||
11 | |||
12 | use self::{ | ||
13 | classes::*, | ||
14 | comments::{scan_comment, scan_shebang}, | ||
15 | numbers::scan_number, | ||
16 | ptr::Ptr, | ||
17 | strings::{ | ||
18 | is_string_literal_start, scan_byte_char_or_string, scan_char, scan_raw_string, scan_string, | ||
19 | }, | ||
20 | }; | 4 | }; |
21 | 5 | ||
22 | /// A token of Rust source. | 6 | /// A token of Rust source. |
@@ -30,149 +14,134 @@ pub struct Token { | |||
30 | 14 | ||
31 | /// Break a string up into its component tokens | 15 | /// Break a string up into its component tokens |
32 | pub fn tokenize(text: &str) -> Vec<Token> { | 16 | pub fn tokenize(text: &str) -> Vec<Token> { |
17 | if text.is_empty() { | ||
18 | return vec![]; | ||
19 | } | ||
33 | let mut text = text; | 20 | let mut text = text; |
34 | let mut acc = Vec::new(); | 21 | let mut acc = Vec::new(); |
35 | while !text.is_empty() { | 22 | if let Some(len) = ra_rustc_lexer::strip_shebang(text) { |
36 | let token = next_token(text); | 23 | acc.push(Token { kind: SHEBANG, len: TextUnit::from_usize(len) }); |
37 | acc.push(token); | 24 | text = &text[len..]; |
38 | let len: u32 = token.len.into(); | ||
39 | text = &text[len as usize..]; | ||
40 | } | ||
41 | acc | ||
42 | } | ||
43 | |||
44 | /// Get the next token from a string | ||
45 | pub fn next_token(text: &str) -> Token { | ||
46 | assert!(!text.is_empty()); | ||
47 | let mut ptr = Ptr::new(text); | ||
48 | let c = ptr.bump().unwrap(); | ||
49 | let kind = next_token_inner(c, &mut ptr); | ||
50 | let len = ptr.into_len(); | ||
51 | Token { kind, len } | ||
52 | } | ||
53 | |||
54 | fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { | ||
55 | if is_whitespace(c) { | ||
56 | ptr.bump_while(is_whitespace); | ||
57 | return WHITESPACE; | ||
58 | } | 25 | } |
59 | 26 | while !text.is_empty() { | |
60 | match c { | 27 | let rustc_token = ra_rustc_lexer::first_token(text); |
61 | '#' => { | 28 | macro_rules! decompose { |
62 | if scan_shebang(ptr) { | 29 | ($t1:expr, $t2:expr) => {{ |
63 | return SHEBANG; | 30 | acc.push(Token { kind: $t1, len: 1.into() }); |
64 | } | 31 | acc.push(Token { kind: $t2, len: 1.into() }); |
65 | } | 32 | text = &text[2..]; |
66 | '/' => { | 33 | continue; |
67 | if let Some(kind) = scan_comment(ptr) { | 34 | }}; |
68 | return kind; | 35 | ($t1:expr, $t2:expr, $t3:expr) => {{ |
69 | } | 36 | acc.push(Token { kind: $t1, len: 1.into() }); |
37 | acc.push(Token { kind: $t2, len: 1.into() }); | ||
38 | acc.push(Token { kind: $t3, len: 1.into() }); | ||
39 | text = &text[3..]; | ||
40 | continue; | ||
41 | }}; | ||
70 | } | 42 | } |
71 | _ => (), | 43 | let kind = match rustc_token.kind { |
72 | } | 44 | ra_rustc_lexer::TokenKind::LineComment => COMMENT, |
73 | 45 | ra_rustc_lexer::TokenKind::BlockComment { .. } => COMMENT, | |
74 | let ident_start = is_ident_start(c) && !is_string_literal_start(c, ptr.current(), ptr.nth(1)); | 46 | ra_rustc_lexer::TokenKind::Whitespace => WHITESPACE, |
75 | if ident_start { | 47 | ra_rustc_lexer::TokenKind::Ident => { |
76 | return scan_ident(c, ptr); | 48 | let token_text = &text[..rustc_token.len]; |
77 | } | 49 | if token_text == "_" { |
78 | 50 | UNDERSCORE | |
79 | if is_dec_digit(c) { | 51 | } else { |
80 | let kind = scan_number(c, ptr); | 52 | SyntaxKind::from_keyword(&text[..rustc_token.len]).unwrap_or(IDENT) |
81 | scan_literal_suffix(ptr); | ||
82 | return kind; | ||
83 | } | ||
84 | |||
85 | // One-byte tokens. | ||
86 | if let Some(kind) = SyntaxKind::from_char(c) { | ||
87 | return kind; | ||
88 | } | ||
89 | |||
90 | match c { | ||
91 | // Possiblily multi-byte tokens, | ||
92 | // but we only produce single byte token now | ||
93 | // T![...], T![..], T![..=], T![.] | ||
94 | '.' => return T![.], | ||
95 | // T![::] T![:] | ||
96 | ':' => return T![:], | ||
97 | // T![==] FATARROW T![=] | ||
98 | '=' => return T![=], | ||
99 | // T![!=] T![!] | ||
100 | '!' => return T![!], | ||
101 | // T![->] T![-] | ||
102 | '-' => return T![-], | ||
103 | |||
104 | // If the character is an ident start not followed by another single | ||
105 | // quote, then this is a lifetime name: | ||
106 | '\'' => { | ||
107 | return if ptr.at_p(is_ident_start) && !ptr.at_str("''") { | ||
108 | ptr.bump(); | ||
109 | while ptr.at_p(is_ident_continue) { | ||
110 | ptr.bump(); | ||
111 | } | 53 | } |
112 | // lifetimes shouldn't end with a single quote | 54 | } |
113 | // if we find one, then this is an invalid character literal | 55 | ra_rustc_lexer::TokenKind::RawIdent => IDENT, |
114 | if ptr.at('\'') { | 56 | ra_rustc_lexer::TokenKind::Literal { kind, .. } => match kind { |
115 | ptr.bump(); | 57 | ra_rustc_lexer::LiteralKind::Int { .. } => INT_NUMBER, |
116 | return CHAR; | 58 | ra_rustc_lexer::LiteralKind::Float { .. } => FLOAT_NUMBER, |
117 | } | 59 | ra_rustc_lexer::LiteralKind::Char { .. } => CHAR, |
118 | LIFETIME | 60 | ra_rustc_lexer::LiteralKind::Byte { .. } => BYTE, |
119 | } else { | 61 | ra_rustc_lexer::LiteralKind::Str { .. } => STRING, |
120 | scan_char(ptr); | 62 | ra_rustc_lexer::LiteralKind::ByteStr { .. } => BYTE_STRING, |
121 | scan_literal_suffix(ptr); | 63 | ra_rustc_lexer::LiteralKind::RawStr { .. } => RAW_STRING, |
122 | CHAR | 64 | ra_rustc_lexer::LiteralKind::RawByteStr { .. } => RAW_BYTE_STRING, |
123 | }; | 65 | }, |
124 | } | 66 | ra_rustc_lexer::TokenKind::Lifetime { .. } => LIFETIME, |
125 | 'b' => { | 67 | ra_rustc_lexer::TokenKind::Semi => SEMI, |
126 | let kind = scan_byte_char_or_string(ptr); | 68 | ra_rustc_lexer::TokenKind::Comma => COMMA, |
127 | scan_literal_suffix(ptr); | 69 | ra_rustc_lexer::TokenKind::DotDotDot => decompose!(DOT, DOT, DOT), |
128 | return kind; | 70 | ra_rustc_lexer::TokenKind::DotDotEq => decompose!(DOT, DOT, EQ), |
129 | } | 71 | ra_rustc_lexer::TokenKind::DotDot => decompose!(DOT, DOT), |
130 | '"' => { | 72 | ra_rustc_lexer::TokenKind::Dot => DOT, |
131 | scan_string(ptr); | 73 | ra_rustc_lexer::TokenKind::OpenParen => L_PAREN, |
132 | scan_literal_suffix(ptr); | 74 | ra_rustc_lexer::TokenKind::CloseParen => R_PAREN, |
133 | return STRING; | 75 | ra_rustc_lexer::TokenKind::OpenBrace => L_CURLY, |
134 | } | 76 | ra_rustc_lexer::TokenKind::CloseBrace => R_CURLY, |
135 | 'r' => { | 77 | ra_rustc_lexer::TokenKind::OpenBracket => L_BRACK, |
136 | scan_raw_string(ptr); | 78 | ra_rustc_lexer::TokenKind::CloseBracket => R_BRACK, |
137 | scan_literal_suffix(ptr); | 79 | ra_rustc_lexer::TokenKind::At => AT, |
138 | return RAW_STRING; | 80 | ra_rustc_lexer::TokenKind::Pound => POUND, |
139 | } | 81 | ra_rustc_lexer::TokenKind::Tilde => TILDE, |
140 | _ => (), | 82 | ra_rustc_lexer::TokenKind::Question => QUESTION, |
141 | } | 83 | ra_rustc_lexer::TokenKind::ColonColon => decompose!(COLON, COLON), |
142 | ERROR | 84 | ra_rustc_lexer::TokenKind::Colon => COLON, |
143 | } | 85 | ra_rustc_lexer::TokenKind::Dollar => DOLLAR, |
144 | 86 | ra_rustc_lexer::TokenKind::EqEq => decompose!(EQ, EQ), | |
145 | fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind { | 87 | ra_rustc_lexer::TokenKind::Eq => EQ, |
146 | let is_raw = match (c, ptr.current()) { | 88 | ra_rustc_lexer::TokenKind::FatArrow => decompose!(EQ, R_ANGLE), |
147 | ('r', Some('#')) => { | 89 | ra_rustc_lexer::TokenKind::Ne => decompose!(EXCL, EQ), |
148 | ptr.bump(); | 90 | ra_rustc_lexer::TokenKind::Not => EXCL, |
149 | true | 91 | ra_rustc_lexer::TokenKind::Le => decompose!(L_ANGLE, EQ), |
150 | } | 92 | ra_rustc_lexer::TokenKind::LArrow => decompose!(COLON, MINUS), |
151 | ('_', None) => return T![_], | 93 | ra_rustc_lexer::TokenKind::Lt => L_ANGLE, |
152 | ('_', Some(c)) if !is_ident_continue(c) => return T![_], | 94 | ra_rustc_lexer::TokenKind::ShlEq => decompose!(L_ANGLE, L_ANGLE, EQ), |
153 | _ => false, | 95 | ra_rustc_lexer::TokenKind::Shl => decompose!(L_ANGLE, L_ANGLE), |
154 | }; | 96 | ra_rustc_lexer::TokenKind::Ge => decompose!(R_ANGLE, EQ), |
155 | ptr.bump_while(is_ident_continue); | 97 | ra_rustc_lexer::TokenKind::Gt => R_ANGLE, |
156 | if !is_raw { | 98 | ra_rustc_lexer::TokenKind::ShrEq => decompose!(R_ANGLE, R_ANGLE, EQ), |
157 | if let Some(kind) = SyntaxKind::from_keyword(ptr.current_token_text()) { | 99 | ra_rustc_lexer::TokenKind::Shr => decompose!(R_ANGLE, R_ANGLE), |
158 | return kind; | 100 | ra_rustc_lexer::TokenKind::RArrow => decompose!(MINUS, R_ANGLE), |
159 | } | 101 | ra_rustc_lexer::TokenKind::Minus => MINUS, |
160 | } | 102 | ra_rustc_lexer::TokenKind::MinusEq => decompose!(MINUS, EQ), |
161 | IDENT | 103 | ra_rustc_lexer::TokenKind::And => AMP, |
162 | } | 104 | ra_rustc_lexer::TokenKind::AndAnd => decompose!(AMP, AMP), |
163 | 105 | ra_rustc_lexer::TokenKind::AndEq => decompose!(AMP, EQ), | |
164 | fn scan_literal_suffix(ptr: &mut Ptr) { | 106 | ra_rustc_lexer::TokenKind::Or => PIPE, |
165 | if ptr.at_p(is_ident_start) { | 107 | ra_rustc_lexer::TokenKind::OrOr => decompose!(PIPE, PIPE), |
166 | ptr.bump(); | 108 | ra_rustc_lexer::TokenKind::OrEq => decompose!(PIPE, EQ), |
109 | ra_rustc_lexer::TokenKind::PlusEq => decompose!(PLUS, EQ), | ||
110 | ra_rustc_lexer::TokenKind::Plus => PLUS, | ||
111 | ra_rustc_lexer::TokenKind::StarEq => decompose!(STAR, EQ), | ||
112 | ra_rustc_lexer::TokenKind::Star => STAR, | ||
113 | ra_rustc_lexer::TokenKind::SlashEq => decompose!(SLASH, EQ), | ||
114 | ra_rustc_lexer::TokenKind::Slash => SLASH, | ||
115 | ra_rustc_lexer::TokenKind::CaretEq => decompose!(CARET, EQ), | ||
116 | ra_rustc_lexer::TokenKind::Caret => CARET, | ||
117 | ra_rustc_lexer::TokenKind::PercentEq => decompose!(PERCENT, EQ), | ||
118 | ra_rustc_lexer::TokenKind::Percent => PERCENT, | ||
119 | ra_rustc_lexer::TokenKind::Unknown => ERROR, | ||
120 | }; | ||
121 | let token = Token { kind, len: TextUnit::from_usize(rustc_token.len) }; | ||
122 | acc.push(token); | ||
123 | text = &text[rustc_token.len..]; | ||
167 | } | 124 | } |
168 | ptr.bump_while(is_ident_continue); | 125 | acc |
169 | } | 126 | } |
170 | 127 | ||
171 | pub fn classify_literal(text: &str) -> Option<Token> { | 128 | pub fn classify_literal(text: &str) -> Option<Token> { |
172 | let tkn = next_token(text); | 129 | let t = ra_rustc_lexer::first_token(text); |
173 | if !tkn.kind.is_literal() || tkn.len.to_usize() != text.len() { | 130 | if t.len != text.len() { |
174 | return None; | 131 | return None; |
175 | } | 132 | } |
176 | 133 | let kind = match t.kind { | |
177 | Some(tkn) | 134 | ra_rustc_lexer::TokenKind::Literal { kind, .. } => match kind { |
135 | ra_rustc_lexer::LiteralKind::Int { .. } => INT_NUMBER, | ||
136 | ra_rustc_lexer::LiteralKind::Float { .. } => FLOAT_NUMBER, | ||
137 | ra_rustc_lexer::LiteralKind::Char { .. } => CHAR, | ||
138 | ra_rustc_lexer::LiteralKind::Byte { .. } => BYTE, | ||
139 | ra_rustc_lexer::LiteralKind::Str { .. } => STRING, | ||
140 | ra_rustc_lexer::LiteralKind::ByteStr { .. } => BYTE_STRING, | ||
141 | ra_rustc_lexer::LiteralKind::RawStr { .. } => RAW_STRING, | ||
142 | ra_rustc_lexer::LiteralKind::RawByteStr { .. } => RAW_BYTE_STRING, | ||
143 | }, | ||
144 | _ => return None, | ||
145 | }; | ||
146 | Some(Token { kind, len: TextUnit::from_usize(t.len) }) | ||
178 | } | 147 | } |
diff --git a/crates/ra_syntax/src/parsing/lexer/classes.rs b/crates/ra_syntax/src/parsing/lexer/classes.rs deleted file mode 100644 index 4235d2648..000000000 --- a/crates/ra_syntax/src/parsing/lexer/classes.rs +++ /dev/null | |||
@@ -1,26 +0,0 @@ | |||
1 | use unicode_xid::UnicodeXID; | ||
2 | |||
3 | pub fn is_ident_start(c: char) -> bool { | ||
4 | (c >= 'a' && c <= 'z') | ||
5 | || (c >= 'A' && c <= 'Z') | ||
6 | || c == '_' | ||
7 | || (c > '\x7f' && UnicodeXID::is_xid_start(c)) | ||
8 | } | ||
9 | |||
10 | pub fn is_ident_continue(c: char) -> bool { | ||
11 | (c >= 'a' && c <= 'z') | ||
12 | || (c >= 'A' && c <= 'Z') | ||
13 | || (c >= '0' && c <= '9') | ||
14 | || c == '_' | ||
15 | || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) | ||
16 | } | ||
17 | |||
18 | pub fn is_whitespace(c: char) -> bool { | ||
19 | //FIXME: use is_pattern_whitespace | ||
20 | //https://github.com/behnam/rust-unic/issues/192 | ||
21 | c.is_whitespace() | ||
22 | } | ||
23 | |||
24 | pub fn is_dec_digit(c: char) -> bool { | ||
25 | '0' <= c && c <= '9' | ||
26 | } | ||
diff --git a/crates/ra_syntax/src/parsing/lexer/comments.rs b/crates/ra_syntax/src/parsing/lexer/comments.rs deleted file mode 100644 index 8bbbe659b..000000000 --- a/crates/ra_syntax/src/parsing/lexer/comments.rs +++ /dev/null | |||
@@ -1,57 +0,0 @@ | |||
1 | use crate::parsing::lexer::ptr::Ptr; | ||
2 | |||
3 | use crate::SyntaxKind::{self, *}; | ||
4 | |||
5 | pub(crate) fn scan_shebang(ptr: &mut Ptr) -> bool { | ||
6 | if ptr.at_str("!/") { | ||
7 | ptr.bump(); | ||
8 | ptr.bump(); | ||
9 | bump_until_eol(ptr); | ||
10 | true | ||
11 | } else { | ||
12 | false | ||
13 | } | ||
14 | } | ||
15 | |||
16 | fn scan_block_comment(ptr: &mut Ptr) -> Option<SyntaxKind> { | ||
17 | if ptr.at('*') { | ||
18 | ptr.bump(); | ||
19 | let mut depth: u32 = 1; | ||
20 | while depth > 0 { | ||
21 | if ptr.at_str("*/") { | ||
22 | depth -= 1; | ||
23 | ptr.bump(); | ||
24 | ptr.bump(); | ||
25 | } else if ptr.at_str("/*") { | ||
26 | depth += 1; | ||
27 | ptr.bump(); | ||
28 | ptr.bump(); | ||
29 | } else if ptr.bump().is_none() { | ||
30 | break; | ||
31 | } | ||
32 | } | ||
33 | Some(COMMENT) | ||
34 | } else { | ||
35 | None | ||
36 | } | ||
37 | } | ||
38 | |||
39 | pub(crate) fn scan_comment(ptr: &mut Ptr) -> Option<SyntaxKind> { | ||
40 | if ptr.at('/') { | ||
41 | bump_until_eol(ptr); | ||
42 | Some(COMMENT) | ||
43 | } else { | ||
44 | scan_block_comment(ptr) | ||
45 | } | ||
46 | } | ||
47 | |||
48 | fn bump_until_eol(ptr: &mut Ptr) { | ||
49 | loop { | ||
50 | if ptr.at('\n') || ptr.at_str("\r\n") { | ||
51 | return; | ||
52 | } | ||
53 | if ptr.bump().is_none() { | ||
54 | break; | ||
55 | } | ||
56 | } | ||
57 | } | ||
diff --git a/crates/ra_syntax/src/parsing/lexer/numbers.rs b/crates/ra_syntax/src/parsing/lexer/numbers.rs deleted file mode 100644 index e53ae231b..000000000 --- a/crates/ra_syntax/src/parsing/lexer/numbers.rs +++ /dev/null | |||
@@ -1,66 +0,0 @@ | |||
1 | use crate::parsing::lexer::{classes::*, ptr::Ptr}; | ||
2 | |||
3 | use crate::SyntaxKind::{self, *}; | ||
4 | |||
5 | pub(crate) fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind { | ||
6 | if c == '0' { | ||
7 | match ptr.current().unwrap_or('\0') { | ||
8 | 'b' | 'o' => { | ||
9 | ptr.bump(); | ||
10 | scan_digits(ptr, false); | ||
11 | } | ||
12 | 'x' => { | ||
13 | ptr.bump(); | ||
14 | scan_digits(ptr, true); | ||
15 | } | ||
16 | '0'..='9' | '_' | '.' | 'e' | 'E' => { | ||
17 | scan_digits(ptr, true); | ||
18 | } | ||
19 | _ => return INT_NUMBER, | ||
20 | } | ||
21 | } else { | ||
22 | scan_digits(ptr, false); | ||
23 | } | ||
24 | |||
25 | // might be a float, but don't be greedy if this is actually an | ||
26 | // integer literal followed by field/method access or a range pattern | ||
27 | // (`0..2` and `12.foo()`) | ||
28 | if ptr.at('.') && !(ptr.at_str("..") || ptr.nth_is_p(1, is_ident_start)) { | ||
29 | // might have stuff after the ., and if it does, it needs to start | ||
30 | // with a number | ||
31 | ptr.bump(); | ||
32 | scan_digits(ptr, false); | ||
33 | scan_float_exponent(ptr); | ||
34 | return FLOAT_NUMBER; | ||
35 | } | ||
36 | // it might be a float if it has an exponent | ||
37 | if ptr.at('e') || ptr.at('E') { | ||
38 | scan_float_exponent(ptr); | ||
39 | return FLOAT_NUMBER; | ||
40 | } | ||
41 | INT_NUMBER | ||
42 | } | ||
43 | |||
44 | fn scan_digits(ptr: &mut Ptr, allow_hex: bool) { | ||
45 | while let Some(c) = ptr.current() { | ||
46 | match c { | ||
47 | '_' | '0'..='9' => { | ||
48 | ptr.bump(); | ||
49 | } | ||
50 | 'a'..='f' | 'A'..='F' if allow_hex => { | ||
51 | ptr.bump(); | ||
52 | } | ||
53 | _ => return, | ||
54 | } | ||
55 | } | ||
56 | } | ||
57 | |||
58 | fn scan_float_exponent(ptr: &mut Ptr) { | ||
59 | if ptr.at('e') || ptr.at('E') { | ||
60 | ptr.bump(); | ||
61 | if ptr.at('-') || ptr.at('+') { | ||
62 | ptr.bump(); | ||
63 | } | ||
64 | scan_digits(ptr, false); | ||
65 | } | ||
66 | } | ||
diff --git a/crates/ra_syntax/src/parsing/lexer/ptr.rs b/crates/ra_syntax/src/parsing/lexer/ptr.rs deleted file mode 100644 index c341c4176..000000000 --- a/crates/ra_syntax/src/parsing/lexer/ptr.rs +++ /dev/null | |||
@@ -1,162 +0,0 @@ | |||
1 | use crate::TextUnit; | ||
2 | |||
3 | use std::str::Chars; | ||
4 | |||
5 | /// A simple view into the characters of a string. | ||
6 | pub(crate) struct Ptr<'s> { | ||
7 | text: &'s str, | ||
8 | len: TextUnit, | ||
9 | } | ||
10 | |||
11 | impl<'s> Ptr<'s> { | ||
12 | /// Creates a new `Ptr` from a string. | ||
13 | pub fn new(text: &'s str) -> Ptr<'s> { | ||
14 | Ptr { text, len: 0.into() } | ||
15 | } | ||
16 | |||
17 | /// Gets the length of the remaining string. | ||
18 | pub fn into_len(self) -> TextUnit { | ||
19 | self.len | ||
20 | } | ||
21 | |||
22 | /// Gets the current character, if one exists. | ||
23 | pub fn current(&self) -> Option<char> { | ||
24 | self.chars().next() | ||
25 | } | ||
26 | |||
27 | /// Gets the nth character from the current. | ||
28 | /// For example, 0 will return the current character, 1 will return the next, etc. | ||
29 | pub fn nth(&self, n: u32) -> Option<char> { | ||
30 | self.chars().nth(n as usize) | ||
31 | } | ||
32 | |||
33 | /// Checks whether the current character is `c`. | ||
34 | pub fn at(&self, c: char) -> bool { | ||
35 | self.current() == Some(c) | ||
36 | } | ||
37 | |||
38 | /// Checks whether the next characters match `s`. | ||
39 | pub fn at_str(&self, s: &str) -> bool { | ||
40 | let chars = self.chars(); | ||
41 | chars.as_str().starts_with(s) | ||
42 | } | ||
43 | |||
44 | /// Checks whether the current character satisfies the predicate `p`. | ||
45 | pub fn at_p<P: Fn(char) -> bool>(&self, p: P) -> bool { | ||
46 | self.current().map(p) == Some(true) | ||
47 | } | ||
48 | |||
49 | /// Checks whether the nth character satisfies the predicate `p`. | ||
50 | pub fn nth_is_p<P: Fn(char) -> bool>(&self, n: u32, p: P) -> bool { | ||
51 | self.nth(n).map(p) == Some(true) | ||
52 | } | ||
53 | |||
54 | /// Moves to the next character. | ||
55 | pub fn bump(&mut self) -> Option<char> { | ||
56 | let ch = self.chars().next()?; | ||
57 | self.len += TextUnit::of_char(ch); | ||
58 | Some(ch) | ||
59 | } | ||
60 | |||
61 | /// Moves to the next character as long as `pred` is satisfied. | ||
62 | pub fn bump_while<F: Fn(char) -> bool>(&mut self, pred: F) { | ||
63 | loop { | ||
64 | match self.current() { | ||
65 | Some(c) if pred(c) => { | ||
66 | self.bump(); | ||
67 | } | ||
68 | _ => return, | ||
69 | } | ||
70 | } | ||
71 | } | ||
72 | |||
73 | /// Returns the text up to the current point. | ||
74 | pub fn current_token_text(&self) -> &str { | ||
75 | let len: u32 = self.len.into(); | ||
76 | &self.text[..len as usize] | ||
77 | } | ||
78 | |||
79 | /// Returns an iterator over the remaining characters. | ||
80 | fn chars(&self) -> Chars { | ||
81 | let len: u32 = self.len.into(); | ||
82 | self.text[len as usize..].chars() | ||
83 | } | ||
84 | } | ||
85 | |||
86 | #[cfg(test)] | ||
87 | mod tests { | ||
88 | use super::*; | ||
89 | |||
90 | #[test] | ||
91 | fn test_current() { | ||
92 | let ptr = Ptr::new("test"); | ||
93 | assert_eq!(ptr.current(), Some('t')); | ||
94 | } | ||
95 | |||
96 | #[test] | ||
97 | fn test_nth() { | ||
98 | let ptr = Ptr::new("test"); | ||
99 | assert_eq!(ptr.nth(0), Some('t')); | ||
100 | assert_eq!(ptr.nth(1), Some('e')); | ||
101 | assert_eq!(ptr.nth(2), Some('s')); | ||
102 | assert_eq!(ptr.nth(3), Some('t')); | ||
103 | assert_eq!(ptr.nth(4), None); | ||
104 | } | ||
105 | |||
106 | #[test] | ||
107 | fn test_at() { | ||
108 | let ptr = Ptr::new("test"); | ||
109 | assert!(ptr.at('t')); | ||
110 | assert!(!ptr.at('a')); | ||
111 | } | ||
112 | |||
113 | #[test] | ||
114 | fn test_at_str() { | ||
115 | let ptr = Ptr::new("test"); | ||
116 | assert!(ptr.at_str("t")); | ||
117 | assert!(ptr.at_str("te")); | ||
118 | assert!(ptr.at_str("test")); | ||
119 | assert!(!ptr.at_str("tests")); | ||
120 | assert!(!ptr.at_str("rust")); | ||
121 | } | ||
122 | |||
123 | #[test] | ||
124 | fn test_at_p() { | ||
125 | let ptr = Ptr::new("test"); | ||
126 | assert!(ptr.at_p(|c| c == 't')); | ||
127 | assert!(!ptr.at_p(|c| c == 'e')); | ||
128 | } | ||
129 | |||
130 | #[test] | ||
131 | fn test_nth_is_p() { | ||
132 | let ptr = Ptr::new("test"); | ||
133 | assert!(ptr.nth_is_p(0, |c| c == 't')); | ||
134 | assert!(!ptr.nth_is_p(1, |c| c == 't')); | ||
135 | assert!(ptr.nth_is_p(3, |c| c == 't')); | ||
136 | assert!(!ptr.nth_is_p(150, |c| c == 't')); | ||
137 | } | ||
138 | |||
139 | #[test] | ||
140 | fn test_bump() { | ||
141 | let mut ptr = Ptr::new("test"); | ||
142 | assert_eq!(ptr.current(), Some('t')); | ||
143 | ptr.bump(); | ||
144 | assert_eq!(ptr.current(), Some('e')); | ||
145 | ptr.bump(); | ||
146 | assert_eq!(ptr.current(), Some('s')); | ||
147 | ptr.bump(); | ||
148 | assert_eq!(ptr.current(), Some('t')); | ||
149 | ptr.bump(); | ||
150 | assert_eq!(ptr.current(), None); | ||
151 | ptr.bump(); | ||
152 | assert_eq!(ptr.current(), None); | ||
153 | } | ||
154 | |||
155 | #[test] | ||
156 | fn test_bump_while() { | ||
157 | let mut ptr = Ptr::new("test"); | ||
158 | assert_eq!(ptr.current(), Some('t')); | ||
159 | ptr.bump_while(|c| c != 's'); | ||
160 | assert_eq!(ptr.current(), Some('s')); | ||
161 | } | ||
162 | } | ||
diff --git a/crates/ra_syntax/src/parsing/lexer/strings.rs b/crates/ra_syntax/src/parsing/lexer/strings.rs deleted file mode 100644 index f74acff9e..000000000 --- a/crates/ra_syntax/src/parsing/lexer/strings.rs +++ /dev/null | |||
@@ -1,112 +0,0 @@ | |||
1 | use crate::{ | ||
2 | parsing::lexer::ptr::Ptr, | ||
3 | SyntaxKind::{self, *}, | ||
4 | }; | ||
5 | |||
6 | pub(crate) fn is_string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool { | ||
7 | match (c, c1, c2) { | ||
8 | ('r', Some('"'), _) | ||
9 | | ('r', Some('#'), Some('"')) | ||
10 | | ('r', Some('#'), Some('#')) | ||
11 | | ('b', Some('"'), _) | ||
12 | | ('b', Some('\''), _) | ||
13 | | ('b', Some('r'), Some('"')) | ||
14 | | ('b', Some('r'), Some('#')) => true, | ||
15 | _ => false, | ||
16 | } | ||
17 | } | ||
18 | |||
19 | pub(crate) fn scan_char(ptr: &mut Ptr) { | ||
20 | while let Some(c) = ptr.current() { | ||
21 | match c { | ||
22 | '\\' => { | ||
23 | ptr.bump(); | ||
24 | if ptr.at('\\') || ptr.at('\'') { | ||
25 | ptr.bump(); | ||
26 | } | ||
27 | } | ||
28 | '\'' => { | ||
29 | ptr.bump(); | ||
30 | return; | ||
31 | } | ||
32 | '\n' => return, | ||
33 | _ => { | ||
34 | ptr.bump(); | ||
35 | } | ||
36 | } | ||
37 | } | ||
38 | } | ||
39 | |||
40 | pub(crate) fn scan_byte_char_or_string(ptr: &mut Ptr) -> SyntaxKind { | ||
41 | // unwrapping and not-exhaustive match are ok | ||
42 | // because of string_literal_start | ||
43 | let c = ptr.bump().unwrap(); | ||
44 | match c { | ||
45 | '\'' => { | ||
46 | scan_byte(ptr); | ||
47 | BYTE | ||
48 | } | ||
49 | '"' => { | ||
50 | scan_byte_string(ptr); | ||
51 | BYTE_STRING | ||
52 | } | ||
53 | 'r' => { | ||
54 | scan_raw_string(ptr); | ||
55 | RAW_BYTE_STRING | ||
56 | } | ||
57 | _ => unreachable!(), | ||
58 | } | ||
59 | } | ||
60 | |||
61 | pub(crate) fn scan_string(ptr: &mut Ptr) { | ||
62 | while let Some(c) = ptr.current() { | ||
63 | match c { | ||
64 | '\\' => { | ||
65 | ptr.bump(); | ||
66 | if ptr.at('\\') || ptr.at('"') { | ||
67 | ptr.bump(); | ||
68 | } | ||
69 | } | ||
70 | '"' => { | ||
71 | ptr.bump(); | ||
72 | return; | ||
73 | } | ||
74 | _ => { | ||
75 | ptr.bump(); | ||
76 | } | ||
77 | } | ||
78 | } | ||
79 | } | ||
80 | |||
81 | pub(crate) fn scan_raw_string(ptr: &mut Ptr) { | ||
82 | let mut hashes = 0; | ||
83 | while ptr.at('#') { | ||
84 | hashes += 1; | ||
85 | ptr.bump(); | ||
86 | } | ||
87 | if !ptr.at('"') { | ||
88 | return; | ||
89 | } | ||
90 | ptr.bump(); | ||
91 | |||
92 | while let Some(c) = ptr.bump() { | ||
93 | if c == '"' { | ||
94 | let mut hashes_left = hashes; | ||
95 | while ptr.at('#') && hashes_left > 0 { | ||
96 | hashes_left -= 1; | ||
97 | ptr.bump(); | ||
98 | } | ||
99 | if hashes_left == 0 { | ||
100 | return; | ||
101 | } | ||
102 | } | ||
103 | } | ||
104 | } | ||
105 | |||
106 | fn scan_byte(ptr: &mut Ptr) { | ||
107 | scan_char(ptr) | ||
108 | } | ||
109 | |||
110 | fn scan_byte_string(ptr: &mut Ptr) { | ||
111 | scan_string(ptr) | ||
112 | } | ||