aboutsummaryrefslogtreecommitdiff
path: root/crates/ra_syntax/src/string_lexing.rs
diff options
context:
space:
mode:
Diffstat (limited to 'crates/ra_syntax/src/string_lexing.rs')
-rw-r--r--crates/ra_syntax/src/string_lexing.rs338
1 files changed, 332 insertions, 6 deletions
diff --git a/crates/ra_syntax/src/string_lexing.rs b/crates/ra_syntax/src/string_lexing.rs
index 349733f3f..4c3eea3d2 100644
--- a/crates/ra_syntax/src/string_lexing.rs
+++ b/crates/ra_syntax/src/string_lexing.rs
@@ -1,7 +1,333 @@
1mod parser; 1use crate::{TextRange, TextUnit};
2mod string; 2use self::StringComponentKind::*;
3 3
4pub use self::{ 4#[derive(Debug, Eq, PartialEq, Clone)]
5 parser::{StringComponent, StringComponentKind}, 5pub(crate) struct StringComponent {
6 string::{parse_string_literal, parse_char_literal, parse_byte_literal, parse_byte_string_literal}, 6 pub(crate) range: TextRange,
7}; 7 pub(crate) kind: StringComponentKind,
8}
9
10#[derive(Debug, Eq, PartialEq, Clone)]
11pub(crate) enum StringComponentKind {
12 IgnoreNewline,
13 CodePoint,
14 AsciiEscape,
15 AsciiCodeEscape,
16 UnicodeEscape,
17}
18
19pub(crate) fn parse_quoted_literal(
20 prefix: Option<char>,
21 quote: char,
22 src: &str,
23) -> StringComponentIter {
24 let prefix = prefix.map(|p| match p {
25 'b' => b'b',
26 _ => panic!("invalid prefix"),
27 });
28 let quote = match quote {
29 '\'' => b'\'',
30 '"' => b'"',
31 _ => panic!("invalid quote"),
32 };
33 StringComponentIter { src, prefix, quote, pos: 0, has_closing_quote: false, suffix: None }
34}
35
36pub(crate) struct StringComponentIter<'a> {
37 src: &'a str,
38 prefix: Option<u8>,
39 quote: u8,
40 pos: usize,
41 pub(crate) has_closing_quote: bool,
42 pub(crate) suffix: Option<TextRange>,
43}
44
45impl<'a> Iterator for StringComponentIter<'a> {
46 type Item = StringComponent;
47 fn next(&mut self) -> Option<StringComponent> {
48 if self.pos == 0 {
49 if let Some(prefix) = self.prefix {
50 assert!(
51 self.advance() == prefix as char,
52 "literal should start with a {:?}",
53 prefix as char,
54 );
55 }
56 assert!(
57 self.advance() == self.quote as char,
58 "literal should start with a {:?}",
59 self.quote as char,
60 );
61 }
62
63 if let Some(component) = self.parse_component() {
64 return Some(component);
65 }
66
67 // We get here when there are no char components left to parse
68 if self.peek() == Some(self.quote as char) {
69 self.advance();
70 self.has_closing_quote = true;
71 if let Some(range) = self.parse_suffix() {
72 self.suffix = Some(range);
73 }
74 }
75
76 assert!(
77 self.peek() == None,
78 "literal should leave no unparsed input: src = {:?}, pos = {}, length = {}",
79 self.src,
80 self.pos,
81 self.src.len()
82 );
83
84 None
85 }
86}
87
88impl<'a> StringComponentIter<'a> {
89 fn peek(&self) -> Option<char> {
90 if self.pos == self.src.len() {
91 return None;
92 }
93
94 self.src[self.pos..].chars().next()
95 }
96
97 fn advance(&mut self) -> char {
98 let next = self.peek().expect("cannot advance if end of input is reached");
99 self.pos += next.len_utf8();
100 next
101 }
102
103 fn parse_component(&mut self) -> Option<StringComponent> {
104 let next = self.peek()?;
105
106 // Ignore string close
107 if next == self.quote as char {
108 return None;
109 }
110
111 let start = self.start_range();
112 self.advance();
113
114 if next == '\\' {
115 // Strings can use `\` to ignore newlines, so we first try to parse one of those
116 // before falling back to parsing char escapes
117 if self.quote == b'"' {
118 if let Some(component) = self.parse_ignore_newline(start) {
119 return Some(component);
120 }
121 }
122
123 Some(self.parse_escape(start))
124 } else {
125 Some(self.finish_component(start, CodePoint))
126 }
127 }
128
129 fn parse_ignore_newline(&mut self, start: TextUnit) -> Option<StringComponent> {
130 // In string literals, when a `\` occurs immediately before the newline, the `\`,
131 // the newline, and all whitespace at the beginning of the next line are ignored
132 match self.peek() {
133 Some('\n') | Some('\r') => {
134 self.skip_whitespace();
135 Some(self.finish_component(start, IgnoreNewline))
136 }
137 _ => None,
138 }
139 }
140
141 fn skip_whitespace(&mut self) {
142 while self.peek().map(|c| c.is_whitespace()) == Some(true) {
143 self.advance();
144 }
145 }
146
147 fn parse_escape(&mut self, start: TextUnit) -> StringComponent {
148 if self.peek().is_none() {
149 return self.finish_component(start, AsciiEscape);
150 }
151
152 let next = self.advance();
153 match next {
154 'x' => self.parse_ascii_code_escape(start),
155 'u' => self.parse_unicode_escape(start),
156 _ => self.finish_component(start, AsciiEscape),
157 }
158 }
159
160 fn parse_unicode_escape(&mut self, start: TextUnit) -> StringComponent {
161 match self.peek() {
162 Some('{') => {
163 self.advance();
164
165 // Parse anything until we reach `}`
166 while let Some(next) = self.peek() {
167 self.advance();
168 if next == '}' {
169 break;
170 }
171 }
172
173 self.finish_component(start, UnicodeEscape)
174 }
175 Some(_) | None => self.finish_component(start, UnicodeEscape),
176 }
177 }
178
179 fn parse_ascii_code_escape(&mut self, start: TextUnit) -> StringComponent {
180 let code_start = self.pos;
181 while let Some(next) = self.peek() {
182 if next == '\'' || (self.pos - code_start == 2) {
183 break;
184 }
185
186 self.advance();
187 }
188 self.finish_component(start, AsciiCodeEscape)
189 }
190
191 fn parse_suffix(&mut self) -> Option<TextRange> {
192 let start = self.start_range();
193 let _ = self.peek()?;
194 while let Some(_) = self.peek() {
195 self.advance();
196 }
197 Some(self.finish_range(start))
198 }
199
200 fn start_range(&self) -> TextUnit {
201 TextUnit::from_usize(self.pos)
202 }
203
204 fn finish_range(&self, start: TextUnit) -> TextRange {
205 TextRange::from_to(start, TextUnit::from_usize(self.pos))
206 }
207
208 fn finish_component(&self, start: TextUnit, kind: StringComponentKind) -> StringComponent {
209 let range = self.finish_range(start);
210 StringComponent { range, kind }
211 }
212}
213
214#[cfg(test)]
215mod tests {
216 use super::*;
217
218 fn parse(src: &str) -> (bool, Vec<StringComponent>) {
219 let component_iterator = &mut parse_quoted_literal(None, '\'', src);
220 let components: Vec<_> = component_iterator.collect();
221 (component_iterator.has_closing_quote, components)
222 }
223
224 fn unclosed_char_component(src: &str) -> StringComponent {
225 let (has_closing_quote, components) = parse(src);
226 assert!(!has_closing_quote, "char should not have closing quote");
227 assert!(components.len() == 1);
228 components[0].clone()
229 }
230
231 fn closed_char_component(src: &str) -> StringComponent {
232 let (has_closing_quote, components) = parse(src);
233 assert!(has_closing_quote, "char should have closing quote");
234 assert!(components.len() == 1, "Literal: {}\nComponents: {:#?}", src, components);
235 components[0].clone()
236 }
237
238 fn closed_char_components(src: &str) -> Vec<StringComponent> {
239 let (has_closing_quote, components) = parse(src);
240 assert!(has_closing_quote, "char should have closing quote");
241 components
242 }
243
244 fn range_closed(src: &str) -> TextRange {
245 TextRange::from_to(1.into(), (src.len() as u32 - 1).into())
246 }
247
248 fn range_unclosed(src: &str) -> TextRange {
249 TextRange::from_to(1.into(), (src.len() as u32).into())
250 }
251
252 #[test]
253 fn test_unicode_escapes() {
254 let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""];
255 for escape in unicode_escapes {
256 let escape_sequence = format!(r"'\u{}'", escape);
257 let component = closed_char_component(&escape_sequence);
258 let expected_range = range_closed(&escape_sequence);
259 assert_eq!(component.kind, UnicodeEscape);
260 assert_eq!(component.range, expected_range);
261 }
262 }
263
264 #[test]
265 fn test_unicode_escapes_unclosed() {
266 let unicode_escapes = &["{DEAD", "{BEEF", "{FF"];
267 for escape in unicode_escapes {
268 let escape_sequence = format!(r"'\u{}'", escape);
269 let component = unclosed_char_component(&escape_sequence);
270 let expected_range = range_unclosed(&escape_sequence);
271 assert_eq!(component.kind, UnicodeEscape);
272 assert_eq!(component.range, expected_range);
273 }
274 }
275
276 #[test]
277 fn test_empty_char() {
278 let (has_closing_quote, components) = parse("''");
279 assert!(has_closing_quote, "char should have closing quote");
280 assert!(components.len() == 0);
281 }
282
283 #[test]
284 fn test_unclosed_char() {
285 let component = unclosed_char_component("'a");
286 assert!(component.kind == CodePoint);
287 assert!(component.range == TextRange::from_to(1.into(), 2.into()));
288 }
289
290 #[test]
291 fn test_digit_escapes() {
292 let literals = &[r"", r"5", r"55"];
293
294 for literal in literals {
295 let lit_text = format!(r"'\x{}'", literal);
296 let component = closed_char_component(&lit_text);
297 assert!(component.kind == AsciiCodeEscape);
298 assert!(component.range == range_closed(&lit_text));
299 }
300
301 // More than 2 digits starts a new codepoint
302 let components = closed_char_components(r"'\x555'");
303 assert!(components.len() == 2);
304 assert!(components[1].kind == CodePoint);
305 }
306
307 #[test]
308 fn test_ascii_escapes() {
309 let literals = &[
310 r"\'", "\\\"", // equivalent to \"
311 r"\n", r"\r", r"\t", r"\\", r"\0",
312 ];
313
314 for literal in literals {
315 let lit_text = format!("'{}'", literal);
316 let component = closed_char_component(&lit_text);
317 assert!(component.kind == AsciiEscape);
318 assert!(component.range == range_closed(&lit_text));
319 }
320 }
321
322 #[test]
323 fn test_no_escapes() {
324 let literals = &['"', 'n', 'r', 't', '0', 'x', 'u'];
325
326 for &literal in literals {
327 let lit_text = format!("'{}'", literal);
328 let component = closed_char_component(&lit_text);
329 assert!(component.kind == CodePoint);
330 assert!(component.range == range_closed(&lit_text));
331 }
332 }
333}