diff options
Diffstat (limited to 'crates/ra_syntax/src')
-rw-r--r-- | crates/ra_syntax/src/ast/generated.rs | 74 | ||||
-rw-r--r-- | crates/ra_syntax/src/ast/mod.rs | 18 | ||||
-rw-r--r-- | crates/ra_syntax/src/grammar.ron | 2 | ||||
-rw-r--r-- | crates/ra_syntax/src/grammar/items/mod.rs | 2 | ||||
-rw-r--r-- | crates/ra_syntax/src/grammar/paths.rs | 2 | ||||
-rw-r--r-- | crates/ra_syntax/src/reparsing.rs | 6 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing.rs | 414 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing/byte.rs | 51 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing/byte_string.rs | 51 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing/char.rs | 176 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing/mod.rs | 13 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing/parser.rs | 201 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing/string.rs | 46 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/byte.rs | 211 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/byte_string.rs | 178 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/char.rs | 192 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/mod.rs | 4 | ||||
-rw-r--r-- | crates/ra_syntax/src/yellow/syntax_error.rs | 23 |
18 files changed, 1153 insertions, 511 deletions
diff --git a/crates/ra_syntax/src/ast/generated.rs b/crates/ra_syntax/src/ast/generated.rs index 2e9ae263a..bf056131e 100644 --- a/crates/ra_syntax/src/ast/generated.rs +++ b/crates/ra_syntax/src/ast/generated.rs | |||
@@ -372,6 +372,80 @@ impl<R: TreeRoot<RaTypes>> BreakExprNode<R> { | |||
372 | 372 | ||
373 | impl<'a> BreakExpr<'a> {} | 373 | impl<'a> BreakExpr<'a> {} |
374 | 374 | ||
375 | // Byte | ||
376 | #[derive(Debug, Clone, Copy,)] | ||
377 | pub struct ByteNode<R: TreeRoot<RaTypes> = OwnedRoot> { | ||
378 | pub(crate) syntax: SyntaxNode<R>, | ||
379 | } | ||
380 | pub type Byte<'a> = ByteNode<RefRoot<'a>>; | ||
381 | |||
382 | impl<R1: TreeRoot<RaTypes>, R2: TreeRoot<RaTypes>> PartialEq<ByteNode<R1>> for ByteNode<R2> { | ||
383 | fn eq(&self, other: &ByteNode<R1>) -> bool { self.syntax == other.syntax } | ||
384 | } | ||
385 | impl<R: TreeRoot<RaTypes>> Eq for ByteNode<R> {} | ||
386 | impl<R: TreeRoot<RaTypes>> Hash for ByteNode<R> { | ||
387 | fn hash<H: Hasher>(&self, state: &mut H) { self.syntax.hash(state) } | ||
388 | } | ||
389 | |||
390 | impl<'a> AstNode<'a> for Byte<'a> { | ||
391 | fn cast(syntax: SyntaxNodeRef<'a>) -> Option<Self> { | ||
392 | match syntax.kind() { | ||
393 | BYTE => Some(Byte { syntax }), | ||
394 | _ => None, | ||
395 | } | ||
396 | } | ||
397 | fn syntax(self) -> SyntaxNodeRef<'a> { self.syntax } | ||
398 | } | ||
399 | |||
400 | impl<R: TreeRoot<RaTypes>> ByteNode<R> { | ||
401 | pub fn borrowed(&self) -> Byte { | ||
402 | ByteNode { syntax: self.syntax.borrowed() } | ||
403 | } | ||
404 | pub fn owned(&self) -> ByteNode { | ||
405 | ByteNode { syntax: self.syntax.owned() } | ||
406 | } | ||
407 | } | ||
408 | |||
409 | |||
410 | impl<'a> Byte<'a> {} | ||
411 | |||
412 | // ByteString | ||
413 | #[derive(Debug, Clone, Copy,)] | ||
414 | pub struct ByteStringNode<R: TreeRoot<RaTypes> = OwnedRoot> { | ||
415 | pub(crate) syntax: SyntaxNode<R>, | ||
416 | } | ||
417 | pub type ByteString<'a> = ByteStringNode<RefRoot<'a>>; | ||
418 | |||
419 | impl<R1: TreeRoot<RaTypes>, R2: TreeRoot<RaTypes>> PartialEq<ByteStringNode<R1>> for ByteStringNode<R2> { | ||
420 | fn eq(&self, other: &ByteStringNode<R1>) -> bool { self.syntax == other.syntax } | ||
421 | } | ||
422 | impl<R: TreeRoot<RaTypes>> Eq for ByteStringNode<R> {} | ||
423 | impl<R: TreeRoot<RaTypes>> Hash for ByteStringNode<R> { | ||
424 | fn hash<H: Hasher>(&self, state: &mut H) { self.syntax.hash(state) } | ||
425 | } | ||
426 | |||
427 | impl<'a> AstNode<'a> for ByteString<'a> { | ||
428 | fn cast(syntax: SyntaxNodeRef<'a>) -> Option<Self> { | ||
429 | match syntax.kind() { | ||
430 | BYTE_STRING => Some(ByteString { syntax }), | ||
431 | _ => None, | ||
432 | } | ||
433 | } | ||
434 | fn syntax(self) -> SyntaxNodeRef<'a> { self.syntax } | ||
435 | } | ||
436 | |||
437 | impl<R: TreeRoot<RaTypes>> ByteStringNode<R> { | ||
438 | pub fn borrowed(&self) -> ByteString { | ||
439 | ByteStringNode { syntax: self.syntax.borrowed() } | ||
440 | } | ||
441 | pub fn owned(&self) -> ByteStringNode { | ||
442 | ByteStringNode { syntax: self.syntax.owned() } | ||
443 | } | ||
444 | } | ||
445 | |||
446 | |||
447 | impl<'a> ByteString<'a> {} | ||
448 | |||
375 | // CallExpr | 449 | // CallExpr |
376 | #[derive(Debug, Clone, Copy,)] | 450 | #[derive(Debug, Clone, Copy,)] |
377 | pub struct CallExprNode<R: TreeRoot<RaTypes> = OwnedRoot> { | 451 | pub struct CallExprNode<R: TreeRoot<RaTypes> = OwnedRoot> { |
diff --git a/crates/ra_syntax/src/ast/mod.rs b/crates/ra_syntax/src/ast/mod.rs index f20714ede..91c67119f 100644 --- a/crates/ra_syntax/src/ast/mod.rs +++ b/crates/ra_syntax/src/ast/mod.rs | |||
@@ -134,6 +134,18 @@ impl<'a> Char<'a> { | |||
134 | } | 134 | } |
135 | } | 135 | } |
136 | 136 | ||
137 | impl<'a> Byte<'a> { | ||
138 | pub fn text(&self) -> &SmolStr { | ||
139 | &self.syntax().leaf_text().unwrap() | ||
140 | } | ||
141 | } | ||
142 | |||
143 | impl<'a> ByteString<'a> { | ||
144 | pub fn text(&self) -> &SmolStr { | ||
145 | &self.syntax().leaf_text().unwrap() | ||
146 | } | ||
147 | } | ||
148 | |||
137 | impl<'a> String<'a> { | 149 | impl<'a> String<'a> { |
138 | pub fn text(&self) -> &SmolStr { | 150 | pub fn text(&self) -> &SmolStr { |
139 | &self.syntax().leaf_text().unwrap() | 151 | &self.syntax().leaf_text().unwrap() |
@@ -303,6 +315,12 @@ impl<'a> PathSegment<'a> { | |||
303 | } | 315 | } |
304 | } | 316 | } |
305 | 317 | ||
318 | impl<'a> UseTree<'a> { | ||
319 | pub fn has_star(self) -> bool { | ||
320 | self.syntax().children().any(|it| it.kind() == STAR) | ||
321 | } | ||
322 | } | ||
323 | |||
306 | impl<'a> UseTreeList<'a> { | 324 | impl<'a> UseTreeList<'a> { |
307 | pub fn parent_use_tree(self) -> UseTree<'a> { | 325 | pub fn parent_use_tree(self) -> UseTree<'a> { |
308 | self.syntax() | 326 | self.syntax() |
diff --git a/crates/ra_syntax/src/grammar.ron b/crates/ra_syntax/src/grammar.ron index c3184667e..53cd2118f 100644 --- a/crates/ra_syntax/src/grammar.ron +++ b/crates/ra_syntax/src/grammar.ron | |||
@@ -412,6 +412,8 @@ Grammar( | |||
412 | "RangeExpr": (), | 412 | "RangeExpr": (), |
413 | "BinExpr": (), | 413 | "BinExpr": (), |
414 | "String": (), | 414 | "String": (), |
415 | "Byte": (), | ||
416 | "ByteString": (), | ||
415 | "Char": (), | 417 | "Char": (), |
416 | "Literal": (), | 418 | "Literal": (), |
417 | 419 | ||
diff --git a/crates/ra_syntax/src/grammar/items/mod.rs b/crates/ra_syntax/src/grammar/items/mod.rs index 06c6b5e6e..682266908 100644 --- a/crates/ra_syntax/src/grammar/items/mod.rs +++ b/crates/ra_syntax/src/grammar/items/mod.rs | |||
@@ -29,7 +29,7 @@ pub(super) enum ItemFlavor { | |||
29 | Trait, | 29 | Trait, |
30 | } | 30 | } |
31 | 31 | ||
32 | const ITEM_RECOVERY_SET: TokenSet = token_set![ | 32 | pub(super) const ITEM_RECOVERY_SET: TokenSet = token_set![ |
33 | FN_KW, STRUCT_KW, ENUM_KW, IMPL_KW, TRAIT_KW, CONST_KW, STATIC_KW, LET_KW, MOD_KW, PUB_KW, | 33 | FN_KW, STRUCT_KW, ENUM_KW, IMPL_KW, TRAIT_KW, CONST_KW, STATIC_KW, LET_KW, MOD_KW, PUB_KW, |
34 | CRATE_KW | 34 | CRATE_KW |
35 | ]; | 35 | ]; |
diff --git a/crates/ra_syntax/src/grammar/paths.rs b/crates/ra_syntax/src/grammar/paths.rs index a35a339cc..33a11886c 100644 --- a/crates/ra_syntax/src/grammar/paths.rs +++ b/crates/ra_syntax/src/grammar/paths.rs | |||
@@ -78,7 +78,7 @@ fn path_segment(p: &mut Parser, mode: Mode, first: bool) { | |||
78 | // use crate::foo; | 78 | // use crate::foo; |
79 | SELF_KW | SUPER_KW | CRATE_KW => p.bump(), | 79 | SELF_KW | SUPER_KW | CRATE_KW => p.bump(), |
80 | _ => { | 80 | _ => { |
81 | p.err_and_bump("expected identifier"); | 81 | p.err_recover("expected identifier", items::ITEM_RECOVERY_SET); |
82 | } | 82 | } |
83 | }; | 83 | }; |
84 | } | 84 | } |
diff --git a/crates/ra_syntax/src/reparsing.rs b/crates/ra_syntax/src/reparsing.rs index d48133166..ddcb8f6f6 100644 --- a/crates/ra_syntax/src/reparsing.rs +++ b/crates/ra_syntax/src/reparsing.rs | |||
@@ -186,8 +186,10 @@ mod tests { | |||
186 | 186 | ||
187 | fn do_check<F>(before: &str, replace_with: &str, reparser: F) | 187 | fn do_check<F>(before: &str, replace_with: &str, reparser: F) |
188 | where | 188 | where |
189 | for<'a> F: Fn(SyntaxNodeRef<'a>, &AtomEdit) | 189 | for<'a> F: Fn( |
190 | -> Option<(SyntaxNodeRef<'a>, GreenNode, Vec<SyntaxError>)>, | 190 | SyntaxNodeRef<'a>, |
191 | &AtomEdit, | ||
192 | ) -> Option<(SyntaxNodeRef<'a>, GreenNode, Vec<SyntaxError>)>, | ||
191 | { | 193 | { |
192 | let (range, before) = extract_range(before); | 194 | let (range, before) = extract_range(before); |
193 | let after = replace_range(before.clone(), range, replace_with); | 195 | let after = replace_range(before.clone(), range, replace_with); |
diff --git a/crates/ra_syntax/src/string_lexing.rs b/crates/ra_syntax/src/string_lexing.rs deleted file mode 100644 index d613bb042..000000000 --- a/crates/ra_syntax/src/string_lexing.rs +++ /dev/null | |||
@@ -1,414 +0,0 @@ | |||
1 | use self::CharComponentKind::*; | ||
2 | use rowan::{TextRange, TextUnit}; | ||
3 | |||
4 | pub fn parse_string_literal(src: &str) -> StringComponentIterator { | ||
5 | StringComponentIterator { | ||
6 | parser: Parser::new(src), | ||
7 | has_closing_quote: false, | ||
8 | } | ||
9 | } | ||
10 | |||
11 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
12 | pub struct StringComponent { | ||
13 | pub range: TextRange, | ||
14 | pub kind: StringComponentKind, | ||
15 | } | ||
16 | |||
17 | impl StringComponent { | ||
18 | fn new(range: TextRange, kind: StringComponentKind) -> StringComponent { | ||
19 | StringComponent { range, kind } | ||
20 | } | ||
21 | } | ||
22 | |||
23 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
24 | pub enum StringComponentKind { | ||
25 | IgnoreNewline, | ||
26 | Char(CharComponentKind), | ||
27 | } | ||
28 | |||
29 | pub struct StringComponentIterator<'a> { | ||
30 | parser: Parser<'a>, | ||
31 | pub has_closing_quote: bool, | ||
32 | } | ||
33 | |||
34 | impl<'a> Iterator for StringComponentIterator<'a> { | ||
35 | type Item = StringComponent; | ||
36 | fn next(&mut self) -> Option<StringComponent> { | ||
37 | if self.parser.pos == 0 { | ||
38 | assert!( | ||
39 | self.parser.advance() == '"', | ||
40 | "string literal should start with double quotes" | ||
41 | ); | ||
42 | } | ||
43 | |||
44 | if let Some(component) = self.parser.parse_string_component() { | ||
45 | return Some(component); | ||
46 | } | ||
47 | |||
48 | // We get here when there are no char components left to parse | ||
49 | if self.parser.peek() == Some('"') { | ||
50 | self.parser.advance(); | ||
51 | self.has_closing_quote = true; | ||
52 | } | ||
53 | |||
54 | assert!( | ||
55 | self.parser.peek() == None, | ||
56 | "string literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
57 | self.parser.src, | ||
58 | self.parser.pos, | ||
59 | self.parser.src.len() | ||
60 | ); | ||
61 | |||
62 | None | ||
63 | } | ||
64 | } | ||
65 | |||
66 | pub fn parse_char_literal(src: &str) -> CharComponentIterator { | ||
67 | CharComponentIterator { | ||
68 | parser: Parser::new(src), | ||
69 | has_closing_quote: false, | ||
70 | } | ||
71 | } | ||
72 | |||
73 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
74 | pub struct CharComponent { | ||
75 | pub range: TextRange, | ||
76 | pub kind: CharComponentKind, | ||
77 | } | ||
78 | |||
79 | impl CharComponent { | ||
80 | fn new(range: TextRange, kind: CharComponentKind) -> CharComponent { | ||
81 | CharComponent { range, kind } | ||
82 | } | ||
83 | } | ||
84 | |||
85 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
86 | pub enum CharComponentKind { | ||
87 | CodePoint, | ||
88 | AsciiEscape, | ||
89 | AsciiCodeEscape, | ||
90 | UnicodeEscape, | ||
91 | } | ||
92 | |||
93 | pub struct CharComponentIterator<'a> { | ||
94 | parser: Parser<'a>, | ||
95 | pub has_closing_quote: bool, | ||
96 | } | ||
97 | |||
98 | impl<'a> Iterator for CharComponentIterator<'a> { | ||
99 | type Item = CharComponent; | ||
100 | fn next(&mut self) -> Option<CharComponent> { | ||
101 | if self.parser.pos == 0 { | ||
102 | assert!( | ||
103 | self.parser.advance() == '\'', | ||
104 | "char literal should start with a quote" | ||
105 | ); | ||
106 | } | ||
107 | |||
108 | if let Some(component) = self.parser.parse_char_component() { | ||
109 | return Some(component); | ||
110 | } | ||
111 | |||
112 | // We get here when there are no char components left to parse | ||
113 | if self.parser.peek() == Some('\'') { | ||
114 | self.parser.advance(); | ||
115 | self.has_closing_quote = true; | ||
116 | } | ||
117 | |||
118 | assert!( | ||
119 | self.parser.peek() == None, | ||
120 | "char literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
121 | self.parser.src, | ||
122 | self.parser.pos, | ||
123 | self.parser.src.len() | ||
124 | ); | ||
125 | |||
126 | None | ||
127 | } | ||
128 | } | ||
129 | |||
130 | pub struct Parser<'a> { | ||
131 | src: &'a str, | ||
132 | pos: usize, | ||
133 | } | ||
134 | |||
135 | impl<'a> Parser<'a> { | ||
136 | pub fn new(src: &'a str) -> Parser<'a> { | ||
137 | Parser { src, pos: 0 } | ||
138 | } | ||
139 | |||
140 | // Utility methods | ||
141 | |||
142 | pub fn peek(&self) -> Option<char> { | ||
143 | if self.pos == self.src.len() { | ||
144 | return None; | ||
145 | } | ||
146 | |||
147 | self.src[self.pos..].chars().next() | ||
148 | } | ||
149 | |||
150 | pub fn advance(&mut self) -> char { | ||
151 | let next = self | ||
152 | .peek() | ||
153 | .expect("cannot advance if end of input is reached"); | ||
154 | self.pos += next.len_utf8(); | ||
155 | next | ||
156 | } | ||
157 | |||
158 | pub fn skip_whitespace(&mut self) { | ||
159 | while self.peek().map(|c| c.is_whitespace()) == Some(true) { | ||
160 | self.advance(); | ||
161 | } | ||
162 | } | ||
163 | |||
164 | pub fn get_pos(&self) -> TextUnit { | ||
165 | (self.pos as u32).into() | ||
166 | } | ||
167 | |||
168 | // Char parsing methods | ||
169 | |||
170 | fn parse_unicode_escape(&mut self, start: TextUnit) -> CharComponent { | ||
171 | match self.peek() { | ||
172 | Some('{') => { | ||
173 | self.advance(); | ||
174 | |||
175 | // Parse anything until we reach `}` | ||
176 | while let Some(next) = self.peek() { | ||
177 | self.advance(); | ||
178 | if next == '}' { | ||
179 | break; | ||
180 | } | ||
181 | } | ||
182 | |||
183 | let end = self.get_pos(); | ||
184 | CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) | ||
185 | } | ||
186 | Some(_) | None => { | ||
187 | let end = self.get_pos(); | ||
188 | CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) | ||
189 | } | ||
190 | } | ||
191 | } | ||
192 | |||
193 | fn parse_ascii_code_escape(&mut self, start: TextUnit) -> CharComponent { | ||
194 | let code_start = self.get_pos(); | ||
195 | while let Some(next) = self.peek() { | ||
196 | if next == '\'' || (self.get_pos() - code_start == 2.into()) { | ||
197 | break; | ||
198 | } | ||
199 | |||
200 | self.advance(); | ||
201 | } | ||
202 | |||
203 | let end = self.get_pos(); | ||
204 | CharComponent::new(TextRange::from_to(start, end), AsciiCodeEscape) | ||
205 | } | ||
206 | |||
207 | fn parse_escape(&mut self, start: TextUnit) -> CharComponent { | ||
208 | if self.peek().is_none() { | ||
209 | return CharComponent::new(TextRange::from_to(start, start), AsciiEscape); | ||
210 | } | ||
211 | |||
212 | let next = self.advance(); | ||
213 | let end = self.get_pos(); | ||
214 | let range = TextRange::from_to(start, end); | ||
215 | match next { | ||
216 | 'x' => self.parse_ascii_code_escape(start), | ||
217 | 'u' => self.parse_unicode_escape(start), | ||
218 | _ => CharComponent::new(range, AsciiEscape), | ||
219 | } | ||
220 | } | ||
221 | |||
222 | pub fn parse_char_component(&mut self) -> Option<CharComponent> { | ||
223 | let next = self.peek()?; | ||
224 | |||
225 | // Ignore character close | ||
226 | if next == '\'' { | ||
227 | return None; | ||
228 | } | ||
229 | |||
230 | let start = self.get_pos(); | ||
231 | self.advance(); | ||
232 | |||
233 | if next == '\\' { | ||
234 | Some(self.parse_escape(start)) | ||
235 | } else { | ||
236 | let end = self.get_pos(); | ||
237 | Some(CharComponent::new( | ||
238 | TextRange::from_to(start, end), | ||
239 | CodePoint, | ||
240 | )) | ||
241 | } | ||
242 | } | ||
243 | |||
244 | pub fn parse_ignore_newline(&mut self, start: TextUnit) -> Option<StringComponent> { | ||
245 | // In string literals, when a `\` occurs immediately before the newline, the `\`, | ||
246 | // the newline, and all whitespace at the beginning of the next line are ignored | ||
247 | match self.peek() { | ||
248 | Some('\n') | Some('\r') => { | ||
249 | self.skip_whitespace(); | ||
250 | Some(StringComponent::new( | ||
251 | TextRange::from_to(start, self.get_pos()), | ||
252 | StringComponentKind::IgnoreNewline, | ||
253 | )) | ||
254 | } | ||
255 | _ => None, | ||
256 | } | ||
257 | } | ||
258 | |||
259 | pub fn parse_string_component(&mut self) -> Option<StringComponent> { | ||
260 | let next = self.peek()?; | ||
261 | |||
262 | // Ignore string close | ||
263 | if next == '"' { | ||
264 | return None; | ||
265 | } | ||
266 | |||
267 | let start = self.get_pos(); | ||
268 | self.advance(); | ||
269 | |||
270 | if next == '\\' { | ||
271 | // Strings can use `\` to ignore newlines, so we first try to parse one of those | ||
272 | // before falling back to parsing char escapes | ||
273 | self.parse_ignore_newline(start).or_else(|| { | ||
274 | let char_component = self.parse_escape(start); | ||
275 | Some(StringComponent::new( | ||
276 | char_component.range, | ||
277 | StringComponentKind::Char(char_component.kind), | ||
278 | )) | ||
279 | }) | ||
280 | } else { | ||
281 | let end = self.get_pos(); | ||
282 | Some(StringComponent::new( | ||
283 | TextRange::from_to(start, end), | ||
284 | StringComponentKind::Char(CodePoint), | ||
285 | )) | ||
286 | } | ||
287 | } | ||
288 | } | ||
289 | |||
290 | #[cfg(test)] | ||
291 | mod tests { | ||
292 | use super::*; | ||
293 | |||
294 | fn parse(src: &str) -> (bool, Vec<CharComponent>) { | ||
295 | let component_iterator = &mut super::parse_char_literal(src); | ||
296 | let components: Vec<_> = component_iterator.collect(); | ||
297 | (component_iterator.has_closing_quote, components) | ||
298 | } | ||
299 | |||
300 | fn unclosed_char_component(src: &str) -> CharComponent { | ||
301 | let (has_closing_quote, components) = parse(src); | ||
302 | assert!(!has_closing_quote, "char should not have closing quote"); | ||
303 | assert!(components.len() == 1); | ||
304 | components[0].clone() | ||
305 | } | ||
306 | |||
307 | fn closed_char_component(src: &str) -> CharComponent { | ||
308 | let (has_closing_quote, components) = parse(src); | ||
309 | assert!(has_closing_quote, "char should have closing quote"); | ||
310 | assert!( | ||
311 | components.len() == 1, | ||
312 | "Literal: {}\nComponents: {:#?}", | ||
313 | src, | ||
314 | components | ||
315 | ); | ||
316 | components[0].clone() | ||
317 | } | ||
318 | |||
319 | fn closed_char_components(src: &str) -> Vec<CharComponent> { | ||
320 | let (has_closing_quote, components) = parse(src); | ||
321 | assert!(has_closing_quote, "char should have closing quote"); | ||
322 | components | ||
323 | } | ||
324 | |||
325 | fn range_closed(src: &str) -> TextRange { | ||
326 | TextRange::from_to(1.into(), (src.len() as u32 - 1).into()) | ||
327 | } | ||
328 | |||
329 | fn range_unclosed(src: &str) -> TextRange { | ||
330 | TextRange::from_to(1.into(), (src.len() as u32).into()) | ||
331 | } | ||
332 | |||
333 | #[test] | ||
334 | fn test_unicode_escapes() { | ||
335 | let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""]; | ||
336 | for escape in unicode_escapes { | ||
337 | let escape_sequence = format!(r"'\u{}'", escape); | ||
338 | let component = closed_char_component(&escape_sequence); | ||
339 | let expected_range = range_closed(&escape_sequence); | ||
340 | assert_eq!(component.kind, CharComponentKind::UnicodeEscape); | ||
341 | assert_eq!(component.range, expected_range); | ||
342 | } | ||
343 | } | ||
344 | |||
345 | #[test] | ||
346 | fn test_unicode_escapes_unclosed() { | ||
347 | let unicode_escapes = &["{DEAD", "{BEEF", "{FF"]; | ||
348 | for escape in unicode_escapes { | ||
349 | let escape_sequence = format!(r"'\u{}'", escape); | ||
350 | let component = unclosed_char_component(&escape_sequence); | ||
351 | let expected_range = range_unclosed(&escape_sequence); | ||
352 | assert_eq!(component.kind, CharComponentKind::UnicodeEscape); | ||
353 | assert_eq!(component.range, expected_range); | ||
354 | } | ||
355 | } | ||
356 | |||
357 | #[test] | ||
358 | fn test_empty_char() { | ||
359 | let (has_closing_quote, components) = parse("''"); | ||
360 | assert!(has_closing_quote, "char should have closing quote"); | ||
361 | assert!(components.len() == 0); | ||
362 | } | ||
363 | |||
364 | #[test] | ||
365 | fn test_unclosed_char() { | ||
366 | let component = unclosed_char_component("'a"); | ||
367 | assert!(component.kind == CodePoint); | ||
368 | assert!(component.range == TextRange::from_to(1.into(), 2.into())); | ||
369 | } | ||
370 | |||
371 | #[test] | ||
372 | fn test_digit_escapes() { | ||
373 | let literals = &[r"", r"5", r"55"]; | ||
374 | |||
375 | for literal in literals { | ||
376 | let lit_text = format!(r"'\x{}'", literal); | ||
377 | let component = closed_char_component(&lit_text); | ||
378 | assert!(component.kind == CharComponentKind::AsciiCodeEscape); | ||
379 | assert!(component.range == range_closed(&lit_text)); | ||
380 | } | ||
381 | |||
382 | // More than 2 digits starts a new codepoint | ||
383 | let components = closed_char_components(r"'\x555'"); | ||
384 | assert!(components.len() == 2); | ||
385 | assert!(components[1].kind == CharComponentKind::CodePoint); | ||
386 | } | ||
387 | |||
388 | #[test] | ||
389 | fn test_ascii_escapes() { | ||
390 | let literals = &[ | ||
391 | r"\'", "\\\"", // equivalent to \" | ||
392 | r"\n", r"\r", r"\t", r"\\", r"\0", | ||
393 | ]; | ||
394 | |||
395 | for literal in literals { | ||
396 | let lit_text = format!("'{}'", literal); | ||
397 | let component = closed_char_component(&lit_text); | ||
398 | assert!(component.kind == CharComponentKind::AsciiEscape); | ||
399 | assert!(component.range == range_closed(&lit_text)); | ||
400 | } | ||
401 | } | ||
402 | |||
403 | #[test] | ||
404 | fn test_no_escapes() { | ||
405 | let literals = &['"', 'n', 'r', 't', '0', 'x', 'u']; | ||
406 | |||
407 | for &literal in literals { | ||
408 | let lit_text = format!("'{}'", literal); | ||
409 | let component = closed_char_component(&lit_text); | ||
410 | assert!(component.kind == CharComponentKind::CodePoint); | ||
411 | assert!(component.range == range_closed(&lit_text)); | ||
412 | } | ||
413 | } | ||
414 | } | ||
diff --git a/crates/ra_syntax/src/string_lexing/byte.rs b/crates/ra_syntax/src/string_lexing/byte.rs new file mode 100644 index 000000000..24424349c --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/byte.rs | |||
@@ -0,0 +1,51 @@ | |||
1 | use super::parser::Parser; | ||
2 | use super::CharComponent; | ||
3 | |||
4 | pub fn parse_byte_literal(src: &str) -> ByteComponentIterator { | ||
5 | ByteComponentIterator { | ||
6 | parser: Parser::new(src), | ||
7 | has_closing_quote: false, | ||
8 | } | ||
9 | } | ||
10 | |||
11 | pub struct ByteComponentIterator<'a> { | ||
12 | parser: Parser<'a>, | ||
13 | pub has_closing_quote: bool, | ||
14 | } | ||
15 | |||
16 | impl<'a> Iterator for ByteComponentIterator<'a> { | ||
17 | type Item = CharComponent; | ||
18 | fn next(&mut self) -> Option<CharComponent> { | ||
19 | if self.parser.pos == 0 { | ||
20 | assert!( | ||
21 | self.parser.advance() == 'b', | ||
22 | "Byte literal should start with a `b`" | ||
23 | ); | ||
24 | |||
25 | assert!( | ||
26 | self.parser.advance() == '\'', | ||
27 | "Byte literal should start with a `b`, followed by a quote" | ||
28 | ); | ||
29 | } | ||
30 | |||
31 | if let Some(component) = self.parser.parse_char_component() { | ||
32 | return Some(component); | ||
33 | } | ||
34 | |||
35 | // We get here when there are no char components left to parse | ||
36 | if self.parser.peek() == Some('\'') { | ||
37 | self.parser.advance(); | ||
38 | self.has_closing_quote = true; | ||
39 | } | ||
40 | |||
41 | assert!( | ||
42 | self.parser.peek() == None, | ||
43 | "byte literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
44 | self.parser.src, | ||
45 | self.parser.pos, | ||
46 | self.parser.src.len() | ||
47 | ); | ||
48 | |||
49 | None | ||
50 | } | ||
51 | } | ||
diff --git a/crates/ra_syntax/src/string_lexing/byte_string.rs b/crates/ra_syntax/src/string_lexing/byte_string.rs new file mode 100644 index 000000000..5b6dda760 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/byte_string.rs | |||
@@ -0,0 +1,51 @@ | |||
1 | use super::parser::Parser; | ||
2 | use super::StringComponent; | ||
3 | |||
4 | pub fn parse_byte_string_literal(src: &str) -> ByteStringComponentIterator { | ||
5 | ByteStringComponentIterator { | ||
6 | parser: Parser::new(src), | ||
7 | has_closing_quote: false, | ||
8 | } | ||
9 | } | ||
10 | |||
11 | pub struct ByteStringComponentIterator<'a> { | ||
12 | parser: Parser<'a>, | ||
13 | pub has_closing_quote: bool, | ||
14 | } | ||
15 | |||
16 | impl<'a> Iterator for ByteStringComponentIterator<'a> { | ||
17 | type Item = StringComponent; | ||
18 | fn next(&mut self) -> Option<StringComponent> { | ||
19 | if self.parser.pos == 0 { | ||
20 | assert!( | ||
21 | self.parser.advance() == 'b', | ||
22 | "byte string literal should start with a `b`" | ||
23 | ); | ||
24 | |||
25 | assert!( | ||
26 | self.parser.advance() == '"', | ||
27 | "byte string literal should start with a `b`, followed by double quotes" | ||
28 | ); | ||
29 | } | ||
30 | |||
31 | if let Some(component) = self.parser.parse_string_component() { | ||
32 | return Some(component); | ||
33 | } | ||
34 | |||
35 | // We get here when there are no char components left to parse | ||
36 | if self.parser.peek() == Some('"') { | ||
37 | self.parser.advance(); | ||
38 | self.has_closing_quote = true; | ||
39 | } | ||
40 | |||
41 | assert!( | ||
42 | self.parser.peek() == None, | ||
43 | "byte string literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
44 | self.parser.src, | ||
45 | self.parser.pos, | ||
46 | self.parser.src.len() | ||
47 | ); | ||
48 | |||
49 | None | ||
50 | } | ||
51 | } | ||
diff --git a/crates/ra_syntax/src/string_lexing/char.rs b/crates/ra_syntax/src/string_lexing/char.rs new file mode 100644 index 000000000..885c03b14 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/char.rs | |||
@@ -0,0 +1,176 @@ | |||
1 | use super::parser::Parser; | ||
2 | use super::CharComponent; | ||
3 | |||
4 | pub fn parse_char_literal(src: &str) -> CharComponentIterator { | ||
5 | CharComponentIterator { | ||
6 | parser: Parser::new(src), | ||
7 | has_closing_quote: false, | ||
8 | } | ||
9 | } | ||
10 | |||
11 | pub struct CharComponentIterator<'a> { | ||
12 | parser: Parser<'a>, | ||
13 | pub has_closing_quote: bool, | ||
14 | } | ||
15 | |||
16 | impl<'a> Iterator for CharComponentIterator<'a> { | ||
17 | type Item = CharComponent; | ||
18 | fn next(&mut self) -> Option<CharComponent> { | ||
19 | if self.parser.pos == 0 { | ||
20 | assert!( | ||
21 | self.parser.advance() == '\'', | ||
22 | "char literal should start with a quote" | ||
23 | ); | ||
24 | } | ||
25 | |||
26 | if let Some(component) = self.parser.parse_char_component() { | ||
27 | return Some(component); | ||
28 | } | ||
29 | |||
30 | // We get here when there are no char components left to parse | ||
31 | if self.parser.peek() == Some('\'') { | ||
32 | self.parser.advance(); | ||
33 | self.has_closing_quote = true; | ||
34 | } | ||
35 | |||
36 | assert!( | ||
37 | self.parser.peek() == None, | ||
38 | "char literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
39 | self.parser.src, | ||
40 | self.parser.pos, | ||
41 | self.parser.src.len() | ||
42 | ); | ||
43 | |||
44 | None | ||
45 | } | ||
46 | } | ||
47 | |||
48 | #[cfg(test)] | ||
49 | mod tests { | ||
50 | use rowan::TextRange; | ||
51 | use crate::string_lexing::{ | ||
52 | CharComponent, | ||
53 | CharComponentKind::*, | ||
54 | }; | ||
55 | |||
56 | fn parse(src: &str) -> (bool, Vec<CharComponent>) { | ||
57 | let component_iterator = &mut super::parse_char_literal(src); | ||
58 | let components: Vec<_> = component_iterator.collect(); | ||
59 | (component_iterator.has_closing_quote, components) | ||
60 | } | ||
61 | |||
62 | fn unclosed_char_component(src: &str) -> CharComponent { | ||
63 | let (has_closing_quote, components) = parse(src); | ||
64 | assert!(!has_closing_quote, "char should not have closing quote"); | ||
65 | assert!(components.len() == 1); | ||
66 | components[0].clone() | ||
67 | } | ||
68 | |||
69 | fn closed_char_component(src: &str) -> CharComponent { | ||
70 | let (has_closing_quote, components) = parse(src); | ||
71 | assert!(has_closing_quote, "char should have closing quote"); | ||
72 | assert!( | ||
73 | components.len() == 1, | ||
74 | "Literal: {}\nComponents: {:#?}", | ||
75 | src, | ||
76 | components | ||
77 | ); | ||
78 | components[0].clone() | ||
79 | } | ||
80 | |||
81 | fn closed_char_components(src: &str) -> Vec<CharComponent> { | ||
82 | let (has_closing_quote, components) = parse(src); | ||
83 | assert!(has_closing_quote, "char should have closing quote"); | ||
84 | components | ||
85 | } | ||
86 | |||
87 | fn range_closed(src: &str) -> TextRange { | ||
88 | TextRange::from_to(1.into(), (src.len() as u32 - 1).into()) | ||
89 | } | ||
90 | |||
91 | fn range_unclosed(src: &str) -> TextRange { | ||
92 | TextRange::from_to(1.into(), (src.len() as u32).into()) | ||
93 | } | ||
94 | |||
95 | #[test] | ||
96 | fn test_unicode_escapes() { | ||
97 | let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""]; | ||
98 | for escape in unicode_escapes { | ||
99 | let escape_sequence = format!(r"'\u{}'", escape); | ||
100 | let component = closed_char_component(&escape_sequence); | ||
101 | let expected_range = range_closed(&escape_sequence); | ||
102 | assert_eq!(component.kind, UnicodeEscape); | ||
103 | assert_eq!(component.range, expected_range); | ||
104 | } | ||
105 | } | ||
106 | |||
107 | #[test] | ||
108 | fn test_unicode_escapes_unclosed() { | ||
109 | let unicode_escapes = &["{DEAD", "{BEEF", "{FF"]; | ||
110 | for escape in unicode_escapes { | ||
111 | let escape_sequence = format!(r"'\u{}'", escape); | ||
112 | let component = unclosed_char_component(&escape_sequence); | ||
113 | let expected_range = range_unclosed(&escape_sequence); | ||
114 | assert_eq!(component.kind, UnicodeEscape); | ||
115 | assert_eq!(component.range, expected_range); | ||
116 | } | ||
117 | } | ||
118 | |||
119 | #[test] | ||
120 | fn test_empty_char() { | ||
121 | let (has_closing_quote, components) = parse("''"); | ||
122 | assert!(has_closing_quote, "char should have closing quote"); | ||
123 | assert!(components.len() == 0); | ||
124 | } | ||
125 | |||
126 | #[test] | ||
127 | fn test_unclosed_char() { | ||
128 | let component = unclosed_char_component("'a"); | ||
129 | assert!(component.kind == CodePoint); | ||
130 | assert!(component.range == TextRange::from_to(1.into(), 2.into())); | ||
131 | } | ||
132 | |||
133 | #[test] | ||
134 | fn test_digit_escapes() { | ||
135 | let literals = &[r"", r"5", r"55"]; | ||
136 | |||
137 | for literal in literals { | ||
138 | let lit_text = format!(r"'\x{}'", literal); | ||
139 | let component = closed_char_component(&lit_text); | ||
140 | assert!(component.kind == AsciiCodeEscape); | ||
141 | assert!(component.range == range_closed(&lit_text)); | ||
142 | } | ||
143 | |||
144 | // More than 2 digits starts a new codepoint | ||
145 | let components = closed_char_components(r"'\x555'"); | ||
146 | assert!(components.len() == 2); | ||
147 | assert!(components[1].kind == CodePoint); | ||
148 | } | ||
149 | |||
150 | #[test] | ||
151 | fn test_ascii_escapes() { | ||
152 | let literals = &[ | ||
153 | r"\'", "\\\"", // equivalent to \" | ||
154 | r"\n", r"\r", r"\t", r"\\", r"\0", | ||
155 | ]; | ||
156 | |||
157 | for literal in literals { | ||
158 | let lit_text = format!("'{}'", literal); | ||
159 | let component = closed_char_component(&lit_text); | ||
160 | assert!(component.kind == AsciiEscape); | ||
161 | assert!(component.range == range_closed(&lit_text)); | ||
162 | } | ||
163 | } | ||
164 | |||
165 | #[test] | ||
166 | fn test_no_escapes() { | ||
167 | let literals = &['"', 'n', 'r', 't', '0', 'x', 'u']; | ||
168 | |||
169 | for &literal in literals { | ||
170 | let lit_text = format!("'{}'", literal); | ||
171 | let component = closed_char_component(&lit_text); | ||
172 | assert!(component.kind == CodePoint); | ||
173 | assert!(component.range == range_closed(&lit_text)); | ||
174 | } | ||
175 | } | ||
176 | } | ||
diff --git a/crates/ra_syntax/src/string_lexing/mod.rs b/crates/ra_syntax/src/string_lexing/mod.rs new file mode 100644 index 000000000..94853331f --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/mod.rs | |||
@@ -0,0 +1,13 @@ | |||
1 | mod parser; | ||
2 | mod byte; | ||
3 | mod byte_string; | ||
4 | mod char; | ||
5 | mod string; | ||
6 | |||
7 | pub use self::{ | ||
8 | byte::parse_byte_literal, | ||
9 | byte_string::parse_byte_string_literal, | ||
10 | char::parse_char_literal, | ||
11 | parser::{CharComponent, CharComponentKind, StringComponent, StringComponentKind}, | ||
12 | string::parse_string_literal, | ||
13 | }; | ||
diff --git a/crates/ra_syntax/src/string_lexing/parser.rs b/crates/ra_syntax/src/string_lexing/parser.rs new file mode 100644 index 000000000..4a6d5bc93 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/parser.rs | |||
@@ -0,0 +1,201 @@ | |||
1 | use rowan::{TextRange, TextUnit}; | ||
2 | |||
3 | use self::CharComponentKind::*; | ||
4 | |||
5 | pub struct Parser<'a> { | ||
6 | pub(super) src: &'a str, | ||
7 | pub(super) pos: usize, | ||
8 | } | ||
9 | |||
10 | impl<'a> Parser<'a> { | ||
11 | pub fn new(src: &'a str) -> Parser<'a> { | ||
12 | Parser { src, pos: 0 } | ||
13 | } | ||
14 | |||
15 | // Utility methods | ||
16 | |||
17 | pub fn peek(&self) -> Option<char> { | ||
18 | if self.pos == self.src.len() { | ||
19 | return None; | ||
20 | } | ||
21 | |||
22 | self.src[self.pos..].chars().next() | ||
23 | } | ||
24 | |||
25 | pub fn advance(&mut self) -> char { | ||
26 | let next = self | ||
27 | .peek() | ||
28 | .expect("cannot advance if end of input is reached"); | ||
29 | self.pos += next.len_utf8(); | ||
30 | next | ||
31 | } | ||
32 | |||
33 | pub fn skip_whitespace(&mut self) { | ||
34 | while self.peek().map(|c| c.is_whitespace()) == Some(true) { | ||
35 | self.advance(); | ||
36 | } | ||
37 | } | ||
38 | |||
39 | pub fn get_pos(&self) -> TextUnit { | ||
40 | (self.pos as u32).into() | ||
41 | } | ||
42 | |||
43 | // Char parsing methods | ||
44 | |||
45 | fn parse_unicode_escape(&mut self, start: TextUnit) -> CharComponent { | ||
46 | match self.peek() { | ||
47 | Some('{') => { | ||
48 | self.advance(); | ||
49 | |||
50 | // Parse anything until we reach `}` | ||
51 | while let Some(next) = self.peek() { | ||
52 | self.advance(); | ||
53 | if next == '}' { | ||
54 | break; | ||
55 | } | ||
56 | } | ||
57 | |||
58 | let end = self.get_pos(); | ||
59 | CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) | ||
60 | } | ||
61 | Some(_) | None => { | ||
62 | let end = self.get_pos(); | ||
63 | CharComponent::new(TextRange::from_to(start, end), UnicodeEscape) | ||
64 | } | ||
65 | } | ||
66 | } | ||
67 | |||
68 | fn parse_ascii_code_escape(&mut self, start: TextUnit) -> CharComponent { | ||
69 | let code_start = self.get_pos(); | ||
70 | while let Some(next) = self.peek() { | ||
71 | if next == '\'' || (self.get_pos() - code_start == 2.into()) { | ||
72 | break; | ||
73 | } | ||
74 | |||
75 | self.advance(); | ||
76 | } | ||
77 | |||
78 | let end = self.get_pos(); | ||
79 | CharComponent::new(TextRange::from_to(start, end), AsciiCodeEscape) | ||
80 | } | ||
81 | |||
82 | fn parse_escape(&mut self, start: TextUnit) -> CharComponent { | ||
83 | if self.peek().is_none() { | ||
84 | return CharComponent::new(TextRange::from_to(start, start), AsciiEscape); | ||
85 | } | ||
86 | |||
87 | let next = self.advance(); | ||
88 | let end = self.get_pos(); | ||
89 | let range = TextRange::from_to(start, end); | ||
90 | match next { | ||
91 | 'x' => self.parse_ascii_code_escape(start), | ||
92 | 'u' => self.parse_unicode_escape(start), | ||
93 | _ => CharComponent::new(range, AsciiEscape), | ||
94 | } | ||
95 | } | ||
96 | |||
97 | pub fn parse_char_component(&mut self) -> Option<CharComponent> { | ||
98 | let next = self.peek()?; | ||
99 | |||
100 | // Ignore character close | ||
101 | if next == '\'' { | ||
102 | return None; | ||
103 | } | ||
104 | |||
105 | let start = self.get_pos(); | ||
106 | self.advance(); | ||
107 | |||
108 | if next == '\\' { | ||
109 | Some(self.parse_escape(start)) | ||
110 | } else { | ||
111 | let end = self.get_pos(); | ||
112 | Some(CharComponent::new( | ||
113 | TextRange::from_to(start, end), | ||
114 | CodePoint, | ||
115 | )) | ||
116 | } | ||
117 | } | ||
118 | |||
119 | pub fn parse_ignore_newline(&mut self, start: TextUnit) -> Option<StringComponent> { | ||
120 | // In string literals, when a `\` occurs immediately before the newline, the `\`, | ||
121 | // the newline, and all whitespace at the beginning of the next line are ignored | ||
122 | match self.peek() { | ||
123 | Some('\n') | Some('\r') => { | ||
124 | self.skip_whitespace(); | ||
125 | Some(StringComponent::new( | ||
126 | TextRange::from_to(start, self.get_pos()), | ||
127 | StringComponentKind::IgnoreNewline, | ||
128 | )) | ||
129 | } | ||
130 | _ => None, | ||
131 | } | ||
132 | } | ||
133 | |||
134 | pub fn parse_string_component(&mut self) -> Option<StringComponent> { | ||
135 | let next = self.peek()?; | ||
136 | |||
137 | // Ignore string close | ||
138 | if next == '"' { | ||
139 | return None; | ||
140 | } | ||
141 | |||
142 | let start = self.get_pos(); | ||
143 | self.advance(); | ||
144 | |||
145 | if next == '\\' { | ||
146 | // Strings can use `\` to ignore newlines, so we first try to parse one of those | ||
147 | // before falling back to parsing char escapes | ||
148 | self.parse_ignore_newline(start).or_else(|| { | ||
149 | let char_component = self.parse_escape(start); | ||
150 | Some(StringComponent::new( | ||
151 | char_component.range, | ||
152 | StringComponentKind::Char(char_component.kind), | ||
153 | )) | ||
154 | }) | ||
155 | } else { | ||
156 | let end = self.get_pos(); | ||
157 | Some(StringComponent::new( | ||
158 | TextRange::from_to(start, end), | ||
159 | StringComponentKind::Char(CodePoint), | ||
160 | )) | ||
161 | } | ||
162 | } | ||
163 | } | ||
164 | |||
165 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
166 | pub struct StringComponent { | ||
167 | pub range: TextRange, | ||
168 | pub kind: StringComponentKind, | ||
169 | } | ||
170 | |||
171 | impl StringComponent { | ||
172 | fn new(range: TextRange, kind: StringComponentKind) -> StringComponent { | ||
173 | StringComponent { range, kind } | ||
174 | } | ||
175 | } | ||
176 | |||
177 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
178 | pub enum StringComponentKind { | ||
179 | IgnoreNewline, | ||
180 | Char(CharComponentKind), | ||
181 | } | ||
182 | |||
183 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
184 | pub struct CharComponent { | ||
185 | pub range: TextRange, | ||
186 | pub kind: CharComponentKind, | ||
187 | } | ||
188 | |||
189 | impl CharComponent { | ||
190 | fn new(range: TextRange, kind: CharComponentKind) -> CharComponent { | ||
191 | CharComponent { range, kind } | ||
192 | } | ||
193 | } | ||
194 | |||
195 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
196 | pub enum CharComponentKind { | ||
197 | CodePoint, | ||
198 | AsciiEscape, | ||
199 | AsciiCodeEscape, | ||
200 | UnicodeEscape, | ||
201 | } | ||
diff --git a/crates/ra_syntax/src/string_lexing/string.rs b/crates/ra_syntax/src/string_lexing/string.rs new file mode 100644 index 000000000..1b23029c6 --- /dev/null +++ b/crates/ra_syntax/src/string_lexing/string.rs | |||
@@ -0,0 +1,46 @@ | |||
1 | use super::parser::Parser; | ||
2 | use super::StringComponent; | ||
3 | |||
4 | pub fn parse_string_literal(src: &str) -> StringComponentIterator { | ||
5 | StringComponentIterator { | ||
6 | parser: Parser::new(src), | ||
7 | has_closing_quote: false, | ||
8 | } | ||
9 | } | ||
10 | |||
11 | pub struct StringComponentIterator<'a> { | ||
12 | parser: Parser<'a>, | ||
13 | pub has_closing_quote: bool, | ||
14 | } | ||
15 | |||
16 | impl<'a> Iterator for StringComponentIterator<'a> { | ||
17 | type Item = StringComponent; | ||
18 | fn next(&mut self) -> Option<StringComponent> { | ||
19 | if self.parser.pos == 0 { | ||
20 | assert!( | ||
21 | self.parser.advance() == '"', | ||
22 | "string literal should start with double quotes" | ||
23 | ); | ||
24 | } | ||
25 | |||
26 | if let Some(component) = self.parser.parse_string_component() { | ||
27 | return Some(component); | ||
28 | } | ||
29 | |||
30 | // We get here when there are no char components left to parse | ||
31 | if self.parser.peek() == Some('"') { | ||
32 | self.parser.advance(); | ||
33 | self.has_closing_quote = true; | ||
34 | } | ||
35 | |||
36 | assert!( | ||
37 | self.parser.peek() == None, | ||
38 | "string literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
39 | self.parser.src, | ||
40 | self.parser.pos, | ||
41 | self.parser.src.len() | ||
42 | ); | ||
43 | |||
44 | None | ||
45 | } | ||
46 | } | ||
diff --git a/crates/ra_syntax/src/validation/byte.rs b/crates/ra_syntax/src/validation/byte.rs new file mode 100644 index 000000000..43c0d7edd --- /dev/null +++ b/crates/ra_syntax/src/validation/byte.rs | |||
@@ -0,0 +1,211 @@ | |||
1 | //! Validation of byte literals | ||
2 | |||
3 | use crate::{ | ||
4 | ast::{self, AstNode}, | ||
5 | string_lexing::{self, CharComponentKind}, | ||
6 | TextRange, | ||
7 | validation::char, | ||
8 | yellow::{ | ||
9 | SyntaxError, | ||
10 | SyntaxErrorKind::*, | ||
11 | }, | ||
12 | }; | ||
13 | |||
14 | pub(super) fn validate_byte_node(node: ast::Byte, errors: &mut Vec<SyntaxError>) { | ||
15 | let literal_text = node.text(); | ||
16 | let literal_range = node.syntax().range(); | ||
17 | let mut components = string_lexing::parse_byte_literal(literal_text); | ||
18 | let mut len = 0; | ||
19 | for component in &mut components { | ||
20 | len += 1; | ||
21 | let text = &literal_text[component.range]; | ||
22 | let range = component.range + literal_range.start(); | ||
23 | validate_byte_component(text, component.kind, range, errors); | ||
24 | } | ||
25 | |||
26 | if !components.has_closing_quote { | ||
27 | errors.push(SyntaxError::new(UnclosedByte, literal_range)); | ||
28 | } | ||
29 | |||
30 | if len == 0 { | ||
31 | errors.push(SyntaxError::new(EmptyByte, literal_range)); | ||
32 | } | ||
33 | |||
34 | if len > 1 { | ||
35 | errors.push(SyntaxError::new(OverlongByte, literal_range)); | ||
36 | } | ||
37 | } | ||
38 | |||
39 | pub(super) fn validate_byte_component( | ||
40 | text: &str, | ||
41 | kind: CharComponentKind, | ||
42 | range: TextRange, | ||
43 | errors: &mut Vec<SyntaxError>, | ||
44 | ) { | ||
45 | use self::CharComponentKind::*; | ||
46 | match kind { | ||
47 | AsciiEscape => validate_byte_escape(text, range, errors), | ||
48 | AsciiCodeEscape => validate_byte_code_escape(text, range, errors), | ||
49 | UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)), | ||
50 | CodePoint => { | ||
51 | let c = text | ||
52 | .chars() | ||
53 | .next() | ||
54 | .expect("Code points should be one character long"); | ||
55 | |||
56 | // These bytes must always be escaped | ||
57 | if c == '\t' || c == '\r' || c == '\n' { | ||
58 | errors.push(SyntaxError::new(UnescapedByte, range)); | ||
59 | } | ||
60 | |||
61 | // Only ASCII bytes are allowed | ||
62 | if c > 0x7F as char { | ||
63 | errors.push(SyntaxError::new(ByteOutOfRange, range)); | ||
64 | } | ||
65 | } | ||
66 | } | ||
67 | } | ||
68 | |||
69 | fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | ||
70 | if text.len() == 1 { | ||
71 | // Escape sequence consists only of leading `\` | ||
72 | errors.push(SyntaxError::new(EmptyByteEscape, range)); | ||
73 | } else { | ||
74 | let escape_code = text.chars().skip(1).next().unwrap(); | ||
75 | if !char::is_ascii_escape(escape_code) { | ||
76 | errors.push(SyntaxError::new(InvalidByteEscape, range)); | ||
77 | } | ||
78 | } | ||
79 | } | ||
80 | |||
81 | fn validate_byte_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | ||
82 | // A ByteCodeEscape has 4 chars, example: `\xDD` | ||
83 | if text.len() < 4 { | ||
84 | errors.push(SyntaxError::new(TooShortByteCodeEscape, range)); | ||
85 | } else { | ||
86 | assert!( | ||
87 | text.chars().count() == 4, | ||
88 | "ByteCodeEscape cannot be longer than 4 chars" | ||
89 | ); | ||
90 | |||
91 | if u8::from_str_radix(&text[2..], 16).is_err() { | ||
92 | errors.push(SyntaxError::new(MalformedByteCodeEscape, range)); | ||
93 | } | ||
94 | } | ||
95 | } | ||
96 | |||
97 | #[cfg(test)] | ||
98 | mod test { | ||
99 | use crate::SourceFileNode; | ||
100 | |||
101 | fn build_file(literal: &str) -> SourceFileNode { | ||
102 | let src = format!("const C: u8 = b'{}';", literal); | ||
103 | SourceFileNode::parse(&src) | ||
104 | } | ||
105 | |||
106 | fn assert_valid_byte(literal: &str) { | ||
107 | let file = build_file(literal); | ||
108 | assert!( | ||
109 | file.errors().len() == 0, | ||
110 | "Errors for literal '{}': {:?}", | ||
111 | literal, | ||
112 | file.errors() | ||
113 | ); | ||
114 | } | ||
115 | |||
116 | fn assert_invalid_byte(literal: &str) { | ||
117 | let file = build_file(literal); | ||
118 | assert!(file.errors().len() > 0); | ||
119 | } | ||
120 | |||
121 | #[test] | ||
122 | fn test_ansi_codepoints() { | ||
123 | for byte in 0..128 { | ||
124 | match byte { | ||
125 | b'\n' | b'\r' | b'\t' => assert_invalid_byte(&(byte as char).to_string()), | ||
126 | b'\'' | b'\\' => { /* Ignore character close and backslash */ } | ||
127 | _ => assert_valid_byte(&(byte as char).to_string()), | ||
128 | } | ||
129 | } | ||
130 | |||
131 | for byte in 128..=255u8 { | ||
132 | assert_invalid_byte(&(byte as char).to_string()); | ||
133 | } | ||
134 | } | ||
135 | |||
136 | #[test] | ||
137 | fn test_unicode_codepoints() { | ||
138 | let invalid = ["Ƒ", "バ", "メ", "﷽"]; | ||
139 | for c in &invalid { | ||
140 | assert_invalid_byte(c); | ||
141 | } | ||
142 | } | ||
143 | |||
144 | #[test] | ||
145 | fn test_unicode_multiple_codepoints() { | ||
146 | let invalid = ["नी", "👨👨"]; | ||
147 | for c in &invalid { | ||
148 | assert_invalid_byte(c); | ||
149 | } | ||
150 | } | ||
151 | |||
152 | #[test] | ||
153 | fn test_valid_byte_escape() { | ||
154 | let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"]; | ||
155 | for c in &valid { | ||
156 | assert_valid_byte(c); | ||
157 | } | ||
158 | } | ||
159 | |||
160 | #[test] | ||
161 | fn test_invalid_byte_escape() { | ||
162 | let invalid = [r"\a", r"\?", r"\"]; | ||
163 | for c in &invalid { | ||
164 | assert_invalid_byte(c); | ||
165 | } | ||
166 | } | ||
167 | |||
168 | #[test] | ||
169 | fn test_valid_byte_code_escape() { | ||
170 | let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"]; | ||
171 | for c in &valid { | ||
172 | assert_valid_byte(c); | ||
173 | } | ||
174 | } | ||
175 | |||
176 | #[test] | ||
177 | fn test_invalid_byte_code_escape() { | ||
178 | let invalid = [r"\x", r"\x7"]; | ||
179 | for c in &invalid { | ||
180 | assert_invalid_byte(c); | ||
181 | } | ||
182 | } | ||
183 | |||
184 | #[test] | ||
185 | fn test_invalid_unicode_escape() { | ||
186 | let well_formed = [ | ||
187 | r"\u{FF}", | ||
188 | r"\u{0}", | ||
189 | r"\u{F}", | ||
190 | r"\u{10FFFF}", | ||
191 | r"\u{1_0__FF___FF_____}", | ||
192 | ]; | ||
193 | for c in &well_formed { | ||
194 | assert_invalid_byte(c); | ||
195 | } | ||
196 | |||
197 | let invalid = [ | ||
198 | r"\u", | ||
199 | r"\u{}", | ||
200 | r"\u{", | ||
201 | r"\u{FF", | ||
202 | r"\u{FFFFFF}", | ||
203 | r"\u{_F}", | ||
204 | r"\u{00FFFFF}", | ||
205 | r"\u{110000}", | ||
206 | ]; | ||
207 | for c in &invalid { | ||
208 | assert_invalid_byte(c); | ||
209 | } | ||
210 | } | ||
211 | } | ||
diff --git a/crates/ra_syntax/src/validation/byte_string.rs b/crates/ra_syntax/src/validation/byte_string.rs new file mode 100644 index 000000000..7b830e97c --- /dev/null +++ b/crates/ra_syntax/src/validation/byte_string.rs | |||
@@ -0,0 +1,178 @@ | |||
1 | use crate::{ | ||
2 | ast::{self, AstNode}, | ||
3 | string_lexing::{self, StringComponentKind}, | ||
4 | yellow::{ | ||
5 | SyntaxError, | ||
6 | SyntaxErrorKind::*, | ||
7 | }, | ||
8 | }; | ||
9 | |||
10 | use super::byte; | ||
11 | |||
12 | pub(crate) fn validate_byte_string_node(node: ast::ByteString, errors: &mut Vec<SyntaxError>) { | ||
13 | let literal_text = node.text(); | ||
14 | let literal_range = node.syntax().range(); | ||
15 | let mut components = string_lexing::parse_byte_string_literal(literal_text); | ||
16 | for component in &mut components { | ||
17 | let range = component.range + literal_range.start(); | ||
18 | |||
19 | match component.kind { | ||
20 | StringComponentKind::Char(kind) => { | ||
21 | // Chars must escape \t, \n and \r codepoints, but strings don't | ||
22 | let text = &literal_text[component.range]; | ||
23 | match text { | ||
24 | "\t" | "\n" | "\r" => { /* always valid */ } | ||
25 | _ => byte::validate_byte_component(text, kind, range, errors), | ||
26 | } | ||
27 | } | ||
28 | StringComponentKind::IgnoreNewline => { /* always valid */ } | ||
29 | } | ||
30 | } | ||
31 | |||
32 | if !components.has_closing_quote { | ||
33 | errors.push(SyntaxError::new(UnclosedString, literal_range)); | ||
34 | } | ||
35 | } | ||
36 | |||
37 | #[cfg(test)] | ||
38 | mod test { | ||
39 | use crate::SourceFileNode; | ||
40 | |||
41 | fn build_file(literal: &str) -> SourceFileNode { | ||
42 | let src = format!(r#"const S: &'static [u8] = b"{}";"#, literal); | ||
43 | println!("Source: {}", src); | ||
44 | SourceFileNode::parse(&src) | ||
45 | } | ||
46 | |||
47 | fn assert_valid_str(literal: &str) { | ||
48 | let file = build_file(literal); | ||
49 | assert!( | ||
50 | file.errors().len() == 0, | ||
51 | "Errors for literal '{}': {:?}", | ||
52 | literal, | ||
53 | file.errors() | ||
54 | ); | ||
55 | } | ||
56 | |||
57 | fn assert_invalid_str(literal: &str) { | ||
58 | let file = build_file(literal); | ||
59 | assert!(file.errors().len() > 0); | ||
60 | } | ||
61 | |||
62 | #[test] | ||
63 | fn test_ansi_codepoints() { | ||
64 | for byte in 0..128 { | ||
65 | match byte { | ||
66 | b'\"' | b'\\' => { /* Ignore string close and backslash */ } | ||
67 | _ => assert_valid_str(&(byte as char).to_string()), | ||
68 | } | ||
69 | } | ||
70 | |||
71 | for byte in 128..=255u8 { | ||
72 | assert_invalid_str(&(byte as char).to_string()); | ||
73 | } | ||
74 | } | ||
75 | |||
76 | #[test] | ||
77 | fn test_unicode_codepoints() { | ||
78 | let invalid = ["Ƒ", "バ", "メ", "﷽"]; | ||
79 | for c in &invalid { | ||
80 | assert_invalid_str(c); | ||
81 | } | ||
82 | } | ||
83 | |||
84 | #[test] | ||
85 | fn test_unicode_multiple_codepoints() { | ||
86 | let invalid = ["नी", "👨👨"]; | ||
87 | for c in &invalid { | ||
88 | assert_invalid_str(c); | ||
89 | } | ||
90 | } | ||
91 | |||
92 | #[test] | ||
93 | fn test_valid_ascii_escape() { | ||
94 | let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"]; | ||
95 | for c in &valid { | ||
96 | assert_valid_str(c); | ||
97 | } | ||
98 | } | ||
99 | |||
100 | #[test] | ||
101 | fn test_invalid_ascii_escape() { | ||
102 | let invalid = [r"\a", r"\?", r"\"]; | ||
103 | for c in &invalid { | ||
104 | assert_invalid_str(c); | ||
105 | } | ||
106 | } | ||
107 | |||
108 | #[test] | ||
109 | fn test_valid_ascii_code_escape() { | ||
110 | let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"]; | ||
111 | for c in &valid { | ||
112 | assert_valid_str(c); | ||
113 | } | ||
114 | } | ||
115 | |||
116 | #[test] | ||
117 | fn test_invalid_ascii_code_escape() { | ||
118 | let invalid = [r"\x", r"\x7"]; | ||
119 | for c in &invalid { | ||
120 | assert_invalid_str(c); | ||
121 | } | ||
122 | } | ||
123 | |||
124 | #[test] | ||
125 | fn test_invalid_unicode_escape() { | ||
126 | let well_formed = [ | ||
127 | r"\u{FF}", | ||
128 | r"\u{0}", | ||
129 | r"\u{F}", | ||
130 | r"\u{10FFFF}", | ||
131 | r"\u{1_0__FF___FF_____}", | ||
132 | ]; | ||
133 | for c in &well_formed { | ||
134 | assert_invalid_str(c); | ||
135 | } | ||
136 | |||
137 | let invalid = [ | ||
138 | r"\u", | ||
139 | r"\u{}", | ||
140 | r"\u{", | ||
141 | r"\u{FF", | ||
142 | r"\u{FFFFFF}", | ||
143 | r"\u{_F}", | ||
144 | r"\u{00FFFFF}", | ||
145 | r"\u{110000}", | ||
146 | ]; | ||
147 | for c in &invalid { | ||
148 | assert_invalid_str(c); | ||
149 | } | ||
150 | } | ||
151 | |||
152 | #[test] | ||
153 | fn test_mixed_invalid() { | ||
154 | assert_invalid_str( | ||
155 | r"This is the tale of a string | ||
156 | with a newline in between, some emoji (👨👨) here and there, | ||
157 | unicode escapes like this: \u{1FFBB} and weird stuff like | ||
158 | this ﷽", | ||
159 | ); | ||
160 | } | ||
161 | |||
162 | #[test] | ||
163 | fn test_mixed_valid() { | ||
164 | assert_valid_str( | ||
165 | r"This is the tale of a string | ||
166 | with a newline in between, no emoji at all, | ||
167 | nor unicode escapes or weird stuff", | ||
168 | ); | ||
169 | } | ||
170 | |||
171 | #[test] | ||
172 | fn test_ignore_newline() { | ||
173 | assert_valid_str( | ||
174 | "Hello \ | ||
175 | World", | ||
176 | ); | ||
177 | } | ||
178 | } | ||
diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs index 63f9bad24..4728c85e6 100644 --- a/crates/ra_syntax/src/validation/char.rs +++ b/crates/ra_syntax/src/validation/char.rs | |||
@@ -1,3 +1,5 @@ | |||
1 | //! Validation of char literals | ||
2 | |||
1 | use std::u32; | 3 | use std::u32; |
2 | 4 | ||
3 | use arrayvec::ArrayString; | 5 | use arrayvec::ArrayString; |
@@ -12,7 +14,7 @@ use crate::{ | |||
12 | }, | 14 | }, |
13 | }; | 15 | }; |
14 | 16 | ||
15 | pub(crate) fn validate_char_node(node: ast::Char, errors: &mut Vec<SyntaxError>) { | 17 | pub(super) fn validate_char_node(node: ast::Char, errors: &mut Vec<SyntaxError>) { |
16 | let literal_text = node.text(); | 18 | let literal_text = node.text(); |
17 | let literal_range = node.syntax().range(); | 19 | let literal_range = node.syntax().range(); |
18 | let mut components = string_lexing::parse_char_literal(literal_text); | 20 | let mut components = string_lexing::parse_char_literal(literal_text); |
@@ -37,7 +39,7 @@ pub(crate) fn validate_char_node(node: ast::Char, errors: &mut Vec<SyntaxError>) | |||
37 | } | 39 | } |
38 | } | 40 | } |
39 | 41 | ||
40 | pub(crate) fn validate_char_component( | 42 | pub(super) fn validate_char_component( |
41 | text: &str, | 43 | text: &str, |
42 | kind: CharComponentKind, | 44 | kind: CharComponentKind, |
43 | range: TextRange, | 45 | range: TextRange, |
@@ -46,109 +48,115 @@ pub(crate) fn validate_char_component( | |||
46 | // Validate escapes | 48 | // Validate escapes |
47 | use self::CharComponentKind::*; | 49 | use self::CharComponentKind::*; |
48 | match kind { | 50 | match kind { |
49 | AsciiEscape => { | 51 | AsciiEscape => validate_ascii_escape(text, range, errors), |
50 | if text.len() == 1 { | 52 | AsciiCodeEscape => validate_ascii_code_escape(text, range, errors), |
51 | // Escape sequence consists only of leading `\` | 53 | UnicodeEscape => validate_unicode_escape(text, range, errors), |
52 | errors.push(SyntaxError::new(EmptyAsciiEscape, range)); | 54 | CodePoint => { |
53 | } else { | 55 | // These code points must always be escaped |
54 | let escape_code = text.chars().skip(1).next().unwrap(); | 56 | if text == "\t" || text == "\r" || text == "\n" { |
55 | if !is_ascii_escape(escape_code) { | 57 | errors.push(SyntaxError::new(UnescapedCodepoint, range)); |
56 | errors.push(SyntaxError::new(InvalidAsciiEscape, range)); | ||
57 | } | ||
58 | } | 58 | } |
59 | } | 59 | } |
60 | AsciiCodeEscape => { | 60 | } |
61 | // An AsciiCodeEscape has 4 chars, example: `\xDD` | 61 | } |
62 | if text.len() < 4 { | 62 | |
63 | errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range)); | 63 | fn validate_ascii_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { |
64 | } else { | 64 | if text.len() == 1 { |
65 | assert!( | 65 | // Escape sequence consists only of leading `\` |
66 | text.chars().count() == 4, | 66 | errors.push(SyntaxError::new(EmptyAsciiEscape, range)); |
67 | "AsciiCodeEscape cannot be longer than 4 chars" | 67 | } else { |
68 | ); | 68 | let escape_code = text.chars().skip(1).next().unwrap(); |
69 | 69 | if !is_ascii_escape(escape_code) { | |
70 | match u8::from_str_radix(&text[2..], 16) { | 70 | errors.push(SyntaxError::new(InvalidAsciiEscape, range)); |
71 | Ok(code) if code < 128 => { /* Escape code is valid */ } | ||
72 | Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)), | ||
73 | Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)), | ||
74 | } | ||
75 | } | ||
76 | } | 71 | } |
77 | UnicodeEscape => { | 72 | } |
78 | assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u"); | 73 | } |
79 | 74 | ||
80 | if text.len() == 2 { | 75 | pub(super) fn is_ascii_escape(code: char) -> bool { |
81 | // No starting `{` | 76 | match code { |
82 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | 77 | '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true, |
83 | return; | 78 | _ => false, |
84 | } | 79 | } |
80 | } | ||
85 | 81 | ||
86 | if text.len() == 3 { | 82 | fn validate_ascii_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { |
87 | // Only starting `{` | 83 | // An AsciiCodeEscape has 4 chars, example: `\xDD` |
88 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)); | 84 | if text.len() < 4 { |
89 | return; | 85 | errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range)); |
90 | } | 86 | } else { |
87 | assert!( | ||
88 | text.chars().count() == 4, | ||
89 | "AsciiCodeEscape cannot be longer than 4 chars" | ||
90 | ); | ||
91 | 91 | ||
92 | let mut code = ArrayString::<[_; 6]>::new(); | 92 | match u8::from_str_radix(&text[2..], 16) { |
93 | let mut closed = false; | 93 | Ok(code) if code < 128 => { /* Escape code is valid */ } |
94 | for c in text[3..].chars() { | 94 | Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)), |
95 | assert!(!closed, "no characters after escape is closed"); | 95 | Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)), |
96 | 96 | } | |
97 | if c.is_digit(16) { | 97 | } |
98 | if code.len() == 6 { | 98 | } |
99 | errors.push(SyntaxError::new(OverlongUnicodeEscape, range)); | ||
100 | return; | ||
101 | } | ||
102 | |||
103 | code.push(c); | ||
104 | } else if c == '_' { | ||
105 | // Reject leading _ | ||
106 | if code.len() == 0 { | ||
107 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
108 | return; | ||
109 | } | ||
110 | } else if c == '}' { | ||
111 | closed = true; | ||
112 | } else { | ||
113 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
114 | return; | ||
115 | } | ||
116 | } | ||
117 | 99 | ||
118 | if !closed { | 100 | fn validate_unicode_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { |
119 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)) | 101 | assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u"); |
120 | } | ||
121 | 102 | ||
122 | if code.len() == 0 { | 103 | if text.len() == 2 { |
123 | errors.push(SyntaxError::new(EmptyUnicodeEcape, range)); | 104 | // No starting `{` |
105 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
106 | return; | ||
107 | } | ||
108 | |||
109 | if text.len() == 3 { | ||
110 | // Only starting `{` | ||
111 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)); | ||
112 | return; | ||
113 | } | ||
114 | |||
115 | let mut code = ArrayString::<[_; 6]>::new(); | ||
116 | let mut closed = false; | ||
117 | for c in text[3..].chars() { | ||
118 | assert!(!closed, "no characters after escape is closed"); | ||
119 | |||
120 | if c.is_digit(16) { | ||
121 | if code.len() == 6 { | ||
122 | errors.push(SyntaxError::new(OverlongUnicodeEscape, range)); | ||
124 | return; | 123 | return; |
125 | } | 124 | } |
126 | 125 | ||
127 | match u32::from_str_radix(&code, 16) { | 126 | code.push(c); |
128 | Ok(code_u32) if code_u32 > 0x10FFFF => { | 127 | } else if c == '_' { |
129 | errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range)); | 128 | // Reject leading _ |
130 | } | 129 | if code.len() == 0 { |
131 | Ok(_) => { | 130 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); |
132 | // Valid escape code | 131 | return; |
133 | } | ||
134 | Err(_) => { | ||
135 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
136 | } | ||
137 | } | ||
138 | } | ||
139 | CodePoint => { | ||
140 | // These code points must always be escaped | ||
141 | if text == "\t" || text == "\r" { | ||
142 | errors.push(SyntaxError::new(UnescapedCodepoint, range)); | ||
143 | } | 132 | } |
133 | } else if c == '}' { | ||
134 | closed = true; | ||
135 | } else { | ||
136 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
137 | return; | ||
144 | } | 138 | } |
145 | } | 139 | } |
146 | } | ||
147 | 140 | ||
148 | fn is_ascii_escape(code: char) -> bool { | 141 | if !closed { |
149 | match code { | 142 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)) |
150 | '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true, | 143 | } |
151 | _ => false, | 144 | |
145 | if code.len() == 0 { | ||
146 | errors.push(SyntaxError::new(EmptyUnicodeEcape, range)); | ||
147 | return; | ||
148 | } | ||
149 | |||
150 | match u32::from_str_radix(&code, 16) { | ||
151 | Ok(code_u32) if code_u32 > 0x10FFFF => { | ||
152 | errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range)); | ||
153 | } | ||
154 | Ok(_) => { | ||
155 | // Valid escape code | ||
156 | } | ||
157 | Err(_) => { | ||
158 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
159 | } | ||
152 | } | 160 | } |
153 | } | 161 | } |
154 | 162 | ||
@@ -205,9 +213,7 @@ mod test { | |||
205 | 213 | ||
206 | #[test] | 214 | #[test] |
207 | fn test_valid_ascii_escape() { | 215 | fn test_valid_ascii_escape() { |
208 | let valid = [ | 216 | let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"]; |
209 | r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b", | ||
210 | ]; | ||
211 | for c in &valid { | 217 | for c in &valid { |
212 | assert_valid_char(c); | 218 | assert_valid_char(c); |
213 | } | 219 | } |
diff --git a/crates/ra_syntax/src/validation/mod.rs b/crates/ra_syntax/src/validation/mod.rs index 2ff0bc26d..bdee8120c 100644 --- a/crates/ra_syntax/src/validation/mod.rs +++ b/crates/ra_syntax/src/validation/mod.rs | |||
@@ -5,6 +5,8 @@ use crate::{ | |||
5 | yellow::SyntaxError, | 5 | yellow::SyntaxError, |
6 | }; | 6 | }; |
7 | 7 | ||
8 | mod byte; | ||
9 | mod byte_string; | ||
8 | mod char; | 10 | mod char; |
9 | mod string; | 11 | mod string; |
10 | 12 | ||
@@ -12,6 +14,8 @@ pub(crate) fn validate(file: &SourceFileNode) -> Vec<SyntaxError> { | |||
12 | let mut errors = Vec::new(); | 14 | let mut errors = Vec::new(); |
13 | for node in file.syntax().descendants() { | 15 | for node in file.syntax().descendants() { |
14 | let _ = visitor_ctx(&mut errors) | 16 | let _ = visitor_ctx(&mut errors) |
17 | .visit::<ast::Byte, _>(self::byte::validate_byte_node) | ||
18 | .visit::<ast::ByteString, _>(self::byte_string::validate_byte_string_node) | ||
15 | .visit::<ast::Char, _>(self::char::validate_char_node) | 19 | .visit::<ast::Char, _>(self::char::validate_char_node) |
16 | .visit::<ast::String, _>(self::string::validate_string_node) | 20 | .visit::<ast::String, _>(self::string::validate_string_node) |
17 | .accept(node); | 21 | .accept(node); |
diff --git a/crates/ra_syntax/src/yellow/syntax_error.rs b/crates/ra_syntax/src/yellow/syntax_error.rs index cf7b1d495..c32ee650d 100644 --- a/crates/ra_syntax/src/yellow/syntax_error.rs +++ b/crates/ra_syntax/src/yellow/syntax_error.rs | |||
@@ -72,6 +72,16 @@ pub enum SyntaxErrorKind { | |||
72 | EmptyChar, | 72 | EmptyChar, |
73 | UnclosedChar, | 73 | UnclosedChar, |
74 | OverlongChar, | 74 | OverlongChar, |
75 | EmptyByte, | ||
76 | UnclosedByte, | ||
77 | OverlongByte, | ||
78 | ByteOutOfRange, | ||
79 | UnescapedByte, | ||
80 | EmptyByteEscape, | ||
81 | InvalidByteEscape, | ||
82 | TooShortByteCodeEscape, | ||
83 | MalformedByteCodeEscape, | ||
84 | UnicodeEscapeForbidden, | ||
75 | EmptyAsciiEscape, | 85 | EmptyAsciiEscape, |
76 | InvalidAsciiEscape, | 86 | InvalidAsciiEscape, |
77 | TooShortAsciiCodeEscape, | 87 | TooShortAsciiCodeEscape, |
@@ -98,6 +108,19 @@ impl fmt::Display for SyntaxErrorKind { | |||
98 | EmptyChar => write!(f, "Empty char literal"), | 108 | EmptyChar => write!(f, "Empty char literal"), |
99 | UnclosedChar => write!(f, "Unclosed char literal"), | 109 | UnclosedChar => write!(f, "Unclosed char literal"), |
100 | OverlongChar => write!(f, "Char literal should be one character long"), | 110 | OverlongChar => write!(f, "Char literal should be one character long"), |
111 | EmptyByte => write!(f, "Empty byte literal"), | ||
112 | UnclosedByte => write!(f, "Unclosed byte literal"), | ||
113 | OverlongByte => write!(f, "Byte literal should be one character long"), | ||
114 | ByteOutOfRange => write!(f, "Byte should be a valid ASCII character"), | ||
115 | UnescapedByte => write!(f, "This byte should always be escaped"), | ||
116 | EmptyByteEscape => write!(f, "Empty escape sequence"), | ||
117 | InvalidByteEscape => write!(f, "Invalid escape sequence"), | ||
118 | TooShortByteCodeEscape => write!(f, "Escape sequence should have two digits"), | ||
119 | MalformedByteCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"), | ||
120 | UnicodeEscapeForbidden => write!( | ||
121 | f, | ||
122 | "Unicode escapes are not allowed in byte literals or byte strings" | ||
123 | ), | ||
101 | TooShortAsciiCodeEscape => write!(f, "Escape sequence should have two digits"), | 124 | TooShortAsciiCodeEscape => write!(f, "Escape sequence should have two digits"), |
102 | AsciiCodeEscapeOutOfRange => { | 125 | AsciiCodeEscapeOutOfRange => { |
103 | write!(f, "Escape sequence should be between \\x00 and \\x7F") | 126 | write!(f, "Escape sequence should be between \\x00 and \\x7F") |