diff options
author | bors[bot] <bors[bot]@users.noreply.github.com> | 2018-11-10 14:36:45 +0000 |
---|---|---|
committer | bors[bot] <bors[bot]@users.noreply.github.com> | 2018-11-10 14:36:45 +0000 |
commit | 477de790b0211196256a772befe4f577d1a8ba14 (patch) | |
tree | 42c40e9201adf64d1c06bc1c69524f5688ee6e9f /crates/ra_syntax/src | |
parent | 5a9150df9bcdaf5faed5b500c22333f1f7c99f32 (diff) | |
parent | 3b4c02c19e4af645fd37e8bff774b05d546dc0b6 (diff) |
Merge #222
222: Validate string literals r=aochagavia a=aochagavia
Related: #6 (some validators are still missing), fixes #27
Co-authored-by: Adolfo Ochagavía <[email protected]>
Diffstat (limited to 'crates/ra_syntax/src')
-rw-r--r-- | crates/ra_syntax/src/ast/generated.rs | 37 | ||||
-rw-r--r-- | crates/ra_syntax/src/ast/mod.rs | 9 | ||||
-rw-r--r-- | crates/ra_syntax/src/grammar.ron | 1 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing.rs (renamed from crates/ra_syntax/src/string_lexing/mod.rs) | 113 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation.rs | 271 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/char.rs | 270 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/mod.rs | 20 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/string.rs | 168 | ||||
-rw-r--r-- | crates/ra_syntax/src/yellow/syntax_error.rs | 6 |
9 files changed, 621 insertions, 274 deletions
diff --git a/crates/ra_syntax/src/ast/generated.rs b/crates/ra_syntax/src/ast/generated.rs index 5b5f71ee7..2e9ae263a 100644 --- a/crates/ra_syntax/src/ast/generated.rs +++ b/crates/ra_syntax/src/ast/generated.rs | |||
@@ -3236,6 +3236,43 @@ impl<'a> AstNode<'a> for Stmt<'a> { | |||
3236 | 3236 | ||
3237 | impl<'a> Stmt<'a> {} | 3237 | impl<'a> Stmt<'a> {} |
3238 | 3238 | ||
3239 | // String | ||
3240 | #[derive(Debug, Clone, Copy,)] | ||
3241 | pub struct StringNode<R: TreeRoot<RaTypes> = OwnedRoot> { | ||
3242 | pub(crate) syntax: SyntaxNode<R>, | ||
3243 | } | ||
3244 | pub type String<'a> = StringNode<RefRoot<'a>>; | ||
3245 | |||
3246 | impl<R1: TreeRoot<RaTypes>, R2: TreeRoot<RaTypes>> PartialEq<StringNode<R1>> for StringNode<R2> { | ||
3247 | fn eq(&self, other: &StringNode<R1>) -> bool { self.syntax == other.syntax } | ||
3248 | } | ||
3249 | impl<R: TreeRoot<RaTypes>> Eq for StringNode<R> {} | ||
3250 | impl<R: TreeRoot<RaTypes>> Hash for StringNode<R> { | ||
3251 | fn hash<H: Hasher>(&self, state: &mut H) { self.syntax.hash(state) } | ||
3252 | } | ||
3253 | |||
3254 | impl<'a> AstNode<'a> for String<'a> { | ||
3255 | fn cast(syntax: SyntaxNodeRef<'a>) -> Option<Self> { | ||
3256 | match syntax.kind() { | ||
3257 | STRING => Some(String { syntax }), | ||
3258 | _ => None, | ||
3259 | } | ||
3260 | } | ||
3261 | fn syntax(self) -> SyntaxNodeRef<'a> { self.syntax } | ||
3262 | } | ||
3263 | |||
3264 | impl<R: TreeRoot<RaTypes>> StringNode<R> { | ||
3265 | pub fn borrowed(&self) -> String { | ||
3266 | StringNode { syntax: self.syntax.borrowed() } | ||
3267 | } | ||
3268 | pub fn owned(&self) -> StringNode { | ||
3269 | StringNode { syntax: self.syntax.owned() } | ||
3270 | } | ||
3271 | } | ||
3272 | |||
3273 | |||
3274 | impl<'a> String<'a> {} | ||
3275 | |||
3239 | // StructDef | 3276 | // StructDef |
3240 | #[derive(Debug, Clone, Copy,)] | 3277 | #[derive(Debug, Clone, Copy,)] |
3241 | pub struct StructDefNode<R: TreeRoot<RaTypes> = OwnedRoot> { | 3278 | pub struct StructDefNode<R: TreeRoot<RaTypes> = OwnedRoot> { |
diff --git a/crates/ra_syntax/src/ast/mod.rs b/crates/ra_syntax/src/ast/mod.rs index 6b0d62610..f20714ede 100644 --- a/crates/ra_syntax/src/ast/mod.rs +++ b/crates/ra_syntax/src/ast/mod.rs | |||
@@ -1,6 +1,7 @@ | |||
1 | mod generated; | 1 | mod generated; |
2 | 2 | ||
3 | use std::marker::PhantomData; | 3 | use std::marker::PhantomData; |
4 | use std::string::String as RustString; | ||
4 | 5 | ||
5 | use itertools::Itertools; | 6 | use itertools::Itertools; |
6 | 7 | ||
@@ -76,7 +77,7 @@ pub trait DocCommentsOwner<'a>: AstNode<'a> { | |||
76 | 77 | ||
77 | /// Returns the textual content of a doc comment block as a single string. | 78 | /// Returns the textual content of a doc comment block as a single string. |
78 | /// That is, strips leading `///` and joins lines | 79 | /// That is, strips leading `///` and joins lines |
79 | fn doc_comment_text(self) -> String { | 80 | fn doc_comment_text(self) -> RustString { |
80 | self.doc_comments() | 81 | self.doc_comments() |
81 | .map(|comment| { | 82 | .map(|comment| { |
82 | let prefix = comment.prefix(); | 83 | let prefix = comment.prefix(); |
@@ -133,6 +134,12 @@ impl<'a> Char<'a> { | |||
133 | } | 134 | } |
134 | } | 135 | } |
135 | 136 | ||
137 | impl<'a> String<'a> { | ||
138 | pub fn text(&self) -> &SmolStr { | ||
139 | &self.syntax().leaf_text().unwrap() | ||
140 | } | ||
141 | } | ||
142 | |||
136 | impl<'a> Comment<'a> { | 143 | impl<'a> Comment<'a> { |
137 | pub fn text(&self) -> &SmolStr { | 144 | pub fn text(&self) -> &SmolStr { |
138 | self.syntax().leaf_text().unwrap() | 145 | self.syntax().leaf_text().unwrap() |
diff --git a/crates/ra_syntax/src/grammar.ron b/crates/ra_syntax/src/grammar.ron index a92844415..c3184667e 100644 --- a/crates/ra_syntax/src/grammar.ron +++ b/crates/ra_syntax/src/grammar.ron | |||
@@ -411,6 +411,7 @@ Grammar( | |||
411 | "PrefixExpr": (), | 411 | "PrefixExpr": (), |
412 | "RangeExpr": (), | 412 | "RangeExpr": (), |
413 | "BinExpr": (), | 413 | "BinExpr": (), |
414 | "String": (), | ||
414 | "Char": (), | 415 | "Char": (), |
415 | "Literal": (), | 416 | "Literal": (), |
416 | 417 | ||
diff --git a/crates/ra_syntax/src/string_lexing/mod.rs b/crates/ra_syntax/src/string_lexing.rs index cc53e0aba..d613bb042 100644 --- a/crates/ra_syntax/src/string_lexing/mod.rs +++ b/crates/ra_syntax/src/string_lexing.rs | |||
@@ -1,6 +1,68 @@ | |||
1 | use self::CharComponentKind::*; | 1 | use self::CharComponentKind::*; |
2 | use rowan::{TextRange, TextUnit}; | 2 | use rowan::{TextRange, TextUnit}; |
3 | 3 | ||
4 | pub fn parse_string_literal(src: &str) -> StringComponentIterator { | ||
5 | StringComponentIterator { | ||
6 | parser: Parser::new(src), | ||
7 | has_closing_quote: false, | ||
8 | } | ||
9 | } | ||
10 | |||
11 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
12 | pub struct StringComponent { | ||
13 | pub range: TextRange, | ||
14 | pub kind: StringComponentKind, | ||
15 | } | ||
16 | |||
17 | impl StringComponent { | ||
18 | fn new(range: TextRange, kind: StringComponentKind) -> StringComponent { | ||
19 | StringComponent { range, kind } | ||
20 | } | ||
21 | } | ||
22 | |||
23 | #[derive(Debug, Eq, PartialEq, Clone)] | ||
24 | pub enum StringComponentKind { | ||
25 | IgnoreNewline, | ||
26 | Char(CharComponentKind), | ||
27 | } | ||
28 | |||
29 | pub struct StringComponentIterator<'a> { | ||
30 | parser: Parser<'a>, | ||
31 | pub has_closing_quote: bool, | ||
32 | } | ||
33 | |||
34 | impl<'a> Iterator for StringComponentIterator<'a> { | ||
35 | type Item = StringComponent; | ||
36 | fn next(&mut self) -> Option<StringComponent> { | ||
37 | if self.parser.pos == 0 { | ||
38 | assert!( | ||
39 | self.parser.advance() == '"', | ||
40 | "string literal should start with double quotes" | ||
41 | ); | ||
42 | } | ||
43 | |||
44 | if let Some(component) = self.parser.parse_string_component() { | ||
45 | return Some(component); | ||
46 | } | ||
47 | |||
48 | // We get here when there are no char components left to parse | ||
49 | if self.parser.peek() == Some('"') { | ||
50 | self.parser.advance(); | ||
51 | self.has_closing_quote = true; | ||
52 | } | ||
53 | |||
54 | assert!( | ||
55 | self.parser.peek() == None, | ||
56 | "string literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
57 | self.parser.src, | ||
58 | self.parser.pos, | ||
59 | self.parser.src.len() | ||
60 | ); | ||
61 | |||
62 | None | ||
63 | } | ||
64 | } | ||
65 | |||
4 | pub fn parse_char_literal(src: &str) -> CharComponentIterator { | 66 | pub fn parse_char_literal(src: &str) -> CharComponentIterator { |
5 | CharComponentIterator { | 67 | CharComponentIterator { |
6 | parser: Parser::new(src), | 68 | parser: Parser::new(src), |
@@ -93,6 +155,12 @@ impl<'a> Parser<'a> { | |||
93 | next | 155 | next |
94 | } | 156 | } |
95 | 157 | ||
158 | pub fn skip_whitespace(&mut self) { | ||
159 | while self.peek().map(|c| c.is_whitespace()) == Some(true) { | ||
160 | self.advance(); | ||
161 | } | ||
162 | } | ||
163 | |||
96 | pub fn get_pos(&self) -> TextUnit { | 164 | pub fn get_pos(&self) -> TextUnit { |
97 | (self.pos as u32).into() | 165 | (self.pos as u32).into() |
98 | } | 166 | } |
@@ -172,6 +240,51 @@ impl<'a> Parser<'a> { | |||
172 | )) | 240 | )) |
173 | } | 241 | } |
174 | } | 242 | } |
243 | |||
244 | pub fn parse_ignore_newline(&mut self, start: TextUnit) -> Option<StringComponent> { | ||
245 | // In string literals, when a `\` occurs immediately before the newline, the `\`, | ||
246 | // the newline, and all whitespace at the beginning of the next line are ignored | ||
247 | match self.peek() { | ||
248 | Some('\n') | Some('\r') => { | ||
249 | self.skip_whitespace(); | ||
250 | Some(StringComponent::new( | ||
251 | TextRange::from_to(start, self.get_pos()), | ||
252 | StringComponentKind::IgnoreNewline, | ||
253 | )) | ||
254 | } | ||
255 | _ => None, | ||
256 | } | ||
257 | } | ||
258 | |||
259 | pub fn parse_string_component(&mut self) -> Option<StringComponent> { | ||
260 | let next = self.peek()?; | ||
261 | |||
262 | // Ignore string close | ||
263 | if next == '"' { | ||
264 | return None; | ||
265 | } | ||
266 | |||
267 | let start = self.get_pos(); | ||
268 | self.advance(); | ||
269 | |||
270 | if next == '\\' { | ||
271 | // Strings can use `\` to ignore newlines, so we first try to parse one of those | ||
272 | // before falling back to parsing char escapes | ||
273 | self.parse_ignore_newline(start).or_else(|| { | ||
274 | let char_component = self.parse_escape(start); | ||
275 | Some(StringComponent::new( | ||
276 | char_component.range, | ||
277 | StringComponentKind::Char(char_component.kind), | ||
278 | )) | ||
279 | }) | ||
280 | } else { | ||
281 | let end = self.get_pos(); | ||
282 | Some(StringComponent::new( | ||
283 | TextRange::from_to(start, end), | ||
284 | StringComponentKind::Char(CodePoint), | ||
285 | )) | ||
286 | } | ||
287 | } | ||
175 | } | 288 | } |
176 | 289 | ||
177 | #[cfg(test)] | 290 | #[cfg(test)] |
diff --git a/crates/ra_syntax/src/validation.rs b/crates/ra_syntax/src/validation.rs deleted file mode 100644 index a10b297c0..000000000 --- a/crates/ra_syntax/src/validation.rs +++ /dev/null | |||
@@ -1,271 +0,0 @@ | |||
1 | use std::u32; | ||
2 | |||
3 | use arrayvec::ArrayString; | ||
4 | |||
5 | use crate::{ | ||
6 | algo::visit::{visitor_ctx, VisitorCtx}, | ||
7 | ast::{self, AstNode}, | ||
8 | SourceFileNode, | ||
9 | string_lexing::{self, CharComponentKind}, | ||
10 | yellow::{ | ||
11 | SyntaxError, | ||
12 | SyntaxErrorKind::*, | ||
13 | }, | ||
14 | }; | ||
15 | |||
16 | pub(crate) fn validate(file: &SourceFileNode) -> Vec<SyntaxError> { | ||
17 | let mut errors = Vec::new(); | ||
18 | for node in file.syntax().descendants() { | ||
19 | let _ = visitor_ctx(&mut errors) | ||
20 | .visit::<ast::Char, _>(validate_char) | ||
21 | .accept(node); | ||
22 | } | ||
23 | errors | ||
24 | } | ||
25 | |||
26 | fn validate_char(node: ast::Char, errors: &mut Vec<SyntaxError>) { | ||
27 | let mut components = string_lexing::parse_char_literal(node.text()); | ||
28 | let mut len = 0; | ||
29 | for component in &mut components { | ||
30 | len += 1; | ||
31 | |||
32 | // Validate escapes | ||
33 | let text = &node.text()[component.range]; | ||
34 | let range = component.range + node.syntax().range().start(); | ||
35 | use self::CharComponentKind::*; | ||
36 | match component.kind { | ||
37 | AsciiEscape => { | ||
38 | if text.len() == 1 { | ||
39 | // Escape sequence consists only of leading `\` | ||
40 | errors.push(SyntaxError::new(EmptyAsciiEscape, range)); | ||
41 | } else { | ||
42 | let escape_code = text.chars().skip(1).next().unwrap(); | ||
43 | if !is_ascii_escape(escape_code) { | ||
44 | errors.push(SyntaxError::new(InvalidAsciiEscape, range)); | ||
45 | } | ||
46 | } | ||
47 | } | ||
48 | AsciiCodeEscape => { | ||
49 | // An AsciiCodeEscape has 4 chars, example: `\xDD` | ||
50 | if text.len() < 4 { | ||
51 | errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range)); | ||
52 | } else { | ||
53 | assert!( | ||
54 | text.chars().count() == 4, | ||
55 | "AsciiCodeEscape cannot be longer than 4 chars" | ||
56 | ); | ||
57 | |||
58 | match u8::from_str_radix(&text[2..], 16) { | ||
59 | Ok(code) if code < 128 => { /* Escape code is valid */ } | ||
60 | Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)), | ||
61 | Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)), | ||
62 | } | ||
63 | } | ||
64 | } | ||
65 | UnicodeEscape => { | ||
66 | assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u"); | ||
67 | |||
68 | if text.len() == 2 { | ||
69 | // No starting `{` | ||
70 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
71 | return; | ||
72 | } | ||
73 | |||
74 | if text.len() == 3 { | ||
75 | // Only starting `{` | ||
76 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)); | ||
77 | return; | ||
78 | } | ||
79 | |||
80 | let mut code = ArrayString::<[_; 6]>::new(); | ||
81 | let mut closed = false; | ||
82 | for c in text[3..].chars() { | ||
83 | assert!(!closed, "no characters after escape is closed"); | ||
84 | |||
85 | if c.is_digit(16) { | ||
86 | if code.len() == 6 { | ||
87 | errors.push(SyntaxError::new(OverlongUnicodeEscape, range)); | ||
88 | return; | ||
89 | } | ||
90 | |||
91 | code.push(c); | ||
92 | } else if c == '_' { | ||
93 | // Reject leading _ | ||
94 | if code.len() == 0 { | ||
95 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
96 | return; | ||
97 | } | ||
98 | } else if c == '}' { | ||
99 | closed = true; | ||
100 | } else { | ||
101 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
102 | return; | ||
103 | } | ||
104 | } | ||
105 | |||
106 | if !closed { | ||
107 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)) | ||
108 | } | ||
109 | |||
110 | if code.len() == 0 { | ||
111 | errors.push(SyntaxError::new(EmptyUnicodeEcape, range)); | ||
112 | return; | ||
113 | } | ||
114 | |||
115 | match u32::from_str_radix(&code, 16) { | ||
116 | Ok(code_u32) if code_u32 > 0x10FFFF => { | ||
117 | errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range)); | ||
118 | } | ||
119 | Ok(_) => { | ||
120 | // Valid escape code | ||
121 | } | ||
122 | Err(_) => { | ||
123 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
124 | } | ||
125 | } | ||
126 | } | ||
127 | CodePoint => { | ||
128 | // These code points must always be escaped | ||
129 | if text == "\t" || text == "\r" { | ||
130 | errors.push(SyntaxError::new(UnescapedCodepoint, range)); | ||
131 | } | ||
132 | } | ||
133 | } | ||
134 | } | ||
135 | |||
136 | if !components.has_closing_quote { | ||
137 | errors.push(SyntaxError::new(UnclosedChar, node.syntax().range())); | ||
138 | } | ||
139 | |||
140 | if len == 0 { | ||
141 | errors.push(SyntaxError::new(EmptyChar, node.syntax().range())); | ||
142 | } | ||
143 | |||
144 | if len > 1 { | ||
145 | errors.push(SyntaxError::new(LongChar, node.syntax().range())); | ||
146 | } | ||
147 | } | ||
148 | |||
149 | fn is_ascii_escape(code: char) -> bool { | ||
150 | match code { | ||
151 | '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true, | ||
152 | _ => false, | ||
153 | } | ||
154 | } | ||
155 | |||
156 | #[cfg(test)] | ||
157 | mod test { | ||
158 | use crate::SourceFileNode; | ||
159 | |||
160 | fn build_file(literal: &str) -> SourceFileNode { | ||
161 | let src = format!("const C: char = '{}';", literal); | ||
162 | SourceFileNode::parse(&src) | ||
163 | } | ||
164 | |||
165 | fn assert_valid_char(literal: &str) { | ||
166 | let file = build_file(literal); | ||
167 | assert!( | ||
168 | file.errors().len() == 0, | ||
169 | "Errors for literal '{}': {:?}", | ||
170 | literal, | ||
171 | file.errors() | ||
172 | ); | ||
173 | } | ||
174 | |||
175 | fn assert_invalid_char(literal: &str) { | ||
176 | let file = build_file(literal); | ||
177 | assert!(file.errors().len() > 0); | ||
178 | } | ||
179 | |||
180 | #[test] | ||
181 | fn test_ansi_codepoints() { | ||
182 | for byte in 0..=255u8 { | ||
183 | match byte { | ||
184 | b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()), | ||
185 | b'\'' | b'\\' => { /* Ignore character close and backslash */ } | ||
186 | _ => assert_valid_char(&(byte as char).to_string()), | ||
187 | } | ||
188 | } | ||
189 | } | ||
190 | |||
191 | #[test] | ||
192 | fn test_unicode_codepoints() { | ||
193 | let valid = ["Ƒ", "バ", "メ", "﷽"]; | ||
194 | for c in &valid { | ||
195 | assert_valid_char(c); | ||
196 | } | ||
197 | } | ||
198 | |||
199 | #[test] | ||
200 | fn test_unicode_multiple_codepoints() { | ||
201 | let invalid = ["नी", "👨👨"]; | ||
202 | for c in &invalid { | ||
203 | assert_invalid_char(c); | ||
204 | } | ||
205 | } | ||
206 | |||
207 | #[test] | ||
208 | fn test_valid_ascii_escape() { | ||
209 | let valid = [ | ||
210 | r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b", | ||
211 | ]; | ||
212 | for c in &valid { | ||
213 | assert_valid_char(c); | ||
214 | } | ||
215 | } | ||
216 | |||
217 | #[test] | ||
218 | fn test_invalid_ascii_escape() { | ||
219 | let invalid = [r"\a", r"\?", r"\"]; | ||
220 | for c in &invalid { | ||
221 | assert_invalid_char(c); | ||
222 | } | ||
223 | } | ||
224 | |||
225 | #[test] | ||
226 | fn test_valid_ascii_code_escape() { | ||
227 | let valid = [r"\x00", r"\x7F", r"\x55"]; | ||
228 | for c in &valid { | ||
229 | assert_valid_char(c); | ||
230 | } | ||
231 | } | ||
232 | |||
233 | #[test] | ||
234 | fn test_invalid_ascii_code_escape() { | ||
235 | let invalid = [r"\x", r"\x7", r"\xF0"]; | ||
236 | for c in &invalid { | ||
237 | assert_invalid_char(c); | ||
238 | } | ||
239 | } | ||
240 | |||
241 | #[test] | ||
242 | fn test_valid_unicode_escape() { | ||
243 | let valid = [ | ||
244 | r"\u{FF}", | ||
245 | r"\u{0}", | ||
246 | r"\u{F}", | ||
247 | r"\u{10FFFF}", | ||
248 | r"\u{1_0__FF___FF_____}", | ||
249 | ]; | ||
250 | for c in &valid { | ||
251 | assert_valid_char(c); | ||
252 | } | ||
253 | } | ||
254 | |||
255 | #[test] | ||
256 | fn test_invalid_unicode_escape() { | ||
257 | let invalid = [ | ||
258 | r"\u", | ||
259 | r"\u{}", | ||
260 | r"\u{", | ||
261 | r"\u{FF", | ||
262 | r"\u{FFFFFF}", | ||
263 | r"\u{_F}", | ||
264 | r"\u{00FFFFF}", | ||
265 | r"\u{110000}", | ||
266 | ]; | ||
267 | for c in &invalid { | ||
268 | assert_invalid_char(c); | ||
269 | } | ||
270 | } | ||
271 | } | ||
diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs new file mode 100644 index 000000000..63f9bad24 --- /dev/null +++ b/crates/ra_syntax/src/validation/char.rs | |||
@@ -0,0 +1,270 @@ | |||
1 | use std::u32; | ||
2 | |||
3 | use arrayvec::ArrayString; | ||
4 | |||
5 | use crate::{ | ||
6 | ast::{self, AstNode}, | ||
7 | string_lexing::{self, CharComponentKind}, | ||
8 | TextRange, | ||
9 | yellow::{ | ||
10 | SyntaxError, | ||
11 | SyntaxErrorKind::*, | ||
12 | }, | ||
13 | }; | ||
14 | |||
15 | pub(crate) fn validate_char_node(node: ast::Char, errors: &mut Vec<SyntaxError>) { | ||
16 | let literal_text = node.text(); | ||
17 | let literal_range = node.syntax().range(); | ||
18 | let mut components = string_lexing::parse_char_literal(literal_text); | ||
19 | let mut len = 0; | ||
20 | for component in &mut components { | ||
21 | len += 1; | ||
22 | let text = &literal_text[component.range]; | ||
23 | let range = component.range + literal_range.start(); | ||
24 | validate_char_component(text, component.kind, range, errors); | ||
25 | } | ||
26 | |||
27 | if !components.has_closing_quote { | ||
28 | errors.push(SyntaxError::new(UnclosedChar, literal_range)); | ||
29 | } | ||
30 | |||
31 | if len == 0 { | ||
32 | errors.push(SyntaxError::new(EmptyChar, literal_range)); | ||
33 | } | ||
34 | |||
35 | if len > 1 { | ||
36 | errors.push(SyntaxError::new(OverlongChar, literal_range)); | ||
37 | } | ||
38 | } | ||
39 | |||
40 | pub(crate) fn validate_char_component( | ||
41 | text: &str, | ||
42 | kind: CharComponentKind, | ||
43 | range: TextRange, | ||
44 | errors: &mut Vec<SyntaxError>, | ||
45 | ) { | ||
46 | // Validate escapes | ||
47 | use self::CharComponentKind::*; | ||
48 | match kind { | ||
49 | AsciiEscape => { | ||
50 | if text.len() == 1 { | ||
51 | // Escape sequence consists only of leading `\` | ||
52 | errors.push(SyntaxError::new(EmptyAsciiEscape, range)); | ||
53 | } else { | ||
54 | let escape_code = text.chars().skip(1).next().unwrap(); | ||
55 | if !is_ascii_escape(escape_code) { | ||
56 | errors.push(SyntaxError::new(InvalidAsciiEscape, range)); | ||
57 | } | ||
58 | } | ||
59 | } | ||
60 | AsciiCodeEscape => { | ||
61 | // An AsciiCodeEscape has 4 chars, example: `\xDD` | ||
62 | if text.len() < 4 { | ||
63 | errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range)); | ||
64 | } else { | ||
65 | assert!( | ||
66 | text.chars().count() == 4, | ||
67 | "AsciiCodeEscape cannot be longer than 4 chars" | ||
68 | ); | ||
69 | |||
70 | match u8::from_str_radix(&text[2..], 16) { | ||
71 | Ok(code) if code < 128 => { /* Escape code is valid */ } | ||
72 | Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)), | ||
73 | Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)), | ||
74 | } | ||
75 | } | ||
76 | } | ||
77 | UnicodeEscape => { | ||
78 | assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u"); | ||
79 | |||
80 | if text.len() == 2 { | ||
81 | // No starting `{` | ||
82 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
83 | return; | ||
84 | } | ||
85 | |||
86 | if text.len() == 3 { | ||
87 | // Only starting `{` | ||
88 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)); | ||
89 | return; | ||
90 | } | ||
91 | |||
92 | let mut code = ArrayString::<[_; 6]>::new(); | ||
93 | let mut closed = false; | ||
94 | for c in text[3..].chars() { | ||
95 | assert!(!closed, "no characters after escape is closed"); | ||
96 | |||
97 | if c.is_digit(16) { | ||
98 | if code.len() == 6 { | ||
99 | errors.push(SyntaxError::new(OverlongUnicodeEscape, range)); | ||
100 | return; | ||
101 | } | ||
102 | |||
103 | code.push(c); | ||
104 | } else if c == '_' { | ||
105 | // Reject leading _ | ||
106 | if code.len() == 0 { | ||
107 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
108 | return; | ||
109 | } | ||
110 | } else if c == '}' { | ||
111 | closed = true; | ||
112 | } else { | ||
113 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
114 | return; | ||
115 | } | ||
116 | } | ||
117 | |||
118 | if !closed { | ||
119 | errors.push(SyntaxError::new(UnclosedUnicodeEscape, range)) | ||
120 | } | ||
121 | |||
122 | if code.len() == 0 { | ||
123 | errors.push(SyntaxError::new(EmptyUnicodeEcape, range)); | ||
124 | return; | ||
125 | } | ||
126 | |||
127 | match u32::from_str_radix(&code, 16) { | ||
128 | Ok(code_u32) if code_u32 > 0x10FFFF => { | ||
129 | errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range)); | ||
130 | } | ||
131 | Ok(_) => { | ||
132 | // Valid escape code | ||
133 | } | ||
134 | Err(_) => { | ||
135 | errors.push(SyntaxError::new(MalformedUnicodeEscape, range)); | ||
136 | } | ||
137 | } | ||
138 | } | ||
139 | CodePoint => { | ||
140 | // These code points must always be escaped | ||
141 | if text == "\t" || text == "\r" { | ||
142 | errors.push(SyntaxError::new(UnescapedCodepoint, range)); | ||
143 | } | ||
144 | } | ||
145 | } | ||
146 | } | ||
147 | |||
148 | fn is_ascii_escape(code: char) -> bool { | ||
149 | match code { | ||
150 | '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true, | ||
151 | _ => false, | ||
152 | } | ||
153 | } | ||
154 | |||
155 | #[cfg(test)] | ||
156 | mod test { | ||
157 | use crate::SourceFileNode; | ||
158 | |||
159 | fn build_file(literal: &str) -> SourceFileNode { | ||
160 | let src = format!("const C: char = '{}';", literal); | ||
161 | SourceFileNode::parse(&src) | ||
162 | } | ||
163 | |||
164 | fn assert_valid_char(literal: &str) { | ||
165 | let file = build_file(literal); | ||
166 | assert!( | ||
167 | file.errors().len() == 0, | ||
168 | "Errors for literal '{}': {:?}", | ||
169 | literal, | ||
170 | file.errors() | ||
171 | ); | ||
172 | } | ||
173 | |||
174 | fn assert_invalid_char(literal: &str) { | ||
175 | let file = build_file(literal); | ||
176 | assert!(file.errors().len() > 0); | ||
177 | } | ||
178 | |||
179 | #[test] | ||
180 | fn test_ansi_codepoints() { | ||
181 | for byte in 0..=255u8 { | ||
182 | match byte { | ||
183 | b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()), | ||
184 | b'\'' | b'\\' => { /* Ignore character close and backslash */ } | ||
185 | _ => assert_valid_char(&(byte as char).to_string()), | ||
186 | } | ||
187 | } | ||
188 | } | ||
189 | |||
190 | #[test] | ||
191 | fn test_unicode_codepoints() { | ||
192 | let valid = ["Ƒ", "バ", "メ", "﷽"]; | ||
193 | for c in &valid { | ||
194 | assert_valid_char(c); | ||
195 | } | ||
196 | } | ||
197 | |||
198 | #[test] | ||
199 | fn test_unicode_multiple_codepoints() { | ||
200 | let invalid = ["नी", "👨👨"]; | ||
201 | for c in &invalid { | ||
202 | assert_invalid_char(c); | ||
203 | } | ||
204 | } | ||
205 | |||
206 | #[test] | ||
207 | fn test_valid_ascii_escape() { | ||
208 | let valid = [ | ||
209 | r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b", | ||
210 | ]; | ||
211 | for c in &valid { | ||
212 | assert_valid_char(c); | ||
213 | } | ||
214 | } | ||
215 | |||
216 | #[test] | ||
217 | fn test_invalid_ascii_escape() { | ||
218 | let invalid = [r"\a", r"\?", r"\"]; | ||
219 | for c in &invalid { | ||
220 | assert_invalid_char(c); | ||
221 | } | ||
222 | } | ||
223 | |||
224 | #[test] | ||
225 | fn test_valid_ascii_code_escape() { | ||
226 | let valid = [r"\x00", r"\x7F", r"\x55"]; | ||
227 | for c in &valid { | ||
228 | assert_valid_char(c); | ||
229 | } | ||
230 | } | ||
231 | |||
232 | #[test] | ||
233 | fn test_invalid_ascii_code_escape() { | ||
234 | let invalid = [r"\x", r"\x7", r"\xF0"]; | ||
235 | for c in &invalid { | ||
236 | assert_invalid_char(c); | ||
237 | } | ||
238 | } | ||
239 | |||
240 | #[test] | ||
241 | fn test_valid_unicode_escape() { | ||
242 | let valid = [ | ||
243 | r"\u{FF}", | ||
244 | r"\u{0}", | ||
245 | r"\u{F}", | ||
246 | r"\u{10FFFF}", | ||
247 | r"\u{1_0__FF___FF_____}", | ||
248 | ]; | ||
249 | for c in &valid { | ||
250 | assert_valid_char(c); | ||
251 | } | ||
252 | } | ||
253 | |||
254 | #[test] | ||
255 | fn test_invalid_unicode_escape() { | ||
256 | let invalid = [ | ||
257 | r"\u", | ||
258 | r"\u{}", | ||
259 | r"\u{", | ||
260 | r"\u{FF", | ||
261 | r"\u{FFFFFF}", | ||
262 | r"\u{_F}", | ||
263 | r"\u{00FFFFF}", | ||
264 | r"\u{110000}", | ||
265 | ]; | ||
266 | for c in &invalid { | ||
267 | assert_invalid_char(c); | ||
268 | } | ||
269 | } | ||
270 | } | ||
diff --git a/crates/ra_syntax/src/validation/mod.rs b/crates/ra_syntax/src/validation/mod.rs new file mode 100644 index 000000000..2ff0bc26d --- /dev/null +++ b/crates/ra_syntax/src/validation/mod.rs | |||
@@ -0,0 +1,20 @@ | |||
1 | use crate::{ | ||
2 | algo::visit::{visitor_ctx, VisitorCtx}, | ||
3 | ast, | ||
4 | SourceFileNode, | ||
5 | yellow::SyntaxError, | ||
6 | }; | ||
7 | |||
8 | mod char; | ||
9 | mod string; | ||
10 | |||
11 | pub(crate) fn validate(file: &SourceFileNode) -> Vec<SyntaxError> { | ||
12 | let mut errors = Vec::new(); | ||
13 | for node in file.syntax().descendants() { | ||
14 | let _ = visitor_ctx(&mut errors) | ||
15 | .visit::<ast::Char, _>(self::char::validate_char_node) | ||
16 | .visit::<ast::String, _>(self::string::validate_string_node) | ||
17 | .accept(node); | ||
18 | } | ||
19 | errors | ||
20 | } | ||
diff --git a/crates/ra_syntax/src/validation/string.rs b/crates/ra_syntax/src/validation/string.rs new file mode 100644 index 000000000..089879d15 --- /dev/null +++ b/crates/ra_syntax/src/validation/string.rs | |||
@@ -0,0 +1,168 @@ | |||
1 | use crate::{ | ||
2 | ast::{self, AstNode}, | ||
3 | string_lexing::{self, StringComponentKind}, | ||
4 | yellow::{ | ||
5 | SyntaxError, | ||
6 | SyntaxErrorKind::*, | ||
7 | }, | ||
8 | }; | ||
9 | |||
10 | use super::char; | ||
11 | |||
12 | pub(crate) fn validate_string_node(node: ast::String, errors: &mut Vec<SyntaxError>) { | ||
13 | let literal_text = node.text(); | ||
14 | let literal_range = node.syntax().range(); | ||
15 | let mut components = string_lexing::parse_string_literal(literal_text); | ||
16 | for component in &mut components { | ||
17 | let range = component.range + literal_range.start(); | ||
18 | |||
19 | match component.kind { | ||
20 | StringComponentKind::Char(kind) => { | ||
21 | // Chars must escape \t, \n and \r codepoints, but strings don't | ||
22 | let text = &literal_text[component.range]; | ||
23 | match text { | ||
24 | "\t" | "\n" | "\r" => { /* always valid */ } | ||
25 | _ => char::validate_char_component(text, kind, range, errors), | ||
26 | } | ||
27 | } | ||
28 | StringComponentKind::IgnoreNewline => { /* always valid */ } | ||
29 | } | ||
30 | } | ||
31 | |||
32 | if !components.has_closing_quote { | ||
33 | errors.push(SyntaxError::new(UnclosedString, literal_range)); | ||
34 | } | ||
35 | } | ||
36 | |||
37 | #[cfg(test)] | ||
38 | mod test { | ||
39 | use crate::SourceFileNode; | ||
40 | |||
41 | fn build_file(literal: &str) -> SourceFileNode { | ||
42 | let src = format!(r#"const S: &'static str = "{}";"#, literal); | ||
43 | println!("Source: {}", src); | ||
44 | SourceFileNode::parse(&src) | ||
45 | } | ||
46 | |||
47 | fn assert_valid_str(literal: &str) { | ||
48 | let file = build_file(literal); | ||
49 | assert!( | ||
50 | file.errors().len() == 0, | ||
51 | "Errors for literal '{}': {:?}", | ||
52 | literal, | ||
53 | file.errors() | ||
54 | ); | ||
55 | } | ||
56 | |||
57 | fn assert_invalid_str(literal: &str) { | ||
58 | let file = build_file(literal); | ||
59 | assert!(file.errors().len() > 0); | ||
60 | } | ||
61 | |||
62 | #[test] | ||
63 | fn test_ansi_codepoints() { | ||
64 | for byte in 0..=255u8 { | ||
65 | match byte { | ||
66 | b'\"' | b'\\' => { /* Ignore string close and backslash */ } | ||
67 | _ => assert_valid_str(&(byte as char).to_string()), | ||
68 | } | ||
69 | } | ||
70 | } | ||
71 | |||
72 | #[test] | ||
73 | fn test_unicode_codepoints() { | ||
74 | let valid = ["Ƒ", "バ", "メ", "﷽"]; | ||
75 | for c in &valid { | ||
76 | assert_valid_str(c); | ||
77 | } | ||
78 | } | ||
79 | |||
80 | #[test] | ||
81 | fn test_unicode_multiple_codepoints() { | ||
82 | let valid = ["नी", "👨👨"]; | ||
83 | for c in &valid { | ||
84 | assert_valid_str(c); | ||
85 | } | ||
86 | } | ||
87 | |||
88 | #[test] | ||
89 | fn test_valid_ascii_escape() { | ||
90 | let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"]; | ||
91 | for c in &valid { | ||
92 | assert_valid_str(c); | ||
93 | } | ||
94 | } | ||
95 | |||
96 | #[test] | ||
97 | fn test_invalid_ascii_escape() { | ||
98 | let invalid = [r"\a", r"\?", r"\"]; | ||
99 | for c in &invalid { | ||
100 | assert_invalid_str(c); | ||
101 | } | ||
102 | } | ||
103 | |||
104 | #[test] | ||
105 | fn test_valid_ascii_code_escape() { | ||
106 | let valid = [r"\x00", r"\x7F", r"\x55"]; | ||
107 | for c in &valid { | ||
108 | assert_valid_str(c); | ||
109 | } | ||
110 | } | ||
111 | |||
112 | #[test] | ||
113 | fn test_invalid_ascii_code_escape() { | ||
114 | let invalid = [r"\x", r"\x7", r"\xF0"]; | ||
115 | for c in &invalid { | ||
116 | assert_invalid_str(c); | ||
117 | } | ||
118 | } | ||
119 | |||
120 | #[test] | ||
121 | fn test_valid_unicode_escape() { | ||
122 | let valid = [ | ||
123 | r"\u{FF}", | ||
124 | r"\u{0}", | ||
125 | r"\u{F}", | ||
126 | r"\u{10FFFF}", | ||
127 | r"\u{1_0__FF___FF_____}", | ||
128 | ]; | ||
129 | for c in &valid { | ||
130 | assert_valid_str(c); | ||
131 | } | ||
132 | } | ||
133 | |||
134 | #[test] | ||
135 | fn test_invalid_unicode_escape() { | ||
136 | let invalid = [ | ||
137 | r"\u", | ||
138 | r"\u{}", | ||
139 | r"\u{", | ||
140 | r"\u{FF", | ||
141 | r"\u{FFFFFF}", | ||
142 | r"\u{_F}", | ||
143 | r"\u{00FFFFF}", | ||
144 | r"\u{110000}", | ||
145 | ]; | ||
146 | for c in &invalid { | ||
147 | assert_invalid_str(c); | ||
148 | } | ||
149 | } | ||
150 | |||
151 | #[test] | ||
152 | fn test_mixed() { | ||
153 | assert_valid_str( | ||
154 | r"This is the tale of a string | ||
155 | with a newline in between, some emoji (👨👨) here and there, | ||
156 | unicode escapes like this: \u{1FFBB} and weird stuff like | ||
157 | this ﷽", | ||
158 | ); | ||
159 | } | ||
160 | |||
161 | #[test] | ||
162 | fn test_ignore_newline() { | ||
163 | assert_valid_str( | ||
164 | "Hello \ | ||
165 | World", | ||
166 | ); | ||
167 | } | ||
168 | } | ||
diff --git a/crates/ra_syntax/src/yellow/syntax_error.rs b/crates/ra_syntax/src/yellow/syntax_error.rs index c524adf39..cf7b1d495 100644 --- a/crates/ra_syntax/src/yellow/syntax_error.rs +++ b/crates/ra_syntax/src/yellow/syntax_error.rs | |||
@@ -71,7 +71,7 @@ pub enum SyntaxErrorKind { | |||
71 | UnescapedCodepoint, | 71 | UnescapedCodepoint, |
72 | EmptyChar, | 72 | EmptyChar, |
73 | UnclosedChar, | 73 | UnclosedChar, |
74 | LongChar, | 74 | OverlongChar, |
75 | EmptyAsciiEscape, | 75 | EmptyAsciiEscape, |
76 | InvalidAsciiEscape, | 76 | InvalidAsciiEscape, |
77 | TooShortAsciiCodeEscape, | 77 | TooShortAsciiCodeEscape, |
@@ -82,6 +82,7 @@ pub enum SyntaxErrorKind { | |||
82 | EmptyUnicodeEcape, | 82 | EmptyUnicodeEcape, |
83 | OverlongUnicodeEscape, | 83 | OverlongUnicodeEscape, |
84 | UnicodeEscapeOutOfRange, | 84 | UnicodeEscapeOutOfRange, |
85 | UnclosedString, | ||
85 | } | 86 | } |
86 | 87 | ||
87 | #[derive(Debug, Clone, PartialEq, Eq, Hash)] | 88 | #[derive(Debug, Clone, PartialEq, Eq, Hash)] |
@@ -96,7 +97,7 @@ impl fmt::Display for SyntaxErrorKind { | |||
96 | InvalidAsciiEscape => write!(f, "Invalid escape sequence"), | 97 | InvalidAsciiEscape => write!(f, "Invalid escape sequence"), |
97 | EmptyChar => write!(f, "Empty char literal"), | 98 | EmptyChar => write!(f, "Empty char literal"), |
98 | UnclosedChar => write!(f, "Unclosed char literal"), | 99 | UnclosedChar => write!(f, "Unclosed char literal"), |
99 | LongChar => write!(f, "Char literal should be one character long"), | 100 | OverlongChar => write!(f, "Char literal should be one character long"), |
100 | TooShortAsciiCodeEscape => write!(f, "Escape sequence should have two digits"), | 101 | TooShortAsciiCodeEscape => write!(f, "Escape sequence should have two digits"), |
101 | AsciiCodeEscapeOutOfRange => { | 102 | AsciiCodeEscapeOutOfRange => { |
102 | write!(f, "Escape sequence should be between \\x00 and \\x7F") | 103 | write!(f, "Escape sequence should be between \\x00 and \\x7F") |
@@ -109,6 +110,7 @@ impl fmt::Display for SyntaxErrorKind { | |||
109 | write!(f, "Unicode escape sequence should have at most 6 digits") | 110 | write!(f, "Unicode escape sequence should have at most 6 digits") |
110 | } | 111 | } |
111 | UnicodeEscapeOutOfRange => write!(f, "Unicode escape code should be at most 0x10FFFF"), | 112 | UnicodeEscapeOutOfRange => write!(f, "Unicode escape code should be at most 0x10FFFF"), |
113 | UnclosedString => write!(f, "Unclosed string literal"), | ||
112 | ParseError(msg) => write!(f, "{}", msg.0), | 114 | ParseError(msg) => write!(f, "{}", msg.0), |
113 | } | 115 | } |
114 | } | 116 | } |