diff options
author | Adolfo Ochagavía <[email protected]> | 2018-11-11 19:41:43 +0000 |
---|---|---|
committer | Adolfo Ochagavía <[email protected]> | 2018-11-11 19:41:43 +0000 |
commit | 30cd4d5acb7dfd40cea264a926d1c89f0c3522c3 (patch) | |
tree | d71024919de98d74bf1a6bedc8a755c8810f1723 /crates/ra_syntax | |
parent | c258b4fdb0e421813330c2428985c4537c787582 (diff) |
Validate byte string literals
Diffstat (limited to 'crates/ra_syntax')
-rw-r--r-- | crates/ra_syntax/src/ast/generated.rs | 37 | ||||
-rw-r--r-- | crates/ra_syntax/src/ast/mod.rs | 6 | ||||
-rw-r--r-- | crates/ra_syntax/src/grammar.ron | 1 | ||||
-rw-r--r-- | crates/ra_syntax/src/string_lexing.rs | 53 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/byte.rs | 50 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/byte_string.rs | 178 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/char.rs | 2 | ||||
-rw-r--r-- | crates/ra_syntax/src/validation/mod.rs | 2 |
8 files changed, 305 insertions, 24 deletions
diff --git a/crates/ra_syntax/src/ast/generated.rs b/crates/ra_syntax/src/ast/generated.rs index 75236153d..bf056131e 100644 --- a/crates/ra_syntax/src/ast/generated.rs +++ b/crates/ra_syntax/src/ast/generated.rs | |||
@@ -409,6 +409,43 @@ impl<R: TreeRoot<RaTypes>> ByteNode<R> { | |||
409 | 409 | ||
410 | impl<'a> Byte<'a> {} | 410 | impl<'a> Byte<'a> {} |
411 | 411 | ||
412 | // ByteString | ||
413 | #[derive(Debug, Clone, Copy,)] | ||
414 | pub struct ByteStringNode<R: TreeRoot<RaTypes> = OwnedRoot> { | ||
415 | pub(crate) syntax: SyntaxNode<R>, | ||
416 | } | ||
417 | pub type ByteString<'a> = ByteStringNode<RefRoot<'a>>; | ||
418 | |||
419 | impl<R1: TreeRoot<RaTypes>, R2: TreeRoot<RaTypes>> PartialEq<ByteStringNode<R1>> for ByteStringNode<R2> { | ||
420 | fn eq(&self, other: &ByteStringNode<R1>) -> bool { self.syntax == other.syntax } | ||
421 | } | ||
422 | impl<R: TreeRoot<RaTypes>> Eq for ByteStringNode<R> {} | ||
423 | impl<R: TreeRoot<RaTypes>> Hash for ByteStringNode<R> { | ||
424 | fn hash<H: Hasher>(&self, state: &mut H) { self.syntax.hash(state) } | ||
425 | } | ||
426 | |||
427 | impl<'a> AstNode<'a> for ByteString<'a> { | ||
428 | fn cast(syntax: SyntaxNodeRef<'a>) -> Option<Self> { | ||
429 | match syntax.kind() { | ||
430 | BYTE_STRING => Some(ByteString { syntax }), | ||
431 | _ => None, | ||
432 | } | ||
433 | } | ||
434 | fn syntax(self) -> SyntaxNodeRef<'a> { self.syntax } | ||
435 | } | ||
436 | |||
437 | impl<R: TreeRoot<RaTypes>> ByteStringNode<R> { | ||
438 | pub fn borrowed(&self) -> ByteString { | ||
439 | ByteStringNode { syntax: self.syntax.borrowed() } | ||
440 | } | ||
441 | pub fn owned(&self) -> ByteStringNode { | ||
442 | ByteStringNode { syntax: self.syntax.owned() } | ||
443 | } | ||
444 | } | ||
445 | |||
446 | |||
447 | impl<'a> ByteString<'a> {} | ||
448 | |||
412 | // CallExpr | 449 | // CallExpr |
413 | #[derive(Debug, Clone, Copy,)] | 450 | #[derive(Debug, Clone, Copy,)] |
414 | pub struct CallExprNode<R: TreeRoot<RaTypes> = OwnedRoot> { | 451 | pub struct CallExprNode<R: TreeRoot<RaTypes> = OwnedRoot> { |
diff --git a/crates/ra_syntax/src/ast/mod.rs b/crates/ra_syntax/src/ast/mod.rs index 686b5cf04..7077e3492 100644 --- a/crates/ra_syntax/src/ast/mod.rs +++ b/crates/ra_syntax/src/ast/mod.rs | |||
@@ -140,6 +140,12 @@ impl<'a> Byte<'a> { | |||
140 | } | 140 | } |
141 | } | 141 | } |
142 | 142 | ||
143 | impl<'a> ByteString<'a> { | ||
144 | pub fn text(&self) -> &SmolStr { | ||
145 | &self.syntax().leaf_text().unwrap() | ||
146 | } | ||
147 | } | ||
148 | |||
143 | impl<'a> String<'a> { | 149 | impl<'a> String<'a> { |
144 | pub fn text(&self) -> &SmolStr { | 150 | pub fn text(&self) -> &SmolStr { |
145 | &self.syntax().leaf_text().unwrap() | 151 | &self.syntax().leaf_text().unwrap() |
diff --git a/crates/ra_syntax/src/grammar.ron b/crates/ra_syntax/src/grammar.ron index 2c2ed1aeb..53cd2118f 100644 --- a/crates/ra_syntax/src/grammar.ron +++ b/crates/ra_syntax/src/grammar.ron | |||
@@ -413,6 +413,7 @@ Grammar( | |||
413 | "BinExpr": (), | 413 | "BinExpr": (), |
414 | "String": (), | 414 | "String": (), |
415 | "Byte": (), | 415 | "Byte": (), |
416 | "ByteString": (), | ||
416 | "Char": (), | 417 | "Char": (), |
417 | "Literal": (), | 418 | "Literal": (), |
418 | 419 | ||
diff --git a/crates/ra_syntax/src/string_lexing.rs b/crates/ra_syntax/src/string_lexing.rs index 4e8c3a91c..d253c97e7 100644 --- a/crates/ra_syntax/src/string_lexing.rs +++ b/crates/ra_syntax/src/string_lexing.rs | |||
@@ -1,6 +1,55 @@ | |||
1 | use self::CharComponentKind::*; | 1 | use self::CharComponentKind::*; |
2 | use rowan::{TextRange, TextUnit}; | 2 | use rowan::{TextRange, TextUnit}; |
3 | 3 | ||
4 | pub fn parse_byte_string_literal(src: &str) -> ByteStringComponentIterator { | ||
5 | ByteStringComponentIterator { | ||
6 | parser: Parser::new(src), | ||
7 | has_closing_quote: false, | ||
8 | } | ||
9 | } | ||
10 | |||
11 | pub struct ByteStringComponentIterator<'a> { | ||
12 | parser: Parser<'a>, | ||
13 | pub has_closing_quote: bool, | ||
14 | } | ||
15 | |||
16 | impl<'a> Iterator for ByteStringComponentIterator<'a> { | ||
17 | type Item = StringComponent; | ||
18 | fn next(&mut self) -> Option<StringComponent> { | ||
19 | if self.parser.pos == 0 { | ||
20 | assert!( | ||
21 | self.parser.advance() == 'b', | ||
22 | "byte string literal should start with a `b`" | ||
23 | ); | ||
24 | |||
25 | assert!( | ||
26 | self.parser.advance() == '"', | ||
27 | "byte string literal should start with a `b`, followed by double quotes" | ||
28 | ); | ||
29 | } | ||
30 | |||
31 | if let Some(component) = self.parser.parse_string_component() { | ||
32 | return Some(component); | ||
33 | } | ||
34 | |||
35 | // We get here when there are no char components left to parse | ||
36 | if self.parser.peek() == Some('"') { | ||
37 | self.parser.advance(); | ||
38 | self.has_closing_quote = true; | ||
39 | } | ||
40 | |||
41 | assert!( | ||
42 | self.parser.peek() == None, | ||
43 | "byte string literal should leave no unparsed input: src = {}, pos = {}, length = {}", | ||
44 | self.parser.src, | ||
45 | self.parser.pos, | ||
46 | self.parser.src.len() | ||
47 | ); | ||
48 | |||
49 | None | ||
50 | } | ||
51 | } | ||
52 | |||
4 | pub fn parse_string_literal(src: &str) -> StringComponentIterator { | 53 | pub fn parse_string_literal(src: &str) -> StringComponentIterator { |
5 | StringComponentIterator { | 54 | StringComponentIterator { |
6 | parser: Parser::new(src), | 55 | parser: Parser::new(src), |
@@ -81,12 +130,12 @@ impl<'a> Iterator for ByteComponentIterator<'a> { | |||
81 | if self.parser.pos == 0 { | 130 | if self.parser.pos == 0 { |
82 | assert!( | 131 | assert!( |
83 | self.parser.advance() == 'b', | 132 | self.parser.advance() == 'b', |
84 | "Byte literal should start with a b" | 133 | "Byte literal should start with a `b`" |
85 | ); | 134 | ); |
86 | 135 | ||
87 | assert!( | 136 | assert!( |
88 | self.parser.advance() == '\'', | 137 | self.parser.advance() == '\'', |
89 | "Byte literal should start with a b, followed by a quote" | 138 | "Byte literal should start with a `b`, followed by a quote" |
90 | ); | 139 | ); |
91 | } | 140 | } |
92 | 141 | ||
diff --git a/crates/ra_syntax/src/validation/byte.rs b/crates/ra_syntax/src/validation/byte.rs index 3d2806c4e..7baf3c1d7 100644 --- a/crates/ra_syntax/src/validation/byte.rs +++ b/crates/ra_syntax/src/validation/byte.rs | |||
@@ -20,26 +20,7 @@ pub(super) fn validate_byte_node(node: ast::Byte, errors: &mut Vec<SyntaxError>) | |||
20 | len += 1; | 20 | len += 1; |
21 | let text = &literal_text[component.range]; | 21 | let text = &literal_text[component.range]; |
22 | let range = component.range + literal_range.start(); | 22 | let range = component.range + literal_range.start(); |
23 | 23 | validate_byte_component(text, component.kind, range, errors); | |
24 | use self::CharComponentKind::*; | ||
25 | match component.kind { | ||
26 | AsciiEscape => validate_byte_escape(text, range, errors), | ||
27 | AsciiCodeEscape => validate_byte_code_escape(text, range, errors), | ||
28 | UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)), | ||
29 | CodePoint => { | ||
30 | let c = text.chars().next().expect("Code points should be one character long"); | ||
31 | |||
32 | // These bytes must always be escaped | ||
33 | if c == '\t' || c == '\r' || c == '\n' { | ||
34 | errors.push(SyntaxError::new(UnescapedByte, range)); | ||
35 | } | ||
36 | |||
37 | // Only ASCII bytes are allowed | ||
38 | if c > 0x7F as char { | ||
39 | errors.push(SyntaxError::new(ByteOutOfRange, range)); | ||
40 | } | ||
41 | } | ||
42 | } | ||
43 | } | 24 | } |
44 | 25 | ||
45 | if !components.has_closing_quote { | 26 | if !components.has_closing_quote { |
@@ -55,6 +36,33 @@ pub(super) fn validate_byte_node(node: ast::Byte, errors: &mut Vec<SyntaxError>) | |||
55 | } | 36 | } |
56 | } | 37 | } |
57 | 38 | ||
39 | pub(super) fn validate_byte_component( | ||
40 | text: &str, | ||
41 | kind: CharComponentKind, | ||
42 | range: TextRange, | ||
43 | errors: &mut Vec<SyntaxError>, | ||
44 | ) { | ||
45 | use self::CharComponentKind::*; | ||
46 | match kind { | ||
47 | AsciiEscape => validate_byte_escape(text, range, errors), | ||
48 | AsciiCodeEscape => validate_byte_code_escape(text, range, errors), | ||
49 | UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)), | ||
50 | CodePoint => { | ||
51 | let c = text.chars().next().expect("Code points should be one character long"); | ||
52 | |||
53 | // These bytes must always be escaped | ||
54 | if c == '\t' || c == '\r' || c == '\n' { | ||
55 | errors.push(SyntaxError::new(UnescapedByte, range)); | ||
56 | } | ||
57 | |||
58 | // Only ASCII bytes are allowed | ||
59 | if c > 0x7F as char { | ||
60 | errors.push(SyntaxError::new(ByteOutOfRange, range)); | ||
61 | } | ||
62 | } | ||
63 | } | ||
64 | } | ||
65 | |||
58 | fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { | 66 | fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) { |
59 | if text.len() == 1 { | 67 | if text.len() == 1 { |
60 | // Escape sequence consists only of leading `\` | 68 | // Escape sequence consists only of leading `\` |
@@ -141,7 +149,7 @@ mod test { | |||
141 | #[test] | 149 | #[test] |
142 | fn test_valid_byte_escape() { | 150 | fn test_valid_byte_escape() { |
143 | let valid = [ | 151 | let valid = [ |
144 | r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b", | 152 | r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", |
145 | ]; | 153 | ]; |
146 | for c in &valid { | 154 | for c in &valid { |
147 | assert_valid_byte(c); | 155 | assert_valid_byte(c); |
diff --git a/crates/ra_syntax/src/validation/byte_string.rs b/crates/ra_syntax/src/validation/byte_string.rs new file mode 100644 index 000000000..7b830e97c --- /dev/null +++ b/crates/ra_syntax/src/validation/byte_string.rs | |||
@@ -0,0 +1,178 @@ | |||
1 | use crate::{ | ||
2 | ast::{self, AstNode}, | ||
3 | string_lexing::{self, StringComponentKind}, | ||
4 | yellow::{ | ||
5 | SyntaxError, | ||
6 | SyntaxErrorKind::*, | ||
7 | }, | ||
8 | }; | ||
9 | |||
10 | use super::byte; | ||
11 | |||
12 | pub(crate) fn validate_byte_string_node(node: ast::ByteString, errors: &mut Vec<SyntaxError>) { | ||
13 | let literal_text = node.text(); | ||
14 | let literal_range = node.syntax().range(); | ||
15 | let mut components = string_lexing::parse_byte_string_literal(literal_text); | ||
16 | for component in &mut components { | ||
17 | let range = component.range + literal_range.start(); | ||
18 | |||
19 | match component.kind { | ||
20 | StringComponentKind::Char(kind) => { | ||
21 | // Chars must escape \t, \n and \r codepoints, but strings don't | ||
22 | let text = &literal_text[component.range]; | ||
23 | match text { | ||
24 | "\t" | "\n" | "\r" => { /* always valid */ } | ||
25 | _ => byte::validate_byte_component(text, kind, range, errors), | ||
26 | } | ||
27 | } | ||
28 | StringComponentKind::IgnoreNewline => { /* always valid */ } | ||
29 | } | ||
30 | } | ||
31 | |||
32 | if !components.has_closing_quote { | ||
33 | errors.push(SyntaxError::new(UnclosedString, literal_range)); | ||
34 | } | ||
35 | } | ||
36 | |||
37 | #[cfg(test)] | ||
38 | mod test { | ||
39 | use crate::SourceFileNode; | ||
40 | |||
41 | fn build_file(literal: &str) -> SourceFileNode { | ||
42 | let src = format!(r#"const S: &'static [u8] = b"{}";"#, literal); | ||
43 | println!("Source: {}", src); | ||
44 | SourceFileNode::parse(&src) | ||
45 | } | ||
46 | |||
47 | fn assert_valid_str(literal: &str) { | ||
48 | let file = build_file(literal); | ||
49 | assert!( | ||
50 | file.errors().len() == 0, | ||
51 | "Errors for literal '{}': {:?}", | ||
52 | literal, | ||
53 | file.errors() | ||
54 | ); | ||
55 | } | ||
56 | |||
57 | fn assert_invalid_str(literal: &str) { | ||
58 | let file = build_file(literal); | ||
59 | assert!(file.errors().len() > 0); | ||
60 | } | ||
61 | |||
62 | #[test] | ||
63 | fn test_ansi_codepoints() { | ||
64 | for byte in 0..128 { | ||
65 | match byte { | ||
66 | b'\"' | b'\\' => { /* Ignore string close and backslash */ } | ||
67 | _ => assert_valid_str(&(byte as char).to_string()), | ||
68 | } | ||
69 | } | ||
70 | |||
71 | for byte in 128..=255u8 { | ||
72 | assert_invalid_str(&(byte as char).to_string()); | ||
73 | } | ||
74 | } | ||
75 | |||
76 | #[test] | ||
77 | fn test_unicode_codepoints() { | ||
78 | let invalid = ["Ƒ", "バ", "メ", "﷽"]; | ||
79 | for c in &invalid { | ||
80 | assert_invalid_str(c); | ||
81 | } | ||
82 | } | ||
83 | |||
84 | #[test] | ||
85 | fn test_unicode_multiple_codepoints() { | ||
86 | let invalid = ["नी", "👨👨"]; | ||
87 | for c in &invalid { | ||
88 | assert_invalid_str(c); | ||
89 | } | ||
90 | } | ||
91 | |||
92 | #[test] | ||
93 | fn test_valid_ascii_escape() { | ||
94 | let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"]; | ||
95 | for c in &valid { | ||
96 | assert_valid_str(c); | ||
97 | } | ||
98 | } | ||
99 | |||
100 | #[test] | ||
101 | fn test_invalid_ascii_escape() { | ||
102 | let invalid = [r"\a", r"\?", r"\"]; | ||
103 | for c in &invalid { | ||
104 | assert_invalid_str(c); | ||
105 | } | ||
106 | } | ||
107 | |||
108 | #[test] | ||
109 | fn test_valid_ascii_code_escape() { | ||
110 | let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"]; | ||
111 | for c in &valid { | ||
112 | assert_valid_str(c); | ||
113 | } | ||
114 | } | ||
115 | |||
116 | #[test] | ||
117 | fn test_invalid_ascii_code_escape() { | ||
118 | let invalid = [r"\x", r"\x7"]; | ||
119 | for c in &invalid { | ||
120 | assert_invalid_str(c); | ||
121 | } | ||
122 | } | ||
123 | |||
124 | #[test] | ||
125 | fn test_invalid_unicode_escape() { | ||
126 | let well_formed = [ | ||
127 | r"\u{FF}", | ||
128 | r"\u{0}", | ||
129 | r"\u{F}", | ||
130 | r"\u{10FFFF}", | ||
131 | r"\u{1_0__FF___FF_____}", | ||
132 | ]; | ||
133 | for c in &well_formed { | ||
134 | assert_invalid_str(c); | ||
135 | } | ||
136 | |||
137 | let invalid = [ | ||
138 | r"\u", | ||
139 | r"\u{}", | ||
140 | r"\u{", | ||
141 | r"\u{FF", | ||
142 | r"\u{FFFFFF}", | ||
143 | r"\u{_F}", | ||
144 | r"\u{00FFFFF}", | ||
145 | r"\u{110000}", | ||
146 | ]; | ||
147 | for c in &invalid { | ||
148 | assert_invalid_str(c); | ||
149 | } | ||
150 | } | ||
151 | |||
152 | #[test] | ||
153 | fn test_mixed_invalid() { | ||
154 | assert_invalid_str( | ||
155 | r"This is the tale of a string | ||
156 | with a newline in between, some emoji (👨👨) here and there, | ||
157 | unicode escapes like this: \u{1FFBB} and weird stuff like | ||
158 | this ﷽", | ||
159 | ); | ||
160 | } | ||
161 | |||
162 | #[test] | ||
163 | fn test_mixed_valid() { | ||
164 | assert_valid_str( | ||
165 | r"This is the tale of a string | ||
166 | with a newline in between, no emoji at all, | ||
167 | nor unicode escapes or weird stuff", | ||
168 | ); | ||
169 | } | ||
170 | |||
171 | #[test] | ||
172 | fn test_ignore_newline() { | ||
173 | assert_valid_str( | ||
174 | "Hello \ | ||
175 | World", | ||
176 | ); | ||
177 | } | ||
178 | } | ||
diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs index 793539b3a..622b2efdc 100644 --- a/crates/ra_syntax/src/validation/char.rs +++ b/crates/ra_syntax/src/validation/char.rs | |||
@@ -214,7 +214,7 @@ mod test { | |||
214 | #[test] | 214 | #[test] |
215 | fn test_valid_ascii_escape() { | 215 | fn test_valid_ascii_escape() { |
216 | let valid = [ | 216 | let valid = [ |
217 | r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", "a", "b", | 217 | r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0", |
218 | ]; | 218 | ]; |
219 | for c in &valid { | 219 | for c in &valid { |
220 | assert_valid_char(c); | 220 | assert_valid_char(c); |
diff --git a/crates/ra_syntax/src/validation/mod.rs b/crates/ra_syntax/src/validation/mod.rs index acad7cb7f..bdee8120c 100644 --- a/crates/ra_syntax/src/validation/mod.rs +++ b/crates/ra_syntax/src/validation/mod.rs | |||
@@ -6,6 +6,7 @@ use crate::{ | |||
6 | }; | 6 | }; |
7 | 7 | ||
8 | mod byte; | 8 | mod byte; |
9 | mod byte_string; | ||
9 | mod char; | 10 | mod char; |
10 | mod string; | 11 | mod string; |
11 | 12 | ||
@@ -14,6 +15,7 @@ pub(crate) fn validate(file: &SourceFileNode) -> Vec<SyntaxError> { | |||
14 | for node in file.syntax().descendants() { | 15 | for node in file.syntax().descendants() { |
15 | let _ = visitor_ctx(&mut errors) | 16 | let _ = visitor_ctx(&mut errors) |
16 | .visit::<ast::Byte, _>(self::byte::validate_byte_node) | 17 | .visit::<ast::Byte, _>(self::byte::validate_byte_node) |
18 | .visit::<ast::ByteString, _>(self::byte_string::validate_byte_string_node) | ||
17 | .visit::<ast::Char, _>(self::char::validate_char_node) | 19 | .visit::<ast::Char, _>(self::char::validate_char_node) |
18 | .visit::<ast::String, _>(self::string::validate_string_node) | 20 | .visit::<ast::String, _>(self::string::validate_string_node) |
19 | .accept(node); | 21 | .accept(node); |