aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbors[bot] <bors[bot]@users.noreply.github.com>2019-05-07 17:43:10 +0100
committerbors[bot] <bors[bot]@users.noreply.github.com>2019-05-07 17:43:10 +0100
commitd3efedb752bb2198796603d8a479a5e3ee472a97 (patch)
treeca6a4aee6ad4077a869a932a18c6c8d134406f8c
parentef782adc293deb287128f005dbab2038ba3ccdc1 (diff)
parent313314e14b629ebf50389dbd2d440bda922f6ae7 (diff)
Merge #1253
1253: Share literal validation logic with compiler r=matklad a=matklad This is neat: the unescape module is literary what compiler is using right now: https://github.com/rust-lang/rust/blob/c6ac57564852cb6e2d0db60f7b46d9eb98d4b449/src/libsyntax/parse/unescape.rs So, yeah, code sharing via copy-paste! Co-authored-by: Aleksey Kladov <[email protected]>
-rw-r--r--crates/ra_syntax/src/lib.rs1
-rw-r--r--crates/ra_syntax/src/string_lexing.rs333
-rw-r--r--crates/ra_syntax/src/syntax_error.rs104
-rw-r--r--crates/ra_syntax/src/validation.rs64
-rw-r--r--crates/ra_syntax/src/validation/byte.rs199
-rw-r--r--crates/ra_syntax/src/validation/byte_string.rs169
-rw-r--r--crates/ra_syntax/src/validation/char.rs273
-rw-r--r--crates/ra_syntax/src/validation/string.rs154
-rw-r--r--crates/ra_syntax/src/validation/unescape.rs521
-rw-r--r--crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt3
10 files changed, 620 insertions, 1201 deletions
diff --git a/crates/ra_syntax/src/lib.rs b/crates/ra_syntax/src/lib.rs
index 9cb66b76b..39c25dbdc 100644
--- a/crates/ra_syntax/src/lib.rs
+++ b/crates/ra_syntax/src/lib.rs
@@ -23,7 +23,6 @@ mod syntax_node;
23mod syntax_text; 23mod syntax_text;
24mod syntax_error; 24mod syntax_error;
25mod parsing; 25mod parsing;
26mod string_lexing;
27mod validation; 26mod validation;
28mod ptr; 27mod ptr;
29 28
diff --git a/crates/ra_syntax/src/string_lexing.rs b/crates/ra_syntax/src/string_lexing.rs
deleted file mode 100644
index 4c3eea3d2..000000000
--- a/crates/ra_syntax/src/string_lexing.rs
+++ /dev/null
@@ -1,333 +0,0 @@
1use crate::{TextRange, TextUnit};
2use self::StringComponentKind::*;
3
4#[derive(Debug, Eq, PartialEq, Clone)]
5pub(crate) struct StringComponent {
6 pub(crate) range: TextRange,
7 pub(crate) kind: StringComponentKind,
8}
9
10#[derive(Debug, Eq, PartialEq, Clone)]
11pub(crate) enum StringComponentKind {
12 IgnoreNewline,
13 CodePoint,
14 AsciiEscape,
15 AsciiCodeEscape,
16 UnicodeEscape,
17}
18
19pub(crate) fn parse_quoted_literal(
20 prefix: Option<char>,
21 quote: char,
22 src: &str,
23) -> StringComponentIter {
24 let prefix = prefix.map(|p| match p {
25 'b' => b'b',
26 _ => panic!("invalid prefix"),
27 });
28 let quote = match quote {
29 '\'' => b'\'',
30 '"' => b'"',
31 _ => panic!("invalid quote"),
32 };
33 StringComponentIter { src, prefix, quote, pos: 0, has_closing_quote: false, suffix: None }
34}
35
36pub(crate) struct StringComponentIter<'a> {
37 src: &'a str,
38 prefix: Option<u8>,
39 quote: u8,
40 pos: usize,
41 pub(crate) has_closing_quote: bool,
42 pub(crate) suffix: Option<TextRange>,
43}
44
45impl<'a> Iterator for StringComponentIter<'a> {
46 type Item = StringComponent;
47 fn next(&mut self) -> Option<StringComponent> {
48 if self.pos == 0 {
49 if let Some(prefix) = self.prefix {
50 assert!(
51 self.advance() == prefix as char,
52 "literal should start with a {:?}",
53 prefix as char,
54 );
55 }
56 assert!(
57 self.advance() == self.quote as char,
58 "literal should start with a {:?}",
59 self.quote as char,
60 );
61 }
62
63 if let Some(component) = self.parse_component() {
64 return Some(component);
65 }
66
67 // We get here when there are no char components left to parse
68 if self.peek() == Some(self.quote as char) {
69 self.advance();
70 self.has_closing_quote = true;
71 if let Some(range) = self.parse_suffix() {
72 self.suffix = Some(range);
73 }
74 }
75
76 assert!(
77 self.peek() == None,
78 "literal should leave no unparsed input: src = {:?}, pos = {}, length = {}",
79 self.src,
80 self.pos,
81 self.src.len()
82 );
83
84 None
85 }
86}
87
88impl<'a> StringComponentIter<'a> {
89 fn peek(&self) -> Option<char> {
90 if self.pos == self.src.len() {
91 return None;
92 }
93
94 self.src[self.pos..].chars().next()
95 }
96
97 fn advance(&mut self) -> char {
98 let next = self.peek().expect("cannot advance if end of input is reached");
99 self.pos += next.len_utf8();
100 next
101 }
102
103 fn parse_component(&mut self) -> Option<StringComponent> {
104 let next = self.peek()?;
105
106 // Ignore string close
107 if next == self.quote as char {
108 return None;
109 }
110
111 let start = self.start_range();
112 self.advance();
113
114 if next == '\\' {
115 // Strings can use `\` to ignore newlines, so we first try to parse one of those
116 // before falling back to parsing char escapes
117 if self.quote == b'"' {
118 if let Some(component) = self.parse_ignore_newline(start) {
119 return Some(component);
120 }
121 }
122
123 Some(self.parse_escape(start))
124 } else {
125 Some(self.finish_component(start, CodePoint))
126 }
127 }
128
129 fn parse_ignore_newline(&mut self, start: TextUnit) -> Option<StringComponent> {
130 // In string literals, when a `\` occurs immediately before the newline, the `\`,
131 // the newline, and all whitespace at the beginning of the next line are ignored
132 match self.peek() {
133 Some('\n') | Some('\r') => {
134 self.skip_whitespace();
135 Some(self.finish_component(start, IgnoreNewline))
136 }
137 _ => None,
138 }
139 }
140
141 fn skip_whitespace(&mut self) {
142 while self.peek().map(|c| c.is_whitespace()) == Some(true) {
143 self.advance();
144 }
145 }
146
147 fn parse_escape(&mut self, start: TextUnit) -> StringComponent {
148 if self.peek().is_none() {
149 return self.finish_component(start, AsciiEscape);
150 }
151
152 let next = self.advance();
153 match next {
154 'x' => self.parse_ascii_code_escape(start),
155 'u' => self.parse_unicode_escape(start),
156 _ => self.finish_component(start, AsciiEscape),
157 }
158 }
159
160 fn parse_unicode_escape(&mut self, start: TextUnit) -> StringComponent {
161 match self.peek() {
162 Some('{') => {
163 self.advance();
164
165 // Parse anything until we reach `}`
166 while let Some(next) = self.peek() {
167 self.advance();
168 if next == '}' {
169 break;
170 }
171 }
172
173 self.finish_component(start, UnicodeEscape)
174 }
175 Some(_) | None => self.finish_component(start, UnicodeEscape),
176 }
177 }
178
179 fn parse_ascii_code_escape(&mut self, start: TextUnit) -> StringComponent {
180 let code_start = self.pos;
181 while let Some(next) = self.peek() {
182 if next == '\'' || (self.pos - code_start == 2) {
183 break;
184 }
185
186 self.advance();
187 }
188 self.finish_component(start, AsciiCodeEscape)
189 }
190
191 fn parse_suffix(&mut self) -> Option<TextRange> {
192 let start = self.start_range();
193 let _ = self.peek()?;
194 while let Some(_) = self.peek() {
195 self.advance();
196 }
197 Some(self.finish_range(start))
198 }
199
200 fn start_range(&self) -> TextUnit {
201 TextUnit::from_usize(self.pos)
202 }
203
204 fn finish_range(&self, start: TextUnit) -> TextRange {
205 TextRange::from_to(start, TextUnit::from_usize(self.pos))
206 }
207
208 fn finish_component(&self, start: TextUnit, kind: StringComponentKind) -> StringComponent {
209 let range = self.finish_range(start);
210 StringComponent { range, kind }
211 }
212}
213
214#[cfg(test)]
215mod tests {
216 use super::*;
217
218 fn parse(src: &str) -> (bool, Vec<StringComponent>) {
219 let component_iterator = &mut parse_quoted_literal(None, '\'', src);
220 let components: Vec<_> = component_iterator.collect();
221 (component_iterator.has_closing_quote, components)
222 }
223
224 fn unclosed_char_component(src: &str) -> StringComponent {
225 let (has_closing_quote, components) = parse(src);
226 assert!(!has_closing_quote, "char should not have closing quote");
227 assert!(components.len() == 1);
228 components[0].clone()
229 }
230
231 fn closed_char_component(src: &str) -> StringComponent {
232 let (has_closing_quote, components) = parse(src);
233 assert!(has_closing_quote, "char should have closing quote");
234 assert!(components.len() == 1, "Literal: {}\nComponents: {:#?}", src, components);
235 components[0].clone()
236 }
237
238 fn closed_char_components(src: &str) -> Vec<StringComponent> {
239 let (has_closing_quote, components) = parse(src);
240 assert!(has_closing_quote, "char should have closing quote");
241 components
242 }
243
244 fn range_closed(src: &str) -> TextRange {
245 TextRange::from_to(1.into(), (src.len() as u32 - 1).into())
246 }
247
248 fn range_unclosed(src: &str) -> TextRange {
249 TextRange::from_to(1.into(), (src.len() as u32).into())
250 }
251
252 #[test]
253 fn test_unicode_escapes() {
254 let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""];
255 for escape in unicode_escapes {
256 let escape_sequence = format!(r"'\u{}'", escape);
257 let component = closed_char_component(&escape_sequence);
258 let expected_range = range_closed(&escape_sequence);
259 assert_eq!(component.kind, UnicodeEscape);
260 assert_eq!(component.range, expected_range);
261 }
262 }
263
264 #[test]
265 fn test_unicode_escapes_unclosed() {
266 let unicode_escapes = &["{DEAD", "{BEEF", "{FF"];
267 for escape in unicode_escapes {
268 let escape_sequence = format!(r"'\u{}'", escape);
269 let component = unclosed_char_component(&escape_sequence);
270 let expected_range = range_unclosed(&escape_sequence);
271 assert_eq!(component.kind, UnicodeEscape);
272 assert_eq!(component.range, expected_range);
273 }
274 }
275
276 #[test]
277 fn test_empty_char() {
278 let (has_closing_quote, components) = parse("''");
279 assert!(has_closing_quote, "char should have closing quote");
280 assert!(components.len() == 0);
281 }
282
283 #[test]
284 fn test_unclosed_char() {
285 let component = unclosed_char_component("'a");
286 assert!(component.kind == CodePoint);
287 assert!(component.range == TextRange::from_to(1.into(), 2.into()));
288 }
289
290 #[test]
291 fn test_digit_escapes() {
292 let literals = &[r"", r"5", r"55"];
293
294 for literal in literals {
295 let lit_text = format!(r"'\x{}'", literal);
296 let component = closed_char_component(&lit_text);
297 assert!(component.kind == AsciiCodeEscape);
298 assert!(component.range == range_closed(&lit_text));
299 }
300
301 // More than 2 digits starts a new codepoint
302 let components = closed_char_components(r"'\x555'");
303 assert!(components.len() == 2);
304 assert!(components[1].kind == CodePoint);
305 }
306
307 #[test]
308 fn test_ascii_escapes() {
309 let literals = &[
310 r"\'", "\\\"", // equivalent to \"
311 r"\n", r"\r", r"\t", r"\\", r"\0",
312 ];
313
314 for literal in literals {
315 let lit_text = format!("'{}'", literal);
316 let component = closed_char_component(&lit_text);
317 assert!(component.kind == AsciiEscape);
318 assert!(component.range == range_closed(&lit_text));
319 }
320 }
321
322 #[test]
323 fn test_no_escapes() {
324 let literals = &['"', 'n', 'r', 't', '0', 'x', 'u'];
325
326 for &literal in literals {
327 let lit_text = format!("'{}'", literal);
328 let component = closed_char_component(&lit_text);
329 assert!(component.kind == CodePoint);
330 assert!(component.range == range_closed(&lit_text));
331 }
332 }
333}
diff --git a/crates/ra_syntax/src/syntax_error.rs b/crates/ra_syntax/src/syntax_error.rs
index 4198eefdb..27e12293b 100644
--- a/crates/ra_syntax/src/syntax_error.rs
+++ b/crates/ra_syntax/src/syntax_error.rs
@@ -2,7 +2,10 @@ use std::fmt;
2 2
3use ra_parser::ParseError; 3use ra_parser::ParseError;
4 4
5use crate::{TextRange, TextUnit}; 5use crate::{
6 TextRange, TextUnit,
7 validation::EscapeError,
8};
6 9
7#[derive(Debug, Clone, PartialEq, Eq, Hash)] 10#[derive(Debug, Clone, PartialEq, Eq, Hash)]
8pub struct SyntaxError { 11pub struct SyntaxError {
@@ -67,32 +70,7 @@ impl fmt::Display for SyntaxError {
67#[derive(Debug, Clone, PartialEq, Eq, Hash)] 70#[derive(Debug, Clone, PartialEq, Eq, Hash)]
68pub enum SyntaxErrorKind { 71pub enum SyntaxErrorKind {
69 ParseError(ParseError), 72 ParseError(ParseError),
70 UnescapedCodepoint, 73 EscapeError(EscapeError),
71 EmptyChar,
72 UnclosedChar,
73 OverlongChar,
74 EmptyByte,
75 UnclosedByte,
76 OverlongByte,
77 ByteOutOfRange,
78 UnescapedByte,
79 EmptyByteEscape,
80 InvalidByteEscape,
81 TooShortByteCodeEscape,
82 MalformedByteCodeEscape,
83 UnicodeEscapeForbidden,
84 EmptyAsciiEscape,
85 InvalidAsciiEscape,
86 TooShortAsciiCodeEscape,
87 AsciiCodeEscapeOutOfRange,
88 MalformedAsciiCodeEscape,
89 UnclosedUnicodeEscape,
90 MalformedUnicodeEscape,
91 EmptyUnicodeEcape,
92 OverlongUnicodeEscape,
93 UnicodeEscapeOutOfRange,
94 UnclosedString,
95 InvalidSuffix,
96 InvalidBlockAttr, 74 InvalidBlockAttr,
97 InvalidMatchInnerAttr, 75 InvalidMatchInnerAttr,
98 InvalidTupleIndexFormat, 76 InvalidTupleIndexFormat,
@@ -102,38 +80,6 @@ impl fmt::Display for SyntaxErrorKind {
102 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 80 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
103 use self::SyntaxErrorKind::*; 81 use self::SyntaxErrorKind::*;
104 match self { 82 match self {
105 UnescapedCodepoint => write!(f, "This codepoint should always be escaped"),
106 EmptyAsciiEscape => write!(f, "Empty escape sequence"),
107 InvalidAsciiEscape => write!(f, "Invalid escape sequence"),
108 EmptyChar => write!(f, "Empty char literal"),
109 UnclosedChar => write!(f, "Unclosed char literal"),
110 OverlongChar => write!(f, "Char literal should be one character long"),
111 EmptyByte => write!(f, "Empty byte literal"),
112 UnclosedByte => write!(f, "Unclosed byte literal"),
113 OverlongByte => write!(f, "Byte literal should be one character long"),
114 ByteOutOfRange => write!(f, "Byte should be a valid ASCII character"),
115 UnescapedByte => write!(f, "This byte should always be escaped"),
116 EmptyByteEscape => write!(f, "Empty escape sequence"),
117 InvalidByteEscape => write!(f, "Invalid escape sequence"),
118 TooShortByteCodeEscape => write!(f, "Escape sequence should have two digits"),
119 MalformedByteCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"),
120 UnicodeEscapeForbidden => {
121 write!(f, "Unicode escapes are not allowed in byte literals or byte strings")
122 }
123 TooShortAsciiCodeEscape => write!(f, "Escape sequence should have two digits"),
124 AsciiCodeEscapeOutOfRange => {
125 write!(f, "Escape sequence should be between \\x00 and \\x7F")
126 }
127 MalformedAsciiCodeEscape => write!(f, "Escape sequence should be a hexadecimal number"),
128 UnclosedUnicodeEscape => write!(f, "Missing `}}`"),
129 MalformedUnicodeEscape => write!(f, "Malformed unicode escape sequence"),
130 EmptyUnicodeEcape => write!(f, "Empty unicode escape sequence"),
131 OverlongUnicodeEscape => {
132 write!(f, "Unicode escape sequence should have at most 6 digits")
133 }
134 UnicodeEscapeOutOfRange => write!(f, "Unicode escape code should be at most 0x10FFFF"),
135 UnclosedString => write!(f, "Unclosed string literal"),
136 InvalidSuffix => write!(f, "Invalid literal suffix"),
137 InvalidBlockAttr => { 83 InvalidBlockAttr => {
138 write!(f, "A block in this position cannot accept inner attributes") 84 write!(f, "A block in this position cannot accept inner attributes")
139 } 85 }
@@ -144,6 +90,46 @@ impl fmt::Display for SyntaxErrorKind {
144 write!(f, "Tuple (struct) field access is only allowed through decimal integers with no underscores or suffix") 90 write!(f, "Tuple (struct) field access is only allowed through decimal integers with no underscores or suffix")
145 } 91 }
146 ParseError(msg) => write!(f, "{}", msg.0), 92 ParseError(msg) => write!(f, "{}", msg.0),
93 EscapeError(err) => write!(f, "{}", err),
147 } 94 }
148 } 95 }
149} 96}
97
98impl fmt::Display for EscapeError {
99 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
100 let msg = match self {
101 EscapeError::ZeroChars => "Empty literal",
102 EscapeError::MoreThanOneChar => "Literal should be one character long",
103 EscapeError::LoneSlash => "Character must be escaped: '\\'",
104 EscapeError::InvalidEscape => "Invalid escape sequence",
105 EscapeError::BareCarriageReturn => "Character must be escaped: '\r'",
106 EscapeError::EscapeOnlyChar => "Character must be escaped",
107 EscapeError::TooShortHexEscape => "Escape sequence should have two digits",
108 EscapeError::InvalidCharInHexEscape => "Escape sequence should be a hexadecimal number",
109 EscapeError::OutOfRangeHexEscape => "Escape sequence should be ASCII",
110 EscapeError::NoBraceInUnicodeEscape => "Invalid escape sequence",
111 EscapeError::InvalidCharInUnicodeEscape => "Invalid escape sequence",
112 EscapeError::EmptyUnicodeEscape => "Invalid escape sequence",
113 EscapeError::UnclosedUnicodeEscape => "Missing '}'",
114 EscapeError::LeadingUnderscoreUnicodeEscape => "Invalid escape sequence",
115 EscapeError::OverlongUnicodeEscape => {
116 "Unicode escape sequence should have at most 6 digits"
117 }
118 EscapeError::LoneSurrogateUnicodeEscape => {
119 "Unicode escape code should not be a surrogate"
120 }
121 EscapeError::OutOfRangeUnicodeEscape => {
122 "Unicode escape code should be at most 0x10FFFF"
123 }
124 EscapeError::UnicodeEscapeInByte => "Unicode escapes are not allowed in bytes",
125 EscapeError::NonAsciiCharInByte => "Non ASCII characters are not allowed in bytes",
126 };
127 write!(f, "{}", msg)
128 }
129}
130
131impl From<EscapeError> for SyntaxErrorKind {
132 fn from(err: EscapeError) -> Self {
133 SyntaxErrorKind::EscapeError(err)
134 }
135}
diff --git a/crates/ra_syntax/src/validation.rs b/crates/ra_syntax/src/validation.rs
index c2f545173..11a1fb4a7 100644
--- a/crates/ra_syntax/src/validation.rs
+++ b/crates/ra_syntax/src/validation.rs
@@ -1,17 +1,17 @@
1mod byte; 1mod unescape;
2mod byte_string; 2
3mod char;
4mod string;
5mod block; 3mod block;
6mod field_expr; 4mod field_expr;
7 5
8use crate::{ 6use crate::{
9 SourceFile, SyntaxError, AstNode, SyntaxNode, 7 SourceFile, SyntaxError, AstNode, SyntaxNode, TextUnit,
10 SyntaxKind::{L_CURLY, R_CURLY, BYTE, BYTE_STRING, STRING, CHAR}, 8 SyntaxKind::{L_CURLY, R_CURLY, BYTE, BYTE_STRING, STRING, CHAR},
11 ast, 9 ast,
12 algo::visit::{visitor_ctx, VisitorCtx}, 10 algo::visit::{visitor_ctx, VisitorCtx},
13}; 11};
14 12
13pub(crate) use unescape::EscapeError;
14
15pub(crate) fn validate(file: &SourceFile) -> Vec<SyntaxError> { 15pub(crate) fn validate(file: &SourceFile) -> Vec<SyntaxError> {
16 let mut errors = Vec::new(); 16 let mut errors = Vec::new();
17 for node in file.syntax().descendants() { 17 for node in file.syntax().descendants() {
@@ -26,11 +26,55 @@ pub(crate) fn validate(file: &SourceFile) -> Vec<SyntaxError> {
26 26
27// FIXME: kill duplication 27// FIXME: kill duplication
28fn validate_literal(literal: &ast::Literal, acc: &mut Vec<SyntaxError>) { 28fn validate_literal(literal: &ast::Literal, acc: &mut Vec<SyntaxError>) {
29 match literal.token().kind() { 29 let token = literal.token();
30 BYTE => byte::validate_byte_node(literal.token(), acc), 30 let text = token.text().as_str();
31 BYTE_STRING => byte_string::validate_byte_string_node(literal.token(), acc), 31 match token.kind() {
32 STRING => string::validate_string_node(literal.token(), acc), 32 BYTE => {
33 CHAR => char::validate_char_node(literal.token(), acc), 33 if let Some(end) = text.rfind('\'') {
34 if let Some(without_quotes) = text.get(2..end) {
35 if let Err((off, err)) = unescape::unescape_byte(without_quotes) {
36 let off = token.range().start() + TextUnit::from_usize(off + 2);
37 acc.push(SyntaxError::new(err.into(), off))
38 }
39 }
40 }
41 }
42 CHAR => {
43 if let Some(end) = text.rfind('\'') {
44 if let Some(without_quotes) = text.get(1..end) {
45 if let Err((off, err)) = unescape::unescape_char(without_quotes) {
46 let off = token.range().start() + TextUnit::from_usize(off + 1);
47 acc.push(SyntaxError::new(err.into(), off))
48 }
49 }
50 }
51 }
52 BYTE_STRING => {
53 if let Some(end) = text.rfind('\"') {
54 if let Some(without_quotes) = text.get(2..end) {
55 unescape::unescape_byte_str(without_quotes, &mut |range, char| {
56 if let Err(err) = char {
57 let off = range.start;
58 let off = token.range().start() + TextUnit::from_usize(off + 2);
59 acc.push(SyntaxError::new(err.into(), off))
60 }
61 })
62 }
63 }
64 }
65 STRING => {
66 if let Some(end) = text.rfind('\"') {
67 if let Some(without_quotes) = text.get(1..end) {
68 unescape::unescape_str(without_quotes, &mut |range, char| {
69 if let Err(err) = char {
70 let off = range.start;
71 let off = token.range().start() + TextUnit::from_usize(off + 1);
72 acc.push(SyntaxError::new(err.into(), off))
73 }
74 })
75 }
76 }
77 }
34 _ => (), 78 _ => (),
35 } 79 }
36} 80}
diff --git a/crates/ra_syntax/src/validation/byte.rs b/crates/ra_syntax/src/validation/byte.rs
deleted file mode 100644
index f653e65d0..000000000
--- a/crates/ra_syntax/src/validation/byte.rs
+++ /dev/null
@@ -1,199 +0,0 @@
1//! Validation of byte literals
2
3use crate::{
4 string_lexing::{self, StringComponentKind},
5 TextRange,
6 validation::char,
7 SyntaxError,
8 SyntaxErrorKind::*,
9 SyntaxToken,
10};
11
12pub(super) fn validate_byte_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
13 let literal_text = node.text();
14 let literal_range = node.range();
15 let mut components = string_lexing::parse_quoted_literal(Some('b'), '\'', literal_text);
16 let mut len = 0;
17 for component in &mut components {
18 len += 1;
19 let text = &literal_text[component.range];
20 let range = component.range + literal_range.start();
21 validate_byte_component(text, component.kind, range, errors);
22 }
23
24 if !components.has_closing_quote {
25 errors.push(SyntaxError::new(UnclosedByte, literal_range));
26 }
27
28 if let Some(range) = components.suffix {
29 errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
30 }
31
32 if len == 0 {
33 errors.push(SyntaxError::new(EmptyByte, literal_range));
34 }
35
36 if len > 1 {
37 errors.push(SyntaxError::new(OverlongByte, literal_range));
38 }
39}
40
41pub(super) fn validate_byte_component(
42 text: &str,
43 kind: StringComponentKind,
44 range: TextRange,
45 errors: &mut Vec<SyntaxError>,
46) {
47 use self::StringComponentKind::*;
48 match kind {
49 AsciiEscape => validate_byte_escape(text, range, errors),
50 AsciiCodeEscape => validate_byte_code_escape(text, range, errors),
51 UnicodeEscape => errors.push(SyntaxError::new(UnicodeEscapeForbidden, range)),
52 CodePoint => {
53 let c = text.chars().next().expect("Code points should be one character long");
54
55 // These bytes must always be escaped
56 if c == '\t' || c == '\r' || c == '\n' {
57 errors.push(SyntaxError::new(UnescapedByte, range));
58 }
59
60 // Only ASCII bytes are allowed
61 if c > 0x7F as char {
62 errors.push(SyntaxError::new(ByteOutOfRange, range));
63 }
64 }
65 IgnoreNewline => { /* always valid */ }
66 }
67}
68
69fn validate_byte_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
70 if text.len() == 1 {
71 // Escape sequence consists only of leading `\`
72 errors.push(SyntaxError::new(EmptyByteEscape, range));
73 } else {
74 let escape_code = text.chars().skip(1).next().unwrap();
75 if !char::is_ascii_escape(escape_code) {
76 errors.push(SyntaxError::new(InvalidByteEscape, range));
77 }
78 }
79}
80
81fn validate_byte_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
82 // A ByteCodeEscape has 4 chars, example: `\xDD`
83 if !text.is_ascii() {
84 errors.push(SyntaxError::new(MalformedByteCodeEscape, range));
85 } else if text.chars().count() < 4 {
86 errors.push(SyntaxError::new(TooShortByteCodeEscape, range));
87 } else {
88 assert!(text.chars().count() == 4, "ByteCodeEscape cannot be longer than 4 chars");
89
90 if u8::from_str_radix(&text[2..], 16).is_err() {
91 errors.push(SyntaxError::new(MalformedByteCodeEscape, range));
92 }
93 }
94}
95
96#[cfg(test)]
97mod test {
98 use crate::{SourceFile, TreeArc};
99
100 fn build_file(literal: &str) -> TreeArc<SourceFile> {
101 let src = format!("const C: u8 = b'{}';", literal);
102 SourceFile::parse(&src)
103 }
104
105 fn assert_valid_byte(literal: &str) {
106 let file = build_file(literal);
107 assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
108 }
109
110 fn assert_invalid_byte(literal: &str) {
111 let file = build_file(literal);
112 assert!(file.errors().len() > 0);
113 }
114
115 #[test]
116 fn test_ansi_codepoints() {
117 for byte in 0..128 {
118 match byte {
119 b'\n' | b'\r' | b'\t' => assert_invalid_byte(&(byte as char).to_string()),
120 b'\'' | b'\\' => { /* Ignore character close and backslash */ }
121 _ => assert_valid_byte(&(byte as char).to_string()),
122 }
123 }
124
125 for byte in 128..=255u8 {
126 assert_invalid_byte(&(byte as char).to_string());
127 }
128 }
129
130 #[test]
131 fn test_unicode_codepoints() {
132 let invalid = ["Ƒ", "バ", "メ", "﷽"];
133 for c in &invalid {
134 assert_invalid_byte(c);
135 }
136 }
137
138 #[test]
139 fn test_unicode_multiple_codepoints() {
140 let invalid = ["नी", "👨‍👨‍"];
141 for c in &invalid {
142 assert_invalid_byte(c);
143 }
144 }
145
146 #[test]
147 fn test_valid_byte_escape() {
148 let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"];
149 for c in &valid {
150 assert_valid_byte(c);
151 }
152 }
153
154 #[test]
155 fn test_invalid_byte_escape() {
156 let invalid = [r"\a", r"\?", r"\"];
157 for c in &invalid {
158 assert_invalid_byte(c);
159 }
160 }
161
162 #[test]
163 fn test_valid_byte_code_escape() {
164 let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"];
165 for c in &valid {
166 assert_valid_byte(c);
167 }
168 }
169
170 #[test]
171 fn test_invalid_byte_code_escape() {
172 let invalid = [r"\x", r"\x7"];
173 for c in &invalid {
174 assert_invalid_byte(c);
175 }
176 }
177
178 #[test]
179 fn test_invalid_unicode_escape() {
180 let well_formed = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
181 for c in &well_formed {
182 assert_invalid_byte(c);
183 }
184
185 let invalid = [
186 r"\u",
187 r"\u{}",
188 r"\u{",
189 r"\u{FF",
190 r"\u{FFFFFF}",
191 r"\u{_F}",
192 r"\u{00FFFFF}",
193 r"\u{110000}",
194 ];
195 for c in &invalid {
196 assert_invalid_byte(c);
197 }
198 }
199}
diff --git a/crates/ra_syntax/src/validation/byte_string.rs b/crates/ra_syntax/src/validation/byte_string.rs
deleted file mode 100644
index 1d48c2d9b..000000000
--- a/crates/ra_syntax/src/validation/byte_string.rs
+++ /dev/null
@@ -1,169 +0,0 @@
1use crate::{
2 string_lexing::{self, StringComponentKind},
3 SyntaxError,
4 SyntaxErrorKind::*,
5 SyntaxToken,
6};
7
8use super::byte;
9
10pub(crate) fn validate_byte_string_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
11 let literal_text = node.text();
12 let literal_range = node.range();
13 let mut components = string_lexing::parse_quoted_literal(Some('b'), '"', literal_text);
14 for component in &mut components {
15 let range = component.range + literal_range.start();
16
17 match component.kind {
18 StringComponentKind::IgnoreNewline => { /* always valid */ }
19 _ => {
20 // Chars must escape \t, \n and \r codepoints, but strings don't
21 let text = &literal_text[component.range];
22 match text {
23 "\t" | "\n" | "\r" => { /* always valid */ }
24 _ => byte::validate_byte_component(text, component.kind, range, errors),
25 }
26 }
27 }
28 }
29
30 if !components.has_closing_quote {
31 errors.push(SyntaxError::new(UnclosedString, literal_range));
32 }
33
34 if let Some(range) = components.suffix {
35 errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
36 }
37}
38
39#[cfg(test)]
40mod test {
41 use crate::{SourceFile, TreeArc};
42
43 fn build_file(literal: &str) -> TreeArc<SourceFile> {
44 let src = format!(r#"const S: &'static [u8] = b"{}";"#, literal);
45 println!("Source: {}", src);
46 SourceFile::parse(&src)
47 }
48
49 fn assert_valid_str(literal: &str) {
50 let file = build_file(literal);
51 assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
52 }
53
54 fn assert_invalid_str(literal: &str) {
55 let file = build_file(literal);
56 assert!(file.errors().len() > 0);
57 }
58
59 #[test]
60 fn test_ansi_codepoints() {
61 for byte in 0..128 {
62 match byte {
63 b'\"' | b'\\' => { /* Ignore string close and backslash */ }
64 _ => assert_valid_str(&(byte as char).to_string()),
65 }
66 }
67
68 for byte in 128..=255u8 {
69 assert_invalid_str(&(byte as char).to_string());
70 }
71 }
72
73 #[test]
74 fn test_unicode_codepoints() {
75 let invalid = ["Ƒ", "バ", "メ", "﷽"];
76 for c in &invalid {
77 assert_invalid_str(c);
78 }
79 }
80
81 #[test]
82 fn test_unicode_multiple_codepoints() {
83 let invalid = ["नी", "👨‍👨‍"];
84 for c in &invalid {
85 assert_invalid_str(c);
86 }
87 }
88
89 #[test]
90 fn test_valid_ascii_escape() {
91 let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"];
92 for c in &valid {
93 assert_valid_str(c);
94 }
95 }
96
97 #[test]
98 fn test_invalid_ascii_escape() {
99 let invalid = [r"\a", r"\?", r"\"];
100 for c in &invalid {
101 assert_invalid_str(c);
102 }
103 }
104
105 #[test]
106 fn test_valid_ascii_code_escape() {
107 let valid = [r"\x00", r"\x7F", r"\x55", r"\xF0"];
108 for c in &valid {
109 assert_valid_str(c);
110 }
111 }
112
113 #[test]
114 fn test_invalid_ascii_code_escape() {
115 let invalid = [r"\x", r"\x7"];
116 for c in &invalid {
117 assert_invalid_str(c);
118 }
119 }
120
121 #[test]
122 fn test_invalid_unicode_escape() {
123 let well_formed = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
124 for c in &well_formed {
125 assert_invalid_str(c);
126 }
127
128 let invalid = [
129 r"\u",
130 r"\u{}",
131 r"\u{",
132 r"\u{FF",
133 r"\u{FFFFFF}",
134 r"\u{_F}",
135 r"\u{00FFFFF}",
136 r"\u{110000}",
137 ];
138 for c in &invalid {
139 assert_invalid_str(c);
140 }
141 }
142
143 #[test]
144 fn test_mixed_invalid() {
145 assert_invalid_str(
146 r"This is the tale of a string
147with a newline in between, some emoji (👨‍👨‍) here and there,
148unicode escapes like this: \u{1FFBB} and weird stuff like
149this ﷽",
150 );
151 }
152
153 #[test]
154 fn test_mixed_valid() {
155 assert_valid_str(
156 r"This is the tale of a string
157with a newline in between, no emoji at all,
158nor unicode escapes or weird stuff",
159 );
160 }
161
162 #[test]
163 fn test_ignore_newline() {
164 assert_valid_str(
165 "Hello \
166 World",
167 );
168 }
169}
diff --git a/crates/ra_syntax/src/validation/char.rs b/crates/ra_syntax/src/validation/char.rs
deleted file mode 100644
index 0f1885873..000000000
--- a/crates/ra_syntax/src/validation/char.rs
+++ /dev/null
@@ -1,273 +0,0 @@
1//! Validation of char literals
2
3use std::u32;
4
5use arrayvec::ArrayString;
6
7use crate::{
8 string_lexing::{self, StringComponentKind},
9 TextRange,
10 SyntaxError,
11 SyntaxErrorKind::*,
12 SyntaxToken,
13};
14
15pub(super) fn validate_char_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
16 let literal_text = node.text();
17 let literal_range = node.range();
18 let mut components = string_lexing::parse_quoted_literal(None, '\'', literal_text);
19 let mut len = 0;
20 for component in &mut components {
21 len += 1;
22 let text = &literal_text[component.range];
23 let range = component.range + literal_range.start();
24 validate_char_component(text, component.kind, range, errors);
25 }
26
27 if !components.has_closing_quote {
28 errors.push(SyntaxError::new(UnclosedChar, literal_range));
29 }
30
31 if let Some(range) = components.suffix {
32 errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
33 }
34
35 if len == 0 {
36 errors.push(SyntaxError::new(EmptyChar, literal_range));
37 }
38
39 if len > 1 {
40 errors.push(SyntaxError::new(OverlongChar, literal_range));
41 }
42}
43
44pub(super) fn validate_char_component(
45 text: &str,
46 kind: StringComponentKind,
47 range: TextRange,
48 errors: &mut Vec<SyntaxError>,
49) {
50 // Validate escapes
51 use self::StringComponentKind::*;
52 match kind {
53 AsciiEscape => validate_ascii_escape(text, range, errors),
54 AsciiCodeEscape => validate_ascii_code_escape(text, range, errors),
55 UnicodeEscape => validate_unicode_escape(text, range, errors),
56 CodePoint => {
57 // These code points must always be escaped
58 if text == "\t" || text == "\r" || text == "\n" {
59 errors.push(SyntaxError::new(UnescapedCodepoint, range));
60 }
61 }
62 StringComponentKind::IgnoreNewline => { /* always valid */ }
63 }
64}
65
66fn validate_ascii_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
67 if text.len() == 1 {
68 // Escape sequence consists only of leading `\` (only occurs at EOF, otherwise e.g. '\' is treated as an unclosed char containing a single quote `'`)
69 errors.push(SyntaxError::new(EmptyAsciiEscape, range));
70 } else {
71 let escape_code = text.chars().skip(1).next().unwrap();
72 if !is_ascii_escape(escape_code) {
73 errors.push(SyntaxError::new(InvalidAsciiEscape, range));
74 }
75 }
76}
77
78pub(super) fn is_ascii_escape(code: char) -> bool {
79 match code {
80 '\\' | '\'' | '"' | 'n' | 'r' | 't' | '0' => true,
81 _ => false,
82 }
83}
84
85fn validate_ascii_code_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
86 // An AsciiCodeEscape has 4 chars, example: `\xDD`
87 if !text.is_ascii() {
88 // FIXME: Give a more precise error message (say what the invalid character was)
89 errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range));
90 } else if text.chars().count() < 4 {
91 errors.push(SyntaxError::new(TooShortAsciiCodeEscape, range));
92 } else {
93 assert_eq!(
94 text.chars().count(),
95 4,
96 "AsciiCodeEscape cannot be longer than 4 chars, but text '{}' is",
97 text,
98 );
99
100 match u8::from_str_radix(&text[2..], 16) {
101 Ok(code) if code < 128 => { /* Escape code is valid */ }
102 Ok(_) => errors.push(SyntaxError::new(AsciiCodeEscapeOutOfRange, range)),
103 Err(_) => errors.push(SyntaxError::new(MalformedAsciiCodeEscape, range)),
104 }
105 }
106}
107
108fn validate_unicode_escape(text: &str, range: TextRange, errors: &mut Vec<SyntaxError>) {
109 assert!(&text[..2] == "\\u", "UnicodeEscape always starts with \\u");
110
111 if text.len() == 2 {
112 // No starting `{`
113 errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
114 return;
115 }
116
117 if text.len() == 3 {
118 // Only starting `{`
119 errors.push(SyntaxError::new(UnclosedUnicodeEscape, range));
120 return;
121 }
122
123 let mut code = ArrayString::<[_; 6]>::new();
124 let mut closed = false;
125 for c in text[3..].chars() {
126 assert!(!closed, "no characters after escape is closed");
127
128 if c.is_digit(16) {
129 if code.len() == 6 {
130 errors.push(SyntaxError::new(OverlongUnicodeEscape, range));
131 return;
132 }
133
134 code.push(c);
135 } else if c == '_' {
136 // Reject leading _
137 if code.len() == 0 {
138 errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
139 return;
140 }
141 } else if c == '}' {
142 closed = true;
143 } else {
144 errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
145 return;
146 }
147 }
148
149 if !closed {
150 errors.push(SyntaxError::new(UnclosedUnicodeEscape, range))
151 }
152
153 if code.len() == 0 {
154 errors.push(SyntaxError::new(EmptyUnicodeEcape, range));
155 return;
156 }
157
158 match u32::from_str_radix(&code, 16) {
159 Ok(code_u32) if code_u32 > 0x10FFFF => {
160 errors.push(SyntaxError::new(UnicodeEscapeOutOfRange, range));
161 }
162 Ok(_) => {
163 // Valid escape code
164 }
165 Err(_) => {
166 errors.push(SyntaxError::new(MalformedUnicodeEscape, range));
167 }
168 }
169}
170
171#[cfg(test)]
172mod test {
173 use crate::{SourceFile, TreeArc};
174
175 fn build_file(literal: &str) -> TreeArc<SourceFile> {
176 let src = format!("const C: char = '{}';", literal);
177 SourceFile::parse(&src)
178 }
179
180 fn assert_valid_char(literal: &str) {
181 let file = build_file(literal);
182 assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
183 }
184
185 fn assert_invalid_char(literal: &str) {
186 let file = build_file(literal);
187 assert!(file.errors().len() > 0);
188 }
189
190 #[test]
191 fn test_ansi_codepoints() {
192 for byte in 0..=255u8 {
193 match byte {
194 b'\n' | b'\r' | b'\t' => assert_invalid_char(&(byte as char).to_string()),
195 b'\'' | b'\\' => { /* Ignore character close and backslash */ }
196 _ => assert_valid_char(&(byte as char).to_string()),
197 }
198 }
199 }
200
201 #[test]
202 fn test_unicode_codepoints() {
203 let valid = ["Ƒ", "バ", "メ", "﷽"];
204 for c in &valid {
205 assert_valid_char(c);
206 }
207 }
208
209 #[test]
210 fn test_unicode_multiple_codepoints() {
211 let invalid = ["नी", "👨‍👨‍"];
212 for c in &invalid {
213 assert_invalid_char(c);
214 }
215 }
216
217 #[test]
218 fn test_valid_ascii_escape() {
219 let valid = [r"\'", "\"", "\\\\", "\\\"", r"\n", r"\r", r"\t", r"\0"];
220 for c in &valid {
221 assert_valid_char(c);
222 }
223 }
224
225 #[test]
226 fn test_invalid_ascii_escape() {
227 let invalid = [r"\a", r"\?", r"\"];
228 for c in &invalid {
229 assert_invalid_char(c);
230 }
231 }
232
233 #[test]
234 fn test_valid_ascii_code_escape() {
235 let valid = [r"\x00", r"\x7F", r"\x55"];
236 for c in &valid {
237 assert_valid_char(c);
238 }
239 }
240
241 #[test]
242 fn test_invalid_ascii_code_escape() {
243 let invalid = [r"\x", r"\x7", r"\xF0"];
244 for c in &invalid {
245 assert_invalid_char(c);
246 }
247 }
248
249 #[test]
250 fn test_valid_unicode_escape() {
251 let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
252 for c in &valid {
253 assert_valid_char(c);
254 }
255 }
256
257 #[test]
258 fn test_invalid_unicode_escape() {
259 let invalid = [
260 r"\u",
261 r"\u{}",
262 r"\u{",
263 r"\u{FF",
264 r"\u{FFFFFF}",
265 r"\u{_F}",
266 r"\u{00FFFFF}",
267 r"\u{110000}",
268 ];
269 for c in &invalid {
270 assert_invalid_char(c);
271 }
272 }
273}
diff --git a/crates/ra_syntax/src/validation/string.rs b/crates/ra_syntax/src/validation/string.rs
deleted file mode 100644
index fc2f1b992..000000000
--- a/crates/ra_syntax/src/validation/string.rs
+++ /dev/null
@@ -1,154 +0,0 @@
1use crate::{
2 string_lexing,
3 SyntaxError,
4 SyntaxErrorKind::*,
5 SyntaxToken,
6};
7
8use super::char;
9
10pub(crate) fn validate_string_node(node: SyntaxToken, errors: &mut Vec<SyntaxError>) {
11 let literal_text = node.text();
12 let literal_range = node.range();
13 let mut components = string_lexing::parse_quoted_literal(None, '"', literal_text);
14 for component in &mut components {
15 let range = component.range + literal_range.start();
16
17 // Chars must escape \t, \n and \r codepoints, but strings don't
18 let text = &literal_text[component.range];
19 match text {
20 "\t" | "\n" | "\r" => { /* always valid */ }
21 _ => char::validate_char_component(text, component.kind, range, errors),
22 }
23 }
24
25 if !components.has_closing_quote {
26 errors.push(SyntaxError::new(UnclosedString, literal_range));
27 }
28
29 if let Some(range) = components.suffix {
30 errors.push(SyntaxError::new(InvalidSuffix, range + literal_range.start()));
31 }
32}
33
34#[cfg(test)]
35mod test {
36 use crate::{SourceFile, TreeArc};
37
38 fn build_file(literal: &str) -> TreeArc<SourceFile> {
39 let src = format!(r#"const S: &'static str = "{}";"#, literal);
40 println!("Source: {}", src);
41 SourceFile::parse(&src)
42 }
43
44 fn assert_valid_str(literal: &str) {
45 let file = build_file(literal);
46 assert!(file.errors().len() == 0, "Errors for literal '{}': {:?}", literal, file.errors());
47 }
48
49 fn assert_invalid_str(literal: &str) {
50 let file = build_file(literal);
51 assert!(file.errors().len() > 0);
52 }
53
54 #[test]
55 fn test_ansi_codepoints() {
56 for byte in 0..=255u8 {
57 match byte {
58 b'\"' | b'\\' => { /* Ignore string close and backslash */ }
59 _ => assert_valid_str(&(byte as char).to_string()),
60 }
61 }
62 }
63
64 #[test]
65 fn test_unicode_codepoints() {
66 let valid = ["Ƒ", "バ", "メ", "﷽"];
67 for c in &valid {
68 assert_valid_str(c);
69 }
70 }
71
72 #[test]
73 fn test_unicode_multiple_codepoints() {
74 let valid = ["नी", "👨‍👨‍"];
75 for c in &valid {
76 assert_valid_str(c);
77 }
78 }
79
80 #[test]
81 fn test_valid_ascii_escape() {
82 let valid = [r"\'", r#"\""#, r"\\", r"\n", r"\r", r"\t", r"\0", "a", "b"];
83 for c in &valid {
84 assert_valid_str(c);
85 }
86 }
87
88 #[test]
89 fn test_invalid_ascii_escape() {
90 let invalid = [r"\a", r"\?", r"\"];
91 for c in &invalid {
92 assert_invalid_str(c);
93 }
94 }
95
96 #[test]
97 fn test_valid_ascii_code_escape() {
98 let valid = [r"\x00", r"\x7F", r"\x55"];
99 for c in &valid {
100 assert_valid_str(c);
101 }
102 }
103
104 #[test]
105 fn test_invalid_ascii_code_escape() {
106 let invalid = [r"\x", r"\x7", r"\xF0"];
107 for c in &invalid {
108 assert_invalid_str(c);
109 }
110 }
111
112 #[test]
113 fn test_valid_unicode_escape() {
114 let valid = [r"\u{FF}", r"\u{0}", r"\u{F}", r"\u{10FFFF}", r"\u{1_0__FF___FF_____}"];
115 for c in &valid {
116 assert_valid_str(c);
117 }
118 }
119
120 #[test]
121 fn test_invalid_unicode_escape() {
122 let invalid = [
123 r"\u",
124 r"\u{}",
125 r"\u{",
126 r"\u{FF",
127 r"\u{FFFFFF}",
128 r"\u{_F}",
129 r"\u{00FFFFF}",
130 r"\u{110000}",
131 ];
132 for c in &invalid {
133 assert_invalid_str(c);
134 }
135 }
136
137 #[test]
138 fn test_mixed() {
139 assert_valid_str(
140 r"This is the tale of a string
141with a newline in between, some emoji (👨‍👨‍) here and there,
142unicode escapes like this: \u{1FFBB} and weird stuff like
143this ﷽",
144 );
145 }
146
147 #[test]
148 fn test_ignore_newline() {
149 assert_valid_str(
150 "Hello \
151 World",
152 );
153 }
154}
diff --git a/crates/ra_syntax/src/validation/unescape.rs b/crates/ra_syntax/src/validation/unescape.rs
new file mode 100644
index 000000000..2086046b6
--- /dev/null
+++ b/crates/ra_syntax/src/validation/unescape.rs
@@ -0,0 +1,521 @@
1//! Utilities for validating string and char literals and turning them into
2//! values they represent.
3//!
4//! This file is copy-pasted from the compiler
5//!
6//! https://github.com/rust-lang/rust/blob/c6ac57564852cb6e2d0db60f7b46d9eb98d4b449/src/libsyntax/parse/unescape.rs
7//!
8//! Hopefully, we'll share this code in a proper way some day
9
10use std::str::Chars;
11use std::ops::Range;
12
13#[derive(Debug, PartialEq, Eq, Clone, Hash)]
14pub enum EscapeError {
15 ZeroChars,
16 MoreThanOneChar,
17
18 LoneSlash,
19 InvalidEscape,
20 BareCarriageReturn,
21 EscapeOnlyChar,
22
23 TooShortHexEscape,
24 InvalidCharInHexEscape,
25 OutOfRangeHexEscape,
26
27 NoBraceInUnicodeEscape,
28 InvalidCharInUnicodeEscape,
29 EmptyUnicodeEscape,
30 UnclosedUnicodeEscape,
31 LeadingUnderscoreUnicodeEscape,
32 OverlongUnicodeEscape,
33 LoneSurrogateUnicodeEscape,
34 OutOfRangeUnicodeEscape,
35
36 UnicodeEscapeInByte,
37 NonAsciiCharInByte,
38}
39
40/// Takes a contents of a char literal (without quotes), and returns an
41/// unescaped char or an error
42pub(crate) fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> {
43 let mut chars = literal_text.chars();
44 unescape_char_or_byte(&mut chars, Mode::Char)
45 .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
46}
47
48/// Takes a contents of a string literal (without quotes) and produces a
49/// sequence of escaped characters or errors.
50pub(crate) fn unescape_str<F>(literal_text: &str, callback: &mut F)
51where
52 F: FnMut(Range<usize>, Result<char, EscapeError>),
53{
54 unescape_str_or_byte_str(literal_text, Mode::Str, callback)
55}
56
57pub(crate) fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
58 let mut chars = literal_text.chars();
59 unescape_char_or_byte(&mut chars, Mode::Byte)
60 .map(byte_from_char)
61 .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
62}
63
64/// Takes a contents of a string literal (without quotes) and produces a
65/// sequence of escaped characters or errors.
66pub(crate) fn unescape_byte_str<F>(literal_text: &str, callback: &mut F)
67where
68 F: FnMut(Range<usize>, Result<u8, EscapeError>),
69{
70 unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
71 callback(range, char.map(byte_from_char))
72 })
73}
74
75#[derive(Debug, Clone, Copy)]
76pub(crate) enum Mode {
77 Char,
78 Str,
79 Byte,
80 ByteStr,
81}
82
83impl Mode {
84 fn in_single_quotes(self) -> bool {
85 match self {
86 Mode::Char | Mode::Byte => true,
87 Mode::Str | Mode::ByteStr => false,
88 }
89 }
90
91 pub(crate) fn in_double_quotes(self) -> bool {
92 !self.in_single_quotes()
93 }
94
95 pub(crate) fn is_bytes(self) -> bool {
96 match self {
97 Mode::Byte | Mode::ByteStr => true,
98 Mode::Char | Mode::Str => false,
99 }
100 }
101}
102
103fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
104 if first_char != '\\' {
105 return match first_char {
106 '\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
107 '\r' => Err(if chars.clone().next() == Some('\n') {
108 EscapeError::EscapeOnlyChar
109 } else {
110 EscapeError::BareCarriageReturn
111 }),
112 '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
113 '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
114 _ => {
115 if mode.is_bytes() && !first_char.is_ascii() {
116 return Err(EscapeError::NonAsciiCharInByte);
117 }
118 Ok(first_char)
119 }
120 };
121 }
122
123 let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
124
125 let res = match second_char {
126 '"' => '"',
127 'n' => '\n',
128 'r' => '\r',
129 't' => '\t',
130 '\\' => '\\',
131 '\'' => '\'',
132 '0' => '\0',
133
134 'x' => {
135 let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
136 let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
137
138 let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
139 let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
140
141 let value = hi * 16 + lo;
142
143 if !mode.is_bytes() && !is_ascii(value) {
144 return Err(EscapeError::OutOfRangeHexEscape);
145 }
146 let value = value as u8;
147
148 value as char
149 }
150
151 'u' => {
152 if chars.next() != Some('{') {
153 return Err(EscapeError::NoBraceInUnicodeEscape);
154 }
155
156 let mut n_digits = 1;
157 let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
158 '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
159 '}' => return Err(EscapeError::EmptyUnicodeEscape),
160 c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
161 };
162
163 loop {
164 match chars.next() {
165 None => return Err(EscapeError::UnclosedUnicodeEscape),
166 Some('_') => continue,
167 Some('}') => {
168 if n_digits > 6 {
169 return Err(EscapeError::OverlongUnicodeEscape);
170 }
171 if mode.is_bytes() {
172 return Err(EscapeError::UnicodeEscapeInByte);
173 }
174
175 break std::char::from_u32(value).ok_or_else(|| {
176 if value > 0x10FFFF {
177 EscapeError::OutOfRangeUnicodeEscape
178 } else {
179 EscapeError::LoneSurrogateUnicodeEscape
180 }
181 })?;
182 }
183 Some(c) => {
184 let digit =
185 c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
186 n_digits += 1;
187 if n_digits > 6 {
188 continue;
189 }
190 let digit = digit as u32;
191 value = value * 16 + digit;
192 }
193 };
194 }
195 }
196 _ => return Err(EscapeError::InvalidEscape),
197 };
198 Ok(res)
199}
200
201fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
202 let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
203 let res = scan_escape(first_char, chars, mode)?;
204 if chars.next().is_some() {
205 return Err(EscapeError::MoreThanOneChar);
206 }
207 Ok(res)
208}
209
210/// Takes a contents of a string literal (without quotes) and produces a
211/// sequence of escaped characters or errors.
212fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
213where
214 F: FnMut(Range<usize>, Result<char, EscapeError>),
215{
216 assert!(mode.in_double_quotes());
217 let initial_len = src.len();
218 let mut chars = src.chars();
219 while let Some(first_char) = chars.next() {
220 let start = initial_len - chars.as_str().len() - first_char.len_utf8();
221
222 let unescaped_char = match first_char {
223 '\\' => {
224 let (second_char, third_char) = {
225 let mut chars = chars.clone();
226 (chars.next(), chars.next())
227 };
228 match (second_char, third_char) {
229 (Some('\n'), _) | (Some('\r'), Some('\n')) => {
230 skip_ascii_whitespace(&mut chars);
231 continue;
232 }
233 _ => scan_escape(first_char, &mut chars, mode),
234 }
235 }
236 '\r' => {
237 let second_char = chars.clone().next();
238 if second_char == Some('\n') {
239 chars.next();
240 Ok('\n')
241 } else {
242 scan_escape(first_char, &mut chars, mode)
243 }
244 }
245 '\n' => Ok('\n'),
246 '\t' => Ok('\t'),
247 _ => scan_escape(first_char, &mut chars, mode),
248 };
249 let end = initial_len - chars.as_str().len();
250 callback(start..end, unescaped_char);
251 }
252
253 fn skip_ascii_whitespace(chars: &mut Chars<'_>) {
254 let str = chars.as_str();
255 let first_non_space = str
256 .bytes()
257 .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
258 .unwrap_or(str.len());
259 *chars = str[first_non_space..].chars()
260 }
261}
262
263fn byte_from_char(c: char) -> u8 {
264 let res = c as u32;
265 assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte");
266 res as u8
267}
268
269fn is_ascii(x: u32) -> bool {
270 x <= 0x7F
271}
272
273#[cfg(test)]
274mod tests {
275 use super::*;
276
277 #[test]
278 fn test_unescape_char_bad() {
279 fn check(literal_text: &str, expected_error: EscapeError) {
280 let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err);
281 assert_eq!(actual_result, Err(expected_error));
282 }
283
284 check("", EscapeError::ZeroChars);
285 check(r"\", EscapeError::LoneSlash);
286
287 check("\n", EscapeError::EscapeOnlyChar);
288 check("\r\n", EscapeError::EscapeOnlyChar);
289 check("\t", EscapeError::EscapeOnlyChar);
290 check("'", EscapeError::EscapeOnlyChar);
291 check("\r", EscapeError::BareCarriageReturn);
292
293 check("spam", EscapeError::MoreThanOneChar);
294 check(r"\x0ff", EscapeError::MoreThanOneChar);
295 check(r#"\"a"#, EscapeError::MoreThanOneChar);
296 check(r"\na", EscapeError::MoreThanOneChar);
297 check(r"\ra", EscapeError::MoreThanOneChar);
298 check(r"\ta", EscapeError::MoreThanOneChar);
299 check(r"\\a", EscapeError::MoreThanOneChar);
300 check(r"\'a", EscapeError::MoreThanOneChar);
301 check(r"\0a", EscapeError::MoreThanOneChar);
302 check(r"\u{0}x", EscapeError::MoreThanOneChar);
303 check(r"\u{1F63b}}", EscapeError::MoreThanOneChar);
304
305 check(r"\v", EscapeError::InvalidEscape);
306 check(r"\💩", EscapeError::InvalidEscape);
307 check(r"\●", EscapeError::InvalidEscape);
308
309 check(r"\x", EscapeError::TooShortHexEscape);
310 check(r"\x0", EscapeError::TooShortHexEscape);
311 check(r"\xf", EscapeError::TooShortHexEscape);
312 check(r"\xa", EscapeError::TooShortHexEscape);
313 check(r"\xx", EscapeError::InvalidCharInHexEscape);
314 check(r"\xы", EscapeError::InvalidCharInHexEscape);
315 check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
316 check(r"\xtt", EscapeError::InvalidCharInHexEscape);
317 check(r"\xff", EscapeError::OutOfRangeHexEscape);
318 check(r"\xFF", EscapeError::OutOfRangeHexEscape);
319 check(r"\x80", EscapeError::OutOfRangeHexEscape);
320
321 check(r"\u", EscapeError::NoBraceInUnicodeEscape);
322 check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
323 check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
324 check(r"\u{", EscapeError::UnclosedUnicodeEscape);
325 check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
326 check(r"\u{}", EscapeError::EmptyUnicodeEscape);
327 check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
328 check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
329 check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape);
330 check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
331 check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
332
333 check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape);
334 check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape);
335 check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape);
336
337 check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape);
338 check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape);
339 check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape);
340 }
341
342 #[test]
343 fn test_unescape_char_good() {
344 fn check(literal_text: &str, expected_char: char) {
345 let actual_result = unescape_char(literal_text);
346 assert_eq!(actual_result, Ok(expected_char));
347 }
348
349 check("a", 'a');
350 check("ы", 'ы');
351 check("🦀", '🦀');
352
353 check(r#"\""#, '"');
354 check(r"\n", '\n');
355 check(r"\r", '\r');
356 check(r"\t", '\t');
357 check(r"\\", '\\');
358 check(r"\'", '\'');
359 check(r"\0", '\0');
360
361 check(r"\x00", '\0');
362 check(r"\x5a", 'Z');
363 check(r"\x5A", 'Z');
364 check(r"\x7f", 127 as char);
365
366 check(r"\u{0}", '\0');
367 check(r"\u{000000}", '\0');
368 check(r"\u{41}", 'A');
369 check(r"\u{0041}", 'A');
370 check(r"\u{00_41}", 'A');
371 check(r"\u{4__1__}", 'A');
372 check(r"\u{1F63b}", '😻');
373 }
374
375 #[test]
376 fn test_unescape_str_good() {
377 fn check(literal_text: &str, expected: &str) {
378 let mut buf = Ok(String::with_capacity(literal_text.len()));
379 unescape_str(literal_text, &mut |range, c| {
380 if let Ok(b) = &mut buf {
381 match c {
382 Ok(c) => b.push(c),
383 Err(e) => buf = Err((range, e)),
384 }
385 }
386 });
387 let buf = buf.as_ref().map(|it| it.as_ref());
388 assert_eq!(buf, Ok(expected))
389 }
390
391 check("foo", "foo");
392 check("", "");
393 check(" \t\n\r\n", " \t\n\n");
394
395 check("hello \\\n world", "hello world");
396 check("hello \\\r\n world", "hello world");
397 check("thread's", "thread's")
398 }
399
400 #[test]
401 fn test_unescape_byte_bad() {
402 fn check(literal_text: &str, expected_error: EscapeError) {
403 let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err);
404 assert_eq!(actual_result, Err(expected_error));
405 }
406
407 check("", EscapeError::ZeroChars);
408 check(r"\", EscapeError::LoneSlash);
409
410 check("\n", EscapeError::EscapeOnlyChar);
411 check("\r\n", EscapeError::EscapeOnlyChar);
412 check("\t", EscapeError::EscapeOnlyChar);
413 check("'", EscapeError::EscapeOnlyChar);
414 check("\r", EscapeError::BareCarriageReturn);
415
416 check("spam", EscapeError::MoreThanOneChar);
417 check(r"\x0ff", EscapeError::MoreThanOneChar);
418 check(r#"\"a"#, EscapeError::MoreThanOneChar);
419 check(r"\na", EscapeError::MoreThanOneChar);
420 check(r"\ra", EscapeError::MoreThanOneChar);
421 check(r"\ta", EscapeError::MoreThanOneChar);
422 check(r"\\a", EscapeError::MoreThanOneChar);
423 check(r"\'a", EscapeError::MoreThanOneChar);
424 check(r"\0a", EscapeError::MoreThanOneChar);
425
426 check(r"\v", EscapeError::InvalidEscape);
427 check(r"\💩", EscapeError::InvalidEscape);
428 check(r"\●", EscapeError::InvalidEscape);
429
430 check(r"\x", EscapeError::TooShortHexEscape);
431 check(r"\x0", EscapeError::TooShortHexEscape);
432 check(r"\xa", EscapeError::TooShortHexEscape);
433 check(r"\xf", EscapeError::TooShortHexEscape);
434 check(r"\xx", EscapeError::InvalidCharInHexEscape);
435 check(r"\xы", EscapeError::InvalidCharInHexEscape);
436 check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
437 check(r"\xtt", EscapeError::InvalidCharInHexEscape);
438
439 check(r"\u", EscapeError::NoBraceInUnicodeEscape);
440 check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
441 check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
442 check(r"\u{", EscapeError::UnclosedUnicodeEscape);
443 check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
444 check(r"\u{}", EscapeError::EmptyUnicodeEscape);
445 check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
446 check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
447
448 check("ы", EscapeError::NonAsciiCharInByte);
449 check("🦀", EscapeError::NonAsciiCharInByte);
450
451 check(r"\u{0}", EscapeError::UnicodeEscapeInByte);
452 check(r"\u{000000}", EscapeError::UnicodeEscapeInByte);
453 check(r"\u{41}", EscapeError::UnicodeEscapeInByte);
454 check(r"\u{0041}", EscapeError::UnicodeEscapeInByte);
455 check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte);
456 check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte);
457 check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte);
458 check(r"\u{0}x", EscapeError::UnicodeEscapeInByte);
459 check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte);
460 check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte);
461 check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
462 check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
463 check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte);
464 check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte);
465 check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte);
466 check(r"\u{D800}", EscapeError::UnicodeEscapeInByte);
467 check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte);
468 check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte);
469 }
470
471 #[test]
472 fn test_unescape_byte_good() {
473 fn check(literal_text: &str, expected_byte: u8) {
474 let actual_result = unescape_byte(literal_text);
475 assert_eq!(actual_result, Ok(expected_byte));
476 }
477
478 check("a", b'a');
479
480 check(r#"\""#, b'"');
481 check(r"\n", b'\n');
482 check(r"\r", b'\r');
483 check(r"\t", b'\t');
484 check(r"\\", b'\\');
485 check(r"\'", b'\'');
486 check(r"\0", b'\0');
487
488 check(r"\x00", b'\0');
489 check(r"\x5a", b'Z');
490 check(r"\x5A", b'Z');
491 check(r"\x7f", 127);
492 check(r"\x80", 128);
493 check(r"\xff", 255);
494 check(r"\xFF", 255);
495 }
496
497 #[test]
498 fn test_unescape_byte_str_good() {
499 fn check(literal_text: &str, expected: &[u8]) {
500 let mut buf = Ok(Vec::with_capacity(literal_text.len()));
501 unescape_byte_str(literal_text, &mut |range, c| {
502 if let Ok(b) = &mut buf {
503 match c {
504 Ok(c) => b.push(c),
505 Err(e) => buf = Err((range, e)),
506 }
507 }
508 });
509 let buf = buf.as_ref().map(|it| it.as_ref());
510 assert_eq!(buf, Ok(expected))
511 }
512
513 check("foo", b"foo");
514 check("", b"");
515 check(" \t\n\r\n", b" \t\n\n");
516
517 check("hello \\\n world", b"hello world");
518 check("hello \\\r\n world", b"hello world");
519 check("thread's", b"thread's")
520 }
521}
diff --git a/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt b/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt
index 61a28134a..e0e38d37d 100644
--- a/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt
+++ b/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt
@@ -40,7 +40,6 @@ SOURCE_FILE@[0; 112)
40 WHITESPACE@[43; 44) " " 40 WHITESPACE@[43; 44) " "
41 LITERAL@[44; 59) 41 LITERAL@[44; 59)
42 STRING@[44; 59) "\"string\"invalid" 42 STRING@[44; 59) "\"string\"invalid"
43 err: `Invalid literal suffix`
44 SEMI@[59; 60) ";" 43 SEMI@[59; 60) ";"
45 WHITESPACE@[60; 65) "\n " 44 WHITESPACE@[60; 65) "\n "
46 LET_STMT@[65; 83) 45 LET_STMT@[65; 83)
@@ -53,7 +52,6 @@ SOURCE_FILE@[0; 112)
53 WHITESPACE@[72; 73) " " 52 WHITESPACE@[72; 73) " "
54 LITERAL@[73; 82) 53 LITERAL@[73; 82)
55 BYTE@[73; 82) "b\'b\'_suff" 54 BYTE@[73; 82) "b\'b\'_suff"
56 err: `Invalid literal suffix`
57 SEMI@[82; 83) ";" 55 SEMI@[82; 83) ";"
58 WHITESPACE@[83; 88) "\n " 56 WHITESPACE@[83; 88) "\n "
59 LET_STMT@[88; 109) 57 LET_STMT@[88; 109)
@@ -66,7 +64,6 @@ SOURCE_FILE@[0; 112)
66 WHITESPACE@[95; 96) " " 64 WHITESPACE@[95; 96) " "
67 LITERAL@[96; 108) 65 LITERAL@[96; 108)
68 BYTE_STRING@[96; 108) "b\"bs\"invalid" 66 BYTE_STRING@[96; 108) "b\"bs\"invalid"
69 err: `Invalid literal suffix`
70 SEMI@[108; 109) ";" 67 SEMI@[108; 109) ";"
71 WHITESPACE@[109; 110) "\n" 68 WHITESPACE@[109; 110) "\n"
72 R_CURLY@[110; 111) "}" 69 R_CURLY@[110; 111) "}"