aboutsummaryrefslogtreecommitdiff
path: root/crates/ra_syntax/src/string_lexing/mod.rs
diff options
context:
space:
mode:
authorAdolfo OchagavĂ­a <[email protected]>2018-11-04 14:06:38 +0000
committerAdolfo OchagavĂ­a <[email protected]>2018-11-04 14:17:24 +0000
commit9b5bbab104d8ba445143f6f3a9e4149b40c29ae5 (patch)
treea1ef0fa5dbfd431e8a58afc6542c32c9ecefed04 /crates/ra_syntax/src/string_lexing/mod.rs
parent19c6cbd9540ef87850161cad7e108b380eceea24 (diff)
Add character literal parsing and validation
Diffstat (limited to 'crates/ra_syntax/src/string_lexing/mod.rs')
-rw-r--r--crates/ra_syntax/src/string_lexing/mod.rs311
1 files changed, 311 insertions, 0 deletions
diff --git a/crates/ra_syntax/src/string_lexing/mod.rs b/crates/ra_syntax/src/string_lexing/mod.rs
new file mode 100644
index 000000000..6b52c62c3
--- /dev/null
+++ b/crates/ra_syntax/src/string_lexing/mod.rs
@@ -0,0 +1,311 @@
1use self::CharComponentKind::*;
2use rowan::{TextRange, TextUnit};
3
4pub fn parse_char_literal(src: &str) -> CharComponentIterator {
5 CharComponentIterator {
6 parser: Parser::new(src),
7 has_closing_quote: false,
8 }
9}
10
11#[derive(Debug, Eq, PartialEq, Clone)]
12pub struct CharComponent {
13 pub range: TextRange,
14 pub kind: CharComponentKind,
15}
16
17impl CharComponent {
18 fn new(range: TextRange, kind: CharComponentKind) -> CharComponent {
19 CharComponent { range, kind }
20 }
21}
22
23#[derive(Debug, Eq, PartialEq, Clone)]
24pub enum CharComponentKind {
25 CodePoint,
26 AsciiEscape,
27 AsciiCodeEscape,
28 UnicodeEscape,
29}
30
31pub struct CharComponentIterator<'a> {
32 parser: Parser<'a>,
33 pub has_closing_quote: bool,
34}
35
36impl<'a> Iterator for CharComponentIterator<'a> {
37 type Item = CharComponent;
38 fn next(&mut self) -> Option<CharComponent> {
39 if self.parser.pos == 0 {
40 assert!(
41 self.parser.advance() == '\'',
42 "char literal should start with a quote"
43 );
44 }
45
46 if let Some(component) = self.parser.parse_char_component() {
47 return Some(component);
48 }
49
50 // We get here when there are no char components left to parse
51 if self.parser.peek() == Some('\'') {
52 self.parser.advance();
53 self.has_closing_quote = true;
54 }
55
56 assert!(
57 self.parser.peek() == None,
58 "char literal should leave no unparsed input: src = {}, pos = {}, length = {}",
59 self.parser.src,
60 self.parser.pos,
61 self.parser.src.len()
62 );
63
64 None
65 }
66}
67
68pub struct Parser<'a> {
69 src: &'a str,
70 pos: usize,
71}
72
73impl<'a> Parser<'a> {
74 pub fn new(src: &'a str) -> Parser<'a> {
75 Parser { src, pos: 0 }
76 }
77
78 // Utility methods
79
80 pub fn peek(&self) -> Option<char> {
81 if self.pos == self.src.len() {
82 return None;
83 }
84
85 self.src[self.pos..].chars().next()
86 }
87
88 pub fn advance(&mut self) -> char {
89 let next = self
90 .peek()
91 .expect("cannot advance if end of input is reached");
92 self.pos += next.len_utf8();
93 next
94 }
95
96 pub fn get_pos(&self) -> TextUnit {
97 (self.pos as u32).into()
98 }
99
100 // Char parsing methods
101
102 fn parse_unicode_escape(&mut self, start: TextUnit) -> CharComponent {
103 // Note: validation of UnicodeEscape will be done elsewhere:
104 // * Only hex digits or underscores allowed
105 // * Max 6 chars
106 // * Within allowed range (must be at most 10FFFF)
107 match self.peek() {
108 Some('{') => {
109 self.advance();
110
111 // Parse anything until we reach `}`
112 while let Some(next) = self.peek() {
113 self.advance();
114 if next == '}' {
115 break;
116 }
117 }
118
119 let end = self.get_pos();
120 CharComponent::new(TextRange::from_to(start, end), UnicodeEscape)
121 }
122 Some(_) | None => {
123 let end = self.get_pos();
124 CharComponent::new(TextRange::from_to(start, end), UnicodeEscape)
125 }
126 }
127 }
128
129 fn parse_ascii_code_escape(&mut self, start: TextUnit) -> CharComponent {
130 // Note: validation of AsciiCodeEscape will be done elsewhere:
131 // * First digit is octal
132 // * Second digit is hex
133 let code_start = self.get_pos();
134 while let Some(next) = self.peek() {
135 if next == '\'' || (self.get_pos() - code_start == 2.into()) {
136 break;
137 }
138
139 self.advance();
140 }
141
142 let end = self.get_pos();
143 CharComponent::new(TextRange::from_to(start, end), AsciiCodeEscape)
144 }
145
146 fn parse_escape(&mut self, start: TextUnit) -> CharComponent {
147 // Note: validation of AsciiEscape will be done elsewhere:
148 // * The escape sequence is non-empty
149 // * The escape sequence is valid
150 if self.peek().is_none() {
151 return CharComponent::new(TextRange::from_to(start, start), AsciiEscape);
152 }
153
154 let next = self.advance();
155 let end = self.get_pos();
156 let range = TextRange::from_to(start, end);
157 match next {
158 'x' => self.parse_ascii_code_escape(start),
159 'u' => self.parse_unicode_escape(start),
160 _ => CharComponent::new(range, AsciiEscape),
161 }
162 }
163
164 pub fn parse_char_component(&mut self) -> Option<CharComponent> {
165 let next = self.peek()?;
166
167 // Ignore character close
168 if next == '\'' {
169 return None;
170 }
171
172 let start = self.get_pos();
173 self.advance();
174
175 if next == '\\' {
176 Some(self.parse_escape(start))
177 } else {
178 let end = self.get_pos();
179 Some(CharComponent::new(
180 TextRange::from_to(start, end),
181 CodePoint,
182 ))
183 }
184 }
185}
186
187#[cfg(test)]
188mod tests {
189 use super::*;
190
191 fn parse(src: &str) -> (bool, Vec<CharComponent>) {
192 let component_iterator = &mut super::parse_char_literal(src);
193 let components: Vec<_> = component_iterator.collect();
194 (component_iterator.has_closing_quote, components)
195 }
196
197 fn unclosed_char_component(src: &str) -> CharComponent {
198 let (has_closing_quote, components) = parse(src);
199 assert!(!has_closing_quote, "char should not have closing quote");
200 assert!(components.len() == 1);
201 components[0].clone()
202 }
203
204 fn closed_char_component(src: &str) -> CharComponent {
205 let (has_closing_quote, components) = parse(src);
206 assert!(has_closing_quote, "char should have closing quote");
207 assert!(
208 components.len() == 1,
209 "Literal: {}\nComponents: {:#?}",
210 src,
211 components
212 );
213 components[0].clone()
214 }
215
216 fn closed_char_components(src: &str) -> Vec<CharComponent> {
217 let (has_closing_quote, components) = parse(src);
218 assert!(has_closing_quote, "char should have closing quote");
219 components
220 }
221
222 fn range_closed(src: &str) -> TextRange {
223 TextRange::from_to(1.into(), (src.len() as u32 - 1).into())
224 }
225
226 fn range_unclosed(src: &str) -> TextRange {
227 TextRange::from_to(1.into(), (src.len() as u32).into())
228 }
229
230 #[test]
231 fn test_unicode_escapes() {
232 let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", ""];
233 for escape in unicode_escapes {
234 let escape_sequence = format!(r"'\u{}'", escape);
235 let component = closed_char_component(&escape_sequence);
236 let expected_range = range_closed(&escape_sequence);
237 assert_eq!(component.kind, CharComponentKind::UnicodeEscape);
238 assert_eq!(component.range, expected_range);
239 }
240 }
241
242 #[test]
243 fn test_unicode_escapes_unclosed() {
244 let unicode_escapes = &["{DEAD", "{BEEF", "{FF"];
245 for escape in unicode_escapes {
246 let escape_sequence = format!(r"'\u{}'", escape);
247 let component = unclosed_char_component(&escape_sequence);
248 let expected_range = range_unclosed(&escape_sequence);
249 assert_eq!(component.kind, CharComponentKind::UnicodeEscape);
250 assert_eq!(component.range, expected_range);
251 }
252 }
253
254 #[test]
255 fn test_empty_char() {
256 let (has_closing_quote, components) = parse("''");
257 assert!(has_closing_quote, "char should have closing quote");
258 assert!(components.len() == 0);
259 }
260
261 #[test]
262 fn test_unclosed_char() {
263 let component = unclosed_char_component("'a");
264 assert!(component.kind == CodePoint);
265 assert!(component.range == TextRange::from_to(1.into(), 2.into()));
266 }
267
268 #[test]
269 fn test_digit_escapes() {
270 let literals = &[r"", r"5", r"55"];
271
272 for literal in literals {
273 let lit_text = format!(r"'\x{}'", literal);
274 let component = closed_char_component(&lit_text);
275 assert!(component.kind == CharComponentKind::AsciiCodeEscape);
276 assert!(component.range == range_closed(&lit_text));
277 }
278
279 // More than 2 digits starts a new codepoint
280 let components = closed_char_components(r"'\x555'");
281 assert!(components.len() == 2);
282 assert!(components[1].kind == CharComponentKind::CodePoint);
283 }
284
285 #[test]
286 fn test_ascii_escapes() {
287 let literals = &[
288 r"\'", "\\\"", // equivalent to \"
289 r"\n", r"\r", r"\t", r"\\", r"\0",
290 ];
291
292 for literal in literals {
293 let lit_text = format!("'{}'", literal);
294 let component = closed_char_component(&lit_text);
295 assert!(component.kind == CharComponentKind::AsciiEscape);
296 assert!(component.range == range_closed(&lit_text));
297 }
298 }
299
300 #[test]
301 fn test_no_escapes() {
302 let literals = &['"', 'n', 'r', 't', '0', 'x', 'u'];
303
304 for &literal in literals {
305 let lit_text = format!("'{}'", literal);
306 let component = closed_char_component(&lit_text);
307 assert!(component.kind == CharComponentKind::CodePoint);
308 assert!(component.range == range_closed(&lit_text));
309 }
310 }
311}