aboutsummaryrefslogtreecommitdiff
path: root/crates/ra_syntax/src/string_lexing
diff options
context:
space:
mode:
Diffstat (limited to 'crates/ra_syntax/src/string_lexing')
-rw-r--r--crates/ra_syntax/src/string_lexing/parser.rs168
-rw-r--r--crates/ra_syntax/src/string_lexing/string.rs222
2 files changed, 0 insertions, 390 deletions
diff --git a/crates/ra_syntax/src/string_lexing/parser.rs b/crates/ra_syntax/src/string_lexing/parser.rs
deleted file mode 100644
index 7469eb903..000000000
--- a/crates/ra_syntax/src/string_lexing/parser.rs
+++ /dev/null
@@ -1,168 +0,0 @@
1use rowan::{TextRange, TextUnit};
2
3use self::StringComponentKind::*;
4
5pub struct Parser<'a> {
6 pub(super) quote: u8,
7 pub(super) src: &'a str,
8 pub(super) pos: usize,
9}
10
11impl<'a> Parser<'a> {
12 pub fn new(src: &'a str, quote: u8) -> Parser<'a> {
13 Parser { quote, src, pos: 0 }
14 }
15
16 // Utility methods
17
18 pub fn peek(&self) -> Option<char> {
19 if self.pos == self.src.len() {
20 return None;
21 }
22
23 self.src[self.pos..].chars().next()
24 }
25
26 pub fn advance(&mut self) -> char {
27 let next = self.peek().expect("cannot advance if end of input is reached");
28 self.pos += next.len_utf8();
29 next
30 }
31
32 pub fn skip_whitespace(&mut self) {
33 while self.peek().map(|c| c.is_whitespace()) == Some(true) {
34 self.advance();
35 }
36 }
37
38 pub fn get_pos(&self) -> TextUnit {
39 (self.pos as u32).into()
40 }
41
42 // Char parsing methods
43
44 fn parse_unicode_escape(&mut self, start: TextUnit) -> StringComponent {
45 match self.peek() {
46 Some('{') => {
47 self.advance();
48
49 // Parse anything until we reach `}`
50 while let Some(next) = self.peek() {
51 self.advance();
52 if next == '}' {
53 break;
54 }
55 }
56
57 let end = self.get_pos();
58 StringComponent::new(TextRange::from_to(start, end), UnicodeEscape)
59 }
60 Some(_) | None => {
61 let end = self.get_pos();
62 StringComponent::new(TextRange::from_to(start, end), UnicodeEscape)
63 }
64 }
65 }
66
67 fn parse_ascii_code_escape(&mut self, start: TextUnit) -> StringComponent {
68 let code_start = self.get_pos();
69 while let Some(next) = self.peek() {
70 if next == '\'' || (self.get_pos() - code_start == 2.into()) {
71 break;
72 }
73
74 self.advance();
75 }
76
77 let end = self.get_pos();
78 StringComponent::new(TextRange::from_to(start, end), AsciiCodeEscape)
79 }
80
81 fn parse_escape(&mut self, start: TextUnit) -> StringComponent {
82 if self.peek().is_none() {
83 return StringComponent::new(TextRange::from_to(start, self.get_pos()), AsciiEscape);
84 }
85
86 let next = self.advance();
87 let end = self.get_pos();
88 let range = TextRange::from_to(start, end);
89 match next {
90 'x' => self.parse_ascii_code_escape(start),
91 'u' => self.parse_unicode_escape(start),
92 _ => StringComponent::new(range, AsciiEscape),
93 }
94 }
95
96 pub fn parse_ignore_newline(&mut self, start: TextUnit) -> Option<StringComponent> {
97 // In string literals, when a `\` occurs immediately before the newline, the `\`,
98 // the newline, and all whitespace at the beginning of the next line are ignored
99 match self.peek() {
100 Some('\n') | Some('\r') => {
101 self.skip_whitespace();
102 Some(StringComponent::new(
103 TextRange::from_to(start, self.get_pos()),
104 StringComponentKind::IgnoreNewline,
105 ))
106 }
107 _ => None,
108 }
109 }
110
111 pub fn parse_component(&mut self) -> Option<StringComponent> {
112 let next = self.peek()?;
113
114 // Ignore string close
115 if next == self.quote as char {
116 return None;
117 }
118
119 let start = self.get_pos();
120 self.advance();
121
122 if next == '\\' {
123 // Strings can use `\` to ignore newlines, so we first try to parse one of those
124 // before falling back to parsing char escapes
125 if self.quote == b'"' {
126 if let Some(component) = self.parse_ignore_newline(start) {
127 return Some(component);
128 }
129 }
130
131 Some(self.parse_escape(start))
132 } else {
133 let end = self.get_pos();
134 Some(StringComponent::new(TextRange::from_to(start, end), CodePoint))
135 }
136 }
137
138 pub fn parse_suffix(&mut self) -> Option<TextRange> {
139 let start = self.get_pos();
140 let _ = self.peek()?;
141 while let Some(_) = self.peek() {
142 self.advance();
143 }
144 let end = self.get_pos();
145 Some(TextRange::from_to(start, end))
146 }
147}
148
149#[derive(Debug, Eq, PartialEq, Clone)]
150pub struct StringComponent {
151 pub range: TextRange,
152 pub kind: StringComponentKind,
153}
154
155impl StringComponent {
156 fn new(range: TextRange, kind: StringComponentKind) -> StringComponent {
157 StringComponent { range, kind }
158 }
159}
160
161#[derive(Debug, Eq, PartialEq, Clone)]
162pub enum StringComponentKind {
163 IgnoreNewline,
164 CodePoint,
165 AsciiEscape,
166 AsciiCodeEscape,
167 UnicodeEscape,
168}
diff --git a/crates/ra_syntax/src/string_lexing/string.rs b/crates/ra_syntax/src/string_lexing/string.rs
deleted file mode 100644
index a4742a0d1..000000000
--- a/crates/ra_syntax/src/string_lexing/string.rs
+++ /dev/null
@@ -1,222 +0,0 @@
1use crate::{
2 TextRange,
3 string_lexing::{
4 parser::Parser,
5 StringComponent,
6}};
7
8pub fn parse_string_literal(src: &str) -> StringComponentIterator {
9 StringComponentIterator {
10 parser: Parser::new(src, b'"'),
11 has_closing_quote: false,
12 suffix: None,
13 prefix: None,
14 quote: b'"',
15 }
16}
17
18pub fn parse_byte_string_literal(src: &str) -> StringComponentIterator {
19 StringComponentIterator {
20 parser: Parser::new(src, b'"'),
21 has_closing_quote: false,
22 suffix: None,
23 prefix: Some(b'b'),
24 quote: b'"',
25 }
26}
27
28pub fn parse_char_literal(src: &str) -> StringComponentIterator {
29 StringComponentIterator {
30 parser: Parser::new(src, b'\''),
31 has_closing_quote: false,
32 suffix: None,
33 prefix: None,
34 quote: b'\'',
35 }
36}
37
38pub fn parse_byte_literal(src: &str) -> StringComponentIterator {
39 StringComponentIterator {
40 parser: Parser::new(src, b'\''),
41 has_closing_quote: false,
42 suffix: None,
43 prefix: Some(b'b'),
44 quote: b'\'',
45 }
46}
47
48pub struct StringComponentIterator<'a> {
49 parser: Parser<'a>,
50 pub has_closing_quote: bool,
51 pub suffix: Option<TextRange>,
52 prefix: Option<u8>,
53 quote: u8,
54}
55
56impl<'a> Iterator for StringComponentIterator<'a> {
57 type Item = StringComponent;
58 fn next(&mut self) -> Option<StringComponent> {
59 if self.parser.pos == 0 {
60 if let Some(prefix) = self.prefix {
61 assert!(
62 self.parser.advance() == prefix as char,
63 "literal should start with a {:?}",
64 prefix as char,
65 );
66 }
67 assert!(
68 self.parser.advance() == self.quote as char,
69 "literal should start with a {:?}",
70 self.quote as char,
71 );
72 }
73
74 if let Some(component) = self.parser.parse_component() {
75 return Some(component);
76 }
77
78 // We get here when there are no char components left to parse
79 if self.parser.peek() == Some(self.quote as char) {
80 self.parser.advance();
81 self.has_closing_quote = true;
82 if let Some(range) = self.parser.parse_suffix() {
83 self.suffix = Some(range);
84 }
85 }
86
87 assert!(
88 self.parser.peek() == None,
89 "literal should leave no unparsed input: src = {:?}, pos = {}, length = {}",
90 self.parser.src,
91 self.parser.pos,
92 self.parser.src.len()
93 );
94
95 None
96 }
97}
98
99#[cfg(test)]
100mod tests {
101 use rowan::TextRange;
102 use crate::string_lexing::{
103 StringComponent,
104 StringComponentKind::*,
105};
106
107 fn parse(src: &str) -> (bool, Vec<StringComponent>) {
108 let component_iterator = &mut super::parse_char_literal(src);
109 let components: Vec<_> = component_iterator.collect();
110 (component_iterator.has_closing_quote, components)
111 }
112
113 fn unclosed_char_component(src: &str) -> StringComponent {
114 let (has_closing_quote, components) = parse(src);
115 assert!(!has_closing_quote, "char should not have closing quote");
116 assert!(components.len() == 1);
117 components[0].clone()
118 }
119
120 fn closed_char_component(src: &str) -> StringComponent {
121 let (has_closing_quote, components) = parse(src);
122 assert!(has_closing_quote, "char should have closing quote");
123 assert!(components.len() == 1, "Literal: {}\nComponents: {:#?}", src, components);
124 components[0].clone()
125 }
126
127 fn closed_char_components(src: &str) -> Vec<StringComponent> {
128 let (has_closing_quote, components) = parse(src);
129 assert!(has_closing_quote, "char should have closing quote");
130 components
131 }
132
133 fn range_closed(src: &str) -> TextRange {
134 TextRange::from_to(1.into(), (src.len() as u32 - 1).into())
135 }
136
137 fn range_unclosed(src: &str) -> TextRange {
138 TextRange::from_to(1.into(), (src.len() as u32).into())
139 }
140
141 #[test]
142 fn test_unicode_escapes() {
143 let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""];
144 for escape in unicode_escapes {
145 let escape_sequence = format!(r"'\u{}'", escape);
146 let component = closed_char_component(&escape_sequence);
147 let expected_range = range_closed(&escape_sequence);
148 assert_eq!(component.kind, UnicodeEscape);
149 assert_eq!(component.range, expected_range);
150 }
151 }
152
153 #[test]
154 fn test_unicode_escapes_unclosed() {
155 let unicode_escapes = &["{DEAD", "{BEEF", "{FF"];
156 for escape in unicode_escapes {
157 let escape_sequence = format!(r"'\u{}'", escape);
158 let component = unclosed_char_component(&escape_sequence);
159 let expected_range = range_unclosed(&escape_sequence);
160 assert_eq!(component.kind, UnicodeEscape);
161 assert_eq!(component.range, expected_range);
162 }
163 }
164
165 #[test]
166 fn test_empty_char() {
167 let (has_closing_quote, components) = parse("''");
168 assert!(has_closing_quote, "char should have closing quote");
169 assert!(components.len() == 0);
170 }
171
172 #[test]
173 fn test_unclosed_char() {
174 let component = unclosed_char_component("'a");
175 assert!(component.kind == CodePoint);
176 assert!(component.range == TextRange::from_to(1.into(), 2.into()));
177 }
178
179 #[test]
180 fn test_digit_escapes() {
181 let literals = &[r"", r"5", r"55"];
182
183 for literal in literals {
184 let lit_text = format!(r"'\x{}'", literal);
185 let component = closed_char_component(&lit_text);
186 assert!(component.kind == AsciiCodeEscape);
187 assert!(component.range == range_closed(&lit_text));
188 }
189
190 // More than 2 digits starts a new codepoint
191 let components = closed_char_components(r"'\x555'");
192 assert!(components.len() == 2);
193 assert!(components[1].kind == CodePoint);
194 }
195
196 #[test]
197 fn test_ascii_escapes() {
198 let literals = &[
199 r"\'", "\\\"", // equivalent to \"
200 r"\n", r"\r", r"\t", r"\\", r"\0",
201 ];
202
203 for literal in literals {
204 let lit_text = format!("'{}'", literal);
205 let component = closed_char_component(&lit_text);
206 assert!(component.kind == AsciiEscape);
207 assert!(component.range == range_closed(&lit_text));
208 }
209 }
210
211 #[test]
212 fn test_no_escapes() {
213 let literals = &['"', 'n', 'r', 't', '0', 'x', 'u'];
214
215 for &literal in literals {
216 let lit_text = format!("'{}'", literal);
217 let component = closed_char_component(&lit_text);
218 assert!(component.kind == CodePoint);
219 assert!(component.range == range_closed(&lit_text));
220 }
221 }
222}