diff options
| author | Adolfo Ochagavía <[email protected]> | 2018-11-15 16:34:05 +0000 |
|---|---|---|
| committer | Adolfo Ochagavía <[email protected]> | 2018-11-16 11:15:50 +0000 |
| commit | 136d1864bcb5046e7f334ac347a8a94946d1ba90 (patch) | |
| tree | 94be94eff9265d0e41cb847be2190e35416eb0a3 /crates/ra_editor | |
| parent | 9aebd9e6caf49467ca20caf2583c47cf5092c788 (diff) | |
Support UTF-16 chars in LineIndex
Diffstat (limited to 'crates/ra_editor')
| -rw-r--r-- | crates/ra_editor/src/line_index.rs | 303 |
1 files changed, 197 insertions, 106 deletions
diff --git a/crates/ra_editor/src/line_index.rs b/crates/ra_editor/src/line_index.rs index 9abbb0d09..0b3a28cd4 100644 --- a/crates/ra_editor/src/line_index.rs +++ b/crates/ra_editor/src/line_index.rs | |||
| @@ -1,43 +1,124 @@ | |||
| 1 | use crate::TextUnit; | 1 | use crate::TextUnit; |
| 2 | use rustc_hash::FxHashMap; | ||
| 2 | use superslice::Ext; | 3 | use superslice::Ext; |
| 3 | 4 | ||
| 4 | #[derive(Clone, Debug, Hash, PartialEq, Eq)] | 5 | #[derive(Clone, Debug, PartialEq, Eq)] |
| 5 | pub struct LineIndex { | 6 | pub struct LineIndex { |
| 6 | newlines: Vec<TextUnit>, | 7 | newlines: Vec<TextUnit>, |
| 8 | utf16_lines: FxHashMap<u32, Vec<Utf16Char>>, | ||
| 7 | } | 9 | } |
| 8 | 10 | ||
| 9 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] | 11 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] |
| 10 | pub struct LineCol { | 12 | pub struct LineCol { |
| 11 | pub line: u32, | 13 | pub line: u32, |
| 12 | pub col: TextUnit, | 14 | pub col: u32, |
| 15 | } | ||
| 16 | |||
| 17 | #[derive(Clone, Debug, Hash, PartialEq, Eq)] | ||
| 18 | struct Utf16Char { | ||
| 19 | start: TextUnit, | ||
| 20 | end: TextUnit, | ||
| 21 | } | ||
| 22 | |||
| 23 | impl Utf16Char { | ||
| 24 | fn len(&self) -> TextUnit { | ||
| 25 | self.end - self.start | ||
| 26 | } | ||
| 13 | } | 27 | } |
| 14 | 28 | ||
| 15 | impl LineIndex { | 29 | impl LineIndex { |
| 16 | pub fn new(text: &str) -> LineIndex { | 30 | pub fn new(text: &str) -> LineIndex { |
| 31 | let mut utf16_lines = FxHashMap::default(); | ||
| 32 | let mut utf16_chars = Vec::new(); | ||
| 33 | |||
| 17 | let mut newlines = vec![0.into()]; | 34 | let mut newlines = vec![0.into()]; |
| 18 | let mut curr = 0.into(); | 35 | let mut curr_row = 0.into(); |
| 36 | let mut curr_col = 0.into(); | ||
| 37 | let mut line = 0; | ||
| 19 | for c in text.chars() { | 38 | for c in text.chars() { |
| 20 | curr += TextUnit::of_char(c); | 39 | curr_row += TextUnit::of_char(c); |
| 21 | if c == '\n' { | 40 | if c == '\n' { |
| 22 | newlines.push(curr); | 41 | newlines.push(curr_row); |
| 42 | |||
| 43 | // Save any utf-16 characters seen in the previous line | ||
| 44 | if utf16_chars.len() > 0 { | ||
| 45 | utf16_lines.insert(line, utf16_chars); | ||
| 46 | utf16_chars = Vec::new(); | ||
| 47 | } | ||
| 48 | |||
| 49 | // Prepare for processing the next line | ||
| 50 | curr_col = 0.into(); | ||
| 51 | line += 1; | ||
| 52 | continue; | ||
| 23 | } | 53 | } |
| 54 | |||
| 55 | let char_len = TextUnit::of_char(c); | ||
| 56 | if char_len.to_usize() > 1 { | ||
| 57 | utf16_chars.push(Utf16Char { | ||
| 58 | start: curr_col, | ||
| 59 | end: curr_col + char_len, | ||
| 60 | }); | ||
| 61 | } | ||
| 62 | |||
| 63 | curr_col += char_len; | ||
| 64 | } | ||
| 65 | LineIndex { | ||
| 66 | newlines, | ||
| 67 | utf16_lines, | ||
| 24 | } | 68 | } |
| 25 | LineIndex { newlines } | ||
| 26 | } | 69 | } |
| 27 | 70 | ||
| 28 | pub fn line_col(&self, offset: TextUnit) -> LineCol { | 71 | pub fn line_col(&self, offset: TextUnit) -> LineCol { |
| 29 | let line = self.newlines.upper_bound(&offset) - 1; | 72 | let line = self.newlines.upper_bound(&offset) - 1; |
| 30 | let line_start_offset = self.newlines[line]; | 73 | let line_start_offset = self.newlines[line]; |
| 31 | let col = offset - line_start_offset; | 74 | let col = offset - line_start_offset; |
| 75 | |||
| 32 | LineCol { | 76 | LineCol { |
| 33 | line: line as u32, | 77 | line: line as u32, |
| 34 | col, | 78 | col: self.utf8_to_utf16_col(line as u32, col) as u32, |
| 35 | } | 79 | } |
| 36 | } | 80 | } |
| 37 | 81 | ||
| 38 | pub fn offset(&self, line_col: LineCol) -> TextUnit { | 82 | pub fn offset(&self, line_col: LineCol) -> TextUnit { |
| 39 | //TODO: return Result | 83 | //TODO: return Result |
| 40 | self.newlines[line_col.line as usize] + line_col.col | 84 | let col = self.utf16_to_utf8_col(line_col.line, line_col.col); |
| 85 | self.newlines[line_col.line as usize] + col | ||
| 86 | } | ||
| 87 | |||
| 88 | fn utf8_to_utf16_col(&self, line: u32, mut col: TextUnit) -> usize { | ||
| 89 | if let Some(utf16_chars) = self.utf16_lines.get(&line) { | ||
| 90 | let mut correction = TextUnit::from_usize(0); | ||
| 91 | for c in utf16_chars { | ||
| 92 | if col >= c.end { | ||
| 93 | correction += c.len() - TextUnit::from_usize(1); | ||
| 94 | } else { | ||
| 95 | // From here on, all utf16 characters come *after* the character we are mapping, | ||
| 96 | // so we don't need to take them into account | ||
| 97 | break; | ||
| 98 | } | ||
| 99 | } | ||
| 100 | |||
| 101 | col -= correction; | ||
| 102 | } | ||
| 103 | |||
| 104 | col.to_usize() | ||
| 105 | } | ||
| 106 | |||
| 107 | fn utf16_to_utf8_col(&self, line: u32, col: u32) -> TextUnit { | ||
| 108 | let mut col: TextUnit = col.into(); | ||
| 109 | if let Some(utf16_chars) = self.utf16_lines.get(&line) { | ||
| 110 | for c in utf16_chars { | ||
| 111 | if col >= c.start { | ||
| 112 | col += c.len() - TextUnit::from_usize(1); | ||
| 113 | } else { | ||
| 114 | // From here on, all utf16 characters come *after* the character we are mapping, | ||
| 115 | // so we don't need to take them into account | ||
| 116 | break; | ||
| 117 | } | ||
| 118 | } | ||
| 119 | } | ||
| 120 | |||
| 121 | col | ||
| 41 | } | 122 | } |
| 42 | } | 123 | } |
| 43 | 124 | ||
| @@ -45,105 +126,115 @@ impl LineIndex { | |||
| 45 | fn test_line_index() { | 126 | fn test_line_index() { |
| 46 | let text = "hello\nworld"; | 127 | let text = "hello\nworld"; |
| 47 | let index = LineIndex::new(text); | 128 | let index = LineIndex::new(text); |
| 48 | assert_eq!( | 129 | assert_eq!(index.line_col(0.into()), LineCol { line: 0, col: 0 }); |
| 49 | index.line_col(0.into()), | 130 | assert_eq!(index.line_col(1.into()), LineCol { line: 0, col: 1 }); |
| 50 | LineCol { | 131 | assert_eq!(index.line_col(5.into()), LineCol { line: 0, col: 5 }); |
| 51 | line: 0, | 132 | assert_eq!(index.line_col(6.into()), LineCol { line: 1, col: 0 }); |
| 52 | col: 0.into() | 133 | assert_eq!(index.line_col(7.into()), LineCol { line: 1, col: 1 }); |
| 53 | } | 134 | assert_eq!(index.line_col(8.into()), LineCol { line: 1, col: 2 }); |
| 54 | ); | 135 | assert_eq!(index.line_col(10.into()), LineCol { line: 1, col: 4 }); |
| 55 | assert_eq!( | 136 | assert_eq!(index.line_col(11.into()), LineCol { line: 1, col: 5 }); |
| 56 | index.line_col(1.into()), | 137 | assert_eq!(index.line_col(12.into()), LineCol { line: 1, col: 6 }); |
| 57 | LineCol { | ||
| 58 | line: 0, | ||
| 59 | col: 1.into() | ||
| 60 | } | ||
| 61 | ); | ||
| 62 | assert_eq!( | ||
| 63 | index.line_col(5.into()), | ||
| 64 | LineCol { | ||
| 65 | line: 0, | ||
| 66 | col: 5.into() | ||
| 67 | } | ||
| 68 | ); | ||
| 69 | assert_eq!( | ||
| 70 | index.line_col(6.into()), | ||
| 71 | LineCol { | ||
| 72 | line: 1, | ||
| 73 | col: 0.into() | ||
| 74 | } | ||
| 75 | ); | ||
| 76 | assert_eq!( | ||
| 77 | index.line_col(7.into()), | ||
| 78 | LineCol { | ||
| 79 | line: 1, | ||
| 80 | col: 1.into() | ||
| 81 | } | ||
| 82 | ); | ||
| 83 | assert_eq!( | ||
| 84 | index.line_col(8.into()), | ||
| 85 | LineCol { | ||
| 86 | line: 1, | ||
| 87 | col: 2.into() | ||
| 88 | } | ||
| 89 | ); | ||
| 90 | assert_eq!( | ||
| 91 | index.line_col(10.into()), | ||
| 92 | LineCol { | ||
| 93 | line: 1, | ||
| 94 | col: 4.into() | ||
| 95 | } | ||
| 96 | ); | ||
| 97 | assert_eq!( | ||
| 98 | index.line_col(11.into()), | ||
| 99 | LineCol { | ||
| 100 | line: 1, | ||
| 101 | col: 5.into() | ||
| 102 | } | ||
| 103 | ); | ||
| 104 | assert_eq!( | ||
| 105 | index.line_col(12.into()), | ||
| 106 | LineCol { | ||
| 107 | line: 1, | ||
| 108 | col: 6.into() | ||
| 109 | } | ||
| 110 | ); | ||
| 111 | 138 | ||
| 112 | let text = "\nhello\nworld"; | 139 | let text = "\nhello\nworld"; |
| 113 | let index = LineIndex::new(text); | 140 | let index = LineIndex::new(text); |
| 114 | assert_eq!( | 141 | assert_eq!(index.line_col(0.into()), LineCol { line: 0, col: 0 }); |
| 115 | index.line_col(0.into()), | 142 | assert_eq!(index.line_col(1.into()), LineCol { line: 1, col: 0 }); |
| 116 | LineCol { | 143 | assert_eq!(index.line_col(2.into()), LineCol { line: 1, col: 1 }); |
| 117 | line: 0, | 144 | assert_eq!(index.line_col(6.into()), LineCol { line: 1, col: 5 }); |
| 118 | col: 0.into() | 145 | assert_eq!(index.line_col(7.into()), LineCol { line: 2, col: 0 }); |
| 119 | } | 146 | } |
| 120 | ); | 147 | |
| 121 | assert_eq!( | 148 | #[cfg(test)] |
| 122 | index.line_col(1.into()), | 149 | mod test_utf8_utf16_conv { |
| 123 | LineCol { | 150 | use super::*; |
| 124 | line: 1, | 151 | |
| 125 | col: 0.into() | 152 | #[test] |
| 126 | } | 153 | fn test_char_len() { |
| 127 | ); | 154 | assert_eq!('メ'.len_utf8(), 3); |
| 128 | assert_eq!( | 155 | assert_eq!('メ'.len_utf16(), 1); |
| 129 | index.line_col(2.into()), | 156 | } |
| 130 | LineCol { | 157 | |
| 131 | line: 1, | 158 | #[test] |
| 132 | col: 1.into() | 159 | fn test_empty_index() { |
| 133 | } | 160 | let col_index = LineIndex::new( |
| 134 | ); | 161 | " |
| 135 | assert_eq!( | 162 | const C: char = 'x'; |
| 136 | index.line_col(6.into()), | 163 | ", |
| 137 | LineCol { | 164 | ); |
| 138 | line: 1, | 165 | assert_eq!(col_index.utf16_lines.len(), 0); |
| 139 | col: 5.into() | 166 | } |
| 140 | } | 167 | |
| 141 | ); | 168 | #[test] |
| 142 | assert_eq!( | 169 | fn test_single_char() { |
| 143 | index.line_col(7.into()), | 170 | let col_index = LineIndex::new( |
| 144 | LineCol { | 171 | " |
| 145 | line: 2, | 172 | const C: char = 'メ'; |
| 146 | col: 0.into() | 173 | ", |
| 147 | } | 174 | ); |
| 148 | ); | 175 | |
| 176 | assert_eq!(col_index.utf16_lines.len(), 1); | ||
| 177 | assert_eq!(col_index.utf16_lines[&1].len(), 1); | ||
| 178 | assert_eq!( | ||
| 179 | col_index.utf16_lines[&1][0], | ||
| 180 | Utf16Char { | ||
| 181 | start: 17.into(), | ||
| 182 | end: 20.into() | ||
| 183 | } | ||
| 184 | ); | ||
| 185 | |||
| 186 | // UTF-8 to UTF-16, no changes | ||
| 187 | assert_eq!(col_index.utf8_to_utf16_col(1, 15.into()), 15); | ||
| 188 | |||
| 189 | // UTF-8 to UTF-16 | ||
| 190 | assert_eq!(col_index.utf8_to_utf16_col(1, 22.into()), 20); | ||
| 191 | |||
| 192 | // UTF-16 to UTF-8, no changes | ||
| 193 | assert_eq!(col_index.utf16_to_utf8_col(1, 15), TextUnit::from(15)); | ||
| 194 | |||
| 195 | // UTF-16 to UTF-8 | ||
| 196 | assert_eq!(col_index.utf16_to_utf8_col(1, 19), TextUnit::from(21)); | ||
| 197 | } | ||
| 198 | |||
| 199 | #[test] | ||
| 200 | fn test_string() { | ||
| 201 | let col_index = LineIndex::new( | ||
| 202 | " | ||
| 203 | const C: char = \"メ メ\"; | ||
| 204 | ", | ||
| 205 | ); | ||
| 206 | |||
| 207 | assert_eq!(col_index.utf16_lines.len(), 1); | ||
| 208 | assert_eq!(col_index.utf16_lines[&1].len(), 2); | ||
| 209 | assert_eq!( | ||
| 210 | col_index.utf16_lines[&1][0], | ||
| 211 | Utf16Char { | ||
| 212 | start: 17.into(), | ||
| 213 | end: 20.into() | ||
| 214 | } | ||
| 215 | ); | ||
| 216 | assert_eq!( | ||
| 217 | col_index.utf16_lines[&1][1], | ||
| 218 | Utf16Char { | ||
| 219 | start: 21.into(), | ||
| 220 | end: 24.into() | ||
| 221 | } | ||
| 222 | ); | ||
| 223 | |||
| 224 | // UTF-8 to UTF-16 | ||
| 225 | assert_eq!(col_index.utf8_to_utf16_col(1, 15.into()), 15); | ||
| 226 | |||
| 227 | assert_eq!(col_index.utf8_to_utf16_col(1, 21.into()), 19); | ||
| 228 | assert_eq!(col_index.utf8_to_utf16_col(1, 25.into()), 21); | ||
| 229 | |||
| 230 | assert!(col_index.utf8_to_utf16_col(2, 15.into()) == 15); | ||
| 231 | |||
| 232 | // UTF-16 to UTF-8 | ||
| 233 | assert_eq!(col_index.utf16_to_utf8_col(1, 15), TextUnit::from_usize(15)); | ||
| 234 | |||
| 235 | assert_eq!(col_index.utf16_to_utf8_col(1, 18), TextUnit::from_usize(20)); | ||
| 236 | assert_eq!(col_index.utf16_to_utf8_col(1, 19), TextUnit::from_usize(23)); | ||
| 237 | |||
| 238 | assert_eq!(col_index.utf16_to_utf8_col(2, 15), TextUnit::from_usize(15)); | ||
| 239 | } | ||
| 149 | } | 240 | } |
