From 69b79e3a73f9a1b820cf6d5ebc9968d8b08d4e68 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Tue, 7 Jul 2020 22:53:12 +0200 Subject: Replace ad hocery with science --- crates/vfs/Cargo.toml | 1 + crates/vfs/src/file_set.rs | 105 +++++++++++++++++++++------------------------ crates/vfs/src/vfs_path.rs | 31 +++++++++++++ 3 files changed, 81 insertions(+), 56 deletions(-) (limited to 'crates/vfs') diff --git a/crates/vfs/Cargo.toml b/crates/vfs/Cargo.toml index 263069002..b985c4c10 100644 --- a/crates/vfs/Cargo.toml +++ b/crates/vfs/Cargo.toml @@ -6,5 +6,6 @@ edition = "2018" [dependencies] rustc-hash = "1.0" +fst = "0.4" paths = { path = "../paths" } diff --git a/crates/vfs/src/file_set.rs b/crates/vfs/src/file_set.rs index b0130017e..37c479306 100644 --- a/crates/vfs/src/file_set.rs +++ b/crates/vfs/src/file_set.rs @@ -2,8 +2,9 @@ //! //! Files which do not belong to any explicitly configured `FileSet` belong to //! the default `FileSet`. -use std::{fmt, mem}; +use std::fmt; +use fst::{IntoStreamer, Streamer}; use rustc_hash::FxHashMap; use crate::{FileId, Vfs, VfsPath}; @@ -40,15 +41,10 @@ impl fmt::Debug for FileSet { } } -// Invariant: if k1 is a prefix of k2, then they are in different buckets (k2 -// is closer to 0th bucket). -// FIXME: replace with an actual trie some day. -type BadTrie = Vec>; - #[derive(Debug)] pub struct FileSetConfig { n_file_sets: usize, - trie: BadTrie, + map: fst::Map>, } impl Default for FileSetConfig { @@ -62,9 +58,10 @@ impl FileSetConfig { FileSetConfigBuilder::default() } pub fn partition(&self, vfs: &Vfs) -> Vec { + let mut scratch_space = Vec::new(); let mut res = vec![FileSet::default(); self.len()]; for (file_id, path) in vfs.iter() { - let root = self.classify(&path); + let root = self.classify(&path, &mut scratch_space); res[root].insert(file_id, path) } res @@ -72,8 +69,16 @@ impl FileSetConfig { fn len(&self) -> usize { self.n_file_sets } - fn classify(&self, path: &VfsPath) -> usize { - find_ancestor(&self.trie, path, is_prefix).copied().unwrap_or(self.len() - 1) + fn classify(&self, path: &VfsPath, scratch_space: &mut Vec) -> usize { + scratch_space.clear(); + path.encode(scratch_space); + let automaton = PrefixOf::new(scratch_space.as_slice()); + let mut longest_prefix = self.len() - 1; + let mut stream = self.map.search(automaton).into_stream(); + while let Some((_, v)) = stream.next() { + longest_prefix = v as usize; + } + longest_prefix } } @@ -96,63 +101,51 @@ impl FileSetConfigBuilder { } pub fn build(self) -> FileSetConfig { let n_file_sets = self.roots.len() + 1; - - let mut trie = BadTrie::new(); - - for (i, paths) in self.roots.into_iter().enumerate() { - for p in paths { - insert(&mut trie, p, i, is_prefix); + let map = { + let mut entries = Vec::new(); + for (i, paths) in self.roots.into_iter().enumerate() { + for p in paths { + let mut buf = Vec::new(); + p.encode(&mut buf); + entries.push((buf, i as u64)); + } } - } - trie.iter_mut().for_each(|it| it.sort()); - FileSetConfig { n_file_sets, trie } + entries.sort(); + entries.dedup_by(|(a, _), (b, _)| a == b); + fst::Map::from_iter(entries).unwrap() + }; + FileSetConfig { n_file_sets, map } } } -fn is_prefix(short: &VfsPath, long: &VfsPath) -> bool { - long.starts_with(short) +struct PrefixOf<'a> { + prefix_of: &'a [u8], } -fn insert bool>( - trie: &mut BadTrie, - mut key: K, - mut value: V, - is_prefix: P, -) { - 'outer: for level in 0.. { - if trie.len() == level { - trie.push(Vec::new()) - } - for (k, v) in trie[level].iter_mut() { - if is_prefix(&key, k) { - continue 'outer; - } - if is_prefix(k, &key) { - mem::swap(k, &mut key); - mem::swap(v, &mut value); - continue 'outer; - } - } - trie[level].push((key, value)); - return; +impl<'a> PrefixOf<'a> { + fn new(prefix_of: &'a [u8]) -> Self { + Self { prefix_of } } } -fn find_ancestor<'t, K: Ord, V, P: Fn(&K, &K) -> bool>( - trie: &'t BadTrie, - key: &K, - is_prefix: P, -) -> Option<&'t V> { - for bucket in trie { - let idx = match bucket.binary_search_by(|(k, _)| k.cmp(key)) { - Ok(it) => it, - Err(it) => it.saturating_sub(1), - }; - if !bucket.is_empty() && is_prefix(&bucket[idx].0, key) { - return Some(&bucket[idx].1); +impl fst::Automaton for PrefixOf<'_> { + type State = usize; + fn start(&self) -> usize { + 0 + } + fn is_match(&self, &state: &usize) -> bool { + state != !0 + } + fn can_match(&self, &state: &usize) -> bool { + state != !0 + } + fn accept(&self, &state: &usize, byte: u8) -> usize { + if self.prefix_of.get(state) == Some(&byte) { + state + 1 + } else { + !0 } } - None } #[test] diff --git a/crates/vfs/src/vfs_path.rs b/crates/vfs/src/vfs_path.rs index dc3031ada..04a42264e 100644 --- a/crates/vfs/src/vfs_path.rs +++ b/crates/vfs/src/vfs_path.rs @@ -48,6 +48,37 @@ impl VfsPath { (VfsPathRepr::VirtualPath(_), _) => false, } } + + // Don't make this `pub` + pub(crate) fn encode(&self, buf: &mut Vec) { + let tag = match &self.0 { + VfsPathRepr::PathBuf(_) => 0, + VfsPathRepr::VirtualPath(_) => 1, + }; + buf.push(tag); + match &self.0 { + VfsPathRepr::PathBuf(it) => { + let path: &std::ffi::OsStr = it.as_os_str(); + #[cfg(windows)] + { + use std::os::windows::ffi::OsStrExt; + for wchar in path.encode_wide() { + buf.extend(wchar.to_le_bytes().iter().copied()); + } + } + #[cfg(unix)] + { + use std::os::unix::ffi::OsStrExt; + buf.extend(path.as_bytes()); + } + #[cfg(not(any(windows, unix)))] + { + buf.extend(path.to_string_lossy().as_bytes()); + } + } + VfsPathRepr::VirtualPath(VirtualPath(s)) => buf.extend(s.as_bytes()), + } + } } #[derive(Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] -- cgit v1.2.3