diff options
-rw-r--r-- | Cargo.lock | 16 | ||||
-rw-r--r-- | Cargo.toml | 1 | ||||
-rw-r--r-- | readme | 7 | ||||
-rw-r--r-- | src/lib.rs | 20 |
4 files changed, 37 insertions, 7 deletions
@@ -23,6 +23,7 @@ version = "0.1.0" | |||
23 | dependencies = [ | 23 | dependencies = [ |
24 | "itertools", | 24 | "itertools", |
25 | "rayon", | 25 | "rayon", |
26 | "simhash", | ||
26 | "thiserror", | 27 | "thiserror", |
27 | "tree-sitter", | 28 | "tree-sitter", |
28 | "tree-sitter-javascript", | 29 | "tree-sitter-javascript", |
@@ -212,6 +213,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index" | |||
212 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" | 213 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" |
213 | 214 | ||
214 | [[package]] | 215 | [[package]] |
216 | name = "simhash" | ||
217 | version = "0.2.0" | ||
218 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
219 | checksum = "cd8e19fb8912cfcdc26507d2e38f53c9f7462b58bf7356f6f91acfb632d3a224" | ||
220 | dependencies = [ | ||
221 | "siphasher", | ||
222 | ] | ||
223 | |||
224 | [[package]] | ||
225 | name = "siphasher" | ||
226 | version = "0.2.3" | ||
227 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
228 | checksum = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac" | ||
229 | |||
230 | [[package]] | ||
215 | name = "syn" | 231 | name = "syn" |
216 | version = "1.0.98" | 232 | version = "1.0.98" |
217 | source = "registry+https://github.com/rust-lang/crates.io-index" | 233 | source = "registry+https://github.com/rust-lang/crates.io-index" |
@@ -17,3 +17,4 @@ tree-sitter = "0.20" | |||
17 | itertools = "0.10" | 17 | itertools = "0.10" |
18 | rayon = "1.5" | 18 | rayon = "1.5" |
19 | thiserror = "1.0" | 19 | thiserror = "1.0" |
20 | simhash = "0.2" | ||
@@ -18,15 +18,14 @@ Internals: | |||
18 | 18 | ||
19 | The tool uses tree-sitter to produce ASTs for the given files. It then lazily | 19 | The tool uses tree-sitter to produce ASTs for the given files. It then lazily |
20 | traverses the trees of the two files to be compared and exits on encountering | 20 | traverses the trees of the two files to be compared and exits on encountering |
21 | the first structural difference in the ASTs. | 21 | the first structural difference in the ASTs. Additionally, it performs a |
22 | textual similarity check to eliminate outliers such as files that consist | ||
23 | entirely of trivia nodes. | ||
22 | 24 | ||
23 | 25 | ||
24 | Known issues: | 26 | Known issues: |
25 | ------------ | 27 | ------------ |
26 | 28 | ||
27 | - A fully commented-out file is equivalent to every other fully commented-out | ||
28 | file and to empty files | ||
29 | |||
30 | - Does not account for equivalence of unordered children: | 29 | - Does not account for equivalence of unordered children: |
31 | 30 | ||
32 | ==== file1.rs ==== ==== file2.rs ==== | 31 | ==== file1.rs ==== ==== file2.rs ==== |
@@ -17,6 +17,8 @@ use tree_sitter::{Node, Parser, Tree}; | |||
17 | 17 | ||
18 | type Result<T> = std::result::Result<T, SimError>; | 18 | type Result<T> = std::result::Result<T, SimError>; |
19 | 19 | ||
20 | const THRESHOLD: f64 = 0.6; | ||
21 | |||
20 | // Check if two tree-sitter objects (trees or nodes etc.) are "similar" | 22 | // Check if two tree-sitter objects (trees or nodes etc.) are "similar" |
21 | pub trait Similarity { | 23 | pub trait Similarity { |
22 | // This function accepts a predicate to filter out nodes, | 24 | // This function accepts a predicate to filter out nodes, |
@@ -32,6 +34,7 @@ pub struct ProgramFile { | |||
32 | pub path: PathBuf, | 34 | pub path: PathBuf, |
33 | pub src: String, | 35 | pub src: String, |
34 | pub tree: Tree, | 36 | pub tree: Tree, |
37 | pub simhash: u64, | ||
35 | } | 38 | } |
36 | 39 | ||
37 | impl Similarity for ProgramFile { | 40 | impl Similarity for ProgramFile { |
@@ -49,12 +52,15 @@ impl Similarity for ProgramFile { | |||
49 | // - one tree's traversal is longer than the other | 52 | // - one tree's traversal is longer than the other |
50 | // - the trees have identical traversal lengths but non-identical | 53 | // - the trees have identical traversal lengths but non-identical |
51 | // nodes along the traversal | 54 | // nodes along the traversal |
52 | mine.zip_longest(theirs) | 55 | let structurally_similar = mine |
56 | .zip_longest(theirs) | ||
53 | .any(|result| match result { | 57 | .any(|result| match result { |
54 | EitherOrBoth::Both(mine, theirs) => mine.kind_id() != theirs.kind_id(), | 58 | EitherOrBoth::Both(mine, theirs) => mine.kind_id() != theirs.kind_id(), |
55 | EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => true, | 59 | EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => true, |
56 | }) | 60 | }) |
57 | .not() | 61 | .not(); |
62 | let textually_similar = (simhash::hash_similarity(self.simhash, other.simhash)) > THRESHOLD; | ||
63 | structurally_similar && textually_similar | ||
58 | } | 64 | } |
59 | } | 65 | } |
60 | 66 | ||
@@ -72,7 +78,13 @@ impl ProgramFile { | |||
72 | let src = std::fs::read_to_string(&path)?; | 78 | let src = std::fs::read_to_string(&path)?; |
73 | let path = path.as_ref().to_owned(); | 79 | let path = path.as_ref().to_owned(); |
74 | let tree = parser.parse(&src, None).ok_or(SimError::LanguageNotSet)?; | 80 | let tree = parser.parse(&src, None).ok_or(SimError::LanguageNotSet)?; |
75 | Ok(Self { path, src, tree }) | 81 | let simhash = simhash::simhash(&src); |
82 | Ok(Self { | ||
83 | path, | ||
84 | src, | ||
85 | tree, | ||
86 | simhash, | ||
87 | }) | ||
76 | } | 88 | } |
77 | } | 89 | } |
78 | 90 | ||
@@ -92,12 +104,14 @@ mod test { | |||
92 | path: PathBuf::from("mine"), | 104 | path: PathBuf::from("mine"), |
93 | src: mine.to_owned(), | 105 | src: mine.to_owned(), |
94 | tree: parser.parse(&mine, None).unwrap(), | 106 | tree: parser.parse(&mine, None).unwrap(), |
107 | simhash: simhash::simhash(&mine), | ||
95 | }; | 108 | }; |
96 | 109 | ||
97 | let their_program_file = ProgramFile { | 110 | let their_program_file = ProgramFile { |
98 | path: PathBuf::from("their"), | 111 | path: PathBuf::from("their"), |
99 | src: their.to_owned(), | 112 | src: their.to_owned(), |
100 | tree: parser.parse(&their, None).unwrap(), | 113 | tree: parser.parse(&their, None).unwrap(), |
114 | simhash: simhash::simhash(&their), | ||
101 | }; | 115 | }; |
102 | 116 | ||
103 | my_program_file.is_similar(&their_program_file, |n| !n.is_extra()) | 117 | my_program_file.is_similar(&their_program_file, |n| !n.is_extra()) |