aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAkshay <[email protected]>2022-08-02 15:20:46 +0100
committerAkshay <[email protected]>2022-08-02 15:27:12 +0100
commitcfc70207996e202edbb577b2ad97a61ba9eb0eaa (patch)
tree97a3f25c3016766d6456efb748d48cbc6c525a47
parentefd96e8df6805a45aaf5822141dee11c642b51ae (diff)
add textual comparisonHEADmaster
structural comparison helps detect a vast majority of duplicates, but it has a few false positives when files contain only trivia. textual similarity can help detect and eliminate those false positives.
-rw-r--r--Cargo.lock16
-rw-r--r--Cargo.toml1
-rw-r--r--readme7
-rw-r--r--src/lib.rs20
4 files changed, 37 insertions, 7 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 6c0cbfd..ecfe571 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -23,6 +23,7 @@ version = "0.1.0"
23dependencies = [ 23dependencies = [
24 "itertools", 24 "itertools",
25 "rayon", 25 "rayon",
26 "simhash",
26 "thiserror", 27 "thiserror",
27 "tree-sitter", 28 "tree-sitter",
28 "tree-sitter-javascript", 29 "tree-sitter-javascript",
@@ -212,6 +213,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
212checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" 213checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
213 214
214[[package]] 215[[package]]
216name = "simhash"
217version = "0.2.0"
218source = "registry+https://github.com/rust-lang/crates.io-index"
219checksum = "cd8e19fb8912cfcdc26507d2e38f53c9f7462b58bf7356f6f91acfb632d3a224"
220dependencies = [
221 "siphasher",
222]
223
224[[package]]
225name = "siphasher"
226version = "0.2.3"
227source = "registry+https://github.com/rust-lang/crates.io-index"
228checksum = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac"
229
230[[package]]
215name = "syn" 231name = "syn"
216version = "1.0.98" 232version = "1.0.98"
217source = "registry+https://github.com/rust-lang/crates.io-index" 233source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/Cargo.toml b/Cargo.toml
index 579c1c4..2803272 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,3 +17,4 @@ tree-sitter = "0.20"
17itertools = "0.10" 17itertools = "0.10"
18rayon = "1.5" 18rayon = "1.5"
19thiserror = "1.0" 19thiserror = "1.0"
20simhash = "0.2"
diff --git a/readme b/readme
index 869e43e..d92989f 100644
--- a/readme
+++ b/readme
@@ -18,15 +18,14 @@ Internals:
18 18
19The tool uses tree-sitter to produce ASTs for the given files. It then lazily 19The tool uses tree-sitter to produce ASTs for the given files. It then lazily
20traverses the trees of the two files to be compared and exits on encountering 20traverses the trees of the two files to be compared and exits on encountering
21the first structural difference in the ASTs. 21the first structural difference in the ASTs. Additionally, it performs a
22textual similarity check to eliminate outliers such as files that consist
23entirely of trivia nodes.
22 24
23 25
24Known issues: 26Known issues:
25------------ 27------------
26 28
27- A fully commented-out file is equivalent to every other fully commented-out
28 file and to empty files
29
30- Does not account for equivalence of unordered children: 29- Does not account for equivalence of unordered children:
31 30
32 ==== file1.rs ==== ==== file2.rs ==== 31 ==== file1.rs ==== ==== file2.rs ====
diff --git a/src/lib.rs b/src/lib.rs
index b708984..df67a9a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -17,6 +17,8 @@ use tree_sitter::{Node, Parser, Tree};
17 17
18type Result<T> = std::result::Result<T, SimError>; 18type Result<T> = std::result::Result<T, SimError>;
19 19
20const THRESHOLD: f64 = 0.6;
21
20// Check if two tree-sitter objects (trees or nodes etc.) are "similar" 22// Check if two tree-sitter objects (trees or nodes etc.) are "similar"
21pub trait Similarity { 23pub trait Similarity {
22 // This function accepts a predicate to filter out nodes, 24 // This function accepts a predicate to filter out nodes,
@@ -32,6 +34,7 @@ pub struct ProgramFile {
32 pub path: PathBuf, 34 pub path: PathBuf,
33 pub src: String, 35 pub src: String,
34 pub tree: Tree, 36 pub tree: Tree,
37 pub simhash: u64,
35} 38}
36 39
37impl Similarity for ProgramFile { 40impl Similarity for ProgramFile {
@@ -49,12 +52,15 @@ impl Similarity for ProgramFile {
49 // - one tree's traversal is longer than the other 52 // - one tree's traversal is longer than the other
50 // - the trees have identical traversal lengths but non-identical 53 // - the trees have identical traversal lengths but non-identical
51 // nodes along the traversal 54 // nodes along the traversal
52 mine.zip_longest(theirs) 55 let structurally_similar = mine
56 .zip_longest(theirs)
53 .any(|result| match result { 57 .any(|result| match result {
54 EitherOrBoth::Both(mine, theirs) => mine.kind_id() != theirs.kind_id(), 58 EitherOrBoth::Both(mine, theirs) => mine.kind_id() != theirs.kind_id(),
55 EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => true, 59 EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => true,
56 }) 60 })
57 .not() 61 .not();
62 let textually_similar = (simhash::hash_similarity(self.simhash, other.simhash)) > THRESHOLD;
63 structurally_similar && textually_similar
58 } 64 }
59} 65}
60 66
@@ -72,7 +78,13 @@ impl ProgramFile {
72 let src = std::fs::read_to_string(&path)?; 78 let src = std::fs::read_to_string(&path)?;
73 let path = path.as_ref().to_owned(); 79 let path = path.as_ref().to_owned();
74 let tree = parser.parse(&src, None).ok_or(SimError::LanguageNotSet)?; 80 let tree = parser.parse(&src, None).ok_or(SimError::LanguageNotSet)?;
75 Ok(Self { path, src, tree }) 81 let simhash = simhash::simhash(&src);
82 Ok(Self {
83 path,
84 src,
85 tree,
86 simhash,
87 })
76 } 88 }
77} 89}
78 90
@@ -92,12 +104,14 @@ mod test {
92 path: PathBuf::from("mine"), 104 path: PathBuf::from("mine"),
93 src: mine.to_owned(), 105 src: mine.to_owned(),
94 tree: parser.parse(&mine, None).unwrap(), 106 tree: parser.parse(&mine, None).unwrap(),
107 simhash: simhash::simhash(&mine),
95 }; 108 };
96 109
97 let their_program_file = ProgramFile { 110 let their_program_file = ProgramFile {
98 path: PathBuf::from("their"), 111 path: PathBuf::from("their"),
99 src: their.to_owned(), 112 src: their.to_owned(),
100 tree: parser.parse(&their, None).unwrap(), 113 tree: parser.parse(&their, None).unwrap(),
114 simhash: simhash::simhash(&their),
101 }; 115 };
102 116
103 my_program_file.is_similar(&their_program_file, |n| !n.is_extra()) 117 my_program_file.is_similar(&their_program_file, |n| !n.is_extra())