From cfc70207996e202edbb577b2ad97a61ba9eb0eaa Mon Sep 17 00:00:00 2001 From: Akshay Date: Tue, 2 Aug 2022 19:50:46 +0530 Subject: add textual comparison structural comparison helps detect a vast majority of duplicates, but it has a few false positives when files contain only trivia. textual similarity can help detect and eliminate those false positives. --- Cargo.lock | 16 ++++++++++++++++ Cargo.toml | 1 + readme | 7 +++---- src/lib.rs | 20 +++++++++++++++++--- 4 files changed, 37 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6c0cbfd..ecfe571 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -23,6 +23,7 @@ version = "0.1.0" dependencies = [ "itertools", "rayon", + "simhash", "thiserror", "tree-sitter", "tree-sitter-javascript", @@ -211,6 +212,21 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "simhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd8e19fb8912cfcdc26507d2e38f53c9f7462b58bf7356f6f91acfb632d3a224" +dependencies = [ + "siphasher", +] + +[[package]] +name = "siphasher" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac" + [[package]] name = "syn" version = "1.0.98" diff --git a/Cargo.toml b/Cargo.toml index 579c1c4..2803272 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,3 +17,4 @@ tree-sitter = "0.20" itertools = "0.10" rayon = "1.5" thiserror = "1.0" +simhash = "0.2" diff --git a/readme b/readme index 869e43e..d92989f 100644 --- a/readme +++ b/readme @@ -18,15 +18,14 @@ Internals: The tool uses tree-sitter to produce ASTs for the given files. It then lazily traverses the trees of the two files to be compared and exits on encountering -the first structural difference in the ASTs. +the first structural difference in the ASTs. Additionally, it performs a +textual similarity check to eliminate outliers such as files that consist +entirely of trivia nodes. Known issues: ------------ -- A fully commented-out file is equivalent to every other fully commented-out - file and to empty files - - Does not account for equivalence of unordered children: ==== file1.rs ==== ==== file2.rs ==== diff --git a/src/lib.rs b/src/lib.rs index b708984..df67a9a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,6 +17,8 @@ use tree_sitter::{Node, Parser, Tree}; type Result = std::result::Result; +const THRESHOLD: f64 = 0.6; + // Check if two tree-sitter objects (trees or nodes etc.) are "similar" pub trait Similarity { // This function accepts a predicate to filter out nodes, @@ -32,6 +34,7 @@ pub struct ProgramFile { pub path: PathBuf, pub src: String, pub tree: Tree, + pub simhash: u64, } impl Similarity for ProgramFile { @@ -49,12 +52,15 @@ impl Similarity for ProgramFile { // - one tree's traversal is longer than the other // - the trees have identical traversal lengths but non-identical // nodes along the traversal - mine.zip_longest(theirs) + let structurally_similar = mine + .zip_longest(theirs) .any(|result| match result { EitherOrBoth::Both(mine, theirs) => mine.kind_id() != theirs.kind_id(), EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => true, }) - .not() + .not(); + let textually_similar = (simhash::hash_similarity(self.simhash, other.simhash)) > THRESHOLD; + structurally_similar && textually_similar } } @@ -72,7 +78,13 @@ impl ProgramFile { let src = std::fs::read_to_string(&path)?; let path = path.as_ref().to_owned(); let tree = parser.parse(&src, None).ok_or(SimError::LanguageNotSet)?; - Ok(Self { path, src, tree }) + let simhash = simhash::simhash(&src); + Ok(Self { + path, + src, + tree, + simhash, + }) } } @@ -92,12 +104,14 @@ mod test { path: PathBuf::from("mine"), src: mine.to_owned(), tree: parser.parse(&mine, None).unwrap(), + simhash: simhash::simhash(&mine), }; let their_program_file = ProgramFile { path: PathBuf::from("their"), src: their.to_owned(), tree: parser.parse(&their, None).unwrap(), + simhash: simhash::simhash(&their), }; my_program_file.is_similar(&their_program_file, |n| !n.is_extra()) -- cgit v1.2.3