From cfc70207996e202edbb577b2ad97a61ba9eb0eaa Mon Sep 17 00:00:00 2001 From: Akshay Date: Tue, 2 Aug 2022 19:50:46 +0530 Subject: add textual comparison structural comparison helps detect a vast majority of duplicates, but it has a few false positives when files contain only trivia. textual similarity can help detect and eliminate those false positives. --- src/lib.rs | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'src/lib.rs') diff --git a/src/lib.rs b/src/lib.rs index b708984..df67a9a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,6 +17,8 @@ use tree_sitter::{Node, Parser, Tree}; type Result = std::result::Result; +const THRESHOLD: f64 = 0.6; + // Check if two tree-sitter objects (trees or nodes etc.) are "similar" pub trait Similarity { // This function accepts a predicate to filter out nodes, @@ -32,6 +34,7 @@ pub struct ProgramFile { pub path: PathBuf, pub src: String, pub tree: Tree, + pub simhash: u64, } impl Similarity for ProgramFile { @@ -49,12 +52,15 @@ impl Similarity for ProgramFile { // - one tree's traversal is longer than the other // - the trees have identical traversal lengths but non-identical // nodes along the traversal - mine.zip_longest(theirs) + let structurally_similar = mine + .zip_longest(theirs) .any(|result| match result { EitherOrBoth::Both(mine, theirs) => mine.kind_id() != theirs.kind_id(), EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => true, }) - .not() + .not(); + let textually_similar = (simhash::hash_similarity(self.simhash, other.simhash)) > THRESHOLD; + structurally_similar && textually_similar } } @@ -72,7 +78,13 @@ impl ProgramFile { let src = std::fs::read_to_string(&path)?; let path = path.as_ref().to_owned(); let tree = parser.parse(&src, None).ok_or(SimError::LanguageNotSet)?; - Ok(Self { path, src, tree }) + let simhash = simhash::simhash(&src); + Ok(Self { + path, + src, + tree, + simhash, + }) } } @@ -92,12 +104,14 @@ mod test { path: PathBuf::from("mine"), src: mine.to_owned(), tree: parser.parse(&mine, None).unwrap(), + simhash: simhash::simhash(&mine), }; let their_program_file = ProgramFile { path: PathBuf::from("their"), src: their.to_owned(), tree: parser.parse(&their, None).unwrap(), + simhash: simhash::simhash(&their), }; my_program_file.is_similar(&their_program_file, |n| !n.is_extra()) -- cgit v1.2.3