aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAkshay <[email protected]>2022-08-02 15:20:46 +0100
committerAkshay <[email protected]>2022-08-02 15:27:12 +0100
commitcfc70207996e202edbb577b2ad97a61ba9eb0eaa (patch)
tree97a3f25c3016766d6456efb748d48cbc6c525a47 /src
parentefd96e8df6805a45aaf5822141dee11c642b51ae (diff)
add textual comparisonHEADmaster
structural comparison helps detect a vast majority of duplicates, but it has a few false positives when files contain only trivia. textual similarity can help detect and eliminate those false positives.
Diffstat (limited to 'src')
-rw-r--r--src/lib.rs20
1 files changed, 17 insertions, 3 deletions
diff --git a/src/lib.rs b/src/lib.rs
index b708984..df67a9a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -17,6 +17,8 @@ use tree_sitter::{Node, Parser, Tree};
17 17
18type Result<T> = std::result::Result<T, SimError>; 18type Result<T> = std::result::Result<T, SimError>;
19 19
20const THRESHOLD: f64 = 0.6;
21
20// Check if two tree-sitter objects (trees or nodes etc.) are "similar" 22// Check if two tree-sitter objects (trees or nodes etc.) are "similar"
21pub trait Similarity { 23pub trait Similarity {
22 // This function accepts a predicate to filter out nodes, 24 // This function accepts a predicate to filter out nodes,
@@ -32,6 +34,7 @@ pub struct ProgramFile {
32 pub path: PathBuf, 34 pub path: PathBuf,
33 pub src: String, 35 pub src: String,
34 pub tree: Tree, 36 pub tree: Tree,
37 pub simhash: u64,
35} 38}
36 39
37impl Similarity for ProgramFile { 40impl Similarity for ProgramFile {
@@ -49,12 +52,15 @@ impl Similarity for ProgramFile {
49 // - one tree's traversal is longer than the other 52 // - one tree's traversal is longer than the other
50 // - the trees have identical traversal lengths but non-identical 53 // - the trees have identical traversal lengths but non-identical
51 // nodes along the traversal 54 // nodes along the traversal
52 mine.zip_longest(theirs) 55 let structurally_similar = mine
56 .zip_longest(theirs)
53 .any(|result| match result { 57 .any(|result| match result {
54 EitherOrBoth::Both(mine, theirs) => mine.kind_id() != theirs.kind_id(), 58 EitherOrBoth::Both(mine, theirs) => mine.kind_id() != theirs.kind_id(),
55 EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => true, 59 EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => true,
56 }) 60 })
57 .not() 61 .not();
62 let textually_similar = (simhash::hash_similarity(self.simhash, other.simhash)) > THRESHOLD;
63 structurally_similar && textually_similar
58 } 64 }
59} 65}
60 66
@@ -72,7 +78,13 @@ impl ProgramFile {
72 let src = std::fs::read_to_string(&path)?; 78 let src = std::fs::read_to_string(&path)?;
73 let path = path.as_ref().to_owned(); 79 let path = path.as_ref().to_owned();
74 let tree = parser.parse(&src, None).ok_or(SimError::LanguageNotSet)?; 80 let tree = parser.parse(&src, None).ok_or(SimError::LanguageNotSet)?;
75 Ok(Self { path, src, tree }) 81 let simhash = simhash::simhash(&src);
82 Ok(Self {
83 path,
84 src,
85 tree,
86 simhash,
87 })
76 } 88 }
77} 89}
78 90
@@ -92,12 +104,14 @@ mod test {
92 path: PathBuf::from("mine"), 104 path: PathBuf::from("mine"),
93 src: mine.to_owned(), 105 src: mine.to_owned(),
94 tree: parser.parse(&mine, None).unwrap(), 106 tree: parser.parse(&mine, None).unwrap(),
107 simhash: simhash::simhash(&mine),
95 }; 108 };
96 109
97 let their_program_file = ProgramFile { 110 let their_program_file = ProgramFile {
98 path: PathBuf::from("their"), 111 path: PathBuf::from("their"),
99 src: their.to_owned(), 112 src: their.to_owned(),
100 tree: parser.parse(&their, None).unwrap(), 113 tree: parser.parse(&their, None).unwrap(),
114 simhash: simhash::simhash(&their),
101 }; 115 };
102 116
103 my_program_file.is_similar(&their_program_file, |n| !n.is_extra()) 117 my_program_file.is_similar(&their_program_file, |n| !n.is_extra())