diff options
author | Akshay <[email protected]> | 2022-08-02 15:20:46 +0100 |
---|---|---|
committer | Akshay <[email protected]> | 2022-08-02 15:27:12 +0100 |
commit | cfc70207996e202edbb577b2ad97a61ba9eb0eaa (patch) | |
tree | 97a3f25c3016766d6456efb748d48cbc6c525a47 /src | |
parent | efd96e8df6805a45aaf5822141dee11c642b51ae (diff) |
structural comparison helps detect a vast majority of duplicates, but it
has a few false positives when files contain only trivia. textual
similarity can help detect and eliminate those false positives.
Diffstat (limited to 'src')
-rw-r--r-- | src/lib.rs | 20 |
1 files changed, 17 insertions, 3 deletions
@@ -17,6 +17,8 @@ use tree_sitter::{Node, Parser, Tree}; | |||
17 | 17 | ||
18 | type Result<T> = std::result::Result<T, SimError>; | 18 | type Result<T> = std::result::Result<T, SimError>; |
19 | 19 | ||
20 | const THRESHOLD: f64 = 0.6; | ||
21 | |||
20 | // Check if two tree-sitter objects (trees or nodes etc.) are "similar" | 22 | // Check if two tree-sitter objects (trees or nodes etc.) are "similar" |
21 | pub trait Similarity { | 23 | pub trait Similarity { |
22 | // This function accepts a predicate to filter out nodes, | 24 | // This function accepts a predicate to filter out nodes, |
@@ -32,6 +34,7 @@ pub struct ProgramFile { | |||
32 | pub path: PathBuf, | 34 | pub path: PathBuf, |
33 | pub src: String, | 35 | pub src: String, |
34 | pub tree: Tree, | 36 | pub tree: Tree, |
37 | pub simhash: u64, | ||
35 | } | 38 | } |
36 | 39 | ||
37 | impl Similarity for ProgramFile { | 40 | impl Similarity for ProgramFile { |
@@ -49,12 +52,15 @@ impl Similarity for ProgramFile { | |||
49 | // - one tree's traversal is longer than the other | 52 | // - one tree's traversal is longer than the other |
50 | // - the trees have identical traversal lengths but non-identical | 53 | // - the trees have identical traversal lengths but non-identical |
51 | // nodes along the traversal | 54 | // nodes along the traversal |
52 | mine.zip_longest(theirs) | 55 | let structurally_similar = mine |
56 | .zip_longest(theirs) | ||
53 | .any(|result| match result { | 57 | .any(|result| match result { |
54 | EitherOrBoth::Both(mine, theirs) => mine.kind_id() != theirs.kind_id(), | 58 | EitherOrBoth::Both(mine, theirs) => mine.kind_id() != theirs.kind_id(), |
55 | EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => true, | 59 | EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => true, |
56 | }) | 60 | }) |
57 | .not() | 61 | .not(); |
62 | let textually_similar = (simhash::hash_similarity(self.simhash, other.simhash)) > THRESHOLD; | ||
63 | structurally_similar && textually_similar | ||
58 | } | 64 | } |
59 | } | 65 | } |
60 | 66 | ||
@@ -72,7 +78,13 @@ impl ProgramFile { | |||
72 | let src = std::fs::read_to_string(&path)?; | 78 | let src = std::fs::read_to_string(&path)?; |
73 | let path = path.as_ref().to_owned(); | 79 | let path = path.as_ref().to_owned(); |
74 | let tree = parser.parse(&src, None).ok_or(SimError::LanguageNotSet)?; | 80 | let tree = parser.parse(&src, None).ok_or(SimError::LanguageNotSet)?; |
75 | Ok(Self { path, src, tree }) | 81 | let simhash = simhash::simhash(&src); |
82 | Ok(Self { | ||
83 | path, | ||
84 | src, | ||
85 | tree, | ||
86 | simhash, | ||
87 | }) | ||
76 | } | 88 | } |
77 | } | 89 | } |
78 | 90 | ||
@@ -92,12 +104,14 @@ mod test { | |||
92 | path: PathBuf::from("mine"), | 104 | path: PathBuf::from("mine"), |
93 | src: mine.to_owned(), | 105 | src: mine.to_owned(), |
94 | tree: parser.parse(&mine, None).unwrap(), | 106 | tree: parser.parse(&mine, None).unwrap(), |
107 | simhash: simhash::simhash(&mine), | ||
95 | }; | 108 | }; |
96 | 109 | ||
97 | let their_program_file = ProgramFile { | 110 | let their_program_file = ProgramFile { |
98 | path: PathBuf::from("their"), | 111 | path: PathBuf::from("their"), |
99 | src: their.to_owned(), | 112 | src: their.to_owned(), |
100 | tree: parser.parse(&their, None).unwrap(), | 113 | tree: parser.parse(&their, None).unwrap(), |
114 | simhash: simhash::simhash(&their), | ||
101 | }; | 115 | }; |
102 | 116 | ||
103 | my_program_file.is_similar(&their_program_file, |n| !n.is_extra()) | 117 | my_program_file.is_similar(&their_program_file, |n| !n.is_extra()) |