diff options
author | Akshay <[email protected]> | 2022-08-02 12:27:04 +0100 |
---|---|---|
committer | Akshay <[email protected]> | 2022-08-02 12:27:04 +0100 |
commit | efd96e8df6805a45aaf5822141dee11c642b51ae (patch) | |
tree | 56cca3a780dd0753e21eb5e0dbda585ce6ff7f7b /src |
init
Diffstat (limited to 'src')
-rw-r--r-- | src/error.rs | 17 | ||||
-rw-r--r-- | src/lib.rs | 220 | ||||
-rw-r--r-- | src/main.rs | 43 | ||||
-rw-r--r-- | src/traverse.rs | 51 |
4 files changed, 331 insertions, 0 deletions
diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..64bd094 --- /dev/null +++ b/src/error.rs | |||
@@ -0,0 +1,17 @@ | |||
1 | // std | ||
2 | use std::io; | ||
3 | |||
4 | // extern | ||
5 | use thiserror::Error; | ||
6 | |||
7 | #[derive(Error, Debug)] | ||
8 | pub enum SimError { | ||
9 | #[error("unable to set language")] | ||
10 | UnableToSetLanguage, | ||
11 | |||
12 | #[error("parser does not have a corresponding tree-sitter language")] | ||
13 | LanguageNotSet, | ||
14 | |||
15 | #[error("io error: {0}")] | ||
16 | IoError(#[from] io::Error), | ||
17 | } | ||
diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..b708984 --- /dev/null +++ b/src/lib.rs | |||
@@ -0,0 +1,220 @@ | |||
1 | // locals | ||
2 | mod error; | ||
3 | mod traverse; | ||
4 | |||
5 | // std | ||
6 | use std::{ | ||
7 | fmt, | ||
8 | ops::Not, | ||
9 | path::{Path, PathBuf}, | ||
10 | }; | ||
11 | |||
12 | // extern | ||
13 | use error::SimError; | ||
14 | use itertools::{EitherOrBoth, Itertools}; | ||
15 | use traverse::PreOrder; | ||
16 | use tree_sitter::{Node, Parser, Tree}; | ||
17 | |||
18 | type Result<T> = std::result::Result<T, SimError>; | ||
19 | |||
20 | // Check if two tree-sitter objects (trees or nodes etc.) are "similar" | ||
21 | pub trait Similarity { | ||
22 | // This function accepts a predicate to filter out nodes, | ||
23 | // for e.g.: trivia such as comments and whitespace may be filtered | ||
24 | // out with `!node.is_extra()` | ||
25 | fn is_similar<P>(&self, other: &Self, predicate: P) -> bool | ||
26 | where | ||
27 | P: Fn(&Node) -> bool; | ||
28 | } | ||
29 | |||
30 | // Representation of a fully parsed file from the filesystem | ||
31 | pub struct ProgramFile { | ||
32 | pub path: PathBuf, | ||
33 | pub src: String, | ||
34 | pub tree: Tree, | ||
35 | } | ||
36 | |||
37 | impl Similarity for ProgramFile { | ||
38 | fn is_similar<P>(&self, other: &Self, predicate: P) -> bool | ||
39 | where | ||
40 | P: Fn(&Node) -> bool, | ||
41 | { | ||
42 | // The order of traversal is unimportant | ||
43 | let mine = PreOrder::new(&self.tree).filter(|n| predicate(n)); | ||
44 | let theirs = PreOrder::new(&other.tree).filter(|n| predicate(n)); | ||
45 | |||
46 | // Simultaneously traverse both the trees, and find dissimilarities. | ||
47 | // | ||
48 | // A dissimilarity is when: | ||
49 | // - one tree's traversal is longer than the other | ||
50 | // - the trees have identical traversal lengths but non-identical | ||
51 | // nodes along the traversal | ||
52 | mine.zip_longest(theirs) | ||
53 | .any(|result| match result { | ||
54 | EitherOrBoth::Both(mine, theirs) => mine.kind_id() != theirs.kind_id(), | ||
55 | EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => true, | ||
56 | }) | ||
57 | .not() | ||
58 | } | ||
59 | } | ||
60 | |||
61 | impl fmt::Debug for ProgramFile { | ||
62 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||
63 | f.debug_struct("ProgramFile") | ||
64 | .field("path", &self.path) | ||
65 | .field("src", &self.src.len()) | ||
66 | .finish() | ||
67 | } | ||
68 | } | ||
69 | |||
70 | impl ProgramFile { | ||
71 | pub fn new<P: AsRef<Path>>(path: P, parser: &mut Parser) -> Result<Self> { | ||
72 | let src = std::fs::read_to_string(&path)?; | ||
73 | let path = path.as_ref().to_owned(); | ||
74 | let tree = parser.parse(&src, None).ok_or(SimError::LanguageNotSet)?; | ||
75 | Ok(Self { path, src, tree }) | ||
76 | } | ||
77 | } | ||
78 | |||
79 | #[cfg(test)] | ||
80 | mod test { | ||
81 | use super::*; | ||
82 | use tree_sitter::Parser; | ||
83 | |||
84 | // Utility to quickly bootstrap ProgramFiles | ||
85 | // and run similarity checks | ||
86 | fn is_similar(mine: &str, their: &str) -> bool { | ||
87 | let lang = tree_sitter_javascript::language(); | ||
88 | let mut parser = Parser::new(); | ||
89 | parser.set_language(lang).unwrap(); | ||
90 | |||
91 | let my_program_file = ProgramFile { | ||
92 | path: PathBuf::from("mine"), | ||
93 | src: mine.to_owned(), | ||
94 | tree: parser.parse(&mine, None).unwrap(), | ||
95 | }; | ||
96 | |||
97 | let their_program_file = ProgramFile { | ||
98 | path: PathBuf::from("their"), | ||
99 | src: their.to_owned(), | ||
100 | tree: parser.parse(&their, None).unwrap(), | ||
101 | }; | ||
102 | |||
103 | my_program_file.is_similar(&their_program_file, |n| !n.is_extra()) | ||
104 | } | ||
105 | |||
106 | #[test] | ||
107 | fn trivial() { | ||
108 | assert!(is_similar( | ||
109 | r#" | ||
110 | var x = 2; | ||
111 | "#, | ||
112 | r#" | ||
113 | var y = 5; | ||
114 | "# | ||
115 | )) | ||
116 | } | ||
117 | |||
118 | #[test] | ||
119 | fn automorphism() { | ||
120 | assert!(is_similar( | ||
121 | r#" | ||
122 | function(x) { | ||
123 | x * 2 | ||
124 | } | ||
125 | "#, | ||
126 | r#" | ||
127 | function(x) { | ||
128 | x * 2 | ||
129 | } | ||
130 | "# | ||
131 | )) | ||
132 | } | ||
133 | |||
134 | #[test] | ||
135 | fn sample() { | ||
136 | assert!(is_similar( | ||
137 | r#" | ||
138 | var scrapeWebsite = async function scrapeWebsite(website) { | ||
139 | try { | ||
140 | var url = normalizeUrl(website); | ||
141 | var html = await fetch(url).then(function (res) { | ||
142 | return res.text(); | ||
143 | }); | ||
144 | var $ = load(html); | ||
145 | var handles = {}; | ||
146 | module.exports.CONFIG.socialNetworks.forEach(function (socialNetwork) { | ||
147 | handles[socialNetwork] = parse(socialNetwork)($); | ||
148 | }) | ||
149 | return handles; | ||
150 | } catch (error) { | ||
151 | throw new Error("Error scraping website data for " + website + ": " + error.message) | ||
152 | } | ||
153 | } | ||
154 | "#, | ||
155 | r#" | ||
156 | var parseWebsite = async function parseWebsite(website) { | ||
157 | try { | ||
158 | var url = normalizeUrl(website); | ||
159 | var html = await fetch(url).then(function (res) { | ||
160 | return res.text(); | ||
161 | }); | ||
162 | var $ = load(html); | ||
163 | var handles = {}; | ||
164 | module.exports.CONFIG.socialNetworks.forEach(function (socialNetwork) { | ||
165 | handles[socialNetwork] = parse(socialNetwork)($); | ||
166 | }) | ||
167 | return handles; | ||
168 | } catch (error) { | ||
169 | throw new Error("Error fetching website data for " + website + ": " + error.message) | ||
170 | } | ||
171 | } | ||
172 | "# | ||
173 | )) | ||
174 | } | ||
175 | |||
176 | #[test] | ||
177 | fn ignore_comment_trivia() { | ||
178 | assert!(is_similar( | ||
179 | r#" | ||
180 | // doubles x | ||
181 | function(x) { | ||
182 | x * 2 | ||
183 | } | ||
184 | "#, | ||
185 | r#" | ||
186 | function(x) { | ||
187 | x * 2 // doubles x | ||
188 | } | ||
189 | "# | ||
190 | )) | ||
191 | } | ||
192 | |||
193 | #[test] | ||
194 | fn ignore_whitespace_trivia() { | ||
195 | assert!(is_similar( | ||
196 | r#" | ||
197 | function(x) { | ||
198 | return x * 2; | ||
199 | } | ||
200 | "#, | ||
201 | r#" | ||
202 | function(x ) { | ||
203 | return x * 2; | ||
204 | } | ||
205 | "# | ||
206 | )) | ||
207 | } | ||
208 | |||
209 | #[test] | ||
210 | fn dissimilar() { | ||
211 | assert!(!is_similar( | ||
212 | r#" | ||
213 | var t = function(x, y) { } | ||
214 | "#, | ||
215 | r#" | ||
216 | var l = function(x) { } | ||
217 | "# | ||
218 | )) | ||
219 | } | ||
220 | } | ||
diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..77c7584 --- /dev/null +++ b/src/main.rs | |||
@@ -0,0 +1,43 @@ | |||
1 | // extern | ||
2 | use itertools::Itertools; | ||
3 | use rayon::prelude::*; | ||
4 | use similarity::{ProgramFile, Similarity}; | ||
5 | use tree_sitter::Parser; | ||
6 | |||
7 | fn parser() -> Parser { | ||
8 | let mut parser = Parser::new(); | ||
9 | parser | ||
10 | .set_language(tree_sitter_javascript::language()) | ||
11 | .unwrap(); | ||
12 | parser | ||
13 | } | ||
14 | |||
15 | fn main() { | ||
16 | let args = std::env::args().skip(1).collect::<Vec<_>>(); | ||
17 | |||
18 | // read and parse files from disk | ||
19 | let (files, errors): (Vec<_>, Vec<_>) = args | ||
20 | .par_iter() | ||
21 | .map(|arg| ProgramFile::new(arg, &mut parser())) | ||
22 | .partition(Result::is_ok); | ||
23 | |||
24 | // dump errors if any | ||
25 | errors.into_iter().map(Result::unwrap_err).for_each(|err| { | ||
26 | eprintln!("{}", err); | ||
27 | }); | ||
28 | |||
29 | let files: Vec<_> = files.into_iter().map(Result::unwrap).collect(); | ||
30 | |||
31 | // compare every pair of files for similarity | ||
32 | (0..files.len()) | ||
33 | .combinations(2) | ||
34 | .map(|item| (item[0], item[1])) | ||
35 | .par_bridge() | ||
36 | .for_each(|(one, two)| { | ||
37 | let one = &files[one]; | ||
38 | let two = &files[two]; | ||
39 | if one.is_similar(&two, |n| !n.is_extra()) && one.path != two.path { | ||
40 | println!("{} {}", one.path.display(), two.path.display()); | ||
41 | } | ||
42 | }); | ||
43 | } | ||
diff --git a/src/traverse.rs b/src/traverse.rs new file mode 100644 index 0000000..df13f5d --- /dev/null +++ b/src/traverse.rs | |||
@@ -0,0 +1,51 @@ | |||
1 | // Lazy pre-order traversal over arbitrary tree-sitter trees | ||
2 | |||
3 | // extern | ||
4 | use tree_sitter::{Node, Tree, TreeCursor}; | ||
5 | |||
6 | pub struct PreOrder<'a> { | ||
7 | cursor: TreeCursor<'a>, | ||
8 | done: bool, | ||
9 | } | ||
10 | |||
11 | impl<'a> PreOrder<'a> { | ||
12 | pub fn new(tree: &'a Tree) -> Self { | ||
13 | Self { | ||
14 | cursor: tree.walk(), | ||
15 | done: false, | ||
16 | } | ||
17 | } | ||
18 | } | ||
19 | |||
20 | // Traverse the tree in root-left-right fashion | ||
21 | impl<'a> Iterator for PreOrder<'a> { | ||
22 | type Item = Node<'a>; | ||
23 | |||
24 | fn next(&mut self) -> Option<Self::Item> { | ||
25 | if self.done { | ||
26 | return None; | ||
27 | } | ||
28 | |||
29 | // capture the root node | ||
30 | let node = self.cursor.node(); | ||
31 | |||
32 | // if this node has a child, move there and return the root, or | ||
33 | // if this node has a sibling, move there and return the root | ||
34 | if self.cursor.goto_first_child() || self.cursor.goto_next_sibling() { | ||
35 | return Some(node); | ||
36 | } | ||
37 | |||
38 | loop { | ||
39 | // we are done retracing all the way back to the root node | ||
40 | if !self.cursor.goto_parent() { | ||
41 | self.done = true; | ||
42 | break; | ||
43 | } | ||
44 | |||
45 | if self.cursor.goto_next_sibling() { | ||
46 | break; | ||
47 | } | ||
48 | } | ||
49 | Some(node) | ||
50 | } | ||
51 | } | ||