diff options
Diffstat (limited to 'src/lib.rs')
-rw-r--r-- | src/lib.rs | 220 |
1 files changed, 220 insertions, 0 deletions
diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..b708984 --- /dev/null +++ b/src/lib.rs | |||
@@ -0,0 +1,220 @@ | |||
1 | // locals | ||
2 | mod error; | ||
3 | mod traverse; | ||
4 | |||
5 | // std | ||
6 | use std::{ | ||
7 | fmt, | ||
8 | ops::Not, | ||
9 | path::{Path, PathBuf}, | ||
10 | }; | ||
11 | |||
12 | // extern | ||
13 | use error::SimError; | ||
14 | use itertools::{EitherOrBoth, Itertools}; | ||
15 | use traverse::PreOrder; | ||
16 | use tree_sitter::{Node, Parser, Tree}; | ||
17 | |||
18 | type Result<T> = std::result::Result<T, SimError>; | ||
19 | |||
20 | // Check if two tree-sitter objects (trees or nodes etc.) are "similar" | ||
21 | pub trait Similarity { | ||
22 | // This function accepts a predicate to filter out nodes, | ||
23 | // for e.g.: trivia such as comments and whitespace may be filtered | ||
24 | // out with `!node.is_extra()` | ||
25 | fn is_similar<P>(&self, other: &Self, predicate: P) -> bool | ||
26 | where | ||
27 | P: Fn(&Node) -> bool; | ||
28 | } | ||
29 | |||
30 | // Representation of a fully parsed file from the filesystem | ||
31 | pub struct ProgramFile { | ||
32 | pub path: PathBuf, | ||
33 | pub src: String, | ||
34 | pub tree: Tree, | ||
35 | } | ||
36 | |||
37 | impl Similarity for ProgramFile { | ||
38 | fn is_similar<P>(&self, other: &Self, predicate: P) -> bool | ||
39 | where | ||
40 | P: Fn(&Node) -> bool, | ||
41 | { | ||
42 | // The order of traversal is unimportant | ||
43 | let mine = PreOrder::new(&self.tree).filter(|n| predicate(n)); | ||
44 | let theirs = PreOrder::new(&other.tree).filter(|n| predicate(n)); | ||
45 | |||
46 | // Simultaneously traverse both the trees, and find dissimilarities. | ||
47 | // | ||
48 | // A dissimilarity is when: | ||
49 | // - one tree's traversal is longer than the other | ||
50 | // - the trees have identical traversal lengths but non-identical | ||
51 | // nodes along the traversal | ||
52 | mine.zip_longest(theirs) | ||
53 | .any(|result| match result { | ||
54 | EitherOrBoth::Both(mine, theirs) => mine.kind_id() != theirs.kind_id(), | ||
55 | EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => true, | ||
56 | }) | ||
57 | .not() | ||
58 | } | ||
59 | } | ||
60 | |||
61 | impl fmt::Debug for ProgramFile { | ||
62 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||
63 | f.debug_struct("ProgramFile") | ||
64 | .field("path", &self.path) | ||
65 | .field("src", &self.src.len()) | ||
66 | .finish() | ||
67 | } | ||
68 | } | ||
69 | |||
70 | impl ProgramFile { | ||
71 | pub fn new<P: AsRef<Path>>(path: P, parser: &mut Parser) -> Result<Self> { | ||
72 | let src = std::fs::read_to_string(&path)?; | ||
73 | let path = path.as_ref().to_owned(); | ||
74 | let tree = parser.parse(&src, None).ok_or(SimError::LanguageNotSet)?; | ||
75 | Ok(Self { path, src, tree }) | ||
76 | } | ||
77 | } | ||
78 | |||
79 | #[cfg(test)] | ||
80 | mod test { | ||
81 | use super::*; | ||
82 | use tree_sitter::Parser; | ||
83 | |||
84 | // Utility to quickly bootstrap ProgramFiles | ||
85 | // and run similarity checks | ||
86 | fn is_similar(mine: &str, their: &str) -> bool { | ||
87 | let lang = tree_sitter_javascript::language(); | ||
88 | let mut parser = Parser::new(); | ||
89 | parser.set_language(lang).unwrap(); | ||
90 | |||
91 | let my_program_file = ProgramFile { | ||
92 | path: PathBuf::from("mine"), | ||
93 | src: mine.to_owned(), | ||
94 | tree: parser.parse(&mine, None).unwrap(), | ||
95 | }; | ||
96 | |||
97 | let their_program_file = ProgramFile { | ||
98 | path: PathBuf::from("their"), | ||
99 | src: their.to_owned(), | ||
100 | tree: parser.parse(&their, None).unwrap(), | ||
101 | }; | ||
102 | |||
103 | my_program_file.is_similar(&their_program_file, |n| !n.is_extra()) | ||
104 | } | ||
105 | |||
106 | #[test] | ||
107 | fn trivial() { | ||
108 | assert!(is_similar( | ||
109 | r#" | ||
110 | var x = 2; | ||
111 | "#, | ||
112 | r#" | ||
113 | var y = 5; | ||
114 | "# | ||
115 | )) | ||
116 | } | ||
117 | |||
118 | #[test] | ||
119 | fn automorphism() { | ||
120 | assert!(is_similar( | ||
121 | r#" | ||
122 | function(x) { | ||
123 | x * 2 | ||
124 | } | ||
125 | "#, | ||
126 | r#" | ||
127 | function(x) { | ||
128 | x * 2 | ||
129 | } | ||
130 | "# | ||
131 | )) | ||
132 | } | ||
133 | |||
134 | #[test] | ||
135 | fn sample() { | ||
136 | assert!(is_similar( | ||
137 | r#" | ||
138 | var scrapeWebsite = async function scrapeWebsite(website) { | ||
139 | try { | ||
140 | var url = normalizeUrl(website); | ||
141 | var html = await fetch(url).then(function (res) { | ||
142 | return res.text(); | ||
143 | }); | ||
144 | var $ = load(html); | ||
145 | var handles = {}; | ||
146 | module.exports.CONFIG.socialNetworks.forEach(function (socialNetwork) { | ||
147 | handles[socialNetwork] = parse(socialNetwork)($); | ||
148 | }) | ||
149 | return handles; | ||
150 | } catch (error) { | ||
151 | throw new Error("Error scraping website data for " + website + ": " + error.message) | ||
152 | } | ||
153 | } | ||
154 | "#, | ||
155 | r#" | ||
156 | var parseWebsite = async function parseWebsite(website) { | ||
157 | try { | ||
158 | var url = normalizeUrl(website); | ||
159 | var html = await fetch(url).then(function (res) { | ||
160 | return res.text(); | ||
161 | }); | ||
162 | var $ = load(html); | ||
163 | var handles = {}; | ||
164 | module.exports.CONFIG.socialNetworks.forEach(function (socialNetwork) { | ||
165 | handles[socialNetwork] = parse(socialNetwork)($); | ||
166 | }) | ||
167 | return handles; | ||
168 | } catch (error) { | ||
169 | throw new Error("Error fetching website data for " + website + ": " + error.message) | ||
170 | } | ||
171 | } | ||
172 | "# | ||
173 | )) | ||
174 | } | ||
175 | |||
176 | #[test] | ||
177 | fn ignore_comment_trivia() { | ||
178 | assert!(is_similar( | ||
179 | r#" | ||
180 | // doubles x | ||
181 | function(x) { | ||
182 | x * 2 | ||
183 | } | ||
184 | "#, | ||
185 | r#" | ||
186 | function(x) { | ||
187 | x * 2 // doubles x | ||
188 | } | ||
189 | "# | ||
190 | )) | ||
191 | } | ||
192 | |||
193 | #[test] | ||
194 | fn ignore_whitespace_trivia() { | ||
195 | assert!(is_similar( | ||
196 | r#" | ||
197 | function(x) { | ||
198 | return x * 2; | ||
199 | } | ||
200 | "#, | ||
201 | r#" | ||
202 | function(x ) { | ||
203 | return x * 2; | ||
204 | } | ||
205 | "# | ||
206 | )) | ||
207 | } | ||
208 | |||
209 | #[test] | ||
210 | fn dissimilar() { | ||
211 | assert!(!is_similar( | ||
212 | r#" | ||
213 | var t = function(x, y) { } | ||
214 | "#, | ||
215 | r#" | ||
216 | var l = function(x) { } | ||
217 | "# | ||
218 | )) | ||
219 | } | ||
220 | } | ||