aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAkshay <[email protected]>2022-08-02 12:27:04 +0100
committerAkshay <[email protected]>2022-08-02 12:27:04 +0100
commitefd96e8df6805a45aaf5822141dee11c642b51ae (patch)
tree56cca3a780dd0753e21eb5e0dbda585ce6ff7f7b /src
init
Diffstat (limited to 'src')
-rw-r--r--src/error.rs17
-rw-r--r--src/lib.rs220
-rw-r--r--src/main.rs43
-rw-r--r--src/traverse.rs51
4 files changed, 331 insertions, 0 deletions
diff --git a/src/error.rs b/src/error.rs
new file mode 100644
index 0000000..64bd094
--- /dev/null
+++ b/src/error.rs
@@ -0,0 +1,17 @@
1// std
2use std::io;
3
4// extern
5use thiserror::Error;
6
7#[derive(Error, Debug)]
8pub enum SimError {
9 #[error("unable to set language")]
10 UnableToSetLanguage,
11
12 #[error("parser does not have a corresponding tree-sitter language")]
13 LanguageNotSet,
14
15 #[error("io error: {0}")]
16 IoError(#[from] io::Error),
17}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..b708984
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,220 @@
1// locals
2mod error;
3mod traverse;
4
5// std
6use std::{
7 fmt,
8 ops::Not,
9 path::{Path, PathBuf},
10};
11
12// extern
13use error::SimError;
14use itertools::{EitherOrBoth, Itertools};
15use traverse::PreOrder;
16use tree_sitter::{Node, Parser, Tree};
17
18type Result<T> = std::result::Result<T, SimError>;
19
20// Check if two tree-sitter objects (trees or nodes etc.) are "similar"
21pub trait Similarity {
22 // This function accepts a predicate to filter out nodes,
23 // for e.g.: trivia such as comments and whitespace may be filtered
24 // out with `!node.is_extra()`
25 fn is_similar<P>(&self, other: &Self, predicate: P) -> bool
26 where
27 P: Fn(&Node) -> bool;
28}
29
30// Representation of a fully parsed file from the filesystem
31pub struct ProgramFile {
32 pub path: PathBuf,
33 pub src: String,
34 pub tree: Tree,
35}
36
37impl Similarity for ProgramFile {
38 fn is_similar<P>(&self, other: &Self, predicate: P) -> bool
39 where
40 P: Fn(&Node) -> bool,
41 {
42 // The order of traversal is unimportant
43 let mine = PreOrder::new(&self.tree).filter(|n| predicate(n));
44 let theirs = PreOrder::new(&other.tree).filter(|n| predicate(n));
45
46 // Simultaneously traverse both the trees, and find dissimilarities.
47 //
48 // A dissimilarity is when:
49 // - one tree's traversal is longer than the other
50 // - the trees have identical traversal lengths but non-identical
51 // nodes along the traversal
52 mine.zip_longest(theirs)
53 .any(|result| match result {
54 EitherOrBoth::Both(mine, theirs) => mine.kind_id() != theirs.kind_id(),
55 EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => true,
56 })
57 .not()
58 }
59}
60
61impl fmt::Debug for ProgramFile {
62 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
63 f.debug_struct("ProgramFile")
64 .field("path", &self.path)
65 .field("src", &self.src.len())
66 .finish()
67 }
68}
69
70impl ProgramFile {
71 pub fn new<P: AsRef<Path>>(path: P, parser: &mut Parser) -> Result<Self> {
72 let src = std::fs::read_to_string(&path)?;
73 let path = path.as_ref().to_owned();
74 let tree = parser.parse(&src, None).ok_or(SimError::LanguageNotSet)?;
75 Ok(Self { path, src, tree })
76 }
77}
78
79#[cfg(test)]
80mod test {
81 use super::*;
82 use tree_sitter::Parser;
83
84 // Utility to quickly bootstrap ProgramFiles
85 // and run similarity checks
86 fn is_similar(mine: &str, their: &str) -> bool {
87 let lang = tree_sitter_javascript::language();
88 let mut parser = Parser::new();
89 parser.set_language(lang).unwrap();
90
91 let my_program_file = ProgramFile {
92 path: PathBuf::from("mine"),
93 src: mine.to_owned(),
94 tree: parser.parse(&mine, None).unwrap(),
95 };
96
97 let their_program_file = ProgramFile {
98 path: PathBuf::from("their"),
99 src: their.to_owned(),
100 tree: parser.parse(&their, None).unwrap(),
101 };
102
103 my_program_file.is_similar(&their_program_file, |n| !n.is_extra())
104 }
105
106 #[test]
107 fn trivial() {
108 assert!(is_similar(
109 r#"
110 var x = 2;
111 "#,
112 r#"
113 var y = 5;
114 "#
115 ))
116 }
117
118 #[test]
119 fn automorphism() {
120 assert!(is_similar(
121 r#"
122 function(x) {
123 x * 2
124 }
125 "#,
126 r#"
127 function(x) {
128 x * 2
129 }
130 "#
131 ))
132 }
133
134 #[test]
135 fn sample() {
136 assert!(is_similar(
137 r#"
138 var scrapeWebsite = async function scrapeWebsite(website) {
139 try {
140 var url = normalizeUrl(website);
141 var html = await fetch(url).then(function (res) {
142 return res.text();
143 });
144 var $ = load(html);
145 var handles = {};
146 module.exports.CONFIG.socialNetworks.forEach(function (socialNetwork) {
147 handles[socialNetwork] = parse(socialNetwork)($);
148 })
149 return handles;
150 } catch (error) {
151 throw new Error("Error scraping website data for " + website + ": " + error.message)
152 }
153 }
154 "#,
155 r#"
156 var parseWebsite = async function parseWebsite(website) {
157 try {
158 var url = normalizeUrl(website);
159 var html = await fetch(url).then(function (res) {
160 return res.text();
161 });
162 var $ = load(html);
163 var handles = {};
164 module.exports.CONFIG.socialNetworks.forEach(function (socialNetwork) {
165 handles[socialNetwork] = parse(socialNetwork)($);
166 })
167 return handles;
168 } catch (error) {
169 throw new Error("Error fetching website data for " + website + ": " + error.message)
170 }
171 }
172 "#
173 ))
174 }
175
176 #[test]
177 fn ignore_comment_trivia() {
178 assert!(is_similar(
179 r#"
180 // doubles x
181 function(x) {
182 x * 2
183 }
184 "#,
185 r#"
186 function(x) {
187 x * 2 // doubles x
188 }
189 "#
190 ))
191 }
192
193 #[test]
194 fn ignore_whitespace_trivia() {
195 assert!(is_similar(
196 r#"
197 function(x) {
198return x * 2;
199 }
200 "#,
201 r#"
202 function(x ) {
203 return x * 2;
204 }
205 "#
206 ))
207 }
208
209 #[test]
210 fn dissimilar() {
211 assert!(!is_similar(
212 r#"
213 var t = function(x, y) { }
214 "#,
215 r#"
216 var l = function(x) { }
217 "#
218 ))
219 }
220}
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..77c7584
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,43 @@
1// extern
2use itertools::Itertools;
3use rayon::prelude::*;
4use similarity::{ProgramFile, Similarity};
5use tree_sitter::Parser;
6
7fn parser() -> Parser {
8 let mut parser = Parser::new();
9 parser
10 .set_language(tree_sitter_javascript::language())
11 .unwrap();
12 parser
13}
14
15fn main() {
16 let args = std::env::args().skip(1).collect::<Vec<_>>();
17
18 // read and parse files from disk
19 let (files, errors): (Vec<_>, Vec<_>) = args
20 .par_iter()
21 .map(|arg| ProgramFile::new(arg, &mut parser()))
22 .partition(Result::is_ok);
23
24 // dump errors if any
25 errors.into_iter().map(Result::unwrap_err).for_each(|err| {
26 eprintln!("{}", err);
27 });
28
29 let files: Vec<_> = files.into_iter().map(Result::unwrap).collect();
30
31 // compare every pair of files for similarity
32 (0..files.len())
33 .combinations(2)
34 .map(|item| (item[0], item[1]))
35 .par_bridge()
36 .for_each(|(one, two)| {
37 let one = &files[one];
38 let two = &files[two];
39 if one.is_similar(&two, |n| !n.is_extra()) && one.path != two.path {
40 println!("{} {}", one.path.display(), two.path.display());
41 }
42 });
43}
diff --git a/src/traverse.rs b/src/traverse.rs
new file mode 100644
index 0000000..df13f5d
--- /dev/null
+++ b/src/traverse.rs
@@ -0,0 +1,51 @@
1// Lazy pre-order traversal over arbitrary tree-sitter trees
2
3// extern
4use tree_sitter::{Node, Tree, TreeCursor};
5
6pub struct PreOrder<'a> {
7 cursor: TreeCursor<'a>,
8 done: bool,
9}
10
11impl<'a> PreOrder<'a> {
12 pub fn new(tree: &'a Tree) -> Self {
13 Self {
14 cursor: tree.walk(),
15 done: false,
16 }
17 }
18}
19
20// Traverse the tree in root-left-right fashion
21impl<'a> Iterator for PreOrder<'a> {
22 type Item = Node<'a>;
23
24 fn next(&mut self) -> Option<Self::Item> {
25 if self.done {
26 return None;
27 }
28
29 // capture the root node
30 let node = self.cursor.node();
31
32 // if this node has a child, move there and return the root, or
33 // if this node has a sibling, move there and return the root
34 if self.cursor.goto_first_child() || self.cursor.goto_next_sibling() {
35 return Some(node);
36 }
37
38 loop {
39 // we are done retracing all the way back to the root node
40 if !self.cursor.goto_parent() {
41 self.done = true;
42 break;
43 }
44
45 if self.cursor.goto_next_sibling() {
46 break;
47 }
48 }
49 Some(node)
50 }
51}