diff options
-rw-r--r-- | .gitignore | 4 | ||||
-rw-r--r-- | Cargo.lock | 269 | ||||
-rw-r--r-- | Cargo.toml | 19 | ||||
-rw-r--r-- | flake.lock | 87 | ||||
-rw-r--r-- | flake.nix | 67 | ||||
-rw-r--r-- | readme | 38 | ||||
-rw-r--r-- | src/error.rs | 17 | ||||
-rw-r--r-- | src/lib.rs | 220 | ||||
-rw-r--r-- | src/main.rs | 43 | ||||
-rw-r--r-- | src/traverse.rs | 51 |
10 files changed, 815 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a2c5a6d --- /dev/null +++ b/.gitignore | |||
@@ -0,0 +1,4 @@ | |||
1 | /target | ||
2 | /js-files | ||
3 | .direnv | ||
4 | .envrc | ||
diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..6c0cbfd --- /dev/null +++ b/Cargo.lock | |||
@@ -0,0 +1,269 @@ | |||
1 | # This file is automatically @generated by Cargo. | ||
2 | # It is not intended for manual editing. | ||
3 | version = 3 | ||
4 | |||
5 | [[package]] | ||
6 | name = "aho-corasick" | ||
7 | version = "0.7.18" | ||
8 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
9 | checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" | ||
10 | dependencies = [ | ||
11 | "memchr", | ||
12 | ] | ||
13 | |||
14 | [[package]] | ||
15 | name = "autocfg" | ||
16 | version = "1.1.0" | ||
17 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
18 | checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" | ||
19 | |||
20 | [[package]] | ||
21 | name = "bloop" | ||
22 | version = "0.1.0" | ||
23 | dependencies = [ | ||
24 | "itertools", | ||
25 | "rayon", | ||
26 | "thiserror", | ||
27 | "tree-sitter", | ||
28 | "tree-sitter-javascript", | ||
29 | ] | ||
30 | |||
31 | [[package]] | ||
32 | name = "cc" | ||
33 | version = "1.0.73" | ||
34 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
35 | checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" | ||
36 | |||
37 | [[package]] | ||
38 | name = "cfg-if" | ||
39 | version = "1.0.0" | ||
40 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
41 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" | ||
42 | |||
43 | [[package]] | ||
44 | name = "crossbeam-channel" | ||
45 | version = "0.5.6" | ||
46 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
47 | checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" | ||
48 | dependencies = [ | ||
49 | "cfg-if", | ||
50 | "crossbeam-utils", | ||
51 | ] | ||
52 | |||
53 | [[package]] | ||
54 | name = "crossbeam-deque" | ||
55 | version = "0.8.2" | ||
56 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
57 | checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" | ||
58 | dependencies = [ | ||
59 | "cfg-if", | ||
60 | "crossbeam-epoch", | ||
61 | "crossbeam-utils", | ||
62 | ] | ||
63 | |||
64 | [[package]] | ||
65 | name = "crossbeam-epoch" | ||
66 | version = "0.9.10" | ||
67 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
68 | checksum = "045ebe27666471bb549370b4b0b3e51b07f56325befa4284db65fc89c02511b1" | ||
69 | dependencies = [ | ||
70 | "autocfg", | ||
71 | "cfg-if", | ||
72 | "crossbeam-utils", | ||
73 | "memoffset", | ||
74 | "once_cell", | ||
75 | "scopeguard", | ||
76 | ] | ||
77 | |||
78 | [[package]] | ||
79 | name = "crossbeam-utils" | ||
80 | version = "0.8.11" | ||
81 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
82 | checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc" | ||
83 | dependencies = [ | ||
84 | "cfg-if", | ||
85 | "once_cell", | ||
86 | ] | ||
87 | |||
88 | [[package]] | ||
89 | name = "either" | ||
90 | version = "1.7.0" | ||
91 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
92 | checksum = "3f107b87b6afc2a64fd13cac55fe06d6c8859f12d4b14cbcdd2c67d0976781be" | ||
93 | |||
94 | [[package]] | ||
95 | name = "hermit-abi" | ||
96 | version = "0.1.19" | ||
97 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
98 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" | ||
99 | dependencies = [ | ||
100 | "libc", | ||
101 | ] | ||
102 | |||
103 | [[package]] | ||
104 | name = "itertools" | ||
105 | version = "0.10.3" | ||
106 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
107 | checksum = "a9a9d19fa1e79b6215ff29b9d6880b706147f16e9b1dbb1e4e5947b5b02bc5e3" | ||
108 | dependencies = [ | ||
109 | "either", | ||
110 | ] | ||
111 | |||
112 | [[package]] | ||
113 | name = "libc" | ||
114 | version = "0.2.126" | ||
115 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
116 | checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836" | ||
117 | |||
118 | [[package]] | ||
119 | name = "memchr" | ||
120 | version = "2.5.0" | ||
121 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
122 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" | ||
123 | |||
124 | [[package]] | ||
125 | name = "memoffset" | ||
126 | version = "0.6.5" | ||
127 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
128 | checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" | ||
129 | dependencies = [ | ||
130 | "autocfg", | ||
131 | ] | ||
132 | |||
133 | [[package]] | ||
134 | name = "num_cpus" | ||
135 | version = "1.13.1" | ||
136 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
137 | checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" | ||
138 | dependencies = [ | ||
139 | "hermit-abi", | ||
140 | "libc", | ||
141 | ] | ||
142 | |||
143 | [[package]] | ||
144 | name = "once_cell" | ||
145 | version = "1.13.0" | ||
146 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
147 | checksum = "18a6dbe30758c9f83eb00cbea4ac95966305f5a7772f3f42ebfc7fc7eddbd8e1" | ||
148 | |||
149 | [[package]] | ||
150 | name = "proc-macro2" | ||
151 | version = "1.0.42" | ||
152 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
153 | checksum = "c278e965f1d8cf32d6e0e96de3d3e79712178ae67986d9cf9151f51e95aac89b" | ||
154 | dependencies = [ | ||
155 | "unicode-ident", | ||
156 | ] | ||
157 | |||
158 | [[package]] | ||
159 | name = "quote" | ||
160 | version = "1.0.20" | ||
161 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
162 | checksum = "3bcdf212e9776fbcb2d23ab029360416bb1706b1aea2d1a5ba002727cbcab804" | ||
163 | dependencies = [ | ||
164 | "proc-macro2", | ||
165 | ] | ||
166 | |||
167 | [[package]] | ||
168 | name = "rayon" | ||
169 | version = "1.5.3" | ||
170 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
171 | checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" | ||
172 | dependencies = [ | ||
173 | "autocfg", | ||
174 | "crossbeam-deque", | ||
175 | "either", | ||
176 | "rayon-core", | ||
177 | ] | ||
178 | |||
179 | [[package]] | ||
180 | name = "rayon-core" | ||
181 | version = "1.9.3" | ||
182 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
183 | checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" | ||
184 | dependencies = [ | ||
185 | "crossbeam-channel", | ||
186 | "crossbeam-deque", | ||
187 | "crossbeam-utils", | ||
188 | "num_cpus", | ||
189 | ] | ||
190 | |||
191 | [[package]] | ||
192 | name = "regex" | ||
193 | version = "1.6.0" | ||
194 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
195 | checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" | ||
196 | dependencies = [ | ||
197 | "aho-corasick", | ||
198 | "memchr", | ||
199 | "regex-syntax", | ||
200 | ] | ||
201 | |||
202 | [[package]] | ||
203 | name = "regex-syntax" | ||
204 | version = "0.6.27" | ||
205 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
206 | checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" | ||
207 | |||
208 | [[package]] | ||
209 | name = "scopeguard" | ||
210 | version = "1.1.0" | ||
211 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
212 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" | ||
213 | |||
214 | [[package]] | ||
215 | name = "syn" | ||
216 | version = "1.0.98" | ||
217 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
218 | checksum = "c50aef8a904de4c23c788f104b7dddc7d6f79c647c7c8ce4cc8f73eb0ca773dd" | ||
219 | dependencies = [ | ||
220 | "proc-macro2", | ||
221 | "quote", | ||
222 | "unicode-ident", | ||
223 | ] | ||
224 | |||
225 | [[package]] | ||
226 | name = "thiserror" | ||
227 | version = "1.0.31" | ||
228 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
229 | checksum = "bd829fe32373d27f76265620b5309d0340cb8550f523c1dda251d6298069069a" | ||
230 | dependencies = [ | ||
231 | "thiserror-impl", | ||
232 | ] | ||
233 | |||
234 | [[package]] | ||
235 | name = "thiserror-impl" | ||
236 | version = "1.0.31" | ||
237 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
238 | checksum = "0396bc89e626244658bef819e22d0cc459e795a5ebe878e6ec336d1674a8d79a" | ||
239 | dependencies = [ | ||
240 | "proc-macro2", | ||
241 | "quote", | ||
242 | "syn", | ||
243 | ] | ||
244 | |||
245 | [[package]] | ||
246 | name = "tree-sitter" | ||
247 | version = "0.20.8" | ||
248 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
249 | checksum = "268bf3e3ca0c09e5d21b59c2638e12cb6dcf7ea2681250a696a2d0936cb57ba0" | ||
250 | dependencies = [ | ||
251 | "cc", | ||
252 | "regex", | ||
253 | ] | ||
254 | |||
255 | [[package]] | ||
256 | name = "tree-sitter-javascript" | ||
257 | version = "0.20.0" | ||
258 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
259 | checksum = "2490fab08630b2c8943c320f7b63473cbf65511c8d83aec551beb9b4375906ed" | ||
260 | dependencies = [ | ||
261 | "cc", | ||
262 | "tree-sitter", | ||
263 | ] | ||
264 | |||
265 | [[package]] | ||
266 | name = "unicode-ident" | ||
267 | version = "1.0.2" | ||
268 | source = "registry+https://github.com/rust-lang/crates.io-index" | ||
269 | checksum = "15c61ba63f9235225a22310255a29b806b907c9b8c964bcbd0a2c70f3f2deea7" | ||
diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..579c1c4 --- /dev/null +++ b/Cargo.toml | |||
@@ -0,0 +1,19 @@ | |||
1 | [package] | ||
2 | name = "bloop" | ||
3 | version = "0.1.0" | ||
4 | edition = "2021" | ||
5 | |||
6 | [lib] | ||
7 | name = "similarity" | ||
8 | path = "src/lib.rs" | ||
9 | |||
10 | [[bin]] | ||
11 | name = "sim" | ||
12 | path = "src/main.rs" | ||
13 | |||
14 | [dependencies] | ||
15 | tree-sitter-javascript = "0.20" | ||
16 | tree-sitter = "0.20" | ||
17 | itertools = "0.10" | ||
18 | rayon = "1.5" | ||
19 | thiserror = "1.0" | ||
diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..a7c083f --- /dev/null +++ b/flake.lock | |||
@@ -0,0 +1,87 @@ | |||
1 | { | ||
2 | "nodes": { | ||
3 | "fenix": { | ||
4 | "inputs": { | ||
5 | "nixpkgs": [ | ||
6 | "nixpkgs" | ||
7 | ], | ||
8 | "rust-analyzer-src": "rust-analyzer-src" | ||
9 | }, | ||
10 | "locked": { | ||
11 | "lastModified": 1645251813, | ||
12 | "narHash": "sha256-cQ66tGjnZclBCS3nD26mZ5fUH+3/HnysGffBiWXUSHk=", | ||
13 | "owner": "nix-community", | ||
14 | "repo": "fenix", | ||
15 | "rev": "9892337b588c38ec59466a1c89befce464aae7f8", | ||
16 | "type": "github" | ||
17 | }, | ||
18 | "original": { | ||
19 | "owner": "nix-community", | ||
20 | "repo": "fenix", | ||
21 | "type": "github" | ||
22 | } | ||
23 | }, | ||
24 | "gitignore": { | ||
25 | "inputs": { | ||
26 | "nixpkgs": [ | ||
27 | "nixpkgs" | ||
28 | ] | ||
29 | }, | ||
30 | "locked": { | ||
31 | "lastModified": 1635165013, | ||
32 | "narHash": "sha256-o/BdVjNwcB6jOmzZjOH703BesSkkS5O7ej3xhyO8hAY=", | ||
33 | "owner": "hercules-ci", | ||
34 | "repo": "gitignore.nix", | ||
35 | "rev": "5b9e0ff9d3b551234b4f3eb3983744fa354b17f1", | ||
36 | "type": "github" | ||
37 | }, | ||
38 | "original": { | ||
39 | "owner": "hercules-ci", | ||
40 | "repo": "gitignore.nix", | ||
41 | "type": "github" | ||
42 | } | ||
43 | }, | ||
44 | "nixpkgs": { | ||
45 | "locked": { | ||
46 | "lastModified": 1645013224, | ||
47 | "narHash": "sha256-b7OEC8vwzJv3rsz9pwnTX2LQDkeOWz2DbKypkVvNHXc=", | ||
48 | "owner": "nixos", | ||
49 | "repo": "nixpkgs", | ||
50 | "rev": "b66b39216b1fef2d8c33cc7a5c72d8da80b79970", | ||
51 | "type": "github" | ||
52 | }, | ||
53 | "original": { | ||
54 | "owner": "nixos", | ||
55 | "ref": "nixpkgs-unstable", | ||
56 | "repo": "nixpkgs", | ||
57 | "type": "github" | ||
58 | } | ||
59 | }, | ||
60 | "root": { | ||
61 | "inputs": { | ||
62 | "fenix": "fenix", | ||
63 | "gitignore": "gitignore", | ||
64 | "nixpkgs": "nixpkgs" | ||
65 | } | ||
66 | }, | ||
67 | "rust-analyzer-src": { | ||
68 | "flake": false, | ||
69 | "locked": { | ||
70 | "lastModified": 1645205556, | ||
71 | "narHash": "sha256-e4lZW3qRyOEJ+vLKFQP7m2Dxh5P44NrnekZYLxlucww=", | ||
72 | "owner": "rust-analyzer", | ||
73 | "repo": "rust-analyzer", | ||
74 | "rev": "acf5874b39f3dc5262317a6074d9fc7285081161", | ||
75 | "type": "github" | ||
76 | }, | ||
77 | "original": { | ||
78 | "owner": "rust-analyzer", | ||
79 | "ref": "nightly", | ||
80 | "repo": "rust-analyzer", | ||
81 | "type": "github" | ||
82 | } | ||
83 | } | ||
84 | }, | ||
85 | "root": "root", | ||
86 | "version": 7 | ||
87 | } | ||
diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..dcdf3b7 --- /dev/null +++ b/flake.nix | |||
@@ -0,0 +1,67 @@ | |||
1 | { | ||
2 | inputs = { | ||
3 | |||
4 | nixpkgs.url = "github:nixos/nixpkgs/nixpkgs-unstable"; | ||
5 | |||
6 | fenix = { | ||
7 | url = "github:nix-community/fenix"; | ||
8 | inputs.nixpkgs.follows = "nixpkgs"; | ||
9 | }; | ||
10 | |||
11 | gitignore = { | ||
12 | url = "github:hercules-ci/gitignore.nix"; | ||
13 | inputs.nixpkgs.follows = "nixpkgs"; | ||
14 | }; | ||
15 | |||
16 | }; | ||
17 | |||
18 | outputs = | ||
19 | { self | ||
20 | , nixpkgs | ||
21 | , fenix | ||
22 | , gitignore | ||
23 | }: | ||
24 | let | ||
25 | inherit (gitignore.lib) gitignoreSource; | ||
26 | |||
27 | supportedSystems = [ "x86_64-linux" "aarch64-linux" "x86_64-darwin" "aarch64-darwin" ]; | ||
28 | forAllSystems = nixpkgs.lib.genAttrs supportedSystems; | ||
29 | nixpkgsFor = forAllSystems (system: | ||
30 | import nixpkgs { inherit system; }); | ||
31 | |||
32 | chanspec = { | ||
33 | date = "2022-02-06"; | ||
34 | channel = "nightly"; | ||
35 | sha256 = "oKkTWopCDx4tphzTtRn+zDDtvmIZrL/H44tV2ruSfDw="; # set zeros after modifying channel or date | ||
36 | }; | ||
37 | rustChannel = p: (fenix.overlay p p).fenix.toolchainOf chanspec; | ||
38 | |||
39 | in | ||
40 | { | ||
41 | |||
42 | |||
43 | devShell = forAllSystems (system: | ||
44 | let | ||
45 | pkgs = nixpkgsFor."${system}"; | ||
46 | toolchain = (rustChannel pkgs).withComponents [ | ||
47 | "rustc" | ||
48 | "cargo" | ||
49 | "rust-std" | ||
50 | "rustfmt" | ||
51 | "clippy" | ||
52 | "rust-src" | ||
53 | ]; | ||
54 | inherit (fenix.packages."${system}") rust-analyzer; | ||
55 | in | ||
56 | pkgs.mkShell { | ||
57 | nativeBuildInputs = [ | ||
58 | pkgs.bacon | ||
59 | pkgs.cargo-insta | ||
60 | rust-analyzer | ||
61 | toolchain | ||
62 | ]; | ||
63 | RUST_LOG = "info"; | ||
64 | RUST_BACKTRACE = 1; | ||
65 | }); | ||
66 | }; | ||
67 | } | ||
@@ -0,0 +1,38 @@ | |||
1 | Accepts a list of files as args and returns pairs of duplicate files, one pair | ||
2 | per line. | ||
3 | |||
4 | Usage: | ||
5 | ----- | ||
6 | |||
7 | cargo run --release --quiet -- [FILES] | ||
8 | |||
9 | |||
10 | Example: | ||
11 | ------- | ||
12 | |||
13 | cargo run ---release --quiet -- js-files/*.js | ||
14 | |||
15 | |||
16 | Internals: | ||
17 | --------- | ||
18 | |||
19 | The tool uses tree-sitter to produce ASTs for the given files. It then lazily | ||
20 | traverses the trees of the two files to be compared and exits on encountering | ||
21 | the first structural difference in the ASTs. | ||
22 | |||
23 | |||
24 | Known issues: | ||
25 | ------------ | ||
26 | |||
27 | - A fully commented-out file is equivalent to every other fully commented-out | ||
28 | file and to empty files | ||
29 | |||
30 | - Does not account for equivalence of unordered children: | ||
31 | |||
32 | ==== file1.rs ==== ==== file2.rs ==== | ||
33 | fn bar(x, y, z) {} fn foo(a, b) {} | ||
34 | fn foo(a, b) {} fn bar(x, y, z) {} | ||
35 | |||
36 | Here, `function` nodes are "unordered" children, the files are structurally | ||
37 | equivalent because the order of `function` nodes is irrelevant. However, the | ||
38 | tool considers these files to be unique. | ||
diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..64bd094 --- /dev/null +++ b/src/error.rs | |||
@@ -0,0 +1,17 @@ | |||
1 | // std | ||
2 | use std::io; | ||
3 | |||
4 | // extern | ||
5 | use thiserror::Error; | ||
6 | |||
7 | #[derive(Error, Debug)] | ||
8 | pub enum SimError { | ||
9 | #[error("unable to set language")] | ||
10 | UnableToSetLanguage, | ||
11 | |||
12 | #[error("parser does not have a corresponding tree-sitter language")] | ||
13 | LanguageNotSet, | ||
14 | |||
15 | #[error("io error: {0}")] | ||
16 | IoError(#[from] io::Error), | ||
17 | } | ||
diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..b708984 --- /dev/null +++ b/src/lib.rs | |||
@@ -0,0 +1,220 @@ | |||
1 | // locals | ||
2 | mod error; | ||
3 | mod traverse; | ||
4 | |||
5 | // std | ||
6 | use std::{ | ||
7 | fmt, | ||
8 | ops::Not, | ||
9 | path::{Path, PathBuf}, | ||
10 | }; | ||
11 | |||
12 | // extern | ||
13 | use error::SimError; | ||
14 | use itertools::{EitherOrBoth, Itertools}; | ||
15 | use traverse::PreOrder; | ||
16 | use tree_sitter::{Node, Parser, Tree}; | ||
17 | |||
18 | type Result<T> = std::result::Result<T, SimError>; | ||
19 | |||
20 | // Check if two tree-sitter objects (trees or nodes etc.) are "similar" | ||
21 | pub trait Similarity { | ||
22 | // This function accepts a predicate to filter out nodes, | ||
23 | // for e.g.: trivia such as comments and whitespace may be filtered | ||
24 | // out with `!node.is_extra()` | ||
25 | fn is_similar<P>(&self, other: &Self, predicate: P) -> bool | ||
26 | where | ||
27 | P: Fn(&Node) -> bool; | ||
28 | } | ||
29 | |||
30 | // Representation of a fully parsed file from the filesystem | ||
31 | pub struct ProgramFile { | ||
32 | pub path: PathBuf, | ||
33 | pub src: String, | ||
34 | pub tree: Tree, | ||
35 | } | ||
36 | |||
37 | impl Similarity for ProgramFile { | ||
38 | fn is_similar<P>(&self, other: &Self, predicate: P) -> bool | ||
39 | where | ||
40 | P: Fn(&Node) -> bool, | ||
41 | { | ||
42 | // The order of traversal is unimportant | ||
43 | let mine = PreOrder::new(&self.tree).filter(|n| predicate(n)); | ||
44 | let theirs = PreOrder::new(&other.tree).filter(|n| predicate(n)); | ||
45 | |||
46 | // Simultaneously traverse both the trees, and find dissimilarities. | ||
47 | // | ||
48 | // A dissimilarity is when: | ||
49 | // - one tree's traversal is longer than the other | ||
50 | // - the trees have identical traversal lengths but non-identical | ||
51 | // nodes along the traversal | ||
52 | mine.zip_longest(theirs) | ||
53 | .any(|result| match result { | ||
54 | EitherOrBoth::Both(mine, theirs) => mine.kind_id() != theirs.kind_id(), | ||
55 | EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => true, | ||
56 | }) | ||
57 | .not() | ||
58 | } | ||
59 | } | ||
60 | |||
61 | impl fmt::Debug for ProgramFile { | ||
62 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||
63 | f.debug_struct("ProgramFile") | ||
64 | .field("path", &self.path) | ||
65 | .field("src", &self.src.len()) | ||
66 | .finish() | ||
67 | } | ||
68 | } | ||
69 | |||
70 | impl ProgramFile { | ||
71 | pub fn new<P: AsRef<Path>>(path: P, parser: &mut Parser) -> Result<Self> { | ||
72 | let src = std::fs::read_to_string(&path)?; | ||
73 | let path = path.as_ref().to_owned(); | ||
74 | let tree = parser.parse(&src, None).ok_or(SimError::LanguageNotSet)?; | ||
75 | Ok(Self { path, src, tree }) | ||
76 | } | ||
77 | } | ||
78 | |||
79 | #[cfg(test)] | ||
80 | mod test { | ||
81 | use super::*; | ||
82 | use tree_sitter::Parser; | ||
83 | |||
84 | // Utility to quickly bootstrap ProgramFiles | ||
85 | // and run similarity checks | ||
86 | fn is_similar(mine: &str, their: &str) -> bool { | ||
87 | let lang = tree_sitter_javascript::language(); | ||
88 | let mut parser = Parser::new(); | ||
89 | parser.set_language(lang).unwrap(); | ||
90 | |||
91 | let my_program_file = ProgramFile { | ||
92 | path: PathBuf::from("mine"), | ||
93 | src: mine.to_owned(), | ||
94 | tree: parser.parse(&mine, None).unwrap(), | ||
95 | }; | ||
96 | |||
97 | let their_program_file = ProgramFile { | ||
98 | path: PathBuf::from("their"), | ||
99 | src: their.to_owned(), | ||
100 | tree: parser.parse(&their, None).unwrap(), | ||
101 | }; | ||
102 | |||
103 | my_program_file.is_similar(&their_program_file, |n| !n.is_extra()) | ||
104 | } | ||
105 | |||
106 | #[test] | ||
107 | fn trivial() { | ||
108 | assert!(is_similar( | ||
109 | r#" | ||
110 | var x = 2; | ||
111 | "#, | ||
112 | r#" | ||
113 | var y = 5; | ||
114 | "# | ||
115 | )) | ||
116 | } | ||
117 | |||
118 | #[test] | ||
119 | fn automorphism() { | ||
120 | assert!(is_similar( | ||
121 | r#" | ||
122 | function(x) { | ||
123 | x * 2 | ||
124 | } | ||
125 | "#, | ||
126 | r#" | ||
127 | function(x) { | ||
128 | x * 2 | ||
129 | } | ||
130 | "# | ||
131 | )) | ||
132 | } | ||
133 | |||
134 | #[test] | ||
135 | fn sample() { | ||
136 | assert!(is_similar( | ||
137 | r#" | ||
138 | var scrapeWebsite = async function scrapeWebsite(website) { | ||
139 | try { | ||
140 | var url = normalizeUrl(website); | ||
141 | var html = await fetch(url).then(function (res) { | ||
142 | return res.text(); | ||
143 | }); | ||
144 | var $ = load(html); | ||
145 | var handles = {}; | ||
146 | module.exports.CONFIG.socialNetworks.forEach(function (socialNetwork) { | ||
147 | handles[socialNetwork] = parse(socialNetwork)($); | ||
148 | }) | ||
149 | return handles; | ||
150 | } catch (error) { | ||
151 | throw new Error("Error scraping website data for " + website + ": " + error.message) | ||
152 | } | ||
153 | } | ||
154 | "#, | ||
155 | r#" | ||
156 | var parseWebsite = async function parseWebsite(website) { | ||
157 | try { | ||
158 | var url = normalizeUrl(website); | ||
159 | var html = await fetch(url).then(function (res) { | ||
160 | return res.text(); | ||
161 | }); | ||
162 | var $ = load(html); | ||
163 | var handles = {}; | ||
164 | module.exports.CONFIG.socialNetworks.forEach(function (socialNetwork) { | ||
165 | handles[socialNetwork] = parse(socialNetwork)($); | ||
166 | }) | ||
167 | return handles; | ||
168 | } catch (error) { | ||
169 | throw new Error("Error fetching website data for " + website + ": " + error.message) | ||
170 | } | ||
171 | } | ||
172 | "# | ||
173 | )) | ||
174 | } | ||
175 | |||
176 | #[test] | ||
177 | fn ignore_comment_trivia() { | ||
178 | assert!(is_similar( | ||
179 | r#" | ||
180 | // doubles x | ||
181 | function(x) { | ||
182 | x * 2 | ||
183 | } | ||
184 | "#, | ||
185 | r#" | ||
186 | function(x) { | ||
187 | x * 2 // doubles x | ||
188 | } | ||
189 | "# | ||
190 | )) | ||
191 | } | ||
192 | |||
193 | #[test] | ||
194 | fn ignore_whitespace_trivia() { | ||
195 | assert!(is_similar( | ||
196 | r#" | ||
197 | function(x) { | ||
198 | return x * 2; | ||
199 | } | ||
200 | "#, | ||
201 | r#" | ||
202 | function(x ) { | ||
203 | return x * 2; | ||
204 | } | ||
205 | "# | ||
206 | )) | ||
207 | } | ||
208 | |||
209 | #[test] | ||
210 | fn dissimilar() { | ||
211 | assert!(!is_similar( | ||
212 | r#" | ||
213 | var t = function(x, y) { } | ||
214 | "#, | ||
215 | r#" | ||
216 | var l = function(x) { } | ||
217 | "# | ||
218 | )) | ||
219 | } | ||
220 | } | ||
diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..77c7584 --- /dev/null +++ b/src/main.rs | |||
@@ -0,0 +1,43 @@ | |||
1 | // extern | ||
2 | use itertools::Itertools; | ||
3 | use rayon::prelude::*; | ||
4 | use similarity::{ProgramFile, Similarity}; | ||
5 | use tree_sitter::Parser; | ||
6 | |||
7 | fn parser() -> Parser { | ||
8 | let mut parser = Parser::new(); | ||
9 | parser | ||
10 | .set_language(tree_sitter_javascript::language()) | ||
11 | .unwrap(); | ||
12 | parser | ||
13 | } | ||
14 | |||
15 | fn main() { | ||
16 | let args = std::env::args().skip(1).collect::<Vec<_>>(); | ||
17 | |||
18 | // read and parse files from disk | ||
19 | let (files, errors): (Vec<_>, Vec<_>) = args | ||
20 | .par_iter() | ||
21 | .map(|arg| ProgramFile::new(arg, &mut parser())) | ||
22 | .partition(Result::is_ok); | ||
23 | |||
24 | // dump errors if any | ||
25 | errors.into_iter().map(Result::unwrap_err).for_each(|err| { | ||
26 | eprintln!("{}", err); | ||
27 | }); | ||
28 | |||
29 | let files: Vec<_> = files.into_iter().map(Result::unwrap).collect(); | ||
30 | |||
31 | // compare every pair of files for similarity | ||
32 | (0..files.len()) | ||
33 | .combinations(2) | ||
34 | .map(|item| (item[0], item[1])) | ||
35 | .par_bridge() | ||
36 | .for_each(|(one, two)| { | ||
37 | let one = &files[one]; | ||
38 | let two = &files[two]; | ||
39 | if one.is_similar(&two, |n| !n.is_extra()) && one.path != two.path { | ||
40 | println!("{} {}", one.path.display(), two.path.display()); | ||
41 | } | ||
42 | }); | ||
43 | } | ||
diff --git a/src/traverse.rs b/src/traverse.rs new file mode 100644 index 0000000..df13f5d --- /dev/null +++ b/src/traverse.rs | |||
@@ -0,0 +1,51 @@ | |||
1 | // Lazy pre-order traversal over arbitrary tree-sitter trees | ||
2 | |||
3 | // extern | ||
4 | use tree_sitter::{Node, Tree, TreeCursor}; | ||
5 | |||
6 | pub struct PreOrder<'a> { | ||
7 | cursor: TreeCursor<'a>, | ||
8 | done: bool, | ||
9 | } | ||
10 | |||
11 | impl<'a> PreOrder<'a> { | ||
12 | pub fn new(tree: &'a Tree) -> Self { | ||
13 | Self { | ||
14 | cursor: tree.walk(), | ||
15 | done: false, | ||
16 | } | ||
17 | } | ||
18 | } | ||
19 | |||
20 | // Traverse the tree in root-left-right fashion | ||
21 | impl<'a> Iterator for PreOrder<'a> { | ||
22 | type Item = Node<'a>; | ||
23 | |||
24 | fn next(&mut self) -> Option<Self::Item> { | ||
25 | if self.done { | ||
26 | return None; | ||
27 | } | ||
28 | |||
29 | // capture the root node | ||
30 | let node = self.cursor.node(); | ||
31 | |||
32 | // if this node has a child, move there and return the root, or | ||
33 | // if this node has a sibling, move there and return the root | ||
34 | if self.cursor.goto_first_child() || self.cursor.goto_next_sibling() { | ||
35 | return Some(node); | ||
36 | } | ||
37 | |||
38 | loop { | ||
39 | // we are done retracing all the way back to the root node | ||
40 | if !self.cursor.goto_parent() { | ||
41 | self.done = true; | ||
42 | break; | ||
43 | } | ||
44 | |||
45 | if self.cursor.goto_next_sibling() { | ||
46 | break; | ||
47 | } | ||
48 | } | ||
49 | Some(node) | ||
50 | } | ||
51 | } | ||