From efd96e8df6805a45aaf5822141dee11c642b51ae Mon Sep 17 00:00:00 2001 From: Akshay Date: Tue, 2 Aug 2022 16:57:04 +0530 Subject: init --- .gitignore | 4 + Cargo.lock | 269 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 19 ++++ flake.lock | 87 ++++++++++++++++++ flake.nix | 67 ++++++++++++++ readme | 38 ++++++++ src/error.rs | 17 ++++ src/lib.rs | 220 +++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 43 +++++++++ src/traverse.rs | 51 +++++++++++ 10 files changed, 815 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 flake.lock create mode 100644 flake.nix create mode 100644 readme create mode 100644 src/error.rs create mode 100644 src/lib.rs create mode 100644 src/main.rs create mode 100644 src/traverse.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a2c5a6d --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/target +/js-files +.direnv +.envrc diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..6c0cbfd --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,269 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +dependencies = [ + "memchr", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bloop" +version = "0.1.0" +dependencies = [ + "itertools", + "rayon", + "thiserror", + "tree-sitter", + "tree-sitter-javascript", +] + +[[package]] +name = "cc" +version = "1.0.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "crossbeam-channel" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "045ebe27666471bb549370b4b0b3e51b07f56325befa4284db65fc89c02511b1" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "memoffset", + "once_cell", + "scopeguard", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc" +dependencies = [ + "cfg-if", + "once_cell", +] + +[[package]] +name = "either" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f107b87b6afc2a64fd13cac55fe06d6c8859f12d4b14cbcdd2c67d0976781be" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "itertools" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9a9d19fa1e79b6215ff29b9d6880b706147f16e9b1dbb1e4e5947b5b02bc5e3" +dependencies = [ + "either", +] + +[[package]] +name = "libc" +version = "0.2.126" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836" + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "memoffset" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "once_cell" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18a6dbe30758c9f83eb00cbea4ac95966305f5a7772f3f42ebfc7fc7eddbd8e1" + +[[package]] +name = "proc-macro2" +version = "1.0.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c278e965f1d8cf32d6e0e96de3d3e79712178ae67986d9cf9151f51e95aac89b" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bcdf212e9776fbcb2d23ab029360416bb1706b1aea2d1a5ba002727cbcab804" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "num_cpus", +] + +[[package]] +name = "regex" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "syn" +version = "1.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c50aef8a904de4c23c788f104b7dddc7d6f79c647c7c8ce4cc8f73eb0ca773dd" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "1.0.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd829fe32373d27f76265620b5309d0340cb8550f523c1dda251d6298069069a" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0396bc89e626244658bef819e22d0cc459e795a5ebe878e6ec336d1674a8d79a" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tree-sitter" +version = "0.20.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "268bf3e3ca0c09e5d21b59c2638e12cb6dcf7ea2681250a696a2d0936cb57ba0" +dependencies = [ + "cc", + "regex", +] + +[[package]] +name = "tree-sitter-javascript" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2490fab08630b2c8943c320f7b63473cbf65511c8d83aec551beb9b4375906ed" +dependencies = [ + "cc", + "tree-sitter", +] + +[[package]] +name = "unicode-ident" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15c61ba63f9235225a22310255a29b806b907c9b8c964bcbd0a2c70f3f2deea7" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..579c1c4 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "bloop" +version = "0.1.0" +edition = "2021" + +[lib] +name = "similarity" +path = "src/lib.rs" + +[[bin]] +name = "sim" +path = "src/main.rs" + +[dependencies] +tree-sitter-javascript = "0.20" +tree-sitter = "0.20" +itertools = "0.10" +rayon = "1.5" +thiserror = "1.0" diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..a7c083f --- /dev/null +++ b/flake.lock @@ -0,0 +1,87 @@ +{ + "nodes": { + "fenix": { + "inputs": { + "nixpkgs": [ + "nixpkgs" + ], + "rust-analyzer-src": "rust-analyzer-src" + }, + "locked": { + "lastModified": 1645251813, + "narHash": "sha256-cQ66tGjnZclBCS3nD26mZ5fUH+3/HnysGffBiWXUSHk=", + "owner": "nix-community", + "repo": "fenix", + "rev": "9892337b588c38ec59466a1c89befce464aae7f8", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "fenix", + "type": "github" + } + }, + "gitignore": { + "inputs": { + "nixpkgs": [ + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1635165013, + "narHash": "sha256-o/BdVjNwcB6jOmzZjOH703BesSkkS5O7ej3xhyO8hAY=", + "owner": "hercules-ci", + "repo": "gitignore.nix", + "rev": "5b9e0ff9d3b551234b4f3eb3983744fa354b17f1", + "type": "github" + }, + "original": { + "owner": "hercules-ci", + "repo": "gitignore.nix", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1645013224, + "narHash": "sha256-b7OEC8vwzJv3rsz9pwnTX2LQDkeOWz2DbKypkVvNHXc=", + "owner": "nixos", + "repo": "nixpkgs", + "rev": "b66b39216b1fef2d8c33cc7a5c72d8da80b79970", + "type": "github" + }, + "original": { + "owner": "nixos", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "fenix": "fenix", + "gitignore": "gitignore", + "nixpkgs": "nixpkgs" + } + }, + "rust-analyzer-src": { + "flake": false, + "locked": { + "lastModified": 1645205556, + "narHash": "sha256-e4lZW3qRyOEJ+vLKFQP7m2Dxh5P44NrnekZYLxlucww=", + "owner": "rust-analyzer", + "repo": "rust-analyzer", + "rev": "acf5874b39f3dc5262317a6074d9fc7285081161", + "type": "github" + }, + "original": { + "owner": "rust-analyzer", + "ref": "nightly", + "repo": "rust-analyzer", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..dcdf3b7 --- /dev/null +++ b/flake.nix @@ -0,0 +1,67 @@ +{ + inputs = { + + nixpkgs.url = "github:nixos/nixpkgs/nixpkgs-unstable"; + + fenix = { + url = "github:nix-community/fenix"; + inputs.nixpkgs.follows = "nixpkgs"; + }; + + gitignore = { + url = "github:hercules-ci/gitignore.nix"; + inputs.nixpkgs.follows = "nixpkgs"; + }; + + }; + + outputs = + { self + , nixpkgs + , fenix + , gitignore + }: + let + inherit (gitignore.lib) gitignoreSource; + + supportedSystems = [ "x86_64-linux" "aarch64-linux" "x86_64-darwin" "aarch64-darwin" ]; + forAllSystems = nixpkgs.lib.genAttrs supportedSystems; + nixpkgsFor = forAllSystems (system: + import nixpkgs { inherit system; }); + + chanspec = { + date = "2022-02-06"; + channel = "nightly"; + sha256 = "oKkTWopCDx4tphzTtRn+zDDtvmIZrL/H44tV2ruSfDw="; # set zeros after modifying channel or date + }; + rustChannel = p: (fenix.overlay p p).fenix.toolchainOf chanspec; + + in + { + + + devShell = forAllSystems (system: + let + pkgs = nixpkgsFor."${system}"; + toolchain = (rustChannel pkgs).withComponents [ + "rustc" + "cargo" + "rust-std" + "rustfmt" + "clippy" + "rust-src" + ]; + inherit (fenix.packages."${system}") rust-analyzer; + in + pkgs.mkShell { + nativeBuildInputs = [ + pkgs.bacon + pkgs.cargo-insta + rust-analyzer + toolchain + ]; + RUST_LOG = "info"; + RUST_BACKTRACE = 1; + }); + }; +} diff --git a/readme b/readme new file mode 100644 index 0000000..869e43e --- /dev/null +++ b/readme @@ -0,0 +1,38 @@ +Accepts a list of files as args and returns pairs of duplicate files, one pair +per line. + +Usage: +----- + + cargo run --release --quiet -- [FILES] + + +Example: +------- + + cargo run ---release --quiet -- js-files/*.js + + +Internals: +--------- + +The tool uses tree-sitter to produce ASTs for the given files. It then lazily +traverses the trees of the two files to be compared and exits on encountering +the first structural difference in the ASTs. + + +Known issues: +------------ + +- A fully commented-out file is equivalent to every other fully commented-out + file and to empty files + +- Does not account for equivalence of unordered children: + + ==== file1.rs ==== ==== file2.rs ==== + fn bar(x, y, z) {} fn foo(a, b) {} + fn foo(a, b) {} fn bar(x, y, z) {} + + Here, `function` nodes are "unordered" children, the files are structurally + equivalent because the order of `function` nodes is irrelevant. However, the + tool considers these files to be unique. diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..64bd094 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,17 @@ +// std +use std::io; + +// extern +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum SimError { + #[error("unable to set language")] + UnableToSetLanguage, + + #[error("parser does not have a corresponding tree-sitter language")] + LanguageNotSet, + + #[error("io error: {0}")] + IoError(#[from] io::Error), +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..b708984 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,220 @@ +// locals +mod error; +mod traverse; + +// std +use std::{ + fmt, + ops::Not, + path::{Path, PathBuf}, +}; + +// extern +use error::SimError; +use itertools::{EitherOrBoth, Itertools}; +use traverse::PreOrder; +use tree_sitter::{Node, Parser, Tree}; + +type Result = std::result::Result; + +// Check if two tree-sitter objects (trees or nodes etc.) are "similar" +pub trait Similarity { + // This function accepts a predicate to filter out nodes, + // for e.g.: trivia such as comments and whitespace may be filtered + // out with `!node.is_extra()` + fn is_similar

(&self, other: &Self, predicate: P) -> bool + where + P: Fn(&Node) -> bool; +} + +// Representation of a fully parsed file from the filesystem +pub struct ProgramFile { + pub path: PathBuf, + pub src: String, + pub tree: Tree, +} + +impl Similarity for ProgramFile { + fn is_similar

(&self, other: &Self, predicate: P) -> bool + where + P: Fn(&Node) -> bool, + { + // The order of traversal is unimportant + let mine = PreOrder::new(&self.tree).filter(|n| predicate(n)); + let theirs = PreOrder::new(&other.tree).filter(|n| predicate(n)); + + // Simultaneously traverse both the trees, and find dissimilarities. + // + // A dissimilarity is when: + // - one tree's traversal is longer than the other + // - the trees have identical traversal lengths but non-identical + // nodes along the traversal + mine.zip_longest(theirs) + .any(|result| match result { + EitherOrBoth::Both(mine, theirs) => mine.kind_id() != theirs.kind_id(), + EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => true, + }) + .not() + } +} + +impl fmt::Debug for ProgramFile { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ProgramFile") + .field("path", &self.path) + .field("src", &self.src.len()) + .finish() + } +} + +impl ProgramFile { + pub fn new>(path: P, parser: &mut Parser) -> Result { + let src = std::fs::read_to_string(&path)?; + let path = path.as_ref().to_owned(); + let tree = parser.parse(&src, None).ok_or(SimError::LanguageNotSet)?; + Ok(Self { path, src, tree }) + } +} + +#[cfg(test)] +mod test { + use super::*; + use tree_sitter::Parser; + + // Utility to quickly bootstrap ProgramFiles + // and run similarity checks + fn is_similar(mine: &str, their: &str) -> bool { + let lang = tree_sitter_javascript::language(); + let mut parser = Parser::new(); + parser.set_language(lang).unwrap(); + + let my_program_file = ProgramFile { + path: PathBuf::from("mine"), + src: mine.to_owned(), + tree: parser.parse(&mine, None).unwrap(), + }; + + let their_program_file = ProgramFile { + path: PathBuf::from("their"), + src: their.to_owned(), + tree: parser.parse(&their, None).unwrap(), + }; + + my_program_file.is_similar(&their_program_file, |n| !n.is_extra()) + } + + #[test] + fn trivial() { + assert!(is_similar( + r#" + var x = 2; + "#, + r#" + var y = 5; + "# + )) + } + + #[test] + fn automorphism() { + assert!(is_similar( + r#" + function(x) { + x * 2 + } + "#, + r#" + function(x) { + x * 2 + } + "# + )) + } + + #[test] + fn sample() { + assert!(is_similar( + r#" + var scrapeWebsite = async function scrapeWebsite(website) { + try { + var url = normalizeUrl(website); + var html = await fetch(url).then(function (res) { + return res.text(); + }); + var $ = load(html); + var handles = {}; + module.exports.CONFIG.socialNetworks.forEach(function (socialNetwork) { + handles[socialNetwork] = parse(socialNetwork)($); + }) + return handles; + } catch (error) { + throw new Error("Error scraping website data for " + website + ": " + error.message) + } + } + "#, + r#" + var parseWebsite = async function parseWebsite(website) { + try { + var url = normalizeUrl(website); + var html = await fetch(url).then(function (res) { + return res.text(); + }); + var $ = load(html); + var handles = {}; + module.exports.CONFIG.socialNetworks.forEach(function (socialNetwork) { + handles[socialNetwork] = parse(socialNetwork)($); + }) + return handles; + } catch (error) { + throw new Error("Error fetching website data for " + website + ": " + error.message) + } + } + "# + )) + } + + #[test] + fn ignore_comment_trivia() { + assert!(is_similar( + r#" + // doubles x + function(x) { + x * 2 + } + "#, + r#" + function(x) { + x * 2 // doubles x + } + "# + )) + } + + #[test] + fn ignore_whitespace_trivia() { + assert!(is_similar( + r#" + function(x) { +return x * 2; + } + "#, + r#" + function(x ) { + return x * 2; + } + "# + )) + } + + #[test] + fn dissimilar() { + assert!(!is_similar( + r#" + var t = function(x, y) { } + "#, + r#" + var l = function(x) { } + "# + )) + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..77c7584 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,43 @@ +// extern +use itertools::Itertools; +use rayon::prelude::*; +use similarity::{ProgramFile, Similarity}; +use tree_sitter::Parser; + +fn parser() -> Parser { + let mut parser = Parser::new(); + parser + .set_language(tree_sitter_javascript::language()) + .unwrap(); + parser +} + +fn main() { + let args = std::env::args().skip(1).collect::>(); + + // read and parse files from disk + let (files, errors): (Vec<_>, Vec<_>) = args + .par_iter() + .map(|arg| ProgramFile::new(arg, &mut parser())) + .partition(Result::is_ok); + + // dump errors if any + errors.into_iter().map(Result::unwrap_err).for_each(|err| { + eprintln!("{}", err); + }); + + let files: Vec<_> = files.into_iter().map(Result::unwrap).collect(); + + // compare every pair of files for similarity + (0..files.len()) + .combinations(2) + .map(|item| (item[0], item[1])) + .par_bridge() + .for_each(|(one, two)| { + let one = &files[one]; + let two = &files[two]; + if one.is_similar(&two, |n| !n.is_extra()) && one.path != two.path { + println!("{} {}", one.path.display(), two.path.display()); + } + }); +} diff --git a/src/traverse.rs b/src/traverse.rs new file mode 100644 index 0000000..df13f5d --- /dev/null +++ b/src/traverse.rs @@ -0,0 +1,51 @@ +// Lazy pre-order traversal over arbitrary tree-sitter trees + +// extern +use tree_sitter::{Node, Tree, TreeCursor}; + +pub struct PreOrder<'a> { + cursor: TreeCursor<'a>, + done: bool, +} + +impl<'a> PreOrder<'a> { + pub fn new(tree: &'a Tree) -> Self { + Self { + cursor: tree.walk(), + done: false, + } + } +} + +// Traverse the tree in root-left-right fashion +impl<'a> Iterator for PreOrder<'a> { + type Item = Node<'a>; + + fn next(&mut self) -> Option { + if self.done { + return None; + } + + // capture the root node + let node = self.cursor.node(); + + // if this node has a child, move there and return the root, or + // if this node has a sibling, move there and return the root + if self.cursor.goto_first_child() || self.cursor.goto_next_sibling() { + return Some(node); + } + + loop { + // we are done retracing all the way back to the root node + if !self.cursor.goto_parent() { + self.done = true; + break; + } + + if self.cursor.goto_next_sibling() { + break; + } + } + Some(node) + } +} -- cgit v1.2.3