From efd96e8df6805a45aaf5822141dee11c642b51ae Mon Sep 17 00:00:00 2001 From: Akshay Date: Tue, 2 Aug 2022 16:57:04 +0530 Subject: init --- src/error.rs | 17 +++++ src/lib.rs | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 43 +++++++++++ src/traverse.rs | 51 +++++++++++++ 4 files changed, 331 insertions(+) create mode 100644 src/error.rs create mode 100644 src/lib.rs create mode 100644 src/main.rs create mode 100644 src/traverse.rs (limited to 'src') diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..64bd094 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,17 @@ +// std +use std::io; + +// extern +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum SimError { + #[error("unable to set language")] + UnableToSetLanguage, + + #[error("parser does not have a corresponding tree-sitter language")] + LanguageNotSet, + + #[error("io error: {0}")] + IoError(#[from] io::Error), +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..b708984 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,220 @@ +// locals +mod error; +mod traverse; + +// std +use std::{ + fmt, + ops::Not, + path::{Path, PathBuf}, +}; + +// extern +use error::SimError; +use itertools::{EitherOrBoth, Itertools}; +use traverse::PreOrder; +use tree_sitter::{Node, Parser, Tree}; + +type Result = std::result::Result; + +// Check if two tree-sitter objects (trees or nodes etc.) are "similar" +pub trait Similarity { + // This function accepts a predicate to filter out nodes, + // for e.g.: trivia such as comments and whitespace may be filtered + // out with `!node.is_extra()` + fn is_similar

(&self, other: &Self, predicate: P) -> bool + where + P: Fn(&Node) -> bool; +} + +// Representation of a fully parsed file from the filesystem +pub struct ProgramFile { + pub path: PathBuf, + pub src: String, + pub tree: Tree, +} + +impl Similarity for ProgramFile { + fn is_similar

(&self, other: &Self, predicate: P) -> bool + where + P: Fn(&Node) -> bool, + { + // The order of traversal is unimportant + let mine = PreOrder::new(&self.tree).filter(|n| predicate(n)); + let theirs = PreOrder::new(&other.tree).filter(|n| predicate(n)); + + // Simultaneously traverse both the trees, and find dissimilarities. + // + // A dissimilarity is when: + // - one tree's traversal is longer than the other + // - the trees have identical traversal lengths but non-identical + // nodes along the traversal + mine.zip_longest(theirs) + .any(|result| match result { + EitherOrBoth::Both(mine, theirs) => mine.kind_id() != theirs.kind_id(), + EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => true, + }) + .not() + } +} + +impl fmt::Debug for ProgramFile { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ProgramFile") + .field("path", &self.path) + .field("src", &self.src.len()) + .finish() + } +} + +impl ProgramFile { + pub fn new>(path: P, parser: &mut Parser) -> Result { + let src = std::fs::read_to_string(&path)?; + let path = path.as_ref().to_owned(); + let tree = parser.parse(&src, None).ok_or(SimError::LanguageNotSet)?; + Ok(Self { path, src, tree }) + } +} + +#[cfg(test)] +mod test { + use super::*; + use tree_sitter::Parser; + + // Utility to quickly bootstrap ProgramFiles + // and run similarity checks + fn is_similar(mine: &str, their: &str) -> bool { + let lang = tree_sitter_javascript::language(); + let mut parser = Parser::new(); + parser.set_language(lang).unwrap(); + + let my_program_file = ProgramFile { + path: PathBuf::from("mine"), + src: mine.to_owned(), + tree: parser.parse(&mine, None).unwrap(), + }; + + let their_program_file = ProgramFile { + path: PathBuf::from("their"), + src: their.to_owned(), + tree: parser.parse(&their, None).unwrap(), + }; + + my_program_file.is_similar(&their_program_file, |n| !n.is_extra()) + } + + #[test] + fn trivial() { + assert!(is_similar( + r#" + var x = 2; + "#, + r#" + var y = 5; + "# + )) + } + + #[test] + fn automorphism() { + assert!(is_similar( + r#" + function(x) { + x * 2 + } + "#, + r#" + function(x) { + x * 2 + } + "# + )) + } + + #[test] + fn sample() { + assert!(is_similar( + r#" + var scrapeWebsite = async function scrapeWebsite(website) { + try { + var url = normalizeUrl(website); + var html = await fetch(url).then(function (res) { + return res.text(); + }); + var $ = load(html); + var handles = {}; + module.exports.CONFIG.socialNetworks.forEach(function (socialNetwork) { + handles[socialNetwork] = parse(socialNetwork)($); + }) + return handles; + } catch (error) { + throw new Error("Error scraping website data for " + website + ": " + error.message) + } + } + "#, + r#" + var parseWebsite = async function parseWebsite(website) { + try { + var url = normalizeUrl(website); + var html = await fetch(url).then(function (res) { + return res.text(); + }); + var $ = load(html); + var handles = {}; + module.exports.CONFIG.socialNetworks.forEach(function (socialNetwork) { + handles[socialNetwork] = parse(socialNetwork)($); + }) + return handles; + } catch (error) { + throw new Error("Error fetching website data for " + website + ": " + error.message) + } + } + "# + )) + } + + #[test] + fn ignore_comment_trivia() { + assert!(is_similar( + r#" + // doubles x + function(x) { + x * 2 + } + "#, + r#" + function(x) { + x * 2 // doubles x + } + "# + )) + } + + #[test] + fn ignore_whitespace_trivia() { + assert!(is_similar( + r#" + function(x) { +return x * 2; + } + "#, + r#" + function(x ) { + return x * 2; + } + "# + )) + } + + #[test] + fn dissimilar() { + assert!(!is_similar( + r#" + var t = function(x, y) { } + "#, + r#" + var l = function(x) { } + "# + )) + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..77c7584 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,43 @@ +// extern +use itertools::Itertools; +use rayon::prelude::*; +use similarity::{ProgramFile, Similarity}; +use tree_sitter::Parser; + +fn parser() -> Parser { + let mut parser = Parser::new(); + parser + .set_language(tree_sitter_javascript::language()) + .unwrap(); + parser +} + +fn main() { + let args = std::env::args().skip(1).collect::>(); + + // read and parse files from disk + let (files, errors): (Vec<_>, Vec<_>) = args + .par_iter() + .map(|arg| ProgramFile::new(arg, &mut parser())) + .partition(Result::is_ok); + + // dump errors if any + errors.into_iter().map(Result::unwrap_err).for_each(|err| { + eprintln!("{}", err); + }); + + let files: Vec<_> = files.into_iter().map(Result::unwrap).collect(); + + // compare every pair of files for similarity + (0..files.len()) + .combinations(2) + .map(|item| (item[0], item[1])) + .par_bridge() + .for_each(|(one, two)| { + let one = &files[one]; + let two = &files[two]; + if one.is_similar(&two, |n| !n.is_extra()) && one.path != two.path { + println!("{} {}", one.path.display(), two.path.display()); + } + }); +} diff --git a/src/traverse.rs b/src/traverse.rs new file mode 100644 index 0000000..df13f5d --- /dev/null +++ b/src/traverse.rs @@ -0,0 +1,51 @@ +// Lazy pre-order traversal over arbitrary tree-sitter trees + +// extern +use tree_sitter::{Node, Tree, TreeCursor}; + +pub struct PreOrder<'a> { + cursor: TreeCursor<'a>, + done: bool, +} + +impl<'a> PreOrder<'a> { + pub fn new(tree: &'a Tree) -> Self { + Self { + cursor: tree.walk(), + done: false, + } + } +} + +// Traverse the tree in root-left-right fashion +impl<'a> Iterator for PreOrder<'a> { + type Item = Node<'a>; + + fn next(&mut self) -> Option { + if self.done { + return None; + } + + // capture the root node + let node = self.cursor.node(); + + // if this node has a child, move there and return the root, or + // if this node has a sibling, move there and return the root + if self.cursor.goto_first_child() || self.cursor.goto_next_sibling() { + return Some(node); + } + + loop { + // we are done retracing all the way back to the root node + if !self.cursor.goto_parent() { + self.done = true; + break; + } + + if self.cursor.goto_next_sibling() { + break; + } + } + Some(node) + } +} -- cgit v1.2.3