// locals mod error; mod traverse; // std use std::{ fmt, ops::Not, path::{Path, PathBuf}, }; // extern use error::SimError; use itertools::{EitherOrBoth, Itertools}; use traverse::PreOrder; use tree_sitter::{Node, Parser, Tree}; type Result = std::result::Result; const THRESHOLD: f64 = 0.6; // Check if two tree-sitter objects (trees or nodes etc.) are "similar" pub trait Similarity { // This function accepts a predicate to filter out nodes, // for e.g.: trivia such as comments and whitespace may be filtered // out with `!node.is_extra()` fn is_similar

(&self, other: &Self, predicate: P) -> bool where P: Fn(&Node) -> bool; } // Representation of a fully parsed file from the filesystem pub struct ProgramFile { pub path: PathBuf, pub src: String, pub tree: Tree, pub simhash: u64, } impl Similarity for ProgramFile { fn is_similar

(&self, other: &Self, predicate: P) -> bool where P: Fn(&Node) -> bool, { // The order of traversal is unimportant let mine = PreOrder::new(&self.tree).filter(|n| predicate(n)); let theirs = PreOrder::new(&other.tree).filter(|n| predicate(n)); // Simultaneously traverse both the trees, and find dissimilarities. // // A dissimilarity is when: // - one tree's traversal is longer than the other // - the trees have identical traversal lengths but non-identical // nodes along the traversal let structurally_similar = mine .zip_longest(theirs) .any(|result| match result { EitherOrBoth::Both(mine, theirs) => mine.kind_id() != theirs.kind_id(), EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => true, }) .not(); let textually_similar = (simhash::hash_similarity(self.simhash, other.simhash)) > THRESHOLD; structurally_similar && textually_similar } } impl fmt::Debug for ProgramFile { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("ProgramFile") .field("path", &self.path) .field("src", &self.src.len()) .finish() } } impl ProgramFile { pub fn new>(path: P, parser: &mut Parser) -> Result { let src = std::fs::read_to_string(&path)?; let path = path.as_ref().to_owned(); let tree = parser.parse(&src, None).ok_or(SimError::LanguageNotSet)?; let simhash = simhash::simhash(&src); Ok(Self { path, src, tree, simhash, }) } } #[cfg(test)] mod test { use super::*; use tree_sitter::Parser; // Utility to quickly bootstrap ProgramFiles // and run similarity checks fn is_similar(mine: &str, their: &str) -> bool { let lang = tree_sitter_javascript::language(); let mut parser = Parser::new(); parser.set_language(lang).unwrap(); let my_program_file = ProgramFile { path: PathBuf::from("mine"), src: mine.to_owned(), tree: parser.parse(&mine, None).unwrap(), simhash: simhash::simhash(&mine), }; let their_program_file = ProgramFile { path: PathBuf::from("their"), src: their.to_owned(), tree: parser.parse(&their, None).unwrap(), simhash: simhash::simhash(&their), }; my_program_file.is_similar(&their_program_file, |n| !n.is_extra()) } #[test] fn trivial() { assert!(is_similar( r#" var x = 2; "#, r#" var y = 5; "# )) } #[test] fn automorphism() { assert!(is_similar( r#" function(x) { x * 2 } "#, r#" function(x) { x * 2 } "# )) } #[test] fn sample() { assert!(is_similar( r#" var scrapeWebsite = async function scrapeWebsite(website) { try { var url = normalizeUrl(website); var html = await fetch(url).then(function (res) { return res.text(); }); var $ = load(html); var handles = {}; module.exports.CONFIG.socialNetworks.forEach(function (socialNetwork) { handles[socialNetwork] = parse(socialNetwork)($); }) return handles; } catch (error) { throw new Error("Error scraping website data for " + website + ": " + error.message) } } "#, r#" var parseWebsite = async function parseWebsite(website) { try { var url = normalizeUrl(website); var html = await fetch(url).then(function (res) { return res.text(); }); var $ = load(html); var handles = {}; module.exports.CONFIG.socialNetworks.forEach(function (socialNetwork) { handles[socialNetwork] = parse(socialNetwork)($); }) return handles; } catch (error) { throw new Error("Error fetching website data for " + website + ": " + error.message) } } "# )) } #[test] fn ignore_comment_trivia() { assert!(is_similar( r#" // doubles x function(x) { x * 2 } "#, r#" function(x) { x * 2 // doubles x } "# )) } #[test] fn ignore_whitespace_trivia() { assert!(is_similar( r#" function(x) { return x * 2; } "#, r#" function(x ) { return x * 2; } "# )) } #[test] fn dissimilar() { assert!(!is_similar( r#" var t = function(x, y) { } "#, r#" var l = function(x) { } "# )) } }