From 9ccdaee79cdbcc76c8b855a0d50b5a5309ba8bb8 Mon Sep 17 00:00:00 2001 From: Akshay Date: Tue, 11 Apr 2023 22:40:24 +0530 Subject: add build steps --- .gitignore | 2 + Cargo.lock | 34 ++++++++++ Cargo.toml | 1 + flake.lock | 13 ++++ flake.nix | 43 ++++++++++++- src/consts.rs | 32 +--------- src/lex.rs | 33 ++++++---- src/lib.rs | 83 ++++++++++++++++++++++++ src/main.rs | 107 +++---------------------------- src/parse.rs | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/utils.rs | 5 +- 11 files changed, 407 insertions(+), 146 deletions(-) create mode 100644 src/lib.rs create mode 100644 src/parse.rs diff --git a/.gitignore b/.gitignore index 2d5df85..b0e70b2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ /target .direnv +assets +result diff --git a/Cargo.lock b/Cargo.lock index 06c3f88..e49c8e5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5,3 +5,37 @@ version = 3 [[package]] name = "dict" version = "0.1.0" +dependencies = [ + "radix_trie", +] + +[[package]] +name = "endian-type" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" + +[[package]] +name = "nibble_vec" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a5d83df9f36fe23f0c3648c6bbb8b0298bb5f1939c8f2704431371f4b84d43" +dependencies = [ + "smallvec", +] + +[[package]] +name = "radix_trie" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd" +dependencies = [ + "endian-type", + "nibble_vec", +] + +[[package]] +name = "smallvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" diff --git a/Cargo.toml b/Cargo.toml index d2c0dc4..044c314 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,3 +6,4 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +radix_trie = "0.2.1" diff --git a/flake.lock b/flake.lock index f231e8c..36b84eb 100644 --- a/flake.lock +++ b/flake.lock @@ -1,5 +1,17 @@ { "nodes": { + "en": { + "flake": false, + "locked": { + "narHash": "sha256-iSqDnYvb2UdUAN7VR4NntsEkIqgTgdW8LRDgCAJhFe4=", + "type": "file", + "url": "https://www.gutenberg.org/cache/epub/29765/pg29765.txt" + }, + "original": { + "type": "file", + "url": "https://www.gutenberg.org/cache/epub/29765/pg29765.txt" + } + }, "nixpkgs": { "locked": { "lastModified": 1677852945, @@ -15,6 +27,7 @@ }, "root": { "inputs": { + "en": "en", "nixpkgs": "nixpkgs" } } diff --git a/flake.nix b/flake.nix index a615408..8a9d489 100644 --- a/flake.nix +++ b/flake.nix @@ -1,22 +1,59 @@ { description = "A very basic flake"; - outputs = { self, nixpkgs }: + inputs = { + en = { + url = "https://www.gutenberg.org/cache/epub/29765/pg29765.txt"; + flake = false; + }; + }; + + outputs = { self, en, nixpkgs }: let supportedSystems = [ "x86_64-linux" ]; forAllSystems = nixpkgs.lib.genAttrs supportedSystems; - nixpkgsFor = forAllSystems (system: import nixpkgs { inherit system; }); + nixpkgsFor = forAllSystems (system: import nixpkgs { + inherit system; + overlays = [ self.overlays.default ]; + }); in { + overlays.default = final: prev: { + dict = + let + pname = "dict"; + packageMeta = (final.lib.importTOML ./Cargo.toml).package; + in + final.rustPlatform.buildRustPackage { + inherit pname; + inherit (packageMeta) version; + preUnpack = '' + mkdir assets + sed "1,27d;973899,$ d" ${en} > assets/en.txt + ${final.dos2unix}/bin/dos2unix assets/en.txt + ls -alh assets/en.txt + ''; + src = self; + cargoLock.lockFile = ./Cargo.lock; + }; + }; + + packages = forAllSystems (system: { + inherit (nixpkgsFor."${system}") dict; + }); + + defaultPackage = forAllSystems (system: self.packages."${system}".dict); + devShell = forAllSystems (system: let - pkgs = nixpkgsFor."${system}"; + pkgs = nixpkgsFor."${ system}"; in pkgs.mkShell { nativeBuildInputs = [ pkgs.rustc pkgs.cargo + pkgs.rustfmt pkgs.rust-analyzer pkgs.cargo-watch ]; diff --git a/src/consts.rs b/src/consts.rs index 446c341..c606a95 100644 --- a/src/consts.rs +++ b/src/consts.rs @@ -1,31 +1 @@ -pub const SRC: &str = include_str!("../assets/en.txt"); -// pub const SRC: &str = r"A -// A (named a in the English, and most commonly ä in other languages). -// -// Defn: The first letter of the English and of many other alphabets. -// The capital A of the alphabets of Middle and Western Europe, as also -// the small letter (a), besides the forms in Italic, black letter, -// etc., are all descended from the old Latin A, which was borrowed from -// the Greek Alpha, of the same form; and this was made from the first -// letter (Aleph, and itself from the Egyptian origin. The Aleph was a -// consonant letter, with a guttural breath sound that was not an -// element of Greek articulation; and the Greeks took it to represent -// their vowel Alpha with the ä sound, the Phoenician alphabet having no -// vowel symbols. This letter, in English, is used for several different -// vowel sounds. See Guide to pronunciation, §§ 43-74. The regular long -// a, as in fate, etc., is a comparatively modern sound, and has taken -// the place of what, till about the early part of the 17th century, was -// a sound of the quality of ä (as in far). -// -// 2. (Mus.) -// -// Defn: The name of the sixth tone in the model major scale (that in -// C), or the first tone of the minor scale, which is named after it the -// scale in A minor. The second string of the violin is tuned to the A -// in the treble staff. -// -- A sharp (A#) is the name of a musical tone intermediate between A -// and B. -// -- A flat (A) is the name of a tone intermediate between A and G."; -// -// -// +pub const SRC: &str = include_str!("../../assets/en.txt"); diff --git a/src/lex.rs b/src/lex.rs index 0f9a535..701009a 100644 --- a/src/lex.rs +++ b/src/lex.rs @@ -1,5 +1,6 @@ use crate::utils::FromStaticStr; +#[derive(Debug)] pub enum Stanza { Entry(&'static str), Defn(&'static str), @@ -11,7 +12,8 @@ pub enum Stanza { impl Stanza { fn is_entry(s: &str) -> bool { - s.chars().all(|c| c.is_uppercase() || c.is_ascii_whitespace() || "-;'.".contains(c)) + s.chars() + .all(|c| c.is_uppercase() || c.is_ascii_whitespace() || "-;'.".contains(c)) } fn is_defn(s: &str) -> bool { @@ -27,7 +29,9 @@ impl Stanza { } fn is_bullet(s: &str) -> bool { - s.find('.').map(|idx| s[..idx].chars().all(char::is_numeric)).unwrap_or_default() + s.find('.') + .map(|idx| s[..idx].chars().all(char::is_numeric)) + .unwrap_or_default() } fn is_sub_bullet(s: &str) -> bool { @@ -49,21 +53,23 @@ impl FromStaticStr for Stanza { if let Some(first_line) = lines.next() { if !first_line.is_empty() { if Stanza::is_entry(first_line) { - Ok(Self::Entry(s)) + Ok(Self::Entry(first_line.trim())) } else if Stanza::is_defn(first_line) { - Ok(Self::Defn(s)) + Ok(Self::Defn(s.strip_prefix("Defn: ").unwrap_or(s).trim())) } else if Stanza::is_note(first_line) { - Ok(Self::Note(s)) + Ok(Self::Note(s.strip_prefix("Note: ").unwrap_or(s).trim())) } else if Stanza::is_synonym(first_line) { - Ok(Self::Synonym(s)) + Ok(Self::Synonym(s.strip_prefix("Syn.").unwrap_or(s))) } else if Stanza::is_bullet(first_line) { - Ok(Self::Bullet(s)) - } else if Stanza::is_sub_bullet(first_line) { - Ok(Self::SubBullet(s)) + Ok(Self::Defn( + s.trim_start_matches(|c| "0123456789. ".contains(c)), + )) + // } else if Stanza::is_sub_bullet(first_line) { + // Ok(Self::SubBullet(s)) } else { - Err(Self::Err { - data: format!("weird stanza: {}", s), - }) + Err(Self::Err { + data: format!("weird stanza: {}", s), + }) } } else { Err(Self::Err { @@ -78,3 +84,6 @@ impl FromStaticStr for Stanza { } } +pub fn lex(src: &'static str) -> impl Iterator> { + src.split("\n\n").map(Stanza::from_str) +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..1324f70 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,83 @@ +pub mod consts; +pub mod lex; +pub mod parse; +mod utils; + +use std::fmt; + +use radix_trie::{Trie, TrieCommon}; + +pub struct Dict { + inner: Trie, +} + +impl Dict { + fn new() -> Self { + Self { inner: Trie::new() } + } + + fn insert(&mut self, entry: DictKey, value: DictValue) { + self.inner.map_with_default( + entry, + |dict_value| { + // TODO: this only merges defns, not notes/syns + for v in value.defn.iter() { + dict_value.defn.push(v); + } + }, + value.clone(), + ); + } + + pub fn search<'dict, 'search>( + &'dict self, + search_term: &'search str, + ) -> SearchResults<'dict, 'search> { + self.inner + .subtrie(search_term) + .map_or(SearchResults::Empty, |subtrie| { + SearchResults::Hit(subtrie.iter()) + }) + } +} + +pub enum SearchResults<'dict, 'search> { + Empty, + Hit(radix_trie::iter::Iter<'dict, &'search str, DictValue>), +} + +impl<'dict, 'search> SearchResults<'dict, 'search> { + // mutable ref here to advance the iterator present in Self::Hit + pub fn print(&mut self) { + match self { + Self::Hit(results) => { + while let Some((key, value)) = results.next() { + if value.defn.len() > 1 { + for (def, idx) in value.defn.iter().zip(1..) { + println!("{}({}) {}", key, idx, def.replace('\n', " ")); + } + } else { + println!("{} {}", key, value.defn[0].replace('\n', " ")); + } + + // if let Some(note) = value.note { + // print!("\t{}", note); + // } + // if let Some(synonym) = value.synonym { + // print!("\t{}", synonym); + // } + } + } + Self::Empty => (), + } + } +} + +type DictKey = &'static str; + +#[derive(Clone)] +pub struct DictValue { + defn: Vec<&'static str>, + note: Option<&'static str>, + synonym: Option<&'static str>, +} diff --git a/src/main.rs b/src/main.rs index e6d997d..9def90c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,101 +1,14 @@ -mod consts; -mod utils; -mod lex; - -use consts::SRC; -use lex::{Stanza, StanzaLexError}; -use utils::FromStaticStr; +use dict::{consts::SRC, lex, parse::ParseState}; fn main() { - let mut count = 0; - let mut parse_state = ParseState::Ready; - let mut current_entry = EntryBuilder::new(); - let mut dict = Dictionary { - entries: vec![], + let Some(search_term) = std::env::args().skip(1).next() else { + eprintln!("usage: dict "); + return; }; - - for l in SRC.split("\n\n") { - count += 1; - let stanza = match Stanza::from_str(l) { - Ok(s) => { - println!("{count} ok"); - s - }, - Err(StanzaLexError { data }) => { - eprintln!("stanza err: {data}\n\n"); - continue; - }, - }; - match stanza { - Stanza::Entry(s) if parse_state == ParseState::Ready => { - current_entry.set_name(s); - parse_state = ParseState::InEntry; - } - Stanza::Defn(d) if parse_state == ParseState::InEntry => { - current_entry.set_defn(d); - - match current_entry.build() { - Ok(e) => dict.entries.push(e), - Err(_) => eprintln!("failed to build entry"), - } - - parse_state = ParseState::Ready; - } - _ => () - } - } - dbg!(dict.entries.iter().find(|entry| entry.name.to_ascii_lowercase().starts_with("discursive"))); -} - -#[derive(PartialEq, Eq, PartialOrd, Ord)] -enum ParseState { - Ready, - InEntry -} - -struct Dictionary { - entries: Vec -} - -#[derive(Debug)] -struct Entry { - name: &'static str, - defn: Option<&'static str>, - note: Option<&'static str>, - synonym: Option<&'static str>, -} - -#[derive(Default)] -struct EntryBuilder { - name: Option<&'static str>, - defn: Option<&'static str>, - note: Option<&'static str>, - synonym: Option<&'static str>, -} - -enum EntryBuilderError { - MissingField(&'static str) -} - -impl EntryBuilder { - fn new() -> Self { - Self::default() - } - - fn set_name(&mut self, name: &'static str) { - self.name = Some(name); - } - - fn set_defn(&mut self, defn: &'static str) { - self.defn = Some(defn); - } - - fn build(&self) -> Result { - Ok(Entry { - name: self.name.ok_or(EntryBuilderError::MissingField("name"))?, - defn: self.defn, - note: self.note, - synonym: self.synonym, - }) - } + lex::lex(SRC) + .filter_map(Result::ok) + .fold(ParseState::new(), ParseState::advance) + .finish() + .search(search_term.to_ascii_uppercase().as_str()) + .print() } diff --git a/src/parse.rs b/src/parse.rs new file mode 100644 index 0000000..5b613ca --- /dev/null +++ b/src/parse.rs @@ -0,0 +1,200 @@ +use crate::{lex::Stanza, Dict}; + +pub struct ParseState { + dict: Dict, + status: Status, + current_entry: EntryBuilder, + errors: Vec, +} + +#[derive(Debug)] +enum ParseError { + Build(EntryBuilderError), + UndefinedState(Status, EntryBuilder, Stanza), +} + +impl ParseState { + pub fn new() -> Self { + Self { + dict: Dict::new(), + status: Status::Start, + current_entry: EntryBuilder::new(), + errors: Vec::new(), + } + } + + pub fn advance(mut self, stanza: Stanza) -> Self { + match (self.status, stanza) { + (Status::Start, Stanza::Entry(e)) => { + self.current_entry.set_name(e); + self.status = Status::ContainsName; + } + (Status::ContainsName, Stanza::Defn(d)) => { + self.current_entry.push_defn(d); + self.status = Status::ContainsOneDefn; + } + (Status::ContainsOneDefn | Status::ContainsMulDefn, Stanza::Defn(d)) => { + self.current_entry.push_defn(d); + self.status = Status::ContainsMulDefn; + } + ( + Status::ContainsOneDefn | Status::ContainsMulDefn | Status::ContainsSynonym, + Stanza::Note(n), + ) => { + self.current_entry.set_note(n); + self.status = Status::ContainsNote; + } + ( + Status::ContainsOneDefn | Status::ContainsMulDefn | Status::ContainsNote, + Stanza::Synonym(s), + ) => { + self.current_entry.set_synonym(s); + self.status = Status::ContainsSynonym; + } + ( + Status::ContainsOneDefn + | Status::ContainsMulDefn + | Status::ContainsNote + | Status::ContainsSynonym, + Stanza::Entry(e), + ) => { + // flush the current entry + match self.current_entry.build() { + Ok(entry) => self.dict.insert(entry.name, entry.into()), + Err(b) => self.register_error(ParseError::Build(b)), + }; + + // begin with the new one + self.current_entry.clear(); + self.current_entry.set_name(e); + self.status = Status::ContainsName; + } + (Status::ContainsName, Stanza::Entry(e)) => { + // dump unfinished entry and enter new entry + self.current_entry.clear(); + self.current_entry.set_name(e); + self.status = Status::ContainsName; + } + (_, new_entry) => { + // any other states means our parser is entering undefined teritorry + // register an error if we have anything in current_entry + self.register_undefined_state_error(new_entry); + // and set the status to Start and fast forward to the next entry + self.current_entry.clear(); + self.status = Status::Start; + } + } + self + } + + fn register_error(&mut self, error: ParseError) { + self.errors.push(error) + } + + fn register_undefined_state_error(&mut self, new_entry: Stanza) { + self.register_error(ParseError::UndefinedState( + self.status, + self.current_entry.clone(), + new_entry, + )); + } + + pub fn finish(self) -> Dict { + self.dict + } + + pub fn dump(&self) { + for err in &self.errors { + eprintln!("{err:?}"); + } + } +} + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] +enum Status { + // ready to accept a new entry + Start, + // ready to accept a defn + ContainsName, + // can accept notes or synonyms, or flush this entry + ContainsOneDefn, + // can accept notes or synonyms, or flush this entry + ContainsMulDefn, + // can accept a synonym + ContainsNote, + // can accept a note + ContainsSynonym, + // mangled stanza, skip until the next entry occurs +} + +#[derive(Debug, Clone)] +struct Entry { + name: &'static str, + defn: Vec<&'static str>, + note: Option<&'static str>, + synonym: Option<&'static str>, +} + +impl From for crate::DictValue { + fn from(entry: Entry) -> Self { + Self { + defn: entry.defn, + note: entry.note, + synonym: entry.synonym, + } + } +} + +#[derive(Debug, Default, Clone)] +struct EntryBuilder { + name: Option<&'static str>, + defn: Vec<&'static str>, + note: Option<&'static str>, + synonym: Option<&'static str>, +} + +#[derive(Debug)] +enum EntryBuilderError { + MissingField(&'static str), +} + +impl EntryBuilder { + fn new() -> Self { + Self::default() + } + + fn clear(&mut self) { + *self = Self::default(); + } + + fn set_name(&mut self, name: &'static str) { + self.name = Some(name); + } + + fn push_defn(&mut self, defn: &'static str) { + self.defn.push(defn); + } + + fn set_note(&mut self, note: &'static str) { + self.note = Some(note); + } + + fn set_synonym(&mut self, synonym: &'static str) { + self.synonym = Some(synonym); + } + + fn build(&self) -> Result { + let name = self.name.ok_or(EntryBuilderError::MissingField("name"))?; + let defn = if self.defn.is_empty() { + return Err(EntryBuilderError::MissingField("defn")); + } else { + self.defn.clone() + }; + Ok(Entry { + name, + defn, + note: self.note, + synonym: self.synonym, + }) + } +} diff --git a/src/utils.rs b/src/utils.rs index 23fff7e..c53e564 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,7 +1,6 @@ - pub trait FromStaticStr { type Err; fn from_str(s: &'static str) -> Result - where Self: Sized; + where + Self: Sized; } - -- cgit v1.2.3