aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--Cargo.lock34
-rw-r--r--Cargo.toml1
-rw-r--r--flake.lock13
-rw-r--r--flake.nix43
-rw-r--r--src/consts.rs32
-rw-r--r--src/lex.rs33
-rw-r--r--src/lib.rs83
-rw-r--r--src/main.rs107
-rw-r--r--src/parse.rs200
-rw-r--r--src/utils.rs5
11 files changed, 407 insertions, 146 deletions
diff --git a/.gitignore b/.gitignore
index 2d5df85..b0e70b2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
1/target 1/target
2.direnv 2.direnv
3assets
4result
diff --git a/Cargo.lock b/Cargo.lock
index 06c3f88..e49c8e5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5,3 +5,37 @@ version = 3
5[[package]] 5[[package]]
6name = "dict" 6name = "dict"
7version = "0.1.0" 7version = "0.1.0"
8dependencies = [
9 "radix_trie",
10]
11
12[[package]]
13name = "endian-type"
14version = "0.1.2"
15source = "registry+https://github.com/rust-lang/crates.io-index"
16checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d"
17
18[[package]]
19name = "nibble_vec"
20version = "0.1.0"
21source = "registry+https://github.com/rust-lang/crates.io-index"
22checksum = "77a5d83df9f36fe23f0c3648c6bbb8b0298bb5f1939c8f2704431371f4b84d43"
23dependencies = [
24 "smallvec",
25]
26
27[[package]]
28name = "radix_trie"
29version = "0.2.1"
30source = "registry+https://github.com/rust-lang/crates.io-index"
31checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd"
32dependencies = [
33 "endian-type",
34 "nibble_vec",
35]
36
37[[package]]
38name = "smallvec"
39version = "1.10.0"
40source = "registry+https://github.com/rust-lang/crates.io-index"
41checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
diff --git a/Cargo.toml b/Cargo.toml
index d2c0dc4..044c314 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,3 +6,4 @@ edition = "2021"
6# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 6# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
7 7
8[dependencies] 8[dependencies]
9radix_trie = "0.2.1"
diff --git a/flake.lock b/flake.lock
index f231e8c..36b84eb 100644
--- a/flake.lock
+++ b/flake.lock
@@ -1,5 +1,17 @@
1{ 1{
2 "nodes": { 2 "nodes": {
3 "en": {
4 "flake": false,
5 "locked": {
6 "narHash": "sha256-iSqDnYvb2UdUAN7VR4NntsEkIqgTgdW8LRDgCAJhFe4=",
7 "type": "file",
8 "url": "https://www.gutenberg.org/cache/epub/29765/pg29765.txt"
9 },
10 "original": {
11 "type": "file",
12 "url": "https://www.gutenberg.org/cache/epub/29765/pg29765.txt"
13 }
14 },
3 "nixpkgs": { 15 "nixpkgs": {
4 "locked": { 16 "locked": {
5 "lastModified": 1677852945, 17 "lastModified": 1677852945,
@@ -15,6 +27,7 @@
15 }, 27 },
16 "root": { 28 "root": {
17 "inputs": { 29 "inputs": {
30 "en": "en",
18 "nixpkgs": "nixpkgs" 31 "nixpkgs": "nixpkgs"
19 } 32 }
20 } 33 }
diff --git a/flake.nix b/flake.nix
index a615408..8a9d489 100644
--- a/flake.nix
+++ b/flake.nix
@@ -1,22 +1,59 @@
1{ 1{
2 description = "A very basic flake"; 2 description = "A very basic flake";
3 3
4 outputs = { self, nixpkgs }: 4 inputs = {
5 en = {
6 url = "https://www.gutenberg.org/cache/epub/29765/pg29765.txt";
7 flake = false;
8 };
9 };
10
11 outputs = { self, en, nixpkgs }:
5 let 12 let
6 supportedSystems = [ "x86_64-linux" ]; 13 supportedSystems = [ "x86_64-linux" ];
7 forAllSystems = nixpkgs.lib.genAttrs supportedSystems; 14 forAllSystems = nixpkgs.lib.genAttrs supportedSystems;
8 nixpkgsFor = forAllSystems (system: import nixpkgs { inherit system; }); 15 nixpkgsFor = forAllSystems (system: import nixpkgs {
16 inherit system;
17 overlays = [ self.overlays.default ];
18 });
9 in 19 in
10 { 20 {
11 21
22 overlays.default = final: prev: {
23 dict =
24 let
25 pname = "dict";
26 packageMeta = (final.lib.importTOML ./Cargo.toml).package;
27 in
28 final.rustPlatform.buildRustPackage {
29 inherit pname;
30 inherit (packageMeta) version;
31 preUnpack = ''
32 mkdir assets
33 sed "1,27d;973899,$ d" ${en} > assets/en.txt
34 ${final.dos2unix}/bin/dos2unix assets/en.txt
35 ls -alh assets/en.txt
36 '';
37 src = self;
38 cargoLock.lockFile = ./Cargo.lock;
39 };
40 };
41
42 packages = forAllSystems (system: {
43 inherit (nixpkgsFor."${system}") dict;
44 });
45
46 defaultPackage = forAllSystems (system: self.packages."${system}".dict);
47
12 devShell = forAllSystems (system: 48 devShell = forAllSystems (system:
13 let 49 let
14 pkgs = nixpkgsFor."${system}"; 50 pkgs = nixpkgsFor."${ system}";
15 in 51 in
16 pkgs.mkShell { 52 pkgs.mkShell {
17 nativeBuildInputs = [ 53 nativeBuildInputs = [
18 pkgs.rustc 54 pkgs.rustc
19 pkgs.cargo 55 pkgs.cargo
56 pkgs.rustfmt
20 pkgs.rust-analyzer 57 pkgs.rust-analyzer
21 pkgs.cargo-watch 58 pkgs.cargo-watch
22 ]; 59 ];
diff --git a/src/consts.rs b/src/consts.rs
index 446c341..c606a95 100644
--- a/src/consts.rs
+++ b/src/consts.rs
@@ -1,31 +1 @@
1pub const SRC: &str = include_str!("../assets/en.txt"); pub const SRC: &str = include_str!("../../assets/en.txt");
2// pub const SRC: &str = r"A
3// A (named a in the English, and most commonly ä in other languages).
4//
5// Defn: The first letter of the English and of many other alphabets.
6// The capital A of the alphabets of Middle and Western Europe, as also
7// the small letter (a), besides the forms in Italic, black letter,
8// etc., are all descended from the old Latin A, which was borrowed from
9// the Greek Alpha, of the same form; and this was made from the first
10// letter (Aleph, and itself from the Egyptian origin. The Aleph was a
11// consonant letter, with a guttural breath sound that was not an
12// element of Greek articulation; and the Greeks took it to represent
13// their vowel Alpha with the ä sound, the Phoenician alphabet having no
14// vowel symbols. This letter, in English, is used for several different
15// vowel sounds. See Guide to pronunciation, §§ 43-74. The regular long
16// a, as in fate, etc., is a comparatively modern sound, and has taken
17// the place of what, till about the early part of the 17th century, was
18// a sound of the quality of ä (as in far).
19//
20// 2. (Mus.)
21//
22// Defn: The name of the sixth tone in the model major scale (that in
23// C), or the first tone of the minor scale, which is named after it the
24// scale in A minor. The second string of the violin is tuned to the A
25// in the treble staff.
26// -- A sharp (A#) is the name of a musical tone intermediate between A
27// and B.
28// -- A flat (A) is the name of a tone intermediate between A and G.";
29//
30//
31//
diff --git a/src/lex.rs b/src/lex.rs
index 0f9a535..701009a 100644
--- a/src/lex.rs
+++ b/src/lex.rs
@@ -1,5 +1,6 @@
1use crate::utils::FromStaticStr; 1use crate::utils::FromStaticStr;
2 2
3#[derive(Debug)]
3pub enum Stanza { 4pub enum Stanza {
4 Entry(&'static str), 5 Entry(&'static str),
5 Defn(&'static str), 6 Defn(&'static str),
@@ -11,7 +12,8 @@ pub enum Stanza {
11 12
12impl Stanza { 13impl Stanza {
13 fn is_entry(s: &str) -> bool { 14 fn is_entry(s: &str) -> bool {
14 s.chars().all(|c| c.is_uppercase() || c.is_ascii_whitespace() || "-;'.".contains(c)) 15 s.chars()
16 .all(|c| c.is_uppercase() || c.is_ascii_whitespace() || "-;'.".contains(c))
15 } 17 }
16 18
17 fn is_defn(s: &str) -> bool { 19 fn is_defn(s: &str) -> bool {
@@ -27,7 +29,9 @@ impl Stanza {
27 } 29 }
28 30
29 fn is_bullet(s: &str) -> bool { 31 fn is_bullet(s: &str) -> bool {
30 s.find('.').map(|idx| s[..idx].chars().all(char::is_numeric)).unwrap_or_default() 32 s.find('.')
33 .map(|idx| s[..idx].chars().all(char::is_numeric))
34 .unwrap_or_default()
31 } 35 }
32 36
33 fn is_sub_bullet(s: &str) -> bool { 37 fn is_sub_bullet(s: &str) -> bool {
@@ -49,21 +53,23 @@ impl FromStaticStr for Stanza {
49 if let Some(first_line) = lines.next() { 53 if let Some(first_line) = lines.next() {
50 if !first_line.is_empty() { 54 if !first_line.is_empty() {
51 if Stanza::is_entry(first_line) { 55 if Stanza::is_entry(first_line) {
52 Ok(Self::Entry(s)) 56 Ok(Self::Entry(first_line.trim()))
53 } else if Stanza::is_defn(first_line) { 57 } else if Stanza::is_defn(first_line) {
54 Ok(Self::Defn(s)) 58 Ok(Self::Defn(s.strip_prefix("Defn: ").unwrap_or(s).trim()))
55 } else if Stanza::is_note(first_line) { 59 } else if Stanza::is_note(first_line) {
56 Ok(Self::Note(s)) 60 Ok(Self::Note(s.strip_prefix("Note: ").unwrap_or(s).trim()))
57 } else if Stanza::is_synonym(first_line) { 61 } else if Stanza::is_synonym(first_line) {
58 Ok(Self::Synonym(s)) 62 Ok(Self::Synonym(s.strip_prefix("Syn.").unwrap_or(s)))
59 } else if Stanza::is_bullet(first_line) { 63 } else if Stanza::is_bullet(first_line) {
60 Ok(Self::Bullet(s)) 64 Ok(Self::Defn(
61 } else if Stanza::is_sub_bullet(first_line) { 65 s.trim_start_matches(|c| "0123456789. ".contains(c)),
62 Ok(Self::SubBullet(s)) 66 ))
67 // } else if Stanza::is_sub_bullet(first_line) {
68 // Ok(Self::SubBullet(s))
63 } else { 69 } else {
64 Err(Self::Err { 70 Err(Self::Err {
65 data: format!("weird stanza: {}", s), 71 data: format!("weird stanza: {}", s),
66 }) 72 })
67 } 73 }
68 } else { 74 } else {
69 Err(Self::Err { 75 Err(Self::Err {
@@ -78,3 +84,6 @@ impl FromStaticStr for Stanza {
78 } 84 }
79} 85}
80 86
87pub fn lex(src: &'static str) -> impl Iterator<Item = Result<Stanza, StanzaLexError>> {
88 src.split("\n\n").map(Stanza::from_str)
89}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..1324f70
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,83 @@
1pub mod consts;
2pub mod lex;
3pub mod parse;
4mod utils;
5
6use std::fmt;
7
8use radix_trie::{Trie, TrieCommon};
9
10pub struct Dict {
11 inner: Trie<DictKey, DictValue>,
12}
13
14impl Dict {
15 fn new() -> Self {
16 Self { inner: Trie::new() }
17 }
18
19 fn insert(&mut self, entry: DictKey, value: DictValue) {
20 self.inner.map_with_default(
21 entry,
22 |dict_value| {
23 // TODO: this only merges defns, not notes/syns
24 for v in value.defn.iter() {
25 dict_value.defn.push(v);
26 }
27 },
28 value.clone(),
29 );
30 }
31
32 pub fn search<'dict, 'search>(
33 &'dict self,
34 search_term: &'search str,
35 ) -> SearchResults<'dict, 'search> {
36 self.inner
37 .subtrie(search_term)
38 .map_or(SearchResults::Empty, |subtrie| {
39 SearchResults::Hit(subtrie.iter())
40 })
41 }
42}
43
44pub enum SearchResults<'dict, 'search> {
45 Empty,
46 Hit(radix_trie::iter::Iter<'dict, &'search str, DictValue>),
47}
48
49impl<'dict, 'search> SearchResults<'dict, 'search> {
50 // mutable ref here to advance the iterator present in Self::Hit
51 pub fn print(&mut self) {
52 match self {
53 Self::Hit(results) => {
54 while let Some((key, value)) = results.next() {
55 if value.defn.len() > 1 {
56 for (def, idx) in value.defn.iter().zip(1..) {
57 println!("{}({}) {}", key, idx, def.replace('\n', " "));
58 }
59 } else {
60 println!("{} {}", key, value.defn[0].replace('\n', " "));
61 }
62
63 // if let Some(note) = value.note {
64 // print!("\t{}", note);
65 // }
66 // if let Some(synonym) = value.synonym {
67 // print!("\t{}", synonym);
68 // }
69 }
70 }
71 Self::Empty => (),
72 }
73 }
74}
75
76type DictKey = &'static str;
77
78#[derive(Clone)]
79pub struct DictValue {
80 defn: Vec<&'static str>,
81 note: Option<&'static str>,
82 synonym: Option<&'static str>,
83}
diff --git a/src/main.rs b/src/main.rs
index e6d997d..9def90c 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,101 +1,14 @@
1mod consts; 1use dict::{consts::SRC, lex, parse::ParseState};
2mod utils;
3mod lex;
4
5use consts::SRC;
6use lex::{Stanza, StanzaLexError};
7use utils::FromStaticStr;
8 2
9fn main() { 3fn main() {
10 let mut count = 0; 4 let Some(search_term) = std::env::args().skip(1).next() else {
11 let mut parse_state = ParseState::Ready; 5 eprintln!("usage: dict <search-term>");
12 let mut current_entry = EntryBuilder::new(); 6 return;
13 let mut dict = Dictionary {
14 entries: vec![],
15 }; 7 };
16 8 lex::lex(SRC)
17 for l in SRC.split("\n\n") { 9 .filter_map(Result::ok)
18 count += 1; 10 .fold(ParseState::new(), ParseState::advance)
19 let stanza = match Stanza::from_str(l) { 11 .finish()
20 Ok(s) => { 12 .search(search_term.to_ascii_uppercase().as_str())
21 println!("{count} ok"); 13 .print()
22 s
23 },
24 Err(StanzaLexError { data }) => {
25 eprintln!("stanza err: {data}\n\n");
26 continue;
27 },
28 };
29 match stanza {
30 Stanza::Entry(s) if parse_state == ParseState::Ready => {
31 current_entry.set_name(s);
32 parse_state = ParseState::InEntry;
33 }
34 Stanza::Defn(d) if parse_state == ParseState::InEntry => {
35 current_entry.set_defn(d);
36
37 match current_entry.build() {
38 Ok(e) => dict.entries.push(e),
39 Err(_) => eprintln!("failed to build entry"),
40 }
41
42 parse_state = ParseState::Ready;
43 }
44 _ => ()
45 }
46 }
47 dbg!(dict.entries.iter().find(|entry| entry.name.to_ascii_lowercase().starts_with("discursive")));
48}
49
50#[derive(PartialEq, Eq, PartialOrd, Ord)]
51enum ParseState {
52 Ready,
53 InEntry
54}
55
56struct Dictionary {
57 entries: Vec<Entry>
58}
59
60#[derive(Debug)]
61struct Entry {
62 name: &'static str,
63 defn: Option<&'static str>,
64 note: Option<&'static str>,
65 synonym: Option<&'static str>,
66}
67
68#[derive(Default)]
69struct EntryBuilder {
70 name: Option<&'static str>,
71 defn: Option<&'static str>,
72 note: Option<&'static str>,
73 synonym: Option<&'static str>,
74}
75
76enum EntryBuilderError {
77 MissingField(&'static str)
78}
79
80impl EntryBuilder {
81 fn new() -> Self {
82 Self::default()
83 }
84
85 fn set_name(&mut self, name: &'static str) {
86 self.name = Some(name);
87 }
88
89 fn set_defn(&mut self, defn: &'static str) {
90 self.defn = Some(defn);
91 }
92
93 fn build(&self) -> Result<Entry, EntryBuilderError> {
94 Ok(Entry {
95 name: self.name.ok_or(EntryBuilderError::MissingField("name"))?,
96 defn: self.defn,
97 note: self.note,
98 synonym: self.synonym,
99 })
100 }
101} 14}
diff --git a/src/parse.rs b/src/parse.rs
new file mode 100644
index 0000000..5b613ca
--- /dev/null
+++ b/src/parse.rs
@@ -0,0 +1,200 @@
1use crate::{lex::Stanza, Dict};
2
3pub struct ParseState {
4 dict: Dict,
5 status: Status,
6 current_entry: EntryBuilder,
7 errors: Vec<ParseError>,
8}
9
10#[derive(Debug)]
11enum ParseError {
12 Build(EntryBuilderError),
13 UndefinedState(Status, EntryBuilder, Stanza),
14}
15
16impl ParseState {
17 pub fn new() -> Self {
18 Self {
19 dict: Dict::new(),
20 status: Status::Start,
21 current_entry: EntryBuilder::new(),
22 errors: Vec::new(),
23 }
24 }
25
26 pub fn advance(mut self, stanza: Stanza) -> Self {
27 match (self.status, stanza) {
28 (Status::Start, Stanza::Entry(e)) => {
29 self.current_entry.set_name(e);
30 self.status = Status::ContainsName;
31 }
32 (Status::ContainsName, Stanza::Defn(d)) => {
33 self.current_entry.push_defn(d);
34 self.status = Status::ContainsOneDefn;
35 }
36 (Status::ContainsOneDefn | Status::ContainsMulDefn, Stanza::Defn(d)) => {
37 self.current_entry.push_defn(d);
38 self.status = Status::ContainsMulDefn;
39 }
40 (
41 Status::ContainsOneDefn | Status::ContainsMulDefn | Status::ContainsSynonym,
42 Stanza::Note(n),
43 ) => {
44 self.current_entry.set_note(n);
45 self.status = Status::ContainsNote;
46 }
47 (
48 Status::ContainsOneDefn | Status::ContainsMulDefn | Status::ContainsNote,
49 Stanza::Synonym(s),
50 ) => {
51 self.current_entry.set_synonym(s);
52 self.status = Status::ContainsSynonym;
53 }
54 (
55 Status::ContainsOneDefn
56 | Status::ContainsMulDefn
57 | Status::ContainsNote
58 | Status::ContainsSynonym,
59 Stanza::Entry(e),
60 ) => {
61 // flush the current entry
62 match self.current_entry.build() {
63 Ok(entry) => self.dict.insert(entry.name, entry.into()),
64 Err(b) => self.register_error(ParseError::Build(b)),
65 };
66
67 // begin with the new one
68 self.current_entry.clear();
69 self.current_entry.set_name(e);
70 self.status = Status::ContainsName;
71 }
72 (Status::ContainsName, Stanza::Entry(e)) => {
73 // dump unfinished entry and enter new entry
74 self.current_entry.clear();
75 self.current_entry.set_name(e);
76 self.status = Status::ContainsName;
77 }
78 (_, new_entry) => {
79 // any other states means our parser is entering undefined teritorry
80 // register an error if we have anything in current_entry
81 self.register_undefined_state_error(new_entry);
82 // and set the status to Start and fast forward to the next entry
83 self.current_entry.clear();
84 self.status = Status::Start;
85 }
86 }
87 self
88 }
89
90 fn register_error(&mut self, error: ParseError) {
91 self.errors.push(error)
92 }
93
94 fn register_undefined_state_error(&mut self, new_entry: Stanza) {
95 self.register_error(ParseError::UndefinedState(
96 self.status,
97 self.current_entry.clone(),
98 new_entry,
99 ));
100 }
101
102 pub fn finish(self) -> Dict {
103 self.dict
104 }
105
106 pub fn dump(&self) {
107 for err in &self.errors {
108 eprintln!("{err:?}");
109 }
110 }
111}
112
113#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
114enum Status {
115 // ready to accept a new entry
116 Start,
117 // ready to accept a defn
118 ContainsName,
119 // can accept notes or synonyms, or flush this entry
120 ContainsOneDefn,
121 // can accept notes or synonyms, or flush this entry
122 ContainsMulDefn,
123 // can accept a synonym
124 ContainsNote,
125 // can accept a note
126 ContainsSynonym,
127 // mangled stanza, skip until the next entry occurs
128}
129
130#[derive(Debug, Clone)]
131struct Entry {
132 name: &'static str,
133 defn: Vec<&'static str>,
134 note: Option<&'static str>,
135 synonym: Option<&'static str>,
136}
137
138impl From<Entry> for crate::DictValue {
139 fn from(entry: Entry) -> Self {
140 Self {
141 defn: entry.defn,
142 note: entry.note,
143 synonym: entry.synonym,
144 }
145 }
146}
147
148#[derive(Debug, Default, Clone)]
149struct EntryBuilder {
150 name: Option<&'static str>,
151 defn: Vec<&'static str>,
152 note: Option<&'static str>,
153 synonym: Option<&'static str>,
154}
155
156#[derive(Debug)]
157enum EntryBuilderError {
158 MissingField(&'static str),
159}
160
161impl EntryBuilder {
162 fn new() -> Self {
163 Self::default()
164 }
165
166 fn clear(&mut self) {
167 *self = Self::default();
168 }
169
170 fn set_name(&mut self, name: &'static str) {
171 self.name = Some(name);
172 }
173
174 fn push_defn(&mut self, defn: &'static str) {
175 self.defn.push(defn);
176 }
177
178 fn set_note(&mut self, note: &'static str) {
179 self.note = Some(note);
180 }
181
182 fn set_synonym(&mut self, synonym: &'static str) {
183 self.synonym = Some(synonym);
184 }
185
186 fn build(&self) -> Result<Entry, EntryBuilderError> {
187 let name = self.name.ok_or(EntryBuilderError::MissingField("name"))?;
188 let defn = if self.defn.is_empty() {
189 return Err(EntryBuilderError::MissingField("defn"));
190 } else {
191 self.defn.clone()
192 };
193 Ok(Entry {
194 name,
195 defn,
196 note: self.note,
197 synonym: self.synonym,
198 })
199 }
200}
diff --git a/src/utils.rs b/src/utils.rs
index 23fff7e..c53e564 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -1,7 +1,6 @@
1
2pub trait FromStaticStr { 1pub trait FromStaticStr {
3 type Err; 2 type Err;
4 fn from_str(s: &'static str) -> Result<Self, Self::Err> 3 fn from_str(s: &'static str) -> Result<Self, Self::Err>
5 where Self: Sized; 4 where
5 Self: Sized;
6} 6}
7