diff options
author | Akshay <[email protected]> | 2023-04-11 18:10:24 +0100 |
---|---|---|
committer | Akshay <[email protected]> | 2023-04-11 18:10:24 +0100 |
commit | 9ccdaee79cdbcc76c8b855a0d50b5a5309ba8bb8 (patch) | |
tree | a6e97f9c0a0af6f08669b9596b05f4df13259bbc /src | |
parent | d315cce8e99ec6d96695bea708ae315028f3db66 (diff) |
add build steps
Diffstat (limited to 'src')
-rw-r--r-- | src/consts.rs | 32 | ||||
-rw-r--r-- | src/lex.rs | 33 | ||||
-rw-r--r-- | src/lib.rs | 83 | ||||
-rw-r--r-- | src/main.rs | 107 | ||||
-rw-r--r-- | src/parse.rs | 200 | ||||
-rw-r--r-- | src/utils.rs | 5 |
6 files changed, 317 insertions, 143 deletions
diff --git a/src/consts.rs b/src/consts.rs index 446c341..c606a95 100644 --- a/src/consts.rs +++ b/src/consts.rs | |||
@@ -1,31 +1 @@ | |||
1 | pub const SRC: &str = include_str!("../assets/en.txt"); | pub const SRC: &str = include_str!("../../assets/en.txt"); | |
2 | // pub const SRC: &str = r"A | ||
3 | // A (named a in the English, and most commonly ä in other languages). | ||
4 | // | ||
5 | // Defn: The first letter of the English and of many other alphabets. | ||
6 | // The capital A of the alphabets of Middle and Western Europe, as also | ||
7 | // the small letter (a), besides the forms in Italic, black letter, | ||
8 | // etc., are all descended from the old Latin A, which was borrowed from | ||
9 | // the Greek Alpha, of the same form; and this was made from the first | ||
10 | // letter (Aleph, and itself from the Egyptian origin. The Aleph was a | ||
11 | // consonant letter, with a guttural breath sound that was not an | ||
12 | // element of Greek articulation; and the Greeks took it to represent | ||
13 | // their vowel Alpha with the ä sound, the Phoenician alphabet having no | ||
14 | // vowel symbols. This letter, in English, is used for several different | ||
15 | // vowel sounds. See Guide to pronunciation, §§ 43-74. The regular long | ||
16 | // a, as in fate, etc., is a comparatively modern sound, and has taken | ||
17 | // the place of what, till about the early part of the 17th century, was | ||
18 | // a sound of the quality of ä (as in far). | ||
19 | // | ||
20 | // 2. (Mus.) | ||
21 | // | ||
22 | // Defn: The name of the sixth tone in the model major scale (that in | ||
23 | // C), or the first tone of the minor scale, which is named after it the | ||
24 | // scale in A minor. The second string of the violin is tuned to the A | ||
25 | // in the treble staff. | ||
26 | // -- A sharp (A#) is the name of a musical tone intermediate between A | ||
27 | // and B. | ||
28 | // -- A flat (A) is the name of a tone intermediate between A and G."; | ||
29 | // | ||
30 | // | ||
31 | // | ||
@@ -1,5 +1,6 @@ | |||
1 | use crate::utils::FromStaticStr; | 1 | use crate::utils::FromStaticStr; |
2 | 2 | ||
3 | #[derive(Debug)] | ||
3 | pub enum Stanza { | 4 | pub enum Stanza { |
4 | Entry(&'static str), | 5 | Entry(&'static str), |
5 | Defn(&'static str), | 6 | Defn(&'static str), |
@@ -11,7 +12,8 @@ pub enum Stanza { | |||
11 | 12 | ||
12 | impl Stanza { | 13 | impl Stanza { |
13 | fn is_entry(s: &str) -> bool { | 14 | fn is_entry(s: &str) -> bool { |
14 | s.chars().all(|c| c.is_uppercase() || c.is_ascii_whitespace() || "-;'.".contains(c)) | 15 | s.chars() |
16 | .all(|c| c.is_uppercase() || c.is_ascii_whitespace() || "-;'.".contains(c)) | ||
15 | } | 17 | } |
16 | 18 | ||
17 | fn is_defn(s: &str) -> bool { | 19 | fn is_defn(s: &str) -> bool { |
@@ -27,7 +29,9 @@ impl Stanza { | |||
27 | } | 29 | } |
28 | 30 | ||
29 | fn is_bullet(s: &str) -> bool { | 31 | fn is_bullet(s: &str) -> bool { |
30 | s.find('.').map(|idx| s[..idx].chars().all(char::is_numeric)).unwrap_or_default() | 32 | s.find('.') |
33 | .map(|idx| s[..idx].chars().all(char::is_numeric)) | ||
34 | .unwrap_or_default() | ||
31 | } | 35 | } |
32 | 36 | ||
33 | fn is_sub_bullet(s: &str) -> bool { | 37 | fn is_sub_bullet(s: &str) -> bool { |
@@ -49,21 +53,23 @@ impl FromStaticStr for Stanza { | |||
49 | if let Some(first_line) = lines.next() { | 53 | if let Some(first_line) = lines.next() { |
50 | if !first_line.is_empty() { | 54 | if !first_line.is_empty() { |
51 | if Stanza::is_entry(first_line) { | 55 | if Stanza::is_entry(first_line) { |
52 | Ok(Self::Entry(s)) | 56 | Ok(Self::Entry(first_line.trim())) |
53 | } else if Stanza::is_defn(first_line) { | 57 | } else if Stanza::is_defn(first_line) { |
54 | Ok(Self::Defn(s)) | 58 | Ok(Self::Defn(s.strip_prefix("Defn: ").unwrap_or(s).trim())) |
55 | } else if Stanza::is_note(first_line) { | 59 | } else if Stanza::is_note(first_line) { |
56 | Ok(Self::Note(s)) | 60 | Ok(Self::Note(s.strip_prefix("Note: ").unwrap_or(s).trim())) |
57 | } else if Stanza::is_synonym(first_line) { | 61 | } else if Stanza::is_synonym(first_line) { |
58 | Ok(Self::Synonym(s)) | 62 | Ok(Self::Synonym(s.strip_prefix("Syn.").unwrap_or(s))) |
59 | } else if Stanza::is_bullet(first_line) { | 63 | } else if Stanza::is_bullet(first_line) { |
60 | Ok(Self::Bullet(s)) | 64 | Ok(Self::Defn( |
61 | } else if Stanza::is_sub_bullet(first_line) { | 65 | s.trim_start_matches(|c| "0123456789. ".contains(c)), |
62 | Ok(Self::SubBullet(s)) | 66 | )) |
67 | // } else if Stanza::is_sub_bullet(first_line) { | ||
68 | // Ok(Self::SubBullet(s)) | ||
63 | } else { | 69 | } else { |
64 | Err(Self::Err { | 70 | Err(Self::Err { |
65 | data: format!("weird stanza: {}", s), | 71 | data: format!("weird stanza: {}", s), |
66 | }) | 72 | }) |
67 | } | 73 | } |
68 | } else { | 74 | } else { |
69 | Err(Self::Err { | 75 | Err(Self::Err { |
@@ -78,3 +84,6 @@ impl FromStaticStr for Stanza { | |||
78 | } | 84 | } |
79 | } | 85 | } |
80 | 86 | ||
87 | pub fn lex(src: &'static str) -> impl Iterator<Item = Result<Stanza, StanzaLexError>> { | ||
88 | src.split("\n\n").map(Stanza::from_str) | ||
89 | } | ||
diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..1324f70 --- /dev/null +++ b/src/lib.rs | |||
@@ -0,0 +1,83 @@ | |||
1 | pub mod consts; | ||
2 | pub mod lex; | ||
3 | pub mod parse; | ||
4 | mod utils; | ||
5 | |||
6 | use std::fmt; | ||
7 | |||
8 | use radix_trie::{Trie, TrieCommon}; | ||
9 | |||
10 | pub struct Dict { | ||
11 | inner: Trie<DictKey, DictValue>, | ||
12 | } | ||
13 | |||
14 | impl Dict { | ||
15 | fn new() -> Self { | ||
16 | Self { inner: Trie::new() } | ||
17 | } | ||
18 | |||
19 | fn insert(&mut self, entry: DictKey, value: DictValue) { | ||
20 | self.inner.map_with_default( | ||
21 | entry, | ||
22 | |dict_value| { | ||
23 | // TODO: this only merges defns, not notes/syns | ||
24 | for v in value.defn.iter() { | ||
25 | dict_value.defn.push(v); | ||
26 | } | ||
27 | }, | ||
28 | value.clone(), | ||
29 | ); | ||
30 | } | ||
31 | |||
32 | pub fn search<'dict, 'search>( | ||
33 | &'dict self, | ||
34 | search_term: &'search str, | ||
35 | ) -> SearchResults<'dict, 'search> { | ||
36 | self.inner | ||
37 | .subtrie(search_term) | ||
38 | .map_or(SearchResults::Empty, |subtrie| { | ||
39 | SearchResults::Hit(subtrie.iter()) | ||
40 | }) | ||
41 | } | ||
42 | } | ||
43 | |||
44 | pub enum SearchResults<'dict, 'search> { | ||
45 | Empty, | ||
46 | Hit(radix_trie::iter::Iter<'dict, &'search str, DictValue>), | ||
47 | } | ||
48 | |||
49 | impl<'dict, 'search> SearchResults<'dict, 'search> { | ||
50 | // mutable ref here to advance the iterator present in Self::Hit | ||
51 | pub fn print(&mut self) { | ||
52 | match self { | ||
53 | Self::Hit(results) => { | ||
54 | while let Some((key, value)) = results.next() { | ||
55 | if value.defn.len() > 1 { | ||
56 | for (def, idx) in value.defn.iter().zip(1..) { | ||
57 | println!("{}({}) {}", key, idx, def.replace('\n', " ")); | ||
58 | } | ||
59 | } else { | ||
60 | println!("{} {}", key, value.defn[0].replace('\n', " ")); | ||
61 | } | ||
62 | |||
63 | // if let Some(note) = value.note { | ||
64 | // print!("\t{}", note); | ||
65 | // } | ||
66 | // if let Some(synonym) = value.synonym { | ||
67 | // print!("\t{}", synonym); | ||
68 | // } | ||
69 | } | ||
70 | } | ||
71 | Self::Empty => (), | ||
72 | } | ||
73 | } | ||
74 | } | ||
75 | |||
76 | type DictKey = &'static str; | ||
77 | |||
78 | #[derive(Clone)] | ||
79 | pub struct DictValue { | ||
80 | defn: Vec<&'static str>, | ||
81 | note: Option<&'static str>, | ||
82 | synonym: Option<&'static str>, | ||
83 | } | ||
diff --git a/src/main.rs b/src/main.rs index e6d997d..9def90c 100644 --- a/src/main.rs +++ b/src/main.rs | |||
@@ -1,101 +1,14 @@ | |||
1 | mod consts; | 1 | use dict::{consts::SRC, lex, parse::ParseState}; |
2 | mod utils; | ||
3 | mod lex; | ||
4 | |||
5 | use consts::SRC; | ||
6 | use lex::{Stanza, StanzaLexError}; | ||
7 | use utils::FromStaticStr; | ||
8 | 2 | ||
9 | fn main() { | 3 | fn main() { |
10 | let mut count = 0; | 4 | let Some(search_term) = std::env::args().skip(1).next() else { |
11 | let mut parse_state = ParseState::Ready; | 5 | eprintln!("usage: dict <search-term>"); |
12 | let mut current_entry = EntryBuilder::new(); | 6 | return; |
13 | let mut dict = Dictionary { | ||
14 | entries: vec![], | ||
15 | }; | 7 | }; |
16 | 8 | lex::lex(SRC) | |
17 | for l in SRC.split("\n\n") { | 9 | .filter_map(Result::ok) |
18 | count += 1; | 10 | .fold(ParseState::new(), ParseState::advance) |
19 | let stanza = match Stanza::from_str(l) { | 11 | .finish() |
20 | Ok(s) => { | 12 | .search(search_term.to_ascii_uppercase().as_str()) |
21 | println!("{count} ok"); | 13 | .print() |
22 | s | ||
23 | }, | ||
24 | Err(StanzaLexError { data }) => { | ||
25 | eprintln!("stanza err: {data}\n\n"); | ||
26 | continue; | ||
27 | }, | ||
28 | }; | ||
29 | match stanza { | ||
30 | Stanza::Entry(s) if parse_state == ParseState::Ready => { | ||
31 | current_entry.set_name(s); | ||
32 | parse_state = ParseState::InEntry; | ||
33 | } | ||
34 | Stanza::Defn(d) if parse_state == ParseState::InEntry => { | ||
35 | current_entry.set_defn(d); | ||
36 | |||
37 | match current_entry.build() { | ||
38 | Ok(e) => dict.entries.push(e), | ||
39 | Err(_) => eprintln!("failed to build entry"), | ||
40 | } | ||
41 | |||
42 | parse_state = ParseState::Ready; | ||
43 | } | ||
44 | _ => () | ||
45 | } | ||
46 | } | ||
47 | dbg!(dict.entries.iter().find(|entry| entry.name.to_ascii_lowercase().starts_with("discursive"))); | ||
48 | } | ||
49 | |||
50 | #[derive(PartialEq, Eq, PartialOrd, Ord)] | ||
51 | enum ParseState { | ||
52 | Ready, | ||
53 | InEntry | ||
54 | } | ||
55 | |||
56 | struct Dictionary { | ||
57 | entries: Vec<Entry> | ||
58 | } | ||
59 | |||
60 | #[derive(Debug)] | ||
61 | struct Entry { | ||
62 | name: &'static str, | ||
63 | defn: Option<&'static str>, | ||
64 | note: Option<&'static str>, | ||
65 | synonym: Option<&'static str>, | ||
66 | } | ||
67 | |||
68 | #[derive(Default)] | ||
69 | struct EntryBuilder { | ||
70 | name: Option<&'static str>, | ||
71 | defn: Option<&'static str>, | ||
72 | note: Option<&'static str>, | ||
73 | synonym: Option<&'static str>, | ||
74 | } | ||
75 | |||
76 | enum EntryBuilderError { | ||
77 | MissingField(&'static str) | ||
78 | } | ||
79 | |||
80 | impl EntryBuilder { | ||
81 | fn new() -> Self { | ||
82 | Self::default() | ||
83 | } | ||
84 | |||
85 | fn set_name(&mut self, name: &'static str) { | ||
86 | self.name = Some(name); | ||
87 | } | ||
88 | |||
89 | fn set_defn(&mut self, defn: &'static str) { | ||
90 | self.defn = Some(defn); | ||
91 | } | ||
92 | |||
93 | fn build(&self) -> Result<Entry, EntryBuilderError> { | ||
94 | Ok(Entry { | ||
95 | name: self.name.ok_or(EntryBuilderError::MissingField("name"))?, | ||
96 | defn: self.defn, | ||
97 | note: self.note, | ||
98 | synonym: self.synonym, | ||
99 | }) | ||
100 | } | ||
101 | } | 14 | } |
diff --git a/src/parse.rs b/src/parse.rs new file mode 100644 index 0000000..5b613ca --- /dev/null +++ b/src/parse.rs | |||
@@ -0,0 +1,200 @@ | |||
1 | use crate::{lex::Stanza, Dict}; | ||
2 | |||
3 | pub struct ParseState { | ||
4 | dict: Dict, | ||
5 | status: Status, | ||
6 | current_entry: EntryBuilder, | ||
7 | errors: Vec<ParseError>, | ||
8 | } | ||
9 | |||
10 | #[derive(Debug)] | ||
11 | enum ParseError { | ||
12 | Build(EntryBuilderError), | ||
13 | UndefinedState(Status, EntryBuilder, Stanza), | ||
14 | } | ||
15 | |||
16 | impl ParseState { | ||
17 | pub fn new() -> Self { | ||
18 | Self { | ||
19 | dict: Dict::new(), | ||
20 | status: Status::Start, | ||
21 | current_entry: EntryBuilder::new(), | ||
22 | errors: Vec::new(), | ||
23 | } | ||
24 | } | ||
25 | |||
26 | pub fn advance(mut self, stanza: Stanza) -> Self { | ||
27 | match (self.status, stanza) { | ||
28 | (Status::Start, Stanza::Entry(e)) => { | ||
29 | self.current_entry.set_name(e); | ||
30 | self.status = Status::ContainsName; | ||
31 | } | ||
32 | (Status::ContainsName, Stanza::Defn(d)) => { | ||
33 | self.current_entry.push_defn(d); | ||
34 | self.status = Status::ContainsOneDefn; | ||
35 | } | ||
36 | (Status::ContainsOneDefn | Status::ContainsMulDefn, Stanza::Defn(d)) => { | ||
37 | self.current_entry.push_defn(d); | ||
38 | self.status = Status::ContainsMulDefn; | ||
39 | } | ||
40 | ( | ||
41 | Status::ContainsOneDefn | Status::ContainsMulDefn | Status::ContainsSynonym, | ||
42 | Stanza::Note(n), | ||
43 | ) => { | ||
44 | self.current_entry.set_note(n); | ||
45 | self.status = Status::ContainsNote; | ||
46 | } | ||
47 | ( | ||
48 | Status::ContainsOneDefn | Status::ContainsMulDefn | Status::ContainsNote, | ||
49 | Stanza::Synonym(s), | ||
50 | ) => { | ||
51 | self.current_entry.set_synonym(s); | ||
52 | self.status = Status::ContainsSynonym; | ||
53 | } | ||
54 | ( | ||
55 | Status::ContainsOneDefn | ||
56 | | Status::ContainsMulDefn | ||
57 | | Status::ContainsNote | ||
58 | | Status::ContainsSynonym, | ||
59 | Stanza::Entry(e), | ||
60 | ) => { | ||
61 | // flush the current entry | ||
62 | match self.current_entry.build() { | ||
63 | Ok(entry) => self.dict.insert(entry.name, entry.into()), | ||
64 | Err(b) => self.register_error(ParseError::Build(b)), | ||
65 | }; | ||
66 | |||
67 | // begin with the new one | ||
68 | self.current_entry.clear(); | ||
69 | self.current_entry.set_name(e); | ||
70 | self.status = Status::ContainsName; | ||
71 | } | ||
72 | (Status::ContainsName, Stanza::Entry(e)) => { | ||
73 | // dump unfinished entry and enter new entry | ||
74 | self.current_entry.clear(); | ||
75 | self.current_entry.set_name(e); | ||
76 | self.status = Status::ContainsName; | ||
77 | } | ||
78 | (_, new_entry) => { | ||
79 | // any other states means our parser is entering undefined teritorry | ||
80 | // register an error if we have anything in current_entry | ||
81 | self.register_undefined_state_error(new_entry); | ||
82 | // and set the status to Start and fast forward to the next entry | ||
83 | self.current_entry.clear(); | ||
84 | self.status = Status::Start; | ||
85 | } | ||
86 | } | ||
87 | self | ||
88 | } | ||
89 | |||
90 | fn register_error(&mut self, error: ParseError) { | ||
91 | self.errors.push(error) | ||
92 | } | ||
93 | |||
94 | fn register_undefined_state_error(&mut self, new_entry: Stanza) { | ||
95 | self.register_error(ParseError::UndefinedState( | ||
96 | self.status, | ||
97 | self.current_entry.clone(), | ||
98 | new_entry, | ||
99 | )); | ||
100 | } | ||
101 | |||
102 | pub fn finish(self) -> Dict { | ||
103 | self.dict | ||
104 | } | ||
105 | |||
106 | pub fn dump(&self) { | ||
107 | for err in &self.errors { | ||
108 | eprintln!("{err:?}"); | ||
109 | } | ||
110 | } | ||
111 | } | ||
112 | |||
113 | #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] | ||
114 | enum Status { | ||
115 | // ready to accept a new entry | ||
116 | Start, | ||
117 | // ready to accept a defn | ||
118 | ContainsName, | ||
119 | // can accept notes or synonyms, or flush this entry | ||
120 | ContainsOneDefn, | ||
121 | // can accept notes or synonyms, or flush this entry | ||
122 | ContainsMulDefn, | ||
123 | // can accept a synonym | ||
124 | ContainsNote, | ||
125 | // can accept a note | ||
126 | ContainsSynonym, | ||
127 | // mangled stanza, skip until the next entry occurs | ||
128 | } | ||
129 | |||
130 | #[derive(Debug, Clone)] | ||
131 | struct Entry { | ||
132 | name: &'static str, | ||
133 | defn: Vec<&'static str>, | ||
134 | note: Option<&'static str>, | ||
135 | synonym: Option<&'static str>, | ||
136 | } | ||
137 | |||
138 | impl From<Entry> for crate::DictValue { | ||
139 | fn from(entry: Entry) -> Self { | ||
140 | Self { | ||
141 | defn: entry.defn, | ||
142 | note: entry.note, | ||
143 | synonym: entry.synonym, | ||
144 | } | ||
145 | } | ||
146 | } | ||
147 | |||
148 | #[derive(Debug, Default, Clone)] | ||
149 | struct EntryBuilder { | ||
150 | name: Option<&'static str>, | ||
151 | defn: Vec<&'static str>, | ||
152 | note: Option<&'static str>, | ||
153 | synonym: Option<&'static str>, | ||
154 | } | ||
155 | |||
156 | #[derive(Debug)] | ||
157 | enum EntryBuilderError { | ||
158 | MissingField(&'static str), | ||
159 | } | ||
160 | |||
161 | impl EntryBuilder { | ||
162 | fn new() -> Self { | ||
163 | Self::default() | ||
164 | } | ||
165 | |||
166 | fn clear(&mut self) { | ||
167 | *self = Self::default(); | ||
168 | } | ||
169 | |||
170 | fn set_name(&mut self, name: &'static str) { | ||
171 | self.name = Some(name); | ||
172 | } | ||
173 | |||
174 | fn push_defn(&mut self, defn: &'static str) { | ||
175 | self.defn.push(defn); | ||
176 | } | ||
177 | |||
178 | fn set_note(&mut self, note: &'static str) { | ||
179 | self.note = Some(note); | ||
180 | } | ||
181 | |||
182 | fn set_synonym(&mut self, synonym: &'static str) { | ||
183 | self.synonym = Some(synonym); | ||
184 | } | ||
185 | |||
186 | fn build(&self) -> Result<Entry, EntryBuilderError> { | ||
187 | let name = self.name.ok_or(EntryBuilderError::MissingField("name"))?; | ||
188 | let defn = if self.defn.is_empty() { | ||
189 | return Err(EntryBuilderError::MissingField("defn")); | ||
190 | } else { | ||
191 | self.defn.clone() | ||
192 | }; | ||
193 | Ok(Entry { | ||
194 | name, | ||
195 | defn, | ||
196 | note: self.note, | ||
197 | synonym: self.synonym, | ||
198 | }) | ||
199 | } | ||
200 | } | ||
diff --git a/src/utils.rs b/src/utils.rs index 23fff7e..c53e564 100644 --- a/src/utils.rs +++ b/src/utils.rs | |||
@@ -1,7 +1,6 @@ | |||
1 | |||
2 | pub trait FromStaticStr { | 1 | pub trait FromStaticStr { |
3 | type Err; | 2 | type Err; |
4 | fn from_str(s: &'static str) -> Result<Self, Self::Err> | 3 | fn from_str(s: &'static str) -> Result<Self, Self::Err> |
5 | where Self: Sized; | 4 | where |
5 | Self: Sized; | ||
6 | } | 6 | } |
7 | |||