diff options
| author | Nathan Reiner <nathan@nathanreiner.xyz> | 2023-07-05 23:07:26 +0200 |
|---|---|---|
| committer | Nathan Reiner <nathan@nathanreiner.xyz> | 2023-07-05 23:07:26 +0200 |
| commit | 4d577650f737daaeb477bbbd5ae2bad4f1121c38 (patch) | |
| tree | ac973541e0a2d7751af4ece5f7f639e739f81fcc /src/dictionary.rs | |
first sketch of indexer
Diffstat (limited to 'src/dictionary.rs')
| -rw-r--r-- | src/dictionary.rs | 69 |
1 files changed, 69 insertions, 0 deletions
diff --git a/src/dictionary.rs b/src/dictionary.rs new file mode 100644 index 0000000..a8d9b28 --- /dev/null +++ b/src/dictionary.rs @@ -0,0 +1,69 @@ +use std::collections::HashMap; +use crate::vector::FileVector; + +pub struct Dictionary { + last_index : usize, + data : HashMap<String, u64>, +} + + +impl Dictionary { + pub fn new() -> Self { + Self { last_index : 0, data : HashMap::new() } + } + + pub fn from_line(line : &str) -> Self { + let mut data : HashMap<String, u64> = HashMap::new(); + let mut i : usize = 0; + + for word in line.split(',') { + data.insert(word.to_string(), i as u64); + i += 1; + } + + Self { last_index : i - 1, data } + } + + pub fn set(&mut self, name : String) { + if !self.data.contains_key(&name) { + self.last_index += 1; + self.data.insert(name, self.last_index as u64); + } + } + + pub fn get(&self, name : String) -> u64 { + *self.data.get(&name).unwrap() + } + + pub fn iter(&self) -> &HashMap<String, u64> { + &self.data + } + + pub fn to_list(&self) -> Vec<String> { + let mut v = Vec::with_capacity(self.last_index + 1); + + v.resize(self.last_index + 1, "".to_string()); + + for (word, id) in self.iter() { + v[(*id) as usize] = word.clone(); + } + + v + } + + pub fn vectorize_word_list(&self, words : Vec<String>) -> FileVector { + let mut fv = FileVector::new(); + + for word in words { + let i = self.get(word); + if !fv.contains_key(&i) { + fv.insert(i, 1); + } else { + let c : u64 = *fv.get(&i).unwrap(); + fv.insert(i, c + 1); + } + } + + fv + } +} |