From 4d577650f737daaeb477bbbd5ae2bad4f1121c38 Mon Sep 17 00:00:00 2001 From: Nathan Reiner Date: Wed, 5 Jul 2023 23:07:26 +0200 Subject: first sketch of indexer --- src/dictionary.rs | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 src/dictionary.rs (limited to 'src/dictionary.rs') diff --git a/src/dictionary.rs b/src/dictionary.rs new file mode 100644 index 0000000..a8d9b28 --- /dev/null +++ b/src/dictionary.rs @@ -0,0 +1,69 @@ +use std::collections::HashMap; +use crate::vector::FileVector; + +pub struct Dictionary { + last_index : usize, + data : HashMap, +} + + +impl Dictionary { + pub fn new() -> Self { + Self { last_index : 0, data : HashMap::new() } + } + + pub fn from_line(line : &str) -> Self { + let mut data : HashMap = HashMap::new(); + let mut i : usize = 0; + + for word in line.split(',') { + data.insert(word.to_string(), i as u64); + i += 1; + } + + Self { last_index : i - 1, data } + } + + pub fn set(&mut self, name : String) { + if !self.data.contains_key(&name) { + self.last_index += 1; + self.data.insert(name, self.last_index as u64); + } + } + + pub fn get(&self, name : String) -> u64 { + *self.data.get(&name).unwrap() + } + + pub fn iter(&self) -> &HashMap { + &self.data + } + + pub fn to_list(&self) -> Vec { + let mut v = Vec::with_capacity(self.last_index + 1); + + v.resize(self.last_index + 1, "".to_string()); + + for (word, id) in self.iter() { + v[(*id) as usize] = word.clone(); + } + + v + } + + pub fn vectorize_word_list(&self, words : Vec) -> FileVector { + let mut fv = FileVector::new(); + + for word in words { + let i = self.get(word); + if !fv.contains_key(&i) { + fv.insert(i, 1); + } else { + let c : u64 = *fv.get(&i).unwrap(); + fv.insert(i, c + 1); + } + } + + fv + } +} -- cgit v1.2.3-70-g09d2