diff options
Diffstat (limited to 'src/vector.rs')
| -rw-r--r-- | src/vector.rs | 67 |
1 files changed, 50 insertions, 17 deletions
diff --git a/src/vector.rs b/src/vector.rs index fa0a139..c85c50a 100644 --- a/src/vector.rs +++ b/src/vector.rs @@ -1,8 +1,11 @@ use std::collections::HashMap; -use std::collections::hash_map::DefaultHasher; -use std::hash::{Hash, Hasher}; +use std::hash::Hash; use std::ops::{Deref, DerefMut}; use serde::{Deserialize, Serialize}; +use hash32::Hasher; + +pub type Count = u8; +pub type Indexer = u32; /// Represents the content of a cached file. /// It is stored as a HashMap, because we do not @@ -10,11 +13,18 @@ use serde::{Deserialize, Serialize}; /// of storage. #[derive(Default, Clone, Debug, Deserialize, Serialize)] pub struct FileVector { - data : HashMap<u64, u64> + data : Vec<FileVectorEntry> +} + +#[repr(packed)] +#[derive(Default, Clone, Debug, Deserialize, Serialize)] +pub struct FileVectorEntry { + index: u32, + count: Count, } impl Deref for FileVector { - type Target = HashMap<u64, u64>; + type Target = Vec<FileVectorEntry>; fn deref(&self) -> &Self::Target { &self.data } @@ -28,19 +38,32 @@ impl DerefMut for FileVector { impl FileVector { pub fn new() -> Self { - Self { data : HashMap::new() } + Self { data : Vec::new() } + } + + pub fn to_hashmap(&self) -> HashMap<Indexer, Count> { + let mut map = HashMap::new(); + + for e in self.data.iter() { + map.insert(e.index, e.count); + } + + map } pub fn from_words(words: Vec<String>) -> Self { - let mut data = HashMap::new(); + let mut data : HashMap<Indexer, Count> = HashMap::new(); for word in words { - let mut hasher = DefaultHasher::new(); + let mut hasher = hash32::FnvHasher::default(); word.hash(&mut hasher); - let k = hasher.finish(); - match data.entry(k) { + let k = hasher.finish32(); + match data.entry(k as Indexer) { std::collections::hash_map::Entry::Occupied(mut e) => { - e.insert(e.get() + 1); + let i = *e.get(); + if i == Count::MAX { + e.insert((i + 1) as Count); + } } std::collections::hash_map::Entry::Vacant(e) => { e.insert(1); @@ -48,26 +71,36 @@ impl FileVector { } } - Self { data } + FileVector::from_hashmap(data) + } + + pub fn from_hashmap(map : HashMap<Indexer, Count>) -> Self { + Self { data : Vec::from_iter(map.iter().map(|e| { + FileVectorEntry { index: *e.0, count: *e.1 } + }))} } } -pub fn scalar_product(a : &FileVector, b : &FileVector) -> u64 { - let mut c = 0; +pub fn scalar_product(a : &FileVector, b : &FileVector) -> u32 { + let a = a.to_hashmap(); + let b = b.to_hashmap(); + let mut c : u32 = 0; for (i, x) in a.iter() { - c += x * (b.get(i).unwrap_or(&0)); + c += (x * (b.get(i).unwrap_or(&0))) as u32; } c } -pub fn match_vector(query : &FileVector, v : &FileVector) -> u64 { - let mut c = 0; +pub fn match_vector(query : &FileVector, v : &FileVector) -> u32 { + let query = query.to_hashmap(); + let v = v.to_hashmap(); + let mut c : u32 = 0; for (i, x) in query.iter() { let s = x * (v.get(i).unwrap_or(&0)); if s == 0 { return 0 } else { - c += s; + c += s as u32; } } c |