use std::collections::HashMap; use std::hash::Hash; use std::ops::{Deref, DerefMut}; use serde::{Deserialize, Serialize}; use hash32::Hasher; pub type Count = u8; pub type Indexer = u32; /// Represents the content of a cached file. /// It is stored as a HashMap, because we do not /// have to store the zeros. With that we save a lot /// of storage. #[derive(Default, Clone, Debug, Deserialize, Serialize)] pub struct FileVector { data : Vec, cache : Option> } #[repr(packed)] #[derive(Default, Clone, Debug, Deserialize, Serialize)] pub struct FileVectorEntry { index: u32, count: Count, } impl Deref for FileVector { type Target = Vec; fn deref(&self) -> &Self::Target { &self.data } } impl DerefMut for FileVector { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.data } } impl FileVector { pub fn new() -> Self { Self { data : Vec::new(), cache : None } } pub fn to_hashmap(&self) -> HashMap { let mut map = HashMap::new(); for e in self.data.iter() { map.insert(e.index, e.count); } map } pub fn from_words(words: Vec) -> Self { let mut data : HashMap = HashMap::new(); for word in words { let mut hasher = hash32::FnvHasher::default(); word.hash(&mut hasher); let k = hasher.finish32(); match data.entry(k as Indexer) { std::collections::hash_map::Entry::Occupied(mut e) => { let i = *e.get(); if i == Count::MAX { e.insert((i + 1) as Count); } } std::collections::hash_map::Entry::Vacant(e) => { e.insert(1); } } } FileVector::from_hashmap(data) } pub fn from_hashmap(map : HashMap) -> Self { let mut v = Vec::from_iter(map.iter().map(|e| { FileVectorEntry { index: *e.0, count: *e.1 } })); v.sort_by(|a, b| { let id = a.index; let id2 = b.index; id.cmp(&id2) }); Self { data : v, cache: None } } } pub fn scalar_product(a : &FileVector, b : &FileVector) -> u32 { let mut c : u32 = 0; for entry in a.iter() { let id = entry.index; let x1 = entry.count as u32; if let Ok(x2) = b.data.binary_search_by(|a| { let id2 = a.index; id2.cmp(&id) }) { c += (x1 * x2 as u32) as u32; } } c } pub fn match_vector(query : &FileVector, v : &FileVector) -> u32 { let mut c : u32 = 0; for entry in query.data.iter() { let id = entry.index as u32; let x1 = entry.count as u32; if let Ok(x2) = v.data.binary_search_by(|a| { let id2 = a.index; id2.cmp(&id) }) { let s = x1 * (x2 as u32); c += s as u32; } else { return 0; } } c }