From e1cb45be6575c805e3b0ab1e2d03d4047acf88d1 Mon Sep 17 00:00:00 2001 From: Nathan Reiner Date: Tue, 25 Jul 2023 22:21:25 +0200 Subject: make it use less ram using packed structs --- src/vector.rs | 67 ++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 17 deletions(-) (limited to 'src/vector.rs') diff --git a/src/vector.rs b/src/vector.rs index fa0a139..c85c50a 100644 --- a/src/vector.rs +++ b/src/vector.rs @@ -1,8 +1,11 @@ use std::collections::HashMap; -use std::collections::hash_map::DefaultHasher; -use std::hash::{Hash, Hasher}; +use std::hash::Hash; use std::ops::{Deref, DerefMut}; use serde::{Deserialize, Serialize}; +use hash32::Hasher; + +pub type Count = u8; +pub type Indexer = u32; /// Represents the content of a cached file. /// It is stored as a HashMap, because we do not @@ -10,11 +13,18 @@ use serde::{Deserialize, Serialize}; /// of storage. #[derive(Default, Clone, Debug, Deserialize, Serialize)] pub struct FileVector { - data : HashMap + data : Vec +} + +#[repr(packed)] +#[derive(Default, Clone, Debug, Deserialize, Serialize)] +pub struct FileVectorEntry { + index: u32, + count: Count, } impl Deref for FileVector { - type Target = HashMap; + type Target = Vec; fn deref(&self) -> &Self::Target { &self.data } @@ -28,19 +38,32 @@ impl DerefMut for FileVector { impl FileVector { pub fn new() -> Self { - Self { data : HashMap::new() } + Self { data : Vec::new() } + } + + pub fn to_hashmap(&self) -> HashMap { + let mut map = HashMap::new(); + + for e in self.data.iter() { + map.insert(e.index, e.count); + } + + map } pub fn from_words(words: Vec) -> Self { - let mut data = HashMap::new(); + let mut data : HashMap = HashMap::new(); for word in words { - let mut hasher = DefaultHasher::new(); + let mut hasher = hash32::FnvHasher::default(); word.hash(&mut hasher); - let k = hasher.finish(); - match data.entry(k) { + let k = hasher.finish32(); + match data.entry(k as Indexer) { std::collections::hash_map::Entry::Occupied(mut e) => { - e.insert(e.get() + 1); + let i = *e.get(); + if i == Count::MAX { + e.insert((i + 1) as Count); + } } std::collections::hash_map::Entry::Vacant(e) => { e.insert(1); @@ -48,26 +71,36 @@ impl FileVector { } } - Self { data } + FileVector::from_hashmap(data) + } + + pub fn from_hashmap(map : HashMap) -> Self { + Self { data : Vec::from_iter(map.iter().map(|e| { + FileVectorEntry { index: *e.0, count: *e.1 } + }))} } } -pub fn scalar_product(a : &FileVector, b : &FileVector) -> u64 { - let mut c = 0; +pub fn scalar_product(a : &FileVector, b : &FileVector) -> u32 { + let a = a.to_hashmap(); + let b = b.to_hashmap(); + let mut c : u32 = 0; for (i, x) in a.iter() { - c += x * (b.get(i).unwrap_or(&0)); + c += (x * (b.get(i).unwrap_or(&0))) as u32; } c } -pub fn match_vector(query : &FileVector, v : &FileVector) -> u64 { - let mut c = 0; +pub fn match_vector(query : &FileVector, v : &FileVector) -> u32 { + let query = query.to_hashmap(); + let v = v.to_hashmap(); + let mut c : u32 = 0; for (i, x) in query.iter() { let s = x * (v.get(i).unwrap_or(&0)); if s == 0 { return 0 } else { - c += s; + c += s as u32; } } c -- cgit v1.2.3-70-g09d2