diff options
author | Nathan Reiner <nathan@nathanreiner.xyz> | 2023-07-26 22:46:48 +0200 |
---|---|---|
committer | Nathan Reiner <nathan@nathanreiner.xyz> | 2023-07-26 22:46:48 +0200 |
commit | 8dbfc9613673be00304d5efc08815a64f7b1ccca (patch) | |
tree | 6ffdeba30a738ecaf417bde49e4b2d472296d552 | |
parent | b1645aa2047cbfed040a40e8f4ed2a92765dc287 (diff) |
use binary search for vector matching instead of hashmaps
-rw-r--r-- | src/index.rs | 3 | ||||
-rw-r--r-- | src/vector.rs | 50 |
2 files changed, 36 insertions, 17 deletions
diff --git a/src/index.rs b/src/index.rs index 5d2abb6..d3aab9e 100644 --- a/src/index.rs +++ b/src/index.rs @@ -149,6 +149,7 @@ impl Index { let opt = FileVector::from_hashmap(opt); let mut results : Vec<SearchResult> = Vec::new(); + let max = self.filecache.len(); let mut last_p = 0; for (i, filecache) in self.filecache.iter().enumerate() { @@ -159,7 +160,7 @@ impl Index { results.push(r); } - let p = i * 100 / self.filecache.len(); + let p = i * 100 / max; if last_p < p { callback(p as u8); last_p = p; diff --git a/src/vector.rs b/src/vector.rs index c85c50a..e73e981 100644 --- a/src/vector.rs +++ b/src/vector.rs @@ -13,7 +13,8 @@ pub type Indexer = u32; /// of storage. #[derive(Default, Clone, Debug, Deserialize, Serialize)] pub struct FileVector { - data : Vec<FileVectorEntry> + data : Vec<FileVectorEntry>, + cache : Option<HashMap<Indexer, Count>> } #[repr(packed)] @@ -38,7 +39,7 @@ impl DerefMut for FileVector { impl FileVector { pub fn new() -> Self { - Self { data : Vec::new() } + Self { data : Vec::new(), cache : None } } pub fn to_hashmap(&self) -> HashMap<Indexer, Count> { @@ -75,32 +76,49 @@ impl FileVector { } pub fn from_hashmap(map : HashMap<Indexer, Count>) -> Self { - Self { data : Vec::from_iter(map.iter().map(|e| { - FileVectorEntry { index: *e.0, count: *e.1 } - }))} + let mut v = Vec::from_iter(map.iter().map(|e| { + FileVectorEntry { index: *e.0, count: *e.1 } + })); + v.sort_by(|a, b| { + let id = a.index; + let id2 = b.index; + id.cmp(&id2) + }); + Self { data : v, + cache: None + } } } pub fn scalar_product(a : &FileVector, b : &FileVector) -> u32 { - let a = a.to_hashmap(); - let b = b.to_hashmap(); let mut c : u32 = 0; - for (i, x) in a.iter() { - c += (x * (b.get(i).unwrap_or(&0))) as u32; + for entry in a.iter() { + let id = entry.index; + let x1 = entry.count as u32; + + if let Ok(x2) = b.data.binary_search_by(|a| { + let id2 = a.index; + id2.cmp(&id) + }) { + c += (x1 * x2 as u32) as u32; + } } c } pub fn match_vector(query : &FileVector, v : &FileVector) -> u32 { - let query = query.to_hashmap(); - let v = v.to_hashmap(); let mut c : u32 = 0; - for (i, x) in query.iter() { - let s = x * (v.get(i).unwrap_or(&0)); - if s == 0 { - return 0 - } else { + for entry in query.data.iter() { + let id = entry.index as u32; + let x1 = entry.count as u32; + if let Ok(x2) = v.data.binary_search_by(|a| { + let id2 = a.index; + id2.cmp(&id) + }) { + let s = x1 * (x2 as u32); c += s as u32; + } else { + return 0; } } c |