use binary search for vector matching instead of hashmaps

author: Nathan Reiner <nathan@nathanreiner.xyz> 2023-07-26 22:46:48 +0200
committer: Nathan Reiner <nathan@nathanreiner.xyz> 2023-07-26 22:46:48 +0200
commit: 8dbfc9613673be00304d5efc08815a64f7b1ccca (patch)
tree: 6ffdeba30a738ecaf417bde49e4b2d472296d552
parent: b1645aa2047cbfed040a40e8f4ed2a92765dc287 (diff)
2 files changed, 36 insertions, 17 deletions
diff --git a/src/index.rs b/src/index.rs
index 5d2abb6..d3aab9e 100644
--- a/src/index.rs
+++ b/src/index.rs
@@ -149,6 +149,7 @@ impl Index {
         let opt = FileVector::from_hashmap(opt);
 
         let mut results : Vec<SearchResult> = Vec::new();
+        let max = self.filecache.len();
 
         let mut last_p = 0;
         for (i, filecache) in self.filecache.iter().enumerate() {
@@ -159,7 +160,7 @@ impl Index {
                 results.push(r);
             }
 
-            let p = i * 100 / self.filecache.len();
+            let p = i * 100 / max;
             if last_p < p {
                 callback(p as u8);
                 last_p = p;
diff --git a/src/vector.rs b/src/vector.rs
index c85c50a..e73e981 100644
--- a/src/vector.rs
+++ b/src/vector.rs
@@ -13,7 +13,8 @@ pub type Indexer = u32;
 /// of storage.
 #[derive(Default, Clone, Debug, Deserialize, Serialize)]
 pub struct FileVector {
-    data : Vec<FileVectorEntry>
+    data : Vec<FileVectorEntry>,
+    cache : Option<HashMap<Indexer, Count>>
 }
 
 #[repr(packed)]
@@ -38,7 +39,7 @@ impl DerefMut for FileVector {
 
 impl FileVector {
     pub fn new() -> Self {
-        Self { data : Vec::new() }
+        Self { data : Vec::new(), cache : None }
     }
 
     pub fn to_hashmap(&self) -> HashMap<Indexer, Count> {
@@ -75,32 +76,49 @@ impl FileVector {
     }
 
     pub fn from_hashmap(map : HashMap<Indexer, Count>) -> Self {
-        Self { data : Vec::from_iter(map.iter().map(|e| {
-            FileVectorEntry { index: *e.0, count: *e.1 }
-        }))}
+        let mut v = Vec::from_iter(map.iter().map(|e| {
+                    FileVectorEntry { index: *e.0, count: *e.1 }
+                }));
+        v.sort_by(|a, b| {
+            let id = a.index;
+            let id2 = b.index;
+            id.cmp(&id2)
+        });
+        Self { data : v,
+               cache: None
+        }
     }
 }
 
 pub fn scalar_product(a : &FileVector, b : &FileVector) -> u32 {
-    let a = a.to_hashmap();
-    let b = b.to_hashmap();
     let mut c : u32 = 0;
-    for (i, x) in a.iter() {
-        c += (x * (b.get(i).unwrap_or(&0))) as u32;
+    for entry in a.iter() {
+        let id = entry.index;
+        let x1 = entry.count as u32;
+
+        if let Ok(x2) = b.data.binary_search_by(|a| {
+            let id2 = a.index;
+            id2.cmp(&id)
+        }) {
+            c += (x1 * x2 as u32) as u32;
+        }
     }
     c
 }
 
 pub fn match_vector(query : &FileVector, v : &FileVector) -> u32 {
-    let query = query.to_hashmap();
-    let v = v.to_hashmap();
     let mut c : u32 = 0;
-    for (i, x) in query.iter() {
-        let s = x * (v.get(i).unwrap_or(&0));
-        if s == 0 {
-            return 0
-        } else {
+    for entry in query.data.iter() {
+        let id = entry.index as u32;
+        let x1 = entry.count as u32;
+        if let Ok(x2) = v.data.binary_search_by(|a| {
+            let id2 = a.index;
+            id2.cmp(&id)
+        }) {
+            let s = x1 * (x2 as u32);
             c += s as u32;
+        } else {
+            return 0;
         }
     }
     c
author	Nathan Reiner <nathan@nathanreiner.xyz>	2023-07-26 22:46:48 +0200
committer	Nathan Reiner <nathan@nathanreiner.xyz>	2023-07-26 22:46:48 +0200
commit	8dbfc9613673be00304d5efc08815a64f7b1ccca (patch)
tree	6ffdeba30a738ecaf417bde49e4b2d472296d552
parent	b1645aa2047cbfed040a40e8f4ed2a92765dc287 (diff)