aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNathan Reiner <nathan@nathanreiner.xyz>2023-07-26 22:46:48 +0200
committerNathan Reiner <nathan@nathanreiner.xyz>2023-07-26 22:46:48 +0200
commit8dbfc9613673be00304d5efc08815a64f7b1ccca (patch)
tree6ffdeba30a738ecaf417bde49e4b2d472296d552
parentb1645aa2047cbfed040a40e8f4ed2a92765dc287 (diff)
use binary search for vector matching instead of hashmaps
-rw-r--r--src/index.rs3
-rw-r--r--src/vector.rs50
2 files changed, 36 insertions, 17 deletions
diff --git a/src/index.rs b/src/index.rs
index 5d2abb6..d3aab9e 100644
--- a/src/index.rs
+++ b/src/index.rs
@@ -149,6 +149,7 @@ impl Index {
let opt = FileVector::from_hashmap(opt);
let mut results : Vec<SearchResult> = Vec::new();
+ let max = self.filecache.len();
let mut last_p = 0;
for (i, filecache) in self.filecache.iter().enumerate() {
@@ -159,7 +160,7 @@ impl Index {
results.push(r);
}
- let p = i * 100 / self.filecache.len();
+ let p = i * 100 / max;
if last_p < p {
callback(p as u8);
last_p = p;
diff --git a/src/vector.rs b/src/vector.rs
index c85c50a..e73e981 100644
--- a/src/vector.rs
+++ b/src/vector.rs
@@ -13,7 +13,8 @@ pub type Indexer = u32;
/// of storage.
#[derive(Default, Clone, Debug, Deserialize, Serialize)]
pub struct FileVector {
- data : Vec<FileVectorEntry>
+ data : Vec<FileVectorEntry>,
+ cache : Option<HashMap<Indexer, Count>>
}
#[repr(packed)]
@@ -38,7 +39,7 @@ impl DerefMut for FileVector {
impl FileVector {
pub fn new() -> Self {
- Self { data : Vec::new() }
+ Self { data : Vec::new(), cache : None }
}
pub fn to_hashmap(&self) -> HashMap<Indexer, Count> {
@@ -75,32 +76,49 @@ impl FileVector {
}
pub fn from_hashmap(map : HashMap<Indexer, Count>) -> Self {
- Self { data : Vec::from_iter(map.iter().map(|e| {
- FileVectorEntry { index: *e.0, count: *e.1 }
- }))}
+ let mut v = Vec::from_iter(map.iter().map(|e| {
+ FileVectorEntry { index: *e.0, count: *e.1 }
+ }));
+ v.sort_by(|a, b| {
+ let id = a.index;
+ let id2 = b.index;
+ id.cmp(&id2)
+ });
+ Self { data : v,
+ cache: None
+ }
}
}
pub fn scalar_product(a : &FileVector, b : &FileVector) -> u32 {
- let a = a.to_hashmap();
- let b = b.to_hashmap();
let mut c : u32 = 0;
- for (i, x) in a.iter() {
- c += (x * (b.get(i).unwrap_or(&0))) as u32;
+ for entry in a.iter() {
+ let id = entry.index;
+ let x1 = entry.count as u32;
+
+ if let Ok(x2) = b.data.binary_search_by(|a| {
+ let id2 = a.index;
+ id2.cmp(&id)
+ }) {
+ c += (x1 * x2 as u32) as u32;
+ }
}
c
}
pub fn match_vector(query : &FileVector, v : &FileVector) -> u32 {
- let query = query.to_hashmap();
- let v = v.to_hashmap();
let mut c : u32 = 0;
- for (i, x) in query.iter() {
- let s = x * (v.get(i).unwrap_or(&0));
- if s == 0 {
- return 0
- } else {
+ for entry in query.data.iter() {
+ let id = entry.index as u32;
+ let x1 = entry.count as u32;
+ if let Ok(x2) = v.data.binary_search_by(|a| {
+ let id2 = a.index;
+ id2.cmp(&id)
+ }) {
+ let s = x1 * (x2 as u32);
c += s as u32;
+ } else {
+ return 0;
}
}
c