aboutsummaryrefslogtreecommitdiff
path: root/src/index.rs
diff options
context:
space:
mode:
authorNathan Reiner <nathan@nathanreiner.xyz>2023-07-14 00:22:39 +0200
committerNathan Reiner <nathan@nathanreiner.xyz>2023-07-14 00:22:39 +0200
commit149e0b6ae9871515be21f23b492f5ef7355e2ca4 (patch)
treed5b1bf8281a3a1cf181d5c921a53dfd99fd8b7a9 /src/index.rs
parent0723ea6b6bb6832b11582eeb8a330d2bdb6077b5 (diff)
make fast using hash instead of dictionary
Diffstat (limited to 'src/index.rs')
-rw-r--r--src/index.rs123
1 files changed, 23 insertions, 100 deletions
diff --git a/src/index.rs b/src/index.rs
index fd38298..8fb34fe 100644
--- a/src/index.rs
+++ b/src/index.rs
@@ -1,13 +1,13 @@
-use std::collections::{HashSet, HashMap};
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{Hash, Hasher};
use std::fs::File;
-use std::io::{BufWriter, BufReader, BufRead, Write};
+use std::io::{BufWriter, Write};
use std::sync::mpsc::{channel, Sender};
use std::time::Duration;
use walkdir::*;
use std::thread;
use std::option::Option::None;
use crate::vector::FileVector;
-use crate::dictionary::Dictionary;
use crate::filecache::FileCache;
use crate::searchresult::SearchResult;
use crate::filecounter::filecount;
@@ -19,7 +19,6 @@ use crate::vector;
/// or read from a file.
#[derive(Clone, Debug)]
pub struct Index {
- dictionary : Dictionary,
filecache : Vec<FileCache>,
}
@@ -40,7 +39,6 @@ pub enum GenState {
impl Index {
pub fn empty() -> Self {
Self {
- dictionary : Dictionary::new(),
filecache : Vec::new()
}
}
@@ -62,14 +60,12 @@ impl Index {
tx_vec.push(tx);
let status_tx = status_tx.clone();
crawler_handles.push(thread::spawn(move || {
- let mut dict = Dictionary::new();
let mut filecache : Vec<FileCache> = Vec::new();
loop {
let path = rx.recv().unwrap();
if path.is_empty() {
return Self {
- dictionary : dict,
filecache
}
}
@@ -83,7 +79,7 @@ impl Index {
}
let words : Vec<String> = splitter::split_to_words(content);
- let fv = dict.insert_words_and_vectorize_word_list(&words.iter().collect());
+ let fv = FileVector::from_words(words);
filecache.push(FileCache {
path,
vector : fv
@@ -160,90 +156,26 @@ impl Index {
}
pub fn from_file(path : &String) -> Self {
- let index_file = File::open(path).expect("could not open index file");
- let reader = BufReader::new(index_file);
- let mut filecache : Vec<FileCache> = Vec::new();
- let mut dict = Dictionary::new();
-
- for line in reader.lines() {
- let l = line.unwrap();
- if l.starts_with('#') {
- dict = Dictionary::from_line(l.strip_prefix('#').unwrap());
- } else {
- filecache.push(FileCache::from_line(l));
- }
- }
+ let bytes = std::fs::read(path).expect("could not read index file");
+ let filecache : Vec<FileCache> = bincode::deserialize(&bytes).unwrap();
Self {
- dictionary : dict,
filecache
}
}
- fn merge_into(&mut self, other : Index) {
- let mut dict = self.dictionary.clone();
- thread::scope(|s| {
- let mut a_hash : HashSet<&FileCache> = HashSet::new();
- let mut diff : Vec<&FileCache> = Vec::new();
-
- let converter_handle = s.spawn(|| {
- let mut b_id_to_word : HashMap<u64, &String> = HashMap::new();
-
- for (value, id) in other.dictionary.iter() {
- b_id_to_word.insert(*id, value);
- }
- b_id_to_word
- });
-
- let dict_handle = s.spawn(|| {
- for (word, _) in other.dictionary.iter() {
- dict.set(word);
- }
- dict
- });
-
- for file in self.filecache.iter() {
- a_hash.insert(file);
- }
-
- for file in other.filecache.iter() {
- if !a_hash.contains(file) {
- diff.push(file);
- }
- }
-
- let b_id_to_word = converter_handle.join().unwrap();
- self.dictionary = dict_handle.join().unwrap();
-
- for file in diff {
- let mut words : Vec<&String> = Vec::new();
-
- for (word_id, i) in file.vector.iter() {
- for _ in 0..*i {
- words.push(b_id_to_word.get(word_id).unwrap());
- }
- }
-
- self.filecache.push(FileCache {
- path : file.path.clone(),
- vector: self.dictionary.vectorize_word_list(&words)
- });
- }
- });
- }
-
- pub fn merge(mut indexes : Vec<Index>, callback : impl Fn(u8)) -> Self {
+ pub fn merge(indexes : Vec<Index>, callback : impl Fn(u8)) -> Self {
let max = indexes.len();
+ let mut filecache = Vec::new();
- indexes.sort_by(|a, b| a.filecache.len().cmp(&b.filecache.len()));
- let mut merged_index : Index = indexes.pop().unwrap();
for (i, index) in indexes.into_iter().enumerate() {
callback((i * 100 / max) as u8);
- merged_index.merge_into(index);
+ filecache.extend(index.filecache);
}
+
callback(100);
- merged_index
+ Self { filecache }
}
pub fn search(&self, search_args : Vec<String>) -> Vec<SearchResult> {
@@ -251,13 +183,14 @@ impl Index {
let mut opt : FileVector = FileVector::new();
for arg in search_args {
- let a = arg.trim_start_matches("+");
- if let Some(value) = self.dictionary.get(&a.to_string()) {
- if arg.chars().nth(0).unwrap() == '+' {
- opt.insert(*value, 1);
- } else {
- v.insert(*value, 1);
- }
+ let mut hasher = DefaultHasher::new();
+ let a = arg.trim_start_matches('+');
+ a.hash(&mut hasher);
+ let value = hasher.finish();
+ if arg.starts_with('+') {
+ opt.insert(value, 1);
+ } else {
+ v.insert(value, 1);
}
}
@@ -276,21 +209,11 @@ impl Index {
}
pub fn save(&self, path: String) {
- thread::scope(|s| {
- let dict_list_handle = s.spawn(|| {
- self.dictionary.to_list().join(",")
- });
+ let index_file = File::create(path).expect("could not open output file");
+ let mut file = BufWriter::new(index_file);
- let index_file = File::create(path).expect("could not open output file");
- let mut file = BufWriter::new(index_file);
-
- for fc in self.filecache.iter() {
- write!(file, "{}, {}\n", fc.path.replace(',', "\0"), fc.vector.stringify()).ok();
- }
-
- write!(file, "#{}\n", dict_list_handle.join().unwrap().as_str()).ok();
- file.flush().ok();
- });
+ file.write_all(&bincode::serialize(&self.filecache).unwrap()).ok();
+ file.flush().ok();
}
pub fn num_files(&self) -> usize {