diff options
| author | Nathan Reiner <nathan@nathanreiner.xyz> | 2023-07-14 00:22:39 +0200 |
|---|---|---|
| committer | Nathan Reiner <nathan@nathanreiner.xyz> | 2023-07-14 00:22:39 +0200 |
| commit | 149e0b6ae9871515be21f23b492f5ef7355e2ca4 (patch) | |
| tree | d5b1bf8281a3a1cf181d5c921a53dfd99fd8b7a9 /src/index.rs | |
| parent | 0723ea6b6bb6832b11582eeb8a330d2bdb6077b5 (diff) | |
make fast using hash instead of dictionary
Diffstat (limited to 'src/index.rs')
| -rw-r--r-- | src/index.rs | 123 |
1 files changed, 23 insertions, 100 deletions
diff --git a/src/index.rs b/src/index.rs index fd38298..8fb34fe 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,13 +1,13 @@ -use std::collections::{HashSet, HashMap}; +use std::collections::hash_map::DefaultHasher; +use std::hash::{Hash, Hasher}; use std::fs::File; -use std::io::{BufWriter, BufReader, BufRead, Write}; +use std::io::{BufWriter, Write}; use std::sync::mpsc::{channel, Sender}; use std::time::Duration; use walkdir::*; use std::thread; use std::option::Option::None; use crate::vector::FileVector; -use crate::dictionary::Dictionary; use crate::filecache::FileCache; use crate::searchresult::SearchResult; use crate::filecounter::filecount; @@ -19,7 +19,6 @@ use crate::vector; /// or read from a file. #[derive(Clone, Debug)] pub struct Index { - dictionary : Dictionary, filecache : Vec<FileCache>, } @@ -40,7 +39,6 @@ pub enum GenState { impl Index { pub fn empty() -> Self { Self { - dictionary : Dictionary::new(), filecache : Vec::new() } } @@ -62,14 +60,12 @@ impl Index { tx_vec.push(tx); let status_tx = status_tx.clone(); crawler_handles.push(thread::spawn(move || { - let mut dict = Dictionary::new(); let mut filecache : Vec<FileCache> = Vec::new(); loop { let path = rx.recv().unwrap(); if path.is_empty() { return Self { - dictionary : dict, filecache } } @@ -83,7 +79,7 @@ impl Index { } let words : Vec<String> = splitter::split_to_words(content); - let fv = dict.insert_words_and_vectorize_word_list(&words.iter().collect()); + let fv = FileVector::from_words(words); filecache.push(FileCache { path, vector : fv @@ -160,90 +156,26 @@ impl Index { } pub fn from_file(path : &String) -> Self { - let index_file = File::open(path).expect("could not open index file"); - let reader = BufReader::new(index_file); - let mut filecache : Vec<FileCache> = Vec::new(); - let mut dict = Dictionary::new(); - - for line in reader.lines() { - let l = line.unwrap(); - if l.starts_with('#') { - dict = Dictionary::from_line(l.strip_prefix('#').unwrap()); - } else { - filecache.push(FileCache::from_line(l)); - } - } + let bytes = std::fs::read(path).expect("could not read index file"); + let filecache : Vec<FileCache> = bincode::deserialize(&bytes).unwrap(); Self { - dictionary : dict, filecache } } - fn merge_into(&mut self, other : Index) { - let mut dict = self.dictionary.clone(); - thread::scope(|s| { - let mut a_hash : HashSet<&FileCache> = HashSet::new(); - let mut diff : Vec<&FileCache> = Vec::new(); - - let converter_handle = s.spawn(|| { - let mut b_id_to_word : HashMap<u64, &String> = HashMap::new(); - - for (value, id) in other.dictionary.iter() { - b_id_to_word.insert(*id, value); - } - b_id_to_word - }); - - let dict_handle = s.spawn(|| { - for (word, _) in other.dictionary.iter() { - dict.set(word); - } - dict - }); - - for file in self.filecache.iter() { - a_hash.insert(file); - } - - for file in other.filecache.iter() { - if !a_hash.contains(file) { - diff.push(file); - } - } - - let b_id_to_word = converter_handle.join().unwrap(); - self.dictionary = dict_handle.join().unwrap(); - - for file in diff { - let mut words : Vec<&String> = Vec::new(); - - for (word_id, i) in file.vector.iter() { - for _ in 0..*i { - words.push(b_id_to_word.get(word_id).unwrap()); - } - } - - self.filecache.push(FileCache { - path : file.path.clone(), - vector: self.dictionary.vectorize_word_list(&words) - }); - } - }); - } - - pub fn merge(mut indexes : Vec<Index>, callback : impl Fn(u8)) -> Self { + pub fn merge(indexes : Vec<Index>, callback : impl Fn(u8)) -> Self { let max = indexes.len(); + let mut filecache = Vec::new(); - indexes.sort_by(|a, b| a.filecache.len().cmp(&b.filecache.len())); - let mut merged_index : Index = indexes.pop().unwrap(); for (i, index) in indexes.into_iter().enumerate() { callback((i * 100 / max) as u8); - merged_index.merge_into(index); + filecache.extend(index.filecache); } + callback(100); - merged_index + Self { filecache } } pub fn search(&self, search_args : Vec<String>) -> Vec<SearchResult> { @@ -251,13 +183,14 @@ impl Index { let mut opt : FileVector = FileVector::new(); for arg in search_args { - let a = arg.trim_start_matches("+"); - if let Some(value) = self.dictionary.get(&a.to_string()) { - if arg.chars().nth(0).unwrap() == '+' { - opt.insert(*value, 1); - } else { - v.insert(*value, 1); - } + let mut hasher = DefaultHasher::new(); + let a = arg.trim_start_matches('+'); + a.hash(&mut hasher); + let value = hasher.finish(); + if arg.starts_with('+') { + opt.insert(value, 1); + } else { + v.insert(value, 1); } } @@ -276,21 +209,11 @@ impl Index { } pub fn save(&self, path: String) { - thread::scope(|s| { - let dict_list_handle = s.spawn(|| { - self.dictionary.to_list().join(",") - }); + let index_file = File::create(path).expect("could not open output file"); + let mut file = BufWriter::new(index_file); - let index_file = File::create(path).expect("could not open output file"); - let mut file = BufWriter::new(index_file); - - for fc in self.filecache.iter() { - write!(file, "{}, {}\n", fc.path.replace(',', "\0"), fc.vector.stringify()).ok(); - } - - write!(file, "#{}\n", dict_list_handle.join().unwrap().as_str()).ok(); - file.flush().ok(); - }); + file.write_all(&bincode::serialize(&self.filecache).unwrap()).ok(); + file.flush().ok(); } pub fn num_files(&self) -> usize { |