diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/dictionary.rs | 105 | ||||
| -rw-r--r-- | src/filecache.rs | 16 | ||||
| -rw-r--r-- | src/gui/mod.rs | 3 | ||||
| -rw-r--r-- | src/index.rs | 123 | ||||
| -rw-r--r-- | src/main.rs | 1 | ||||
| -rw-r--r-- | src/vector.rs | 43 |
6 files changed, 43 insertions, 248 deletions
diff --git a/src/dictionary.rs b/src/dictionary.rs deleted file mode 100644 index 3e05b91..0000000 --- a/src/dictionary.rs +++ /dev/null @@ -1,105 +0,0 @@ -use std::collections::HashMap; -use crate::vector::FileVector; - -/// The dictionary is used to cache to words ids. -/// It also provides a function to convert it to -/// a vector and generate a FileVector from a word list -/// with the current directory. -#[derive(Clone, Debug)] -pub struct Dictionary { - last_index : usize, - data : HashMap<String, u64>, -} - -impl Default for Dictionary { - fn default() -> Self { - Self::new() - } -} - -impl Dictionary { - pub fn new() -> Self { - Self { last_index : 0, data : HashMap::new() } - } - - pub fn from_line(line : &str) -> Self { - let mut data : HashMap<String, u64> = HashMap::new(); - let mut i : usize = 0; - - for word in line.split(',') { - data.insert(word.to_string(), i as u64); - i += 1; - } - - Self { last_index : i - 1, data } - } - - pub fn set(&mut self, name : &String) { - if !self.data.contains_key(name) { - self.last_index += 1; - self.data.insert(name.clone(), self.last_index as u64); - } - } - - pub fn set_and_get(&mut self, name : &String) -> u64 { - if !self.data.contains_key(name) { - self.last_index += 1; - self.data.insert(name.clone(), self.last_index as u64); - self.last_index as u64 - } else { - *self.data.get(name).unwrap() - } - } - - pub fn get(&self, name : &String) -> Option<&u64> { - self.data.get(name) - } - - pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, u64> { - self.data.iter() - } - - pub fn to_list(&self) -> Vec<String> { - let mut v = Vec::with_capacity(self.last_index + 1); - - v.resize(self.last_index + 1, "".to_string()); - - for (word, id) in self.iter() { - v[(*id) as usize] = word.clone(); - } - - v - } - - pub fn vectorize_word_list(&self, words : &Vec<&String>) -> FileVector { - let mut fv = FileVector::new(); - - for word in words { - let i = *self.get(word).unwrap(); - if !fv.contains_key(&i) { - fv.insert(i, 1); - } else { - let c = *fv.get(&i).unwrap(); - fv.insert(i, c + 1); - } - } - - fv - } - - pub fn insert_words_and_vectorize_word_list(&mut self, words : &Vec<&String>) -> FileVector { - let mut fv = FileVector::new(); - - for word in words { - let i = self.set_and_get(word); - if !fv.contains_key(&i) { - fv.insert(i, 1); - } else { - let c = *fv.get(&i).unwrap(); - fv.insert(i, c + 1); - } - } - - fv - } -} diff --git a/src/filecache.rs b/src/filecache.rs index af97c20..721d7a4 100644 --- a/src/filecache.rs +++ b/src/filecache.rs @@ -1,9 +1,9 @@ use std::hash::{Hasher, Hash}; - +use serde::{Deserialize, Serialize}; use crate::vector::FileVector; /// Represents one file which was indexed. -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, Serialize, Deserialize)] pub struct FileCache { pub path : String, pub vector : FileVector, @@ -22,15 +22,3 @@ impl Hash for FileCache { self.path.hash(state); } } - -impl FileCache { - pub fn from_line(line : String) -> Self { - let ls : Vec<String> = line.split(',').map(|s| s.to_string()).collect(); - let v = FileVector::from_string(&ls[1]); - let p = ls[0].clone().replace('\0', ","); - Self { - vector : v, - path : p - } - } -} diff --git a/src/gui/mod.rs b/src/gui/mod.rs index f56dd71..593eb65 100644 --- a/src/gui/mod.rs +++ b/src/gui/mod.rs @@ -210,8 +210,7 @@ impl Application for App { } Message::ExportResults => { let file = rfd::FileDialog::new().set_title("Export to File").add_filter("Raw Text", &["txt"]).save_file(); - if file.is_some() { - let file = file.unwrap(); + if let Some(file) = file { let mut file = File::create(file).unwrap(); for result in state.results.iter() { writeln!(file, "{}", result.path).ok(); diff --git a/src/index.rs b/src/index.rs index fd38298..8fb34fe 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,13 +1,13 @@ -use std::collections::{HashSet, HashMap}; +use std::collections::hash_map::DefaultHasher; +use std::hash::{Hash, Hasher}; use std::fs::File; -use std::io::{BufWriter, BufReader, BufRead, Write}; +use std::io::{BufWriter, Write}; use std::sync::mpsc::{channel, Sender}; use std::time::Duration; use walkdir::*; use std::thread; use std::option::Option::None; use crate::vector::FileVector; -use crate::dictionary::Dictionary; use crate::filecache::FileCache; use crate::searchresult::SearchResult; use crate::filecounter::filecount; @@ -19,7 +19,6 @@ use crate::vector; /// or read from a file. #[derive(Clone, Debug)] pub struct Index { - dictionary : Dictionary, filecache : Vec<FileCache>, } @@ -40,7 +39,6 @@ pub enum GenState { impl Index { pub fn empty() -> Self { Self { - dictionary : Dictionary::new(), filecache : Vec::new() } } @@ -62,14 +60,12 @@ impl Index { tx_vec.push(tx); let status_tx = status_tx.clone(); crawler_handles.push(thread::spawn(move || { - let mut dict = Dictionary::new(); let mut filecache : Vec<FileCache> = Vec::new(); loop { let path = rx.recv().unwrap(); if path.is_empty() { return Self { - dictionary : dict, filecache } } @@ -83,7 +79,7 @@ impl Index { } let words : Vec<String> = splitter::split_to_words(content); - let fv = dict.insert_words_and_vectorize_word_list(&words.iter().collect()); + let fv = FileVector::from_words(words); filecache.push(FileCache { path, vector : fv @@ -160,90 +156,26 @@ impl Index { } pub fn from_file(path : &String) -> Self { - let index_file = File::open(path).expect("could not open index file"); - let reader = BufReader::new(index_file); - let mut filecache : Vec<FileCache> = Vec::new(); - let mut dict = Dictionary::new(); - - for line in reader.lines() { - let l = line.unwrap(); - if l.starts_with('#') { - dict = Dictionary::from_line(l.strip_prefix('#').unwrap()); - } else { - filecache.push(FileCache::from_line(l)); - } - } + let bytes = std::fs::read(path).expect("could not read index file"); + let filecache : Vec<FileCache> = bincode::deserialize(&bytes).unwrap(); Self { - dictionary : dict, filecache } } - fn merge_into(&mut self, other : Index) { - let mut dict = self.dictionary.clone(); - thread::scope(|s| { - let mut a_hash : HashSet<&FileCache> = HashSet::new(); - let mut diff : Vec<&FileCache> = Vec::new(); - - let converter_handle = s.spawn(|| { - let mut b_id_to_word : HashMap<u64, &String> = HashMap::new(); - - for (value, id) in other.dictionary.iter() { - b_id_to_word.insert(*id, value); - } - b_id_to_word - }); - - let dict_handle = s.spawn(|| { - for (word, _) in other.dictionary.iter() { - dict.set(word); - } - dict - }); - - for file in self.filecache.iter() { - a_hash.insert(file); - } - - for file in other.filecache.iter() { - if !a_hash.contains(file) { - diff.push(file); - } - } - - let b_id_to_word = converter_handle.join().unwrap(); - self.dictionary = dict_handle.join().unwrap(); - - for file in diff { - let mut words : Vec<&String> = Vec::new(); - - for (word_id, i) in file.vector.iter() { - for _ in 0..*i { - words.push(b_id_to_word.get(word_id).unwrap()); - } - } - - self.filecache.push(FileCache { - path : file.path.clone(), - vector: self.dictionary.vectorize_word_list(&words) - }); - } - }); - } - - pub fn merge(mut indexes : Vec<Index>, callback : impl Fn(u8)) -> Self { + pub fn merge(indexes : Vec<Index>, callback : impl Fn(u8)) -> Self { let max = indexes.len(); + let mut filecache = Vec::new(); - indexes.sort_by(|a, b| a.filecache.len().cmp(&b.filecache.len())); - let mut merged_index : Index = indexes.pop().unwrap(); for (i, index) in indexes.into_iter().enumerate() { callback((i * 100 / max) as u8); - merged_index.merge_into(index); + filecache.extend(index.filecache); } + callback(100); - merged_index + Self { filecache } } pub fn search(&self, search_args : Vec<String>) -> Vec<SearchResult> { @@ -251,13 +183,14 @@ impl Index { let mut opt : FileVector = FileVector::new(); for arg in search_args { - let a = arg.trim_start_matches("+"); - if let Some(value) = self.dictionary.get(&a.to_string()) { - if arg.chars().nth(0).unwrap() == '+' { - opt.insert(*value, 1); - } else { - v.insert(*value, 1); - } + let mut hasher = DefaultHasher::new(); + let a = arg.trim_start_matches('+'); + a.hash(&mut hasher); + let value = hasher.finish(); + if arg.starts_with('+') { + opt.insert(value, 1); + } else { + v.insert(value, 1); } } @@ -276,21 +209,11 @@ impl Index { } pub fn save(&self, path: String) { - thread::scope(|s| { - let dict_list_handle = s.spawn(|| { - self.dictionary.to_list().join(",") - }); + let index_file = File::create(path).expect("could not open output file"); + let mut file = BufWriter::new(index_file); - let index_file = File::create(path).expect("could not open output file"); - let mut file = BufWriter::new(index_file); - - for fc in self.filecache.iter() { - write!(file, "{}, {}\n", fc.path.replace(',', "\0"), fc.vector.stringify()).ok(); - } - - write!(file, "#{}\n", dict_list_handle.join().unwrap().as_str()).ok(); - file.flush().ok(); - }); + file.write_all(&bincode::serialize(&self.filecache).unwrap()).ok(); + file.flush().ok(); } pub fn num_files(&self) -> usize { diff --git a/src/main.rs b/src/main.rs index 729b40d..3f18de9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,4 @@ pub mod vector; -pub mod dictionary; pub mod text; pub mod splitter; pub mod filecache; diff --git a/src/vector.rs b/src/vector.rs index c058490..fa0a139 100644 --- a/src/vector.rs +++ b/src/vector.rs @@ -1,11 +1,14 @@ use std::collections::HashMap; +use std::collections::hash_map::DefaultHasher; +use std::hash::{Hash, Hasher}; use std::ops::{Deref, DerefMut}; +use serde::{Deserialize, Serialize}; /// Represents the content of a cached file. /// It is stored as a HashMap, because we do not /// have to store the zeros. With that we save a lot /// of storage. -#[derive(Clone, Debug)] +#[derive(Default, Clone, Debug, Deserialize, Serialize)] pub struct FileVector { data : HashMap<u64, u64> } @@ -23,42 +26,30 @@ impl DerefMut for FileVector { } } -impl Default for FileVector { - fn default() -> Self { - Self::new() - } -} - impl FileVector { pub fn new() -> Self { Self { data : HashMap::new() } } - pub fn from_string(hex : &str) -> Self { - let mut data : HashMap<u64, u64> = HashMap::new(); - let data_chunks : Vec<&str> = hex.split(' ').collect(); + pub fn from_words(words: Vec<String>) -> Self { + let mut data = HashMap::new(); - for chunk in data_chunks { - if !chunk.is_empty() { - let n : Vec<&str> = chunk.split(';').collect(); - let i : u64 = u64::from_str_radix(n[0], 16).expect("could not extract index"); - let v : u64 = u64::from_str_radix(n[1], 16).expect("could not extract value"); - data.insert(i, v); + for word in words { + let mut hasher = DefaultHasher::new(); + word.hash(&mut hasher); + let k = hasher.finish(); + match data.entry(k) { + std::collections::hash_map::Entry::Occupied(mut e) => { + e.insert(e.get() + 1); + } + std::collections::hash_map::Entry::Vacant(e) => { + e.insert(1); + } } } Self { data } } - - pub fn stringify(&self) -> String { - let mut hex = String::new(); - - for (i, v) in self.data.iter() { - hex += &format!("{:x};{:x} ", *i, *v); - } - - hex.trim().to_string() - } } pub fn scalar_product(a : &FileVector, b : &FileVector) -> u64 { |