From 019d08f3441c9e499977d583bb0f8383aff50d4b Mon Sep 17 00:00:00 2001 From: Nathan Reiner Date: Sun, 9 Jul 2023 13:12:00 +0200 Subject: introduce some minor optimizations --- src/dictionary.rs | 42 +++++++++++++---- src/filecache.rs | 2 +- src/gui/mod.rs | 2 +- src/index.rs | 136 ++++++++++++++++++++++++++++-------------------------- src/main.rs | 16 +++---- src/splitter.rs | 2 +- src/vector.rs | 2 +- 7 files changed, 117 insertions(+), 85 deletions(-) (limited to 'src') diff --git a/src/dictionary.rs b/src/dictionary.rs index e651e71..563621d 100644 --- a/src/dictionary.rs +++ b/src/dictionary.rs @@ -34,19 +34,29 @@ impl Dictionary { Self { last_index : i - 1, data } } - pub fn set(&mut self, name : String) { - if let std::collections::hash_map::Entry::Vacant(e) = self.data.entry(name) { + pub fn set(&mut self, name : &String) { + if !self.data.contains_key(name) { self.last_index += 1; - e.insert(self.last_index as u64); + self.data.insert(name.clone(), self.last_index as u64); } } - pub fn get(&self, name : String) -> Option<&u64> { - self.data.get(&name) + pub fn set_and_get(&mut self, name : &String) -> u64 { + if !self.data.contains_key(name) { + self.last_index += 1; + self.data.insert(name.clone(), self.last_index as u64); + self.last_index as u64 + } else { + *self.data.get(name).unwrap() + } } - pub fn iter(&self) -> &HashMap { - &self.data + pub fn get(&self, name : &String) -> Option<&u64> { + self.data.get(name) + } + + pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, u64> { + self.data.iter() } pub fn to_list(&self) -> Vec { @@ -61,7 +71,7 @@ impl Dictionary { v } - pub fn vectorize_word_list(&self, words : Vec) -> FileVector { + pub fn vectorize_word_list(&self, words : &Vec<&String>) -> FileVector { let mut fv = FileVector::new(); for word in words { @@ -76,4 +86,20 @@ impl Dictionary { fv } + + pub fn insert_words_and_vectorize_word_list(&mut self, words : &Vec<&String>) -> FileVector { + let mut fv = FileVector::new(); + + for word in words { + let i = self.set_and_get(word); + if !fv.contains_key(&i) { + fv.insert(i, 1); + } else { + let c : u64 = *fv.get(&i).unwrap(); + fv.insert(i, c + 1); + } + } + + fv + } } diff --git a/src/filecache.rs b/src/filecache.rs index a352e58..af97c20 100644 --- a/src/filecache.rs +++ b/src/filecache.rs @@ -26,7 +26,7 @@ impl Hash for FileCache { impl FileCache { pub fn from_line(line : String) -> Self { let ls : Vec = line.split(',').map(|s| s.to_string()).collect(); - let v = FileVector::from_string(ls[1].clone()); + let v = FileVector::from_string(&ls[1]); let p = ls[0].clone().replace('\0', ","); Self { vector : v, diff --git a/src/gui/mod.rs b/src/gui/mod.rs index 389fdb6..52cf8a7 100644 --- a/src/gui/mod.rs +++ b/src/gui/mod.rs @@ -83,7 +83,7 @@ async fn load_file() -> Index { let file = file.unwrap(); let file = file.to_str(); let file = file.unwrap(); - Index::from_file(file.to_string()) + Index::from_file(&file.to_string()) } async fn generate() -> Index { diff --git a/src/index.rs b/src/index.rs index cfb612a..741eb7c 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,7 +1,8 @@ use std::collections::{HashSet, HashMap}; use std::fs::File; -use std::io::{Write, BufReader, BufRead}; +use std::io::{BufWriter, BufReader, BufRead, Write}; use std::sync::mpsc::{channel, Sender}; +use std::time::Duration; use walkdir::*; use std::thread; use std::option::Option::None; @@ -50,13 +51,16 @@ impl Index { let mut crawler_handles = Vec::new(); let num_threads = thread::available_parallelism().unwrap().get(); let mut tx_vec : Vec> = Vec::new(); + let mut indexes = Vec::new(); thread::scope(|s| { let mut nof_handle : Option<_> = Some(s.spawn(|| filecount(input_path))); + let (status_tx, status_rx) = channel(); for _ in 0..num_threads { let (tx, rx) = channel(); tx_vec.push(tx); + let status_tx = status_tx.clone(); crawler_handles.push(thread::spawn(move || { let mut dict = Dictionary::new(); let mut filecache : Vec = Vec::new(); @@ -72,17 +76,14 @@ impl Index { let content : String = text::extract_text(path.as_str()); + let _ = status_tx.send(()); + if content.is_empty() { continue; } let words : Vec = splitter::split_to_words(content); - - for word in words.iter() { - dict.set(word.clone()); - } - - let fv = dict.vectorize_word_list(words.clone()); + let fv = dict.insert_words_and_vectorize_word_list(&words.iter().collect()); filecache.push(FileCache { path, vector : fv @@ -128,22 +129,36 @@ impl Index { } } } - }); - let mut indexes = Vec::new(); - let mut i = 0; + let join_handle = s.spawn(|| { + for (i, handle) in crawler_handles.into_iter().enumerate() { + tx_vec[i].send(String::new()).ok(); + indexes.push(handle.join().unwrap()); + } + }); - for handle in crawler_handles { - callback(GenState::Parsing, (i * 100 / num_threads) as u8); - tx_vec[i].send(String::new()).ok(); - indexes.push(handle.join().unwrap()); - i += 1; - } - Index::merge(indexes, |p| { callback(GenState::Merging, p) }) + let mut i = 0; + let mut last_p = 0; + while !join_handle.is_finished() { + if status_rx.recv_timeout(Duration::from_millis(20)).is_ok() { + i += 1; + let p = i * 100 / nof; + if p != last_p { + callback(GenState::Parsing, p as u8); + last_p = p; + } + } + } + + join_handle.join().ok(); + }); + + + Index::merge(indexes.iter().collect(), |p| { callback(GenState::Merging, p) }) } - pub fn from_file(path : String) -> Self { + pub fn from_file(path : &String) -> Self { let index_file = File::open(path).expect("could not open index file"); let reader = BufReader::new(index_file); let mut filecache : Vec = Vec::new(); @@ -164,77 +179,67 @@ impl Index { } } - fn merge_two(a : Index, b : Index) -> Self { - let mut filecache = a.filecache.clone(); - let mut dictionary = Dictionary::default(); - + fn merge_into(&mut self, other : &Index) { + let mut dict = self.dictionary.clone(); thread::scope(|s| { - let mut a_hash : HashSet = HashSet::new(); - let mut diff : Vec = Vec::new(); + let mut a_hash : HashSet<&FileCache> = HashSet::new(); + let mut diff : Vec<&FileCache> = Vec::new(); let converter_handle = s.spawn(|| { - let mut b_id_to_word : HashMap = HashMap::new(); + let mut b_id_to_word : HashMap = HashMap::new(); - for (value, id) in b.dictionary.iter() { - b_id_to_word.insert(id.clone(), value.clone()); + for (value, id) in other.dictionary.iter() { + b_id_to_word.insert(*id, value); } b_id_to_word }); let dict_handle = s.spawn(|| { - let mut dict = a.dictionary.clone(); - for (word, _) in b.dictionary.iter() { - dict.set(word.clone()); + for (word, _) in other.dictionary.iter() { + dict.set(word); } dict }); - for file in a.filecache.iter() { - a_hash.insert(file.clone()); + for file in self.filecache.iter() { + a_hash.insert(file); } - for file in b.filecache.iter() { + for file in other.filecache.iter() { if !a_hash.contains(file) { - diff.push(file.clone()); + diff.push(file); } } let b_id_to_word = converter_handle.join().unwrap(); - dictionary = dict_handle.join().unwrap(); + self.dictionary = dict_handle.join().unwrap(); for file in diff { - let mut words = Vec::new(); + let mut words : Vec<&String> = Vec::new(); for (word_id, i) in file.vector.iter() { for _ in 0..*i { - words.push(b_id_to_word.get(word_id).unwrap().clone()); + words.push(b_id_to_word.get(word_id).unwrap()); } } - filecache.push(FileCache { + self.filecache.push(FileCache { path : file.path.clone(), - vector: dictionary.vectorize_word_list(words) + vector: self.dictionary.vectorize_word_list(&words) }); } }); - - Self { - dictionary, - filecache - } } - pub fn merge(mut indexes : Vec, callback : impl Fn(u8)) -> Self { + pub fn merge(mut indexes : Vec<&Index>, callback : impl Fn(u8)) -> Self { let max = indexes.len(); - let mut i = 0 as usize; indexes.sort_by(|a, b| a.filecache.len().cmp(&b.filecache.len())); - let mut merged_index = indexes.pop().unwrap(); + let mut merged_index : Index = indexes.pop().unwrap().clone(); - for index in indexes { + for (i, index) in indexes.into_iter().enumerate() { callback((i * 100 / max) as u8); - i += 1; - merged_index = Index::merge_two(merged_index, index); + merged_index.merge_into(index); } callback(100); merged_index @@ -244,7 +249,7 @@ impl Index { let mut v : FileVector = FileVector::new(); for arg in search_args { - if let Some(value) = self.dictionary.get(arg.to_string()) { + if let Some(value) = self.dictionary.get(&arg) { v.insert(*value, 1); } } @@ -262,19 +267,20 @@ impl Index { results } - pub fn save(&self, output : String) { - let mut index_file = File::create(output).unwrap(); - - for file in self.filecache.iter() { - writeln!( - index_file, - "{}, {}", - file.path .replace(',', "\0"), - file.vector.stringify() - ).ok(); - } - - let dict_list : Vec = self.dictionary.to_list(); - writeln!(index_file, "#{}", dict_list.join(",")).ok(); + pub fn save(&self, path: String) { + thread::scope(|s| { + let dict_list_handle = s.spawn(|| { + self.dictionary.to_list().join(",") + }); + let mut output : String = self.filecache.iter().map(|c| format!("{}, {}\n", c.path.replace(',', "\0"), c.vector.stringify())).collect(); + output += "#"; + output += dict_list_handle.join().unwrap().as_str(); + output += "\n"; + + let index_file = File::create(path).expect("could not open output file"); + let mut file = BufWriter::new(index_file); + file.write_all(output.as_bytes()).expect("could not write"); + file.flush().ok(); + }); } } diff --git a/src/main.rs b/src/main.rs index c5cc5f6..f9d5018 100644 --- a/src/main.rs +++ b/src/main.rs @@ -28,7 +28,7 @@ fn main() { let input = args.get(2).unwrap(); let file = args.get(3).unwrap(); - let _ = Index::generate(input, |t, p| { + Index::generate(input, |t, p| { eprint!("\r\x1b[2K{}% ", p); match t { GenState::Fetching => { eprint!("fetched") } @@ -48,7 +48,7 @@ fn main() { let search = v.join(" "); let searchvec = splitter::split_to_words(search); - let idx = Index::from_file(file); + let idx = Index::from_file(&file); let results = idx.search(searchvec); for result in results { println!("{}", result.path); @@ -61,12 +61,12 @@ fn main() { let merged = args.get(2).unwrap().clone(); let v : Vec = args.get(3..(args.len())).unwrap().into(); - let indexes = v.iter().map(|s| Index::from_file(s.clone())).collect(); - let _ = Index::merge(indexes, - |p| { - eprint!("\r\x1b[2K{}% merged", p); - } - ).save(merged); + let indexes : Vec = v.iter().map(Index::from_file).collect(); + Index::merge(indexes.iter().collect(), + |p| { + eprint!("\r\x1b[2K{}% merged", p); + } + ).save(merged); } } else { let _ = gui::run(); diff --git a/src/splitter.rs b/src/splitter.rs index fbb2b6a..c4015e8 100644 --- a/src/splitter.rs +++ b/src/splitter.rs @@ -12,7 +12,7 @@ pub fn split_to_words(data : String) -> Vec { word.retain(|c| !r#"{}[]#(),".;:?!'%|0123456789/\^"#.contains(c)) } - v.retain(|str| !str.is_empty()); + v.retain(|str| !str.is_empty() && !str.contains("--")); v } diff --git a/src/vector.rs b/src/vector.rs index 87be04b..dfd2d71 100644 --- a/src/vector.rs +++ b/src/vector.rs @@ -34,7 +34,7 @@ impl FileVector { Self { data : HashMap::new() } } - pub fn from_string(hex : String) -> Self { + pub fn from_string(hex : &str) -> Self { let mut data : HashMap = HashMap::new(); let data_chunks : Vec<&str> = hex.split(' ').collect(); -- cgit v1.2.3-70-g09d2