diff options
| author | Nathan Reiner <nathan@nathanreiner.xyz> | 2023-07-09 13:12:00 +0200 |
|---|---|---|
| committer | Nathan Reiner <nathan@nathanreiner.xyz> | 2023-07-09 13:12:00 +0200 |
| commit | 019d08f3441c9e499977d583bb0f8383aff50d4b (patch) | |
| tree | 1720d6386d39366ddc9f7d321e849350c3bd9c7b /src/index.rs | |
| parent | 8d6b55ebed2bda4705525004863055e99e72da83 (diff) | |
introduce some minor optimizations
Diffstat (limited to 'src/index.rs')
| -rw-r--r-- | src/index.rs | 134 |
1 files changed, 70 insertions, 64 deletions
diff --git a/src/index.rs b/src/index.rs index cfb612a..741eb7c 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,7 +1,8 @@ use std::collections::{HashSet, HashMap}; use std::fs::File; -use std::io::{Write, BufReader, BufRead}; +use std::io::{BufWriter, BufReader, BufRead, Write}; use std::sync::mpsc::{channel, Sender}; +use std::time::Duration; use walkdir::*; use std::thread; use std::option::Option::None; @@ -50,13 +51,16 @@ impl Index { let mut crawler_handles = Vec::new(); let num_threads = thread::available_parallelism().unwrap().get(); let mut tx_vec : Vec<Sender<String>> = Vec::new(); + let mut indexes = Vec::new(); thread::scope(|s| { let mut nof_handle : Option<_> = Some(s.spawn(|| filecount(input_path))); + let (status_tx, status_rx) = channel(); for _ in 0..num_threads { let (tx, rx) = channel(); tx_vec.push(tx); + let status_tx = status_tx.clone(); crawler_handles.push(thread::spawn(move || { let mut dict = Dictionary::new(); let mut filecache : Vec<FileCache> = Vec::new(); @@ -72,17 +76,14 @@ impl Index { let content : String = text::extract_text(path.as_str()); + let _ = status_tx.send(()); + if content.is_empty() { continue; } let words : Vec<String> = splitter::split_to_words(content); - - for word in words.iter() { - dict.set(word.clone()); - } - - let fv = dict.vectorize_word_list(words.clone()); + let fv = dict.insert_words_and_vectorize_word_list(&words.iter().collect()); filecache.push(FileCache { path, vector : fv @@ -128,22 +129,36 @@ impl Index { } } } - }); - let mut indexes = Vec::new(); - let mut i = 0; + let join_handle = s.spawn(|| { + for (i, handle) in crawler_handles.into_iter().enumerate() { + tx_vec[i].send(String::new()).ok(); + indexes.push(handle.join().unwrap()); + } + }); - for handle in crawler_handles { - callback(GenState::Parsing, (i * 100 / num_threads) as u8); - tx_vec[i].send(String::new()).ok(); - indexes.push(handle.join().unwrap()); - i += 1; - } - Index::merge(indexes, |p| { callback(GenState::Merging, p) }) + let mut i = 0; + let mut last_p = 0; + while !join_handle.is_finished() { + if status_rx.recv_timeout(Duration::from_millis(20)).is_ok() { + i += 1; + let p = i * 100 / nof; + if p != last_p { + callback(GenState::Parsing, p as u8); + last_p = p; + } + } + } + + join_handle.join().ok(); + }); + + + Index::merge(indexes.iter().collect(), |p| { callback(GenState::Merging, p) }) } - pub fn from_file(path : String) -> Self { + pub fn from_file(path : &String) -> Self { let index_file = File::open(path).expect("could not open index file"); let reader = BufReader::new(index_file); let mut filecache : Vec<FileCache> = Vec::new(); @@ -164,77 +179,67 @@ impl Index { } } - fn merge_two(a : Index, b : Index) -> Self { - let mut filecache = a.filecache.clone(); - let mut dictionary = Dictionary::default(); - + fn merge_into(&mut self, other : &Index) { + let mut dict = self.dictionary.clone(); thread::scope(|s| { - let mut a_hash : HashSet<FileCache> = HashSet::new(); - let mut diff : Vec<FileCache> = Vec::new(); + let mut a_hash : HashSet<&FileCache> = HashSet::new(); + let mut diff : Vec<&FileCache> = Vec::new(); let converter_handle = s.spawn(|| { - let mut b_id_to_word : HashMap<u64, String> = HashMap::new(); + let mut b_id_to_word : HashMap<u64, &String> = HashMap::new(); - for (value, id) in b.dictionary.iter() { - b_id_to_word.insert(id.clone(), value.clone()); + for (value, id) in other.dictionary.iter() { + b_id_to_word.insert(*id, value); } b_id_to_word }); let dict_handle = s.spawn(|| { - let mut dict = a.dictionary.clone(); - for (word, _) in b.dictionary.iter() { - dict.set(word.clone()); + for (word, _) in other.dictionary.iter() { + dict.set(word); } dict }); - for file in a.filecache.iter() { - a_hash.insert(file.clone()); + for file in self.filecache.iter() { + a_hash.insert(file); } - for file in b.filecache.iter() { + for file in other.filecache.iter() { if !a_hash.contains(file) { - diff.push(file.clone()); + diff.push(file); } } let b_id_to_word = converter_handle.join().unwrap(); - dictionary = dict_handle.join().unwrap(); + self.dictionary = dict_handle.join().unwrap(); for file in diff { - let mut words = Vec::new(); + let mut words : Vec<&String> = Vec::new(); for (word_id, i) in file.vector.iter() { for _ in 0..*i { - words.push(b_id_to_word.get(word_id).unwrap().clone()); + words.push(b_id_to_word.get(word_id).unwrap()); } } - filecache.push(FileCache { + self.filecache.push(FileCache { path : file.path.clone(), - vector: dictionary.vectorize_word_list(words) + vector: self.dictionary.vectorize_word_list(&words) }); } }); - - Self { - dictionary, - filecache - } } - pub fn merge(mut indexes : Vec<Index>, callback : impl Fn(u8)) -> Self { + pub fn merge(mut indexes : Vec<&Index>, callback : impl Fn(u8)) -> Self { let max = indexes.len(); - let mut i = 0 as usize; indexes.sort_by(|a, b| a.filecache.len().cmp(&b.filecache.len())); - let mut merged_index = indexes.pop().unwrap(); + let mut merged_index : Index = indexes.pop().unwrap().clone(); - for index in indexes { + for (i, index) in indexes.into_iter().enumerate() { callback((i * 100 / max) as u8); - i += 1; - merged_index = Index::merge_two(merged_index, index); + merged_index.merge_into(index); } callback(100); merged_index @@ -244,7 +249,7 @@ impl Index { let mut v : FileVector = FileVector::new(); for arg in search_args { - if let Some(value) = self.dictionary.get(arg.to_string()) { + if let Some(value) = self.dictionary.get(&arg) { v.insert(*value, 1); } } @@ -262,19 +267,20 @@ impl Index { results } - pub fn save(&self, output : String) { - let mut index_file = File::create(output).unwrap(); - - for file in self.filecache.iter() { - writeln!( - index_file, - "{}, {}", - file.path .replace(',', "\0"), - file.vector.stringify() - ).ok(); - } + pub fn save(&self, path: String) { + thread::scope(|s| { + let dict_list_handle = s.spawn(|| { + self.dictionary.to_list().join(",") + }); + let mut output : String = self.filecache.iter().map(|c| format!("{}, {}\n", c.path.replace(',', "\0"), c.vector.stringify())).collect(); + output += "#"; + output += dict_list_handle.join().unwrap().as_str(); + output += "\n"; - let dict_list : Vec<String> = self.dictionary.to_list(); - writeln!(index_file, "#{}", dict_list.join(",")).ok(); + let index_file = File::create(path).expect("could not open output file"); + let mut file = BufWriter::new(index_file); + file.write_all(output.as_bytes()).expect("could not write"); + file.flush().ok(); + }); } } |