diff options
Diffstat (limited to 'src/index.rs')
| -rw-r--r-- | src/index.rs | 237 |
1 files changed, 170 insertions, 67 deletions
diff --git a/src/index.rs b/src/index.rs index 27bb56b..78df3a1 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,6 +1,7 @@ use std::collections::{HashSet, HashMap}; use std::fs::File; use std::io::{Write, BufReader, BufRead}; +use std::sync::mpsc::{channel, Sender}; use walkdir::*; use std::thread; use std::option::Option::None; @@ -27,6 +28,14 @@ impl Default for Index { } } +#[derive(Clone, Debug, Default, Copy)] +pub enum GenState { + #[default] + Fetching, + Parsing, + Merging +} + impl Index { pub fn empty() -> Self { Self { @@ -35,58 +44,99 @@ impl Index { } } - pub fn generate(input_path : &str, callback : impl Fn(u64, u64)) -> Self { - let mut dict = Dictionary::new(); - let mut filecache : Vec<FileCache> = Vec::new(); + pub fn generate(input_path : &str, callback : impl Fn(GenState, u8)) -> Self { let mut nof = 0; let mut counter = 0; + let mut crawler_handles = Vec::new(); + let num_threads = thread::available_parallelism().unwrap().get(); + let mut tx_vec : Vec<Sender<String>> = Vec::new(); thread::scope(|s| { let mut nof_handle : Option<_> = Some(s.spawn(|| filecount(input_path))); - for entry in WalkDir::new(input_path) - .into_iter() - .filter_map(|e| e.ok()) { - counter += 1; - if entry.path().is_file() { - let content : String = text::extract_text(entry.path().to_str().unwrap()); + for _ in 0..num_threads { + let (tx, rx) = channel(); + tx_vec.push(tx); + crawler_handles.push(thread::spawn(move || { + let mut dict = Dictionary::new(); + let mut filecache : Vec<FileCache> = Vec::new(); - if content.is_empty() { - continue - } + loop { + let path = rx.recv().unwrap(); + if path.is_empty() { + return Self { + dictionary : dict, + filecache + } + } - let words : Vec<String> = splitter::split_to_words(content); + let content : String = text::extract_text(path.as_str()); - for word in words.iter() { - dict.set(word.clone()); - } + if content.is_empty() { + continue; + } - let fv = dict.vectorize_word_list(words.clone()); - filecache.push(FileCache { - path : entry.path().to_str().unwrap().to_string(), - vector : fv - }); + let words : Vec<String> = splitter::split_to_words(content); + for word in words.iter() { + dict.set(word.clone()); + } - } - match nof_handle { - Some(t) => { - nof = t.join().unwrap(); - nof_handle = None; + let fv = dict.vectorize_word_list(words.clone()); + filecache.push(FileCache { + path, + vector : fv + }); } - None => { - callback(counter, nof); + })); + } + + let mut next_crawler = 0; + let mut last_p = u64::MAX; + + for entry in WalkDir::new(input_path) + .into_iter() + .filter_map(|e| e.ok()) { + counter += 1; + if entry.path().is_file() { + tx_vec[next_crawler].send(entry.path().to_str().unwrap().to_string()).ok(); + next_crawler += 1; + if next_crawler == num_threads { + next_crawler = 0; + } + + match nof_handle { + Some(t) => { + nof = t.join().unwrap(); + nof_handle = None; + } + None => { + // Make sure that we only push a update + // if there is a visual change to the number + // because updating the screen takes a lot + // of time. + let p = counter * 100 / nof; + if p != last_p { + callback(GenState::Fetching, p as u8); + last_p = p; + } + } } } } - - callback(nof, nof); }); - Self { - dictionary : dict, - filecache + let mut indexes = Vec::new(); + let mut i = 0; + + for handle in crawler_handles { + callback(GenState::Parsing, (i * 100 / num_threads) as u8); + tx_vec[i].send(String::new()).ok(); + indexes.push(handle.join().unwrap()); + i += 1; } + + Index::merge(indexes, |p| { callback(GenState::Merging, p) }) } pub fn from_file(path : String) -> Self { @@ -95,7 +145,6 @@ impl Index { let mut filecache : Vec<FileCache> = Vec::new(); let mut dict = Dictionary::new(); - for line in reader.lines() { let l = line.unwrap(); if l.starts_with('#') { @@ -111,51 +160,105 @@ impl Index { } } - pub fn merge(a : Index, b : Index) -> Self { - let mut a_hash : HashSet<FileCache> = HashSet::new(); - let mut diff : Vec<FileCache> = Vec::new(); - let mut dict = a.dictionary.clone(); - let mut filecache = a.filecache.clone(); + fn merge_two(first : Index, second : Index) -> thread::JoinHandle<Self> { + thread::spawn(move || { + let (a, b) = if first.filecache.len() < second.filecache.len() { + (second, first) + } else { + (first, second) + }; + let mut filecache = a.filecache.clone(); + let mut dictionary = Dictionary::default(); - for file in a.filecache.iter() { - a_hash.insert(file.clone()); - } + thread::scope(|s| { + let mut a_hash : HashSet<FileCache> = HashSet::new(); + let mut diff : Vec<FileCache> = Vec::new(); - for file in b.filecache.iter() { - if !a_hash.contains(file) { - diff.push(file.clone()); - } - } + let converter_handle = s.spawn(|| { + let mut b_id_to_word : HashMap<u64, String> = HashMap::new(); - for (word, _) in b.dictionary.iter() { - dict.set(word.clone()); - } + for (value, id) in b.dictionary.iter() { + b_id_to_word.insert(id.clone(), value.clone()); + } + b_id_to_word + }); - let mut b_id_to_word : HashMap<u64, String> = HashMap::new(); + let dict_handle = s.spawn(|| { + let mut dict = a.dictionary.clone(); + for (word, _) in b.dictionary.iter() { + dict.set(word.clone()); + } + dict + }); - for (value, id) in b.dictionary.iter() { - b_id_to_word.insert(*id, value.clone()); - } + for file in a.filecache.iter() { + a_hash.insert(file.clone()); + } + + for file in b.filecache.iter() { + if !a_hash.contains(file) { + diff.push(file.clone()); + } + } + + let b_id_to_word = converter_handle.join().unwrap(); + dictionary = dict_handle.join().unwrap(); + + for file in diff { + let mut words = Vec::new(); + + for (word_id, i) in file.vector.iter() { + for _ in 0..*i { + words.push(b_id_to_word.get(word_id).unwrap().clone()); + } + } - for file in diff { - let mut words = Vec::new(); + filecache.push(FileCache { + path : file.path.clone(), + vector: dictionary.vectorize_word_list(words) + }); + } + }); + + Self { + dictionary, + filecache + } + }) + } + + pub fn merge(indexes : Vec<Index>, callback : impl Fn(u8)) -> Self { + let mut idxs : Vec<Index> = indexes.clone(); + let max = (idxs.len() as f32).log2().ceil() as u32; + let mut i = 0 as u32; - for (word_id, i) in file.vector.iter() { - for _ in 0..*i { - words.push(b_id_to_word.get(word_id).unwrap().clone()); + while idxs.len() > 1 { + callback((i * 100 / max) as u8); + i += 1; + let mut idxs_handle = Vec::new(); + let mut processed = Vec::new(); + + for chunk in idxs.chunks(2) { + if chunk.len() == 2 { + let a = chunk[0].clone(); + let b = chunk[1].clone(); + idxs_handle.push(Index::merge_two(a, b)); + } else { + for idx in chunk.iter() { + processed.push(idx.clone()) + } } } - filecache.push(FileCache { - path : file.path.clone(), - vector: dict.vectorize_word_list(words) - }); - } + for idx_handle in idxs_handle { + let idx : Index = idx_handle.join().unwrap(); + processed.push(idx) + } - Self { - dictionary: dict, - filecache + idxs = processed; } + + idxs.get(0).unwrap().clone() } pub fn search(&self, search_args : Vec<String>) -> Vec<SearchResult> { |