diff options
Diffstat (limited to 'src/index.rs')
| -rw-r--r-- | src/index.rs | 153 |
1 files changed, 52 insertions, 101 deletions
diff --git a/src/index.rs b/src/index.rs index 8fb34fe..cc86f0c 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,16 +1,13 @@ -use std::collections::hash_map::DefaultHasher; -use std::hash::{Hash, Hasher}; +use std::collections::HashMap; +use std::hash::Hash; use std::fs::File; -use std::io::{BufWriter, Write}; -use std::sync::mpsc::{channel, Sender}; -use std::time::Duration; -use walkdir::*; +use std::io::BufWriter; use std::thread; -use std::option::Option::None; -use crate::vector::FileVector; +use walkdir::*; +use hash32::Hasher; +use crate::vector::{FileVector, Indexer, Count}; use crate::filecache::FileCache; use crate::searchresult::SearchResult; -use crate::filecounter::filecount; use crate::text; use crate::splitter; use crate::vector; @@ -44,100 +41,55 @@ impl Index { } pub fn generate(input_path : &str, callback : impl Fn(GenState, u8)) -> Self { - let mut nof = 1; - let mut counter = 0; - let mut crawler_handles = Vec::new(); + let mut nof : usize = 0; let num_threads = thread::available_parallelism().unwrap().get(); - let mut tx_vec : Vec<Sender<String>> = Vec::new(); - let mut indexes = Vec::new(); - - thread::scope(|s| { - let mut nof_handle : Option<_> = Some(s.spawn(|| filecount(input_path))); - let (status_tx, status_rx) = channel(); + let mut paths = Vec::new(); + let (result_tx, result_rx) = std::sync::mpsc::channel(); - for _ in 0..num_threads { - let (tx, rx) = channel(); - tx_vec.push(tx); - let status_tx = status_tx.clone(); - crawler_handles.push(thread::spawn(move || { - let mut filecache : Vec<FileCache> = Vec::new(); - - loop { - let path = rx.recv().unwrap(); - if path.is_empty() { - return Self { - filecache - } - } + callback(GenState::Fetching, 0); + for entry in WalkDir::new(input_path) + .into_iter() + .filter_map(|e| e.ok()) { + if entry.path().is_file() { + nof += 1; + paths.push(entry.path().to_str().unwrap().to_string()); + } + } + callback(GenState::Fetching, 100); - let content : String = text::extract_text(path.as_str()); + let chunks = paths.chunks(paths.len() / num_threads); + let mut filecache = Vec::with_capacity(nof); - let _ = status_tx.send(()); + thread::scope(|s| { + for chunk in chunks { + let result_tx = result_tx.clone(); + s.spawn(move || { + for path in chunk { + let content : String = text::extract_text(path); if content.is_empty() { + result_tx.send(FileCache { + path: "".to_string(), + vector : FileVector::default() + }).ok(); continue; } let words : Vec<String> = splitter::split_to_words(content); let fv = FileVector::from_words(words); - filecache.push(FileCache { - path, + result_tx.send(FileCache { + path: "".to_string(), vector : fv - }); - } - })); - } - - let mut next_crawler = 0; - let mut last_p = 0; - - for entry in WalkDir::new(input_path) - .into_iter() - .filter_map(|e| e.ok()) { - counter += 1; - if entry.path().is_file() { - tx_vec[next_crawler].send(entry.path().to_str().unwrap().to_string()).ok(); - next_crawler += 1; - if next_crawler == num_threads { - next_crawler = 0; - } - - match nof_handle { - Some(t) => { - if t.is_finished() { - nof = t.join().unwrap(); - nof_handle = None; - } else { - nof_handle = Some(t); - } - } - None => { - // Make sure that we only push a update - // if there is a visual change to the number - // because updating the screen takes a lot - // of time. - let p = counter * 100 / nof; - if p != last_p { - callback(GenState::Fetching, p as u8); - last_p = p; - } - } + }).ok(); } - } + }); } - let join_handle = s.spawn(|| { - for (i, handle) in crawler_handles.into_iter().enumerate() { - tx_vec[i].send(String::new()).ok(); - indexes.push(handle.join().unwrap()); - } - }); - - - let mut i = 0; + let mut i : usize = 0; let mut last_p = 0; - while !join_handle.is_finished() { - if status_rx.recv_timeout(Duration::from_millis(20)).is_ok() { + while i != nof { + if let Ok(result) = result_rx.recv() { + filecache.push(result); i += 1; let p = i * 100 / nof; if p != last_p { @@ -148,11 +100,9 @@ impl Index { } callback(GenState::Parsing, 100); - - join_handle.join().ok(); }); - Index::merge(indexes, |p| { callback(GenState::Merging, p) }) + Self { filecache } } pub fn from_file(path : &String) -> Self { @@ -168,32 +118,35 @@ impl Index { let max = indexes.len(); let mut filecache = Vec::new(); - for (i, index) in indexes.into_iter().enumerate() { callback((i * 100 / max) as u8); filecache.extend(index.filecache); } callback(100); + Self { filecache } } pub fn search(&self, search_args : Vec<String>) -> Vec<SearchResult> { - let mut v : FileVector = FileVector::new(); - let mut opt : FileVector = FileVector::new(); + let mut v : HashMap<Indexer, Count> = HashMap::new(); + let mut opt : HashMap<Indexer, Count> = HashMap::new(); for arg in search_args { - let mut hasher = DefaultHasher::new(); + let mut hasher = hash32::FnvHasher::default(); let a = arg.trim_start_matches('+'); a.hash(&mut hasher); - let value = hasher.finish(); + let value = hasher.finish32(); if arg.starts_with('+') { - opt.insert(value, 1); + opt.insert(value as Indexer, 1); } else { - v.insert(value, 1); + v.insert(value as Indexer, 1); } } + let v = FileVector::from_hashmap(v); + let opt = FileVector::from_hashmap(opt); + let mut results : Vec<SearchResult> = Vec::new(); for filecache in self.filecache.iter() { @@ -210,10 +163,8 @@ impl Index { pub fn save(&self, path: String) { let index_file = File::create(path).expect("could not open output file"); - let mut file = BufWriter::new(index_file); - - file.write_all(&bincode::serialize(&self.filecache).unwrap()).ok(); - file.flush().ok(); + let file = BufWriter::new(index_file); + bincode::serialize_into(file, &self.filecache).ok(); } pub fn num_files(&self) -> usize { |