use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; use std::fs::File; use std::io::{BufWriter, Write}; use std::sync::mpsc::{channel, Sender}; use std::time::Duration; use walkdir::*; use std::thread; use std::option::Option::None; use crate::vector::FileVector; use crate::filecache::FileCache; use crate::searchresult::SearchResult; use crate::filecounter::filecount; use crate::text; use crate::splitter; use crate::vector; /// Represents a Index which is ether generated /// or read from a file. #[derive(Clone, Debug)] pub struct Index { filecache : Vec, } impl Default for Index { fn default() -> Self { Self::empty() } } #[derive(Clone, Debug, Default, Copy)] pub enum GenState { #[default] Fetching, Parsing, Merging } impl Index { pub fn empty() -> Self { Self { filecache : Vec::new() } } pub fn generate(input_path : &str, callback : impl Fn(GenState, u8)) -> Self { let mut nof = 1; let mut counter = 0; let mut crawler_handles = Vec::new(); let num_threads = thread::available_parallelism().unwrap().get(); let mut tx_vec : Vec> = Vec::new(); let mut indexes = Vec::new(); thread::scope(|s| { let mut nof_handle : Option<_> = Some(s.spawn(|| filecount(input_path))); let (status_tx, status_rx) = channel(); for _ in 0..num_threads { let (tx, rx) = channel(); tx_vec.push(tx); let status_tx = status_tx.clone(); crawler_handles.push(thread::spawn(move || { let mut filecache : Vec = Vec::new(); loop { let path = rx.recv().unwrap(); if path.is_empty() { return Self { filecache } } let content : String = text::extract_text(path.as_str()); let _ = status_tx.send(()); if content.is_empty() { continue; } let words : Vec = splitter::split_to_words(content); let fv = FileVector::from_words(words); filecache.push(FileCache { path, vector : fv }); } })); } let mut next_crawler = 0; let mut last_p = 0; for entry in WalkDir::new(input_path) .into_iter() .filter_map(|e| e.ok()) { counter += 1; if entry.path().is_file() { tx_vec[next_crawler].send(entry.path().to_str().unwrap().to_string()).ok(); next_crawler += 1; if next_crawler == num_threads { next_crawler = 0; } match nof_handle { Some(t) => { if t.is_finished() { nof = t.join().unwrap(); nof_handle = None; } else { nof_handle = Some(t); } } None => { // Make sure that we only push a update // if there is a visual change to the number // because updating the screen takes a lot // of time. let p = counter * 100 / nof; if p != last_p { callback(GenState::Fetching, p as u8); last_p = p; } } } } } let join_handle = s.spawn(|| { for (i, handle) in crawler_handles.into_iter().enumerate() { tx_vec[i].send(String::new()).ok(); indexes.push(handle.join().unwrap()); } }); let mut i = 0; let mut last_p = 0; while !join_handle.is_finished() { if status_rx.recv_timeout(Duration::from_millis(20)).is_ok() { i += 1; let p = i * 100 / nof; if p != last_p { callback(GenState::Parsing, p as u8); last_p = p; } } } callback(GenState::Parsing, 100); join_handle.join().ok(); }); Index::merge(indexes, |p| { callback(GenState::Merging, p) }) } pub fn from_file(path : &String) -> Self { let bytes = std::fs::read(path).expect("could not read index file"); let filecache : Vec = bincode::deserialize(&bytes).unwrap(); Self { filecache } } pub fn merge(indexes : Vec, callback : impl Fn(u8)) -> Self { let max = indexes.len(); let mut filecache = Vec::new(); for (i, index) in indexes.into_iter().enumerate() { callback((i * 100 / max) as u8); filecache.extend(index.filecache); } callback(100); Self { filecache } } pub fn search(&self, search_args : Vec) -> Vec { let mut v : FileVector = FileVector::new(); let mut opt : FileVector = FileVector::new(); for arg in search_args { let mut hasher = DefaultHasher::new(); let a = arg.trim_start_matches('+'); a.hash(&mut hasher); let value = hasher.finish(); if arg.starts_with('+') { opt.insert(value, 1); } else { v.insert(value, 1); } } let mut results : Vec = Vec::new(); for filecache in self.filecache.iter() { let mut r = SearchResult { priority : 0, path : filecache.path.clone() }; r.priority = vector::match_vector(&v, &filecache.vector); if r.priority > 0 { r.priority += vector::scalar_product(&opt, &filecache.vector); results.push(r); } } results.sort_by(|a, b| b.priority.cmp(&a.priority)); results } pub fn save(&self, path: String) { let index_file = File::create(path).expect("could not open output file"); let mut file = BufWriter::new(index_file); file.write_all(&bincode::serialize(&self.filecache).unwrap()).ok(); file.flush().ok(); } pub fn num_files(&self) -> usize { self.filecache.len() } }