use std::collections::HashMap; use std::hash::Hash; use std::fs::File; use std::io::{BufWriter, BufReader}; use std::thread; use walkdir::*; use hash32::Hasher; use crate::vector::{FileVector, Indexer, Count}; use crate::filecache::FileCache; use crate::searchresult::SearchResult; use crate::text; use crate::splitter; use crate::vector; /// Represents a Index which is ether generated /// or read from a file. #[derive(Clone, Debug)] pub struct Index { pub filecache : Vec, } impl Default for Index { fn default() -> Self { Self::empty() } } #[derive(Clone, Debug, Default, Copy)] pub enum GenState { #[default] Fetching, Parsing, Merging } impl Index { pub fn empty() -> Self { Self { filecache : Vec::new() } } pub fn generate(input_path : &str, callback : impl Fn(GenState, u8)) -> Self { let mut nof : usize = 0; let num_threads = thread::available_parallelism().unwrap().get(); let mut paths = Vec::new(); let (result_tx, result_rx) = std::sync::mpsc::channel(); callback(GenState::Fetching, 0); for entry in WalkDir::new(input_path) .into_iter() .filter_map(|e| e.ok()) { if entry.path().is_file() { nof += 1; paths.push(entry.path().to_str().unwrap().to_string()); } } callback(GenState::Fetching, 100); let chunks = paths.chunks(paths.len() / num_threads); let mut filecache = Vec::with_capacity(nof); thread::scope(|s| { for chunk in chunks { let result_tx = result_tx.clone(); s.spawn(move || { for path in chunk { let content : String = text::extract_text(path); if content.is_empty() { result_tx.send(FileCache { path: "[is_empty]".to_string(), vector : FileVector::default() }).ok(); continue; } let words : Vec = splitter::split_to_words(content); let fv = FileVector::from_words(words); result_tx.send(FileCache { path: path.to_string(), vector : fv }).ok(); } }); } let mut i : usize = 0; let mut last_p = 0; while i != nof { if let Ok(result) = result_rx.recv() { filecache.push(result); i += 1; let p = i * 100 / nof; if p != last_p { callback(GenState::Parsing, p as u8); last_p = p; } } } callback(GenState::Parsing, 100); }); Self { filecache } } pub fn from_file(path : &String) -> Self { let file = File::open(path).expect("could not read index file"); let reader = BufReader::new(file); let filecache : Vec = bincode::deserialize_from(reader).unwrap(); Self { filecache } } pub fn merge(indexes : Vec, callback : impl Fn(u8)) -> Self { let max = indexes.len(); let mut filecache = Vec::new(); for (i, index) in indexes.into_iter().enumerate() { callback((i * 100 / max) as u8); filecache.extend(index.filecache); } callback(100); Self { filecache } } pub fn search(&self, search_args : Vec, callback : impl Fn(u8)) -> Vec { let mut v : HashMap = HashMap::new(); let mut opt : HashMap = HashMap::new(); for arg in search_args { let mut hasher = hash32::FnvHasher::default(); let a = arg.trim_start_matches('+'); a.hash(&mut hasher); let value = hasher.finish32(); if arg.starts_with('+') { opt.insert(value as Indexer, 1); } else { v.insert(value as Indexer, 1); } } let v = FileVector::from_hashmap(v); let opt = FileVector::from_hashmap(opt); let mut results : Vec = Vec::new(); let max = self.filecache.len(); let mut last_p = 0; for (i, filecache) in self.filecache.iter().enumerate() { let mut r = SearchResult { priority : 0, path : filecache.path.clone() }; r.priority = vector::match_vector(&v, &filecache.vector); if r.priority > 0 { r.priority += vector::scalar_product(&opt, &filecache.vector); results.push(r); } let p = i * 100 / max; if last_p < p { callback(p as u8); last_p = p; } } results.sort_by(|a, b| b.priority.cmp(&a.priority)); results } pub fn save(&self, path: String) { let index_file = File::create(path).expect("could not open output file"); let file = BufWriter::new(index_file); bincode::serialize_into(file, &self.filecache).ok(); } pub fn num_files(&self) -> usize { self.filecache.len() } pub fn import(&mut self, index : Index) { self.filecache = index.filecache; } }