use std::collections::{HashSet, HashMap}; use std::fs::File; use std::io::{Write, BufReader, BufRead}; use std::sync::mpsc::{channel, Sender}; use walkdir::*; use std::thread; use std::option::Option::None; use crate::vector::FileVector; use crate::dictionary::Dictionary; use crate::filecache::FileCache; use crate::searchresult::SearchResult; use crate::filecounter::filecount; use crate::text; use crate::splitter; use crate::vector; /// Represents a Index which is ether generated /// or read from a file. #[derive(Clone, Debug)] pub struct Index { dictionary : Dictionary, filecache : Vec, } impl Default for Index { fn default() -> Self { Self::empty() } } #[derive(Clone, Debug, Default, Copy)] pub enum GenState { #[default] Fetching, Parsing, Merging } impl Index { pub fn empty() -> Self { Self { dictionary : Dictionary::new(), filecache : Vec::new() } } pub fn generate(input_path : &str, callback : impl Fn(GenState, u8)) -> Self { let mut nof = 0; let mut counter = 0; let mut crawler_handles = Vec::new(); let num_threads = thread::available_parallelism().unwrap().get(); let mut tx_vec : Vec> = Vec::new(); thread::scope(|s| { let mut nof_handle : Option<_> = Some(s.spawn(|| filecount(input_path))); for _ in 0..num_threads { let (tx, rx) = channel(); tx_vec.push(tx); crawler_handles.push(thread::spawn(move || { let mut dict = Dictionary::new(); let mut filecache : Vec = Vec::new(); loop { let path = rx.recv().unwrap(); if path.is_empty() { return Self { dictionary : dict, filecache } } let content : String = text::extract_text(path.as_str()); if content.is_empty() { continue; } let words : Vec = splitter::split_to_words(content); for word in words.iter() { dict.set(word.clone()); } let fv = dict.vectorize_word_list(words.clone()); filecache.push(FileCache { path, vector : fv }); } })); } let mut next_crawler = 0; let mut last_p = u64::MAX; for entry in WalkDir::new(input_path) .into_iter() .filter_map(|e| e.ok()) { counter += 1; if entry.path().is_file() { tx_vec[next_crawler].send(entry.path().to_str().unwrap().to_string()).ok(); next_crawler += 1; if next_crawler == num_threads { next_crawler = 0; } match nof_handle { Some(t) => { if t.is_finished() { nof = t.join().unwrap(); nof_handle = None; } else { nof_handle = Some(t); } } None => { // Make sure that we only push a update // if there is a visual change to the number // because updating the screen takes a lot // of time. let p = counter * 100 / nof; if p != last_p { callback(GenState::Fetching, p as u8); last_p = p; } } } } } }); let mut indexes = Vec::new(); let mut i = 0; for handle in crawler_handles { callback(GenState::Parsing, (i * 100 / num_threads) as u8); tx_vec[i].send(String::new()).ok(); indexes.push(handle.join().unwrap()); i += 1; } Index::merge(indexes, |p| { callback(GenState::Merging, p) }) } pub fn from_file(path : String) -> Self { let index_file = File::open(path).expect("could not open index file"); let reader = BufReader::new(index_file); let mut filecache : Vec = Vec::new(); let mut dict = Dictionary::new(); for line in reader.lines() { let l = line.unwrap(); if l.starts_with('#') { dict = Dictionary::from_line(l.strip_prefix('#').unwrap()); } else { filecache.push(FileCache::from_line(l)); } } Self { dictionary : dict, filecache } } fn merge_two(a : Index, b : Index) -> Self { let mut filecache = a.filecache.clone(); let mut dictionary = Dictionary::default(); thread::scope(|s| { let mut a_hash : HashSet = HashSet::new(); let mut diff : Vec = Vec::new(); let converter_handle = s.spawn(|| { let mut b_id_to_word : HashMap = HashMap::new(); for (value, id) in b.dictionary.iter() { b_id_to_word.insert(id.clone(), value.clone()); } b_id_to_word }); let dict_handle = s.spawn(|| { let mut dict = a.dictionary.clone(); for (word, _) in b.dictionary.iter() { dict.set(word.clone()); } dict }); for file in a.filecache.iter() { a_hash.insert(file.clone()); } for file in b.filecache.iter() { if !a_hash.contains(file) { diff.push(file.clone()); } } let b_id_to_word = converter_handle.join().unwrap(); dictionary = dict_handle.join().unwrap(); for file in diff { let mut words = Vec::new(); for (word_id, i) in file.vector.iter() { for _ in 0..*i { words.push(b_id_to_word.get(word_id).unwrap().clone()); } } filecache.push(FileCache { path : file.path.clone(), vector: dictionary.vectorize_word_list(words) }); } }); Self { dictionary, filecache } } pub fn merge(mut indexes : Vec, callback : impl Fn(u8)) -> Self { let max = indexes.len(); let mut i = 0 as usize; indexes.sort_by(|a, b| a.filecache.len().cmp(&b.filecache.len())); let mut merged_index = indexes.pop().unwrap(); for index in indexes { callback((i * 100 / max) as u8); i += 1; merged_index = Index::merge_two(merged_index, index); } callback(100); merged_index } pub fn search(&self, search_args : Vec) -> Vec { let mut v : FileVector = FileVector::new(); for arg in search_args { if let Some(value) = self.dictionary.get(arg.to_string()) { v.insert(*value, 1); } } let mut results : Vec = Vec::new(); for filecache in self.filecache.iter() { let mut r = SearchResult { priority : 0, path : filecache.path.clone() }; r.priority = vector::scalar_product(&v, &filecache.vector); if r.priority > 0 { results.push(r); } } results.sort_by(|a, b| b.priority.cmp(&a.priority)); results } pub fn save(&self, output : String) { let mut index_file = File::create(output).unwrap(); for file in self.filecache.iter() { writeln!( index_file, "{}, {}", file.path .replace(',', "\0"), file.vector.stringify() ).ok(); } let dict_list : Vec = self.dictionary.to_list(); writeln!(index_file, "#{}", dict_list.join(",")).ok(); } }