use std::collections::{HashSet, HashMap}; use std::fs::File; use std::io::{Write, BufReader, BufRead}; use walkdir::*; use std::thread; use std::option::Option::None; use crate::vector::FileVector; use crate::dictionary::Dictionary; use crate::filecache::FileCache; use crate::searchresult::SearchResult; use crate::filecounter::filecount; use crate::text; use crate::splitter; use crate::vector; /// Represents a Index which is ether generated /// or read from a file. #[derive(Clone, Debug)] pub struct Index { dictionary : Dictionary, filecache : Vec, } impl Default for Index { fn default() -> Self { Self::empty() } } impl Index { pub fn empty() -> Self { Self { dictionary : Dictionary::new(), filecache : Vec::new() } } pub fn generate(input_path : &str, callback : impl Fn(u64, u64)) -> Self { let mut dict = Dictionary::new(); let mut filecache : Vec = Vec::new(); let mut nof = 0; let mut counter = 0; thread::scope(|s| { let mut nof_handle : Option<_> = Some(s.spawn(|| filecount(input_path))); for entry in WalkDir::new(input_path) .into_iter() .filter_map(|e| e.ok()) { counter += 1; if entry.path().is_file() { let content : String = text::extract_text(entry.path().to_str().unwrap()); if content.is_empty() { continue } let words : Vec = splitter::split_to_words(content); for word in words.iter() { dict.set(word.clone()); } let fv = dict.vectorize_word_list(words.clone()); filecache.push(FileCache { path : entry.path().to_str().unwrap().to_string(), vector : fv }); } match nof_handle { Some(t) => { nof = t.join().unwrap(); nof_handle = None; } None => { callback(counter, nof); } } } callback(nof, nof); }); Self { dictionary : dict, filecache } } pub fn from_file(path : String) -> Self { let index_file = File::open(path).expect("could not open index file"); let reader = BufReader::new(index_file); let mut filecache : Vec = Vec::new(); let mut dict = Dictionary::new(); for line in reader.lines() { let l = line.unwrap(); if l.starts_with('#') { dict = Dictionary::from_line(l.strip_prefix('#').unwrap()); } else { filecache.push(FileCache::from_line(l)); } } Self { dictionary : dict, filecache } } pub fn merge(a : Index, b : Index) -> Self { let mut a_hash : HashSet = HashSet::new(); let mut diff : Vec = Vec::new(); let mut dict = a.dictionary.clone(); let mut filecache = a.filecache.clone(); for file in a.filecache.iter() { a_hash.insert(file.clone()); } for file in b.filecache.iter() { if !a_hash.contains(file) { diff.push(file.clone()); } } for (word, _) in b.dictionary.iter() { dict.set(word.clone()); } let mut b_id_to_word : HashMap = HashMap::new(); for (value, id) in b.dictionary.iter() { b_id_to_word.insert(*id, value.clone()); } for file in diff { let mut words = Vec::new(); for (word_id, i) in file.vector.iter() { for _ in 0..*i { words.push(b_id_to_word.get(word_id).unwrap().clone()); } } filecache.push(FileCache { path : file.path.clone(), vector: dict.vectorize_word_list(words) }); } Self { dictionary: dict, filecache } } pub fn search(&self, search_args : Vec) -> Vec { let mut v : FileVector = FileVector::new(); for arg in search_args { if let Some(value) = self.dictionary.get(arg.to_string()) { v.insert(*value, 1); } } let mut results : Vec = Vec::new(); for filecache in self.filecache.iter() { let mut r = SearchResult { priority : 0, path : filecache.path.clone() }; r.priority = vector::scalar_product(&v, &filecache.vector); if r.priority > 0 { results.push(r); } } results.sort_by(|a, b| b.priority.cmp(&a.priority)); results } pub fn save(&self, output : String) { let mut index_file = File::create(output).unwrap(); for file in self.filecache.iter() { writeln!( index_file, "{}, {}", file.path .replace(',', "\0"), file.vector.stringify() ).ok(); } let dict_list : Vec = self.dictionary.to_list(); writeln!(index_file, "#{}", dict_list.join(",")).ok(); } }