diff options
Diffstat (limited to 'src/index.rs')
| -rw-r--r-- | src/index.rs | 133 |
1 files changed, 133 insertions, 0 deletions
diff --git a/src/index.rs b/src/index.rs new file mode 100644 index 0000000..7ca31f1 --- /dev/null +++ b/src/index.rs @@ -0,0 +1,133 @@ +use std::fs::File; +use std::io::{Write, BufReader, BufRead}; +use walkdir::*; +use std::thread; +use std::option::Option::None; +use crate::vector::FileVector; +use crate::dictionary::Dictionary; +use crate::filecache::FileCache; +use crate::searchresult::SearchResult; +use crate::filecounter::filecount; +use crate::text; +use crate::splitter; +use crate::vector; + +pub struct Index { + dictionary : Dictionary, + filecache : Vec<FileCache>, +} + +impl Index { + pub fn generate(input_path : &str, index_path : &str, callback : impl Fn(u64, u64)) -> Self { + let mut index_file = File::create(index_path).unwrap(); + let mut dict = Dictionary::new(); + let mut filecache : Vec<FileCache> = Vec::new(); + let mut nof = 0; + let mut counter = 0; + + thread::scope(|s| { + let mut nof_handle : Option<_> = Some(s.spawn(|| filecount(input_path))); + + for entry in WalkDir::new(input_path) + .into_iter() + .filter_map(|e| e.ok()) { + counter += 1; + if entry.path().is_file() { + let content : String = text::extract_text(entry.path().to_str().unwrap()); + + if content.is_empty() { + continue + } + + let words : Vec<String> = splitter::split_to_words(content); + + for word in words.iter() { + dict.set(word.clone()); + } + + let fv = dict.vectorize_word_list(words.clone()); + writeln!( + index_file, + "{}, {}", + entry.path() + .to_str() + .unwrap() + .replace(",", "\0"), + fv.to_hex() + ).ok(); + + filecache.push(FileCache { + path : entry.path().to_str().unwrap().to_string(), + vector : fv + }); + + + match nof_handle { + Some(t) => { + nof = t.join().unwrap(); + nof_handle = None; + } + None => { + callback(counter, nof); + } + } + } + } + + callback(nof, nof); + + let dict_list : Vec<String> = dict.to_list(); + writeln!(index_file, "#{}", dict_list.join(",")).ok(); + }); + + Self { + dictionary : dict, + filecache + } + } + + pub fn from_file(path : &str) -> Self { + let index_file = File::open(path).expect("could not open index file"); + let reader = BufReader::new(index_file); + let mut filecache : Vec<FileCache> = Vec::new(); + let mut dict = Dictionary::new(); + + + for line in reader.lines() { + let l = line.unwrap(); + if l.starts_with("#") { + dict = Dictionary::from_line(&l.strip_prefix("#").unwrap()); + } else { + filecache.push(FileCache::from_line(l)); + } + } + + Self { + dictionary : dict, + filecache + } + } + + pub fn search(&self, search_args : Vec<String>) -> Vec<SearchResult> { + let mut v : FileVector = FileVector::new(); + + for arg in search_args { + match self.dictionary.get(arg.to_string()) { + Some(value) => { v.insert(*value, 1); } + None => {} + } + } + + let mut results : Vec<SearchResult> = Vec::new(); + + for filecache in self.filecache.iter() { + let mut r = SearchResult { priority : 0, path : filecache.path.clone() }; + r.priority = vector::scalar_product(&v, &filecache.vector); + if r.priority > 0 { + results.push(r); + } + } + results.sort_by(|a, b| b.priority.cmp(&a.priority)); + results + } +} |