From 8ff994ab8419a243b051a07feabefdad3a4e1e3b Mon Sep 17 00:00:00 2001 From: Nathan Reiner Date: Thu, 6 Jul 2023 12:48:30 +0200 Subject: add cli --- src/dictionary.rs | 6 +-- src/index.rs | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 127 +++++++++++++-------------------------------------- 3 files changed, 168 insertions(+), 98 deletions(-) create mode 100644 src/index.rs (limited to 'src') diff --git a/src/dictionary.rs b/src/dictionary.rs index a8d9b28..b522c9e 100644 --- a/src/dictionary.rs +++ b/src/dictionary.rs @@ -31,8 +31,8 @@ impl Dictionary { } } - pub fn get(&self, name : String) -> u64 { - *self.data.get(&name).unwrap() + pub fn get(&self, name : String) -> Option<&u64> { + self.data.get(&name) } pub fn iter(&self) -> &HashMap { @@ -55,7 +55,7 @@ impl Dictionary { let mut fv = FileVector::new(); for word in words { - let i = self.get(word); + let i = *self.get(word).unwrap(); if !fv.contains_key(&i) { fv.insert(i, 1); } else { diff --git a/src/index.rs b/src/index.rs new file mode 100644 index 0000000..7ca31f1 --- /dev/null +++ b/src/index.rs @@ -0,0 +1,133 @@ +use std::fs::File; +use std::io::{Write, BufReader, BufRead}; +use walkdir::*; +use std::thread; +use std::option::Option::None; +use crate::vector::FileVector; +use crate::dictionary::Dictionary; +use crate::filecache::FileCache; +use crate::searchresult::SearchResult; +use crate::filecounter::filecount; +use crate::text; +use crate::splitter; +use crate::vector; + +pub struct Index { + dictionary : Dictionary, + filecache : Vec, +} + +impl Index { + pub fn generate(input_path : &str, index_path : &str, callback : impl Fn(u64, u64)) -> Self { + let mut index_file = File::create(index_path).unwrap(); + let mut dict = Dictionary::new(); + let mut filecache : Vec = Vec::new(); + let mut nof = 0; + let mut counter = 0; + + thread::scope(|s| { + let mut nof_handle : Option<_> = Some(s.spawn(|| filecount(input_path))); + + for entry in WalkDir::new(input_path) + .into_iter() + .filter_map(|e| e.ok()) { + counter += 1; + if entry.path().is_file() { + let content : String = text::extract_text(entry.path().to_str().unwrap()); + + if content.is_empty() { + continue + } + + let words : Vec = splitter::split_to_words(content); + + for word in words.iter() { + dict.set(word.clone()); + } + + let fv = dict.vectorize_word_list(words.clone()); + writeln!( + index_file, + "{}, {}", + entry.path() + .to_str() + .unwrap() + .replace(",", "\0"), + fv.to_hex() + ).ok(); + + filecache.push(FileCache { + path : entry.path().to_str().unwrap().to_string(), + vector : fv + }); + + + match nof_handle { + Some(t) => { + nof = t.join().unwrap(); + nof_handle = None; + } + None => { + callback(counter, nof); + } + } + } + } + + callback(nof, nof); + + let dict_list : Vec = dict.to_list(); + writeln!(index_file, "#{}", dict_list.join(",")).ok(); + }); + + Self { + dictionary : dict, + filecache + } + } + + pub fn from_file(path : &str) -> Self { + let index_file = File::open(path).expect("could not open index file"); + let reader = BufReader::new(index_file); + let mut filecache : Vec = Vec::new(); + let mut dict = Dictionary::new(); + + + for line in reader.lines() { + let l = line.unwrap(); + if l.starts_with("#") { + dict = Dictionary::from_line(&l.strip_prefix("#").unwrap()); + } else { + filecache.push(FileCache::from_line(l)); + } + } + + Self { + dictionary : dict, + filecache + } + } + + pub fn search(&self, search_args : Vec) -> Vec { + let mut v : FileVector = FileVector::new(); + + for arg in search_args { + match self.dictionary.get(arg.to_string()) { + Some(value) => { v.insert(*value, 1); } + None => {} + } + } + + let mut results : Vec = Vec::new(); + + for filecache in self.filecache.iter() { + let mut r = SearchResult { priority : 0, path : filecache.path.clone() }; + r.priority = vector::scalar_product(&v, &filecache.vector); + if r.priority > 0 { + results.push(r); + } + } + results.sort_by(|a, b| b.priority.cmp(&a.priority)); + results + } +} diff --git a/src/main.rs b/src/main.rs index 2e49fd4..261008b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,110 +6,47 @@ pub mod filecache; pub mod searchresult; pub mod filecounter; pub mod extractors; +pub mod index; -use vector::FileVector; -use dictionary::Dictionary; -use filecache::FileCache; -use searchresult::SearchResult; -use filecounter::filecount; -use std::fs::File; -use std::io::{Write, BufReader, BufRead}; -use walkdir::*; -use std::thread; -use std::option::Option::None; +use index::Index; +use std::io::*; +use std::env; -fn generate_index(input_path : &str, index_path : &str) { - let mut index_file = File::create(index_path).unwrap(); - let mut dict = Dictionary::new(); - let mut nof = 0; - let mut counter = 0; - - thread::scope(|s| { - let mut nof_handle : Option<_> = Some(s.spawn(|| filecount(input_path))); - - for entry in WalkDir::new(input_path).into_iter().filter_map(|e| e.ok()) { - counter += 1; - if entry.path().is_file() { - let content : String = text::extract_text(entry.path().to_str().unwrap()); - - if content.is_empty() { - continue - } - - let words : Vec = splitter::split_to_words(content); - - for word in words.iter() { - dict.set(word.clone()); - } - - let fv = dict.vectorize_word_list(words.clone()); - writeln!(index_file, "{}, {}", entry.path().to_str().unwrap().replace(",", "\0"), fv.to_hex()).ok(); +fn main() { + let args: Vec<_> = env::args().collect(); + if args.len() > 1 { + let cmd = args.get(1).unwrap(); - match nof_handle { - Some(t) => { - nof = t.join().unwrap(); - nof_handle = None; - } - None => { - eprint!("\r\x1b[2K{} of {} files indexed ({}%)", counter, nof, (counter * 100) / nof); - std::io::stdout().flush().ok(); - } - } + if cmd == "-g" { + if args.len() != 4 { + eprintln!("{} -g ", args.get(0).unwrap()); + return; } - } - - eprintln!("\r\x1b[2Kall files indexed (100%)"); - - let dict_list : Vec = dict.to_list(); - writeln!(index_file, "#{}", dict_list.join(",")).ok(); - }); -} - -fn search(index_path : &str, search_args : Vec<&str>) { - let index_file = File::open(index_path).expect("could not open index file"); - let reader = BufReader::new(index_file); - let mut filecaches : Vec = Vec::new(); - let mut dict = Dictionary::new(); - - for line in reader.lines() { - let l = line.unwrap(); - if l.starts_with("#") { - dict = Dictionary::from_line(&l.strip_prefix("#").unwrap()); - } else { - filecaches.push(FileCache::from_line(l)); - } - } - - let mut v : FileVector = FileVector::new(); - - for arg in search_args { - v.insert(dict.get(arg.to_string()), 1); - } + let input = args.get(2).unwrap(); + let file = args.get(3).unwrap(); + let _ = Index::generate(input, file, |counter, nof| { + eprint!("\r\x1b[2K{} of {} files indexed ({}%)", counter, nof, (counter * 100) / nof); + std::io::stdout().flush().ok(); + }); + } else if cmd == "-s" { + if args.len() < 4 { + eprintln!("{} -s ...", args.get(0).unwrap()); + return; + } - let mut results : Vec = Vec::new(); + let file = args.get(2).unwrap().clone(); + let v = args.get(3..(args.len())).unwrap(); + let search = v.join(" "); + let searchvec = splitter::split_to_words(search); - for filecache in filecaches.iter() { - let mut r = SearchResult { priority : 0, path : filecache.path.clone() }; - r.priority = vector::scalar_product(&v, &filecache.vector); - if r.priority > 0 { - results.push(r); + let idx = Index::from_file(file.as_str()); + let results = idx.search(searchvec); + for result in results { + println!("{}", result.path); + } } - } - results.sort_by(|a, b| b.priority.cmp(&a.priority)); - for result in results.iter() { - println!("{}", result.path); } - - println!("{} results", results.len()) - -} - -fn main() { - println!("Generating Index..."); - generate_index("/home/n8", "index.idxs"); - println!("Searching..."); - search("index.idxs", vec!["welt"]); } -- cgit v1.2.3-70-g09d2