aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/dictionary.rs6
-rw-r--r--src/index.rs133
-rw-r--r--src/main.rs127
3 files changed, 168 insertions, 98 deletions
diff --git a/src/dictionary.rs b/src/dictionary.rs
index a8d9b28..b522c9e 100644
--- a/src/dictionary.rs
+++ b/src/dictionary.rs
@@ -31,8 +31,8 @@ impl Dictionary {
}
}
- pub fn get(&self, name : String) -> u64 {
- *self.data.get(&name).unwrap()
+ pub fn get(&self, name : String) -> Option<&u64> {
+ self.data.get(&name)
}
pub fn iter(&self) -> &HashMap<String, u64> {
@@ -55,7 +55,7 @@ impl Dictionary {
let mut fv = FileVector::new();
for word in words {
- let i = self.get(word);
+ let i = *self.get(word).unwrap();
if !fv.contains_key(&i) {
fv.insert(i, 1);
} else {
diff --git a/src/index.rs b/src/index.rs
new file mode 100644
index 0000000..7ca31f1
--- /dev/null
+++ b/src/index.rs
@@ -0,0 +1,133 @@
+use std::fs::File;
+use std::io::{Write, BufReader, BufRead};
+use walkdir::*;
+use std::thread;
+use std::option::Option::None;
+use crate::vector::FileVector;
+use crate::dictionary::Dictionary;
+use crate::filecache::FileCache;
+use crate::searchresult::SearchResult;
+use crate::filecounter::filecount;
+use crate::text;
+use crate::splitter;
+use crate::vector;
+
+pub struct Index {
+ dictionary : Dictionary,
+ filecache : Vec<FileCache>,
+}
+
+impl Index {
+ pub fn generate(input_path : &str, index_path : &str, callback : impl Fn(u64, u64)) -> Self {
+ let mut index_file = File::create(index_path).unwrap();
+ let mut dict = Dictionary::new();
+ let mut filecache : Vec<FileCache> = Vec::new();
+ let mut nof = 0;
+ let mut counter = 0;
+
+ thread::scope(|s| {
+ let mut nof_handle : Option<_> = Some(s.spawn(|| filecount(input_path)));
+
+ for entry in WalkDir::new(input_path)
+ .into_iter()
+ .filter_map(|e| e.ok()) {
+ counter += 1;
+ if entry.path().is_file() {
+ let content : String = text::extract_text(entry.path().to_str().unwrap());
+
+ if content.is_empty() {
+ continue
+ }
+
+ let words : Vec<String> = splitter::split_to_words(content);
+
+ for word in words.iter() {
+ dict.set(word.clone());
+ }
+
+ let fv = dict.vectorize_word_list(words.clone());
+ writeln!(
+ index_file,
+ "{}, {}",
+ entry.path()
+ .to_str()
+ .unwrap()
+ .replace(",", "\0"),
+ fv.to_hex()
+ ).ok();
+
+ filecache.push(FileCache {
+ path : entry.path().to_str().unwrap().to_string(),
+ vector : fv
+ });
+
+
+ match nof_handle {
+ Some(t) => {
+ nof = t.join().unwrap();
+ nof_handle = None;
+ }
+ None => {
+ callback(counter, nof);
+ }
+ }
+ }
+ }
+
+ callback(nof, nof);
+
+ let dict_list : Vec<String> = dict.to_list();
+ writeln!(index_file, "#{}", dict_list.join(",")).ok();
+ });
+
+ Self {
+ dictionary : dict,
+ filecache
+ }
+ }
+
+ pub fn from_file(path : &str) -> Self {
+ let index_file = File::open(path).expect("could not open index file");
+ let reader = BufReader::new(index_file);
+ let mut filecache : Vec<FileCache> = Vec::new();
+ let mut dict = Dictionary::new();
+
+
+ for line in reader.lines() {
+ let l = line.unwrap();
+ if l.starts_with("#") {
+ dict = Dictionary::from_line(&l.strip_prefix("#").unwrap());
+ } else {
+ filecache.push(FileCache::from_line(l));
+ }
+ }
+
+ Self {
+ dictionary : dict,
+ filecache
+ }
+ }
+
+ pub fn search(&self, search_args : Vec<String>) -> Vec<SearchResult> {
+ let mut v : FileVector = FileVector::new();
+
+ for arg in search_args {
+ match self.dictionary.get(arg.to_string()) {
+ Some(value) => { v.insert(*value, 1); }
+ None => {}
+ }
+ }
+
+ let mut results : Vec<SearchResult> = Vec::new();
+
+ for filecache in self.filecache.iter() {
+ let mut r = SearchResult { priority : 0, path : filecache.path.clone() };
+ r.priority = vector::scalar_product(&v, &filecache.vector);
+ if r.priority > 0 {
+ results.push(r);
+ }
+ }
+ results.sort_by(|a, b| b.priority.cmp(&a.priority));
+ results
+ }
+}
diff --git a/src/main.rs b/src/main.rs
index 2e49fd4..261008b 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -6,110 +6,47 @@ pub mod filecache;
pub mod searchresult;
pub mod filecounter;
pub mod extractors;
+pub mod index;
-use vector::FileVector;
-use dictionary::Dictionary;
-use filecache::FileCache;
-use searchresult::SearchResult;
-use filecounter::filecount;
-use std::fs::File;
-use std::io::{Write, BufReader, BufRead};
-use walkdir::*;
-use std::thread;
-use std::option::Option::None;
+use index::Index;
+use std::io::*;
+use std::env;
-fn generate_index(input_path : &str, index_path : &str) {
- let mut index_file = File::create(index_path).unwrap();
- let mut dict = Dictionary::new();
- let mut nof = 0;
- let mut counter = 0;
-
- thread::scope(|s| {
- let mut nof_handle : Option<_> = Some(s.spawn(|| filecount(input_path)));
-
- for entry in WalkDir::new(input_path).into_iter().filter_map(|e| e.ok()) {
- counter += 1;
- if entry.path().is_file() {
- let content : String = text::extract_text(entry.path().to_str().unwrap());
-
- if content.is_empty() {
- continue
- }
-
- let words : Vec<String> = splitter::split_to_words(content);
-
- for word in words.iter() {
- dict.set(word.clone());
- }
-
- let fv = dict.vectorize_word_list(words.clone());
- writeln!(index_file, "{}, {}", entry.path().to_str().unwrap().replace(",", "\0"), fv.to_hex()).ok();
+fn main() {
+ let args: Vec<_> = env::args().collect();
+ if args.len() > 1 {
+ let cmd = args.get(1).unwrap();
- match nof_handle {
- Some(t) => {
- nof = t.join().unwrap();
- nof_handle = None;
- }
- None => {
- eprint!("\r\x1b[2K{} of {} files indexed ({}%)", counter, nof, (counter * 100) / nof);
- std::io::stdout().flush().ok();
- }
- }
+ if cmd == "-g" {
+ if args.len() != 4 {
+ eprintln!("{} -g <input> <indexfile>", args.get(0).unwrap());
+ return;
}
- }
-
- eprintln!("\r\x1b[2Kall files indexed (100%)");
-
- let dict_list : Vec<String> = dict.to_list();
- writeln!(index_file, "#{}", dict_list.join(",")).ok();
- });
-}
-
-fn search(index_path : &str, search_args : Vec<&str>) {
- let index_file = File::open(index_path).expect("could not open index file");
- let reader = BufReader::new(index_file);
- let mut filecaches : Vec<FileCache> = Vec::new();
- let mut dict = Dictionary::new();
-
- for line in reader.lines() {
- let l = line.unwrap();
- if l.starts_with("#") {
- dict = Dictionary::from_line(&l.strip_prefix("#").unwrap());
- } else {
- filecaches.push(FileCache::from_line(l));
- }
- }
-
- let mut v : FileVector = FileVector::new();
-
- for arg in search_args {
- v.insert(dict.get(arg.to_string()), 1);
- }
+ let input = args.get(2).unwrap();
+ let file = args.get(3).unwrap();
+ let _ = Index::generate(input, file, |counter, nof| {
+ eprint!("\r\x1b[2K{} of {} files indexed ({}%)", counter, nof, (counter * 100) / nof);
+ std::io::stdout().flush().ok();
+ });
+ } else if cmd == "-s" {
+ if args.len() < 4 {
+ eprintln!("{} -s <indexfile> ...", args.get(0).unwrap());
+ return;
+ }
- let mut results : Vec<SearchResult> = Vec::new();
+ let file = args.get(2).unwrap().clone();
+ let v = args.get(3..(args.len())).unwrap();
+ let search = v.join(" ");
+ let searchvec = splitter::split_to_words(search);
- for filecache in filecaches.iter() {
- let mut r = SearchResult { priority : 0, path : filecache.path.clone() };
- r.priority = vector::scalar_product(&v, &filecache.vector);
- if r.priority > 0 {
- results.push(r);
+ let idx = Index::from_file(file.as_str());
+ let results = idx.search(searchvec);
+ for result in results {
+ println!("{}", result.path);
+ }
}
- }
- results.sort_by(|a, b| b.priority.cmp(&a.priority));
- for result in results.iter() {
- println!("{}", result.path);
}
-
- println!("{} results", results.len())
-
-}
-
-fn main() {
- println!("Generating Index...");
- generate_index("/home/n8", "index.idxs");
- println!("Searching...");
- search("index.idxs", vec!["welt"]);
}