From 4d577650f737daaeb477bbbd5ae2bad4f1121c38 Mon Sep 17 00:00:00 2001 From: Nathan Reiner Date: Wed, 5 Jul 2023 23:07:26 +0200 Subject: first sketch of indexer --- src/dictionary.rs | 69 ++++++++++++++++++++++++++++++++++++++++++ src/filecache.rs | 18 +++++++++++ src/main.rs | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/searchresult.rs | 5 +++ src/splitter.rs | 16 ++++++++++ src/text/mod.rs | 31 +++++++++++++++++++ src/text/txt.rs | 5 +++ src/vector.rs | 59 ++++++++++++++++++++++++++++++++++++ 8 files changed, 290 insertions(+) create mode 100644 src/dictionary.rs create mode 100644 src/filecache.rs create mode 100644 src/main.rs create mode 100644 src/searchresult.rs create mode 100644 src/splitter.rs create mode 100644 src/text/mod.rs create mode 100644 src/text/txt.rs create mode 100644 src/vector.rs (limited to 'src') diff --git a/src/dictionary.rs b/src/dictionary.rs new file mode 100644 index 0000000..a8d9b28 --- /dev/null +++ b/src/dictionary.rs @@ -0,0 +1,69 @@ +use std::collections::HashMap; +use crate::vector::FileVector; + +pub struct Dictionary { + last_index : usize, + data : HashMap, +} + + +impl Dictionary { + pub fn new() -> Self { + Self { last_index : 0, data : HashMap::new() } + } + + pub fn from_line(line : &str) -> Self { + let mut data : HashMap = HashMap::new(); + let mut i : usize = 0; + + for word in line.split(',') { + data.insert(word.to_string(), i as u64); + i += 1; + } + + Self { last_index : i - 1, data } + } + + pub fn set(&mut self, name : String) { + if !self.data.contains_key(&name) { + self.last_index += 1; + self.data.insert(name, self.last_index as u64); + } + } + + pub fn get(&self, name : String) -> u64 { + *self.data.get(&name).unwrap() + } + + pub fn iter(&self) -> &HashMap { + &self.data + } + + pub fn to_list(&self) -> Vec { + let mut v = Vec::with_capacity(self.last_index + 1); + + v.resize(self.last_index + 1, "".to_string()); + + for (word, id) in self.iter() { + v[(*id) as usize] = word.clone(); + } + + v + } + + pub fn vectorize_word_list(&self, words : Vec) -> FileVector { + let mut fv = FileVector::new(); + + for word in words { + let i = self.get(word); + if !fv.contains_key(&i) { + fv.insert(i, 1); + } else { + let c : u64 = *fv.get(&i).unwrap(); + fv.insert(i, c + 1); + } + } + + fv + } +} diff --git a/src/filecache.rs b/src/filecache.rs new file mode 100644 index 0000000..c584c0b --- /dev/null +++ b/src/filecache.rs @@ -0,0 +1,18 @@ +use crate::vector::FileVector; + +pub struct FileCache { + pub vector : FileVector, + pub path : String, +} + +impl FileCache { + pub fn from_line(line : String) -> Self { + let ls : Vec = line.split(',').map(|s| s.to_string()).collect(); + let v = FileVector::from_hex(ls[1].clone()); + let p = ls[0].clone().replace("\0", ","); + Self { + vector : v, + path : p + } + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..f0975ad --- /dev/null +++ b/src/main.rs @@ -0,0 +1,87 @@ +pub mod vector; +pub mod dictionary; +pub mod text; +pub mod splitter; +pub mod filecache; +pub mod searchresult; + +use vector::FileVector; +use dictionary::Dictionary; +use filecache::FileCache; +use searchresult::SearchResult; +use std::fs::File; +use std::io::{Write, BufReader, BufRead}; +use walkdir::*; + +fn generate_index(input_path : &str, index_path : &str) { + let mut index_file = File::create(index_path).unwrap(); + let mut dict = Dictionary::new(); + + for entry in WalkDir::new(input_path).into_iter().filter_map(|e| e.ok()) { + if entry.path().is_file() { + let content : String = text::extract_text(entry.path().to_str().unwrap()); + if !content.is_empty() { + let words : Vec = splitter::split_to_words(content); + + for word in words.iter() { + let w = word.clone(); + dict.set(w); + } + + let fv : FileVector = dict.vectorize_word_list(words); + writeln!(index_file, "{}, {}", entry.path().to_str().unwrap().replace(",", "\0"), fv.to_hex()).ok(); + } + } + } + + let dict_list : Vec = dict.to_list(); + writeln!(index_file, "#{}", dict_list.join(",")).ok(); +} + +fn search(index_path : &str, search_args : Vec<&str>) { + let index_file = File::open(index_path).expect("could not open index file"); + let reader = BufReader::new(index_file); + let mut filecaches : Vec = Vec::new(); + let mut dict = Dictionary::new(); + + + for line in reader.lines() { + let l = line.unwrap(); + if l.starts_with("#") { + dict = Dictionary::from_line(&l.strip_prefix("#").unwrap()); + } else { + filecaches.push(FileCache::from_line(l)); + } + } + + let mut v : FileVector = FileVector::new(); + + for arg in search_args { + v.insert(dict.get(arg.to_string()), 1); + } + + let mut results : Vec = Vec::new(); + + for filecache in filecaches.iter() { + let mut r = SearchResult { priority : 0, path : filecache.path.clone() }; + r.priority = vector::scalar_product(&v, &filecache.vector); + if r.priority > 0 { + results.push(r); + } + } + results.sort_by(|a, b| b.priority.cmp(&a.priority)); + + for result in results.iter() { + println!("{}", result.path); + } + + println!("{} results", results.len()) + +} + +fn main() { + println!("Generating Index..."); + generate_index("/home/n8", "index.idxs"); + println!("Searching..."); + search("index.idxs", vec!["one", "difficult", "under", "linux"]); +} diff --git a/src/searchresult.rs b/src/searchresult.rs new file mode 100644 index 0000000..6a0dd30 --- /dev/null +++ b/src/searchresult.rs @@ -0,0 +1,5 @@ + +pub struct SearchResult { + pub priority : u64, + pub path : String +} diff --git a/src/splitter.rs b/src/splitter.rs new file mode 100644 index 0000000..64e659f --- /dev/null +++ b/src/splitter.rs @@ -0,0 +1,16 @@ +use std::vec::Vec; + +pub fn split_to_words(data : String) -> Vec { + let mut v : Vec = data + .to_lowercase() + .split_whitespace() + .map(str::to_string).collect(); + + for word in v.iter_mut() { + word.retain(|c| !r#"{}[]#(),".;:?!'%|0123456789/\^"#.contains(c)) + } + + v.retain(|str| !str.is_empty()); + + v +} diff --git a/src/text/mod.rs b/src/text/mod.rs new file mode 100644 index 0000000..4e1b9a4 --- /dev/null +++ b/src/text/mod.rs @@ -0,0 +1,31 @@ +use lazy_static::lazy_static; +use std::ffi::OsStr; +use std::path::Path; +use std::collections::HashMap; + +mod txt; + +fn empty_extractor(_ : &str) -> String { + "".to_string() +} + +macro_rules! ext { + ($f:ident) => {(stringify!($f), $f::get_text as ExtFn)} +} + +type ExtFn = fn(&str) -> String; + +lazy_static! { + static ref EXT: HashMap<&'static str, ExtFn> = { + HashMap::from([ + ext!(txt), + ]) + }; +} + + +pub fn extract_text(path : &str) -> String { + let p = Path::new(&path); + let extenstion = p.extension().unwrap_or_else(|| OsStr::new("")).to_str().unwrap(); + EXT.get(extenstion).unwrap_or(&(empty_extractor as ExtFn))(path) +} diff --git a/src/text/txt.rs b/src/text/txt.rs new file mode 100644 index 0000000..14e7422 --- /dev/null +++ b/src/text/txt.rs @@ -0,0 +1,5 @@ +use std::fs; + +pub fn get_text(path : &str) -> String { + fs::read_to_string(path).unwrap_or_default() +} diff --git a/src/vector.rs b/src/vector.rs new file mode 100644 index 0000000..ce1139d --- /dev/null +++ b/src/vector.rs @@ -0,0 +1,59 @@ +use std::collections::HashMap; +use std::ops::{Deref, DerefMut}; + +pub struct FileVector { + data : HashMap +} + +impl Deref for FileVector { + type Target = HashMap; + fn deref(&self) -> &Self::Target { + &self.data + } +} + +impl DerefMut for FileVector { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.data + } +} + +impl FileVector { + pub fn new() -> Self { + Self { data : HashMap::new() } + } + + pub fn from_hex(hex : String) -> Self { + let mut data : HashMap = HashMap::new(); + let data_chunks : Vec<&str> = hex.split(' ').collect(); + + for chunk in data_chunks { + if !chunk.is_empty() { + let n : Vec<&str> = chunk.split(';').collect(); + let i : u64 = u64::from_str_radix(n[0], 16).expect("could not extract index"); + let v : u64 = u64::from_str_radix(n[1], 16).expect("could not extract value"); + data.insert(i, v); + } + } + + Self { data } + } + + pub fn to_hex(&self) -> String { + let mut hex = String::new(); + + for (i, v) in self.data.iter() { + hex += &format!("{:x};{:x} ", *i, *v); + } + + hex.trim().to_string() + } +} + +pub fn scalar_product(a : &FileVector, b : &FileVector) -> u64 { + let mut c = 0; + for (i, x) in a.iter() { + c += x * (b.get(i).unwrap_or_else(|| &0)); + } + c +} -- cgit v1.2.3-70-g09d2