aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorNathan Reiner <nathan@nathanreiner.xyz>2023-07-05 23:07:26 +0200
committerNathan Reiner <nathan@nathanreiner.xyz>2023-07-05 23:07:26 +0200
commit4d577650f737daaeb477bbbd5ae2bad4f1121c38 (patch)
treeac973541e0a2d7751af4ece5f7f639e739f81fcc /src
first sketch of indexer
Diffstat (limited to 'src')
-rw-r--r--src/dictionary.rs69
-rw-r--r--src/filecache.rs18
-rw-r--r--src/main.rs87
-rw-r--r--src/searchresult.rs5
-rw-r--r--src/splitter.rs16
-rw-r--r--src/text/mod.rs31
-rw-r--r--src/text/txt.rs5
-rw-r--r--src/vector.rs59
8 files changed, 290 insertions, 0 deletions
diff --git a/src/dictionary.rs b/src/dictionary.rs
new file mode 100644
index 0000000..a8d9b28
--- /dev/null
+++ b/src/dictionary.rs
@@ -0,0 +1,69 @@
+use std::collections::HashMap;
+use crate::vector::FileVector;
+
+pub struct Dictionary {
+ last_index : usize,
+ data : HashMap<String, u64>,
+}
+
+
+impl Dictionary {
+ pub fn new() -> Self {
+ Self { last_index : 0, data : HashMap::new() }
+ }
+
+ pub fn from_line(line : &str) -> Self {
+ let mut data : HashMap<String, u64> = HashMap::new();
+ let mut i : usize = 0;
+
+ for word in line.split(',') {
+ data.insert(word.to_string(), i as u64);
+ i += 1;
+ }
+
+ Self { last_index : i - 1, data }
+ }
+
+ pub fn set(&mut self, name : String) {
+ if !self.data.contains_key(&name) {
+ self.last_index += 1;
+ self.data.insert(name, self.last_index as u64);
+ }
+ }
+
+ pub fn get(&self, name : String) -> u64 {
+ *self.data.get(&name).unwrap()
+ }
+
+ pub fn iter(&self) -> &HashMap<String, u64> {
+ &self.data
+ }
+
+ pub fn to_list(&self) -> Vec<String> {
+ let mut v = Vec::with_capacity(self.last_index + 1);
+
+ v.resize(self.last_index + 1, "".to_string());
+
+ for (word, id) in self.iter() {
+ v[(*id) as usize] = word.clone();
+ }
+
+ v
+ }
+
+ pub fn vectorize_word_list(&self, words : Vec<String>) -> FileVector {
+ let mut fv = FileVector::new();
+
+ for word in words {
+ let i = self.get(word);
+ if !fv.contains_key(&i) {
+ fv.insert(i, 1);
+ } else {
+ let c : u64 = *fv.get(&i).unwrap();
+ fv.insert(i, c + 1);
+ }
+ }
+
+ fv
+ }
+}
diff --git a/src/filecache.rs b/src/filecache.rs
new file mode 100644
index 0000000..c584c0b
--- /dev/null
+++ b/src/filecache.rs
@@ -0,0 +1,18 @@
+use crate::vector::FileVector;
+
+pub struct FileCache {
+ pub vector : FileVector,
+ pub path : String,
+}
+
+impl FileCache {
+ pub fn from_line(line : String) -> Self {
+ let ls : Vec<String> = line.split(',').map(|s| s.to_string()).collect();
+ let v = FileVector::from_hex(ls[1].clone());
+ let p = ls[0].clone().replace("\0", ",");
+ Self {
+ vector : v,
+ path : p
+ }
+ }
+}
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..f0975ad
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,87 @@
+pub mod vector;
+pub mod dictionary;
+pub mod text;
+pub mod splitter;
+pub mod filecache;
+pub mod searchresult;
+
+use vector::FileVector;
+use dictionary::Dictionary;
+use filecache::FileCache;
+use searchresult::SearchResult;
+use std::fs::File;
+use std::io::{Write, BufReader, BufRead};
+use walkdir::*;
+
+fn generate_index(input_path : &str, index_path : &str) {
+ let mut index_file = File::create(index_path).unwrap();
+ let mut dict = Dictionary::new();
+
+ for entry in WalkDir::new(input_path).into_iter().filter_map(|e| e.ok()) {
+ if entry.path().is_file() {
+ let content : String = text::extract_text(entry.path().to_str().unwrap());
+ if !content.is_empty() {
+ let words : Vec<String> = splitter::split_to_words(content);
+
+ for word in words.iter() {
+ let w = word.clone();
+ dict.set(w);
+ }
+
+ let fv : FileVector = dict.vectorize_word_list(words);
+ writeln!(index_file, "{}, {}", entry.path().to_str().unwrap().replace(",", "\0"), fv.to_hex()).ok();
+ }
+ }
+ }
+
+ let dict_list : Vec<String> = dict.to_list();
+ writeln!(index_file, "#{}", dict_list.join(",")).ok();
+}
+
+fn search(index_path : &str, search_args : Vec<&str>) {
+ let index_file = File::open(index_path).expect("could not open index file");
+ let reader = BufReader::new(index_file);
+ let mut filecaches : Vec<FileCache> = Vec::new();
+ let mut dict = Dictionary::new();
+
+
+ for line in reader.lines() {
+ let l = line.unwrap();
+ if l.starts_with("#") {
+ dict = Dictionary::from_line(&l.strip_prefix("#").unwrap());
+ } else {
+ filecaches.push(FileCache::from_line(l));
+ }
+ }
+
+ let mut v : FileVector = FileVector::new();
+
+ for arg in search_args {
+ v.insert(dict.get(arg.to_string()), 1);
+ }
+
+ let mut results : Vec<SearchResult> = Vec::new();
+
+ for filecache in filecaches.iter() {
+ let mut r = SearchResult { priority : 0, path : filecache.path.clone() };
+ r.priority = vector::scalar_product(&v, &filecache.vector);
+ if r.priority > 0 {
+ results.push(r);
+ }
+ }
+ results.sort_by(|a, b| b.priority.cmp(&a.priority));
+
+ for result in results.iter() {
+ println!("{}", result.path);
+ }
+
+ println!("{} results", results.len())
+
+}
+
+fn main() {
+ println!("Generating Index...");
+ generate_index("/home/n8", "index.idxs");
+ println!("Searching...");
+ search("index.idxs", vec!["one", "difficult", "under", "linux"]);
+}
diff --git a/src/searchresult.rs b/src/searchresult.rs
new file mode 100644
index 0000000..6a0dd30
--- /dev/null
+++ b/src/searchresult.rs
@@ -0,0 +1,5 @@
+
+pub struct SearchResult {
+ pub priority : u64,
+ pub path : String
+}
diff --git a/src/splitter.rs b/src/splitter.rs
new file mode 100644
index 0000000..64e659f
--- /dev/null
+++ b/src/splitter.rs
@@ -0,0 +1,16 @@
+use std::vec::Vec;
+
+pub fn split_to_words(data : String) -> Vec<String> {
+ let mut v : Vec<String> = data
+ .to_lowercase()
+ .split_whitespace()
+ .map(str::to_string).collect();
+
+ for word in v.iter_mut() {
+ word.retain(|c| !r#"{}[]#(),".;:?!'%|0123456789/\^"#.contains(c))
+ }
+
+ v.retain(|str| !str.is_empty());
+
+ v
+}
diff --git a/src/text/mod.rs b/src/text/mod.rs
new file mode 100644
index 0000000..4e1b9a4
--- /dev/null
+++ b/src/text/mod.rs
@@ -0,0 +1,31 @@
+use lazy_static::lazy_static;
+use std::ffi::OsStr;
+use std::path::Path;
+use std::collections::HashMap;
+
+mod txt;
+
+fn empty_extractor(_ : &str) -> String {
+ "".to_string()
+}
+
+macro_rules! ext {
+ ($f:ident) => {(stringify!($f), $f::get_text as ExtFn)}
+}
+
+type ExtFn = fn(&str) -> String;
+
+lazy_static! {
+ static ref EXT: HashMap<&'static str, ExtFn> = {
+ HashMap::from([
+ ext!(txt),
+ ])
+ };
+}
+
+
+pub fn extract_text(path : &str) -> String {
+ let p = Path::new(&path);
+ let extenstion = p.extension().unwrap_or_else(|| OsStr::new("")).to_str().unwrap();
+ EXT.get(extenstion).unwrap_or(&(empty_extractor as ExtFn))(path)
+}
diff --git a/src/text/txt.rs b/src/text/txt.rs
new file mode 100644
index 0000000..14e7422
--- /dev/null
+++ b/src/text/txt.rs
@@ -0,0 +1,5 @@
+use std::fs;
+
+pub fn get_text(path : &str) -> String {
+ fs::read_to_string(path).unwrap_or_default()
+}
diff --git a/src/vector.rs b/src/vector.rs
new file mode 100644
index 0000000..ce1139d
--- /dev/null
+++ b/src/vector.rs
@@ -0,0 +1,59 @@
+use std::collections::HashMap;
+use std::ops::{Deref, DerefMut};
+
+pub struct FileVector {
+ data : HashMap<u64, u64>
+}
+
+impl Deref for FileVector {
+ type Target = HashMap<u64, u64>;
+ fn deref(&self) -> &Self::Target {
+ &self.data
+ }
+}
+
+impl DerefMut for FileVector {
+ fn deref_mut(&mut self) -> &mut Self::Target {
+ &mut self.data
+ }
+}
+
+impl FileVector {
+ pub fn new() -> Self {
+ Self { data : HashMap::new() }
+ }
+
+ pub fn from_hex(hex : String) -> Self {
+ let mut data : HashMap<u64, u64> = HashMap::new();
+ let data_chunks : Vec<&str> = hex.split(' ').collect();
+
+ for chunk in data_chunks {
+ if !chunk.is_empty() {
+ let n : Vec<&str> = chunk.split(';').collect();
+ let i : u64 = u64::from_str_radix(n[0], 16).expect("could not extract index");
+ let v : u64 = u64::from_str_radix(n[1], 16).expect("could not extract value");
+ data.insert(i, v);
+ }
+ }
+
+ Self { data }
+ }
+
+ pub fn to_hex(&self) -> String {
+ let mut hex = String::new();
+
+ for (i, v) in self.data.iter() {
+ hex += &format!("{:x};{:x} ", *i, *v);
+ }
+
+ hex.trim().to_string()
+ }
+}
+
+pub fn scalar_product(a : &FileVector, b : &FileVector) -> u64 {
+ let mut c = 0;
+ for (i, x) in a.iter() {
+ c += x * (b.get(i).unwrap_or_else(|| &0));
+ }
+ c
+}