first sketch of indexer

author: Nathan Reiner <nathan@nathanreiner.xyz> 2023-07-05 23:07:26 +0200
committer: Nathan Reiner <nathan@nathanreiner.xyz> 2023-07-05 23:07:26 +0200
commit: 4d577650f737daaeb477bbbd5ae2bad4f1121c38 (patch)
tree: ac973541e0a2d7751af4ece5f7f639e739f81fcc /src
8 files changed, 290 insertions, 0 deletions
diff --git a/src/dictionary.rs b/src/dictionary.rs
new file mode 100644
index 0000000..a8d9b28
--- /dev/null
+++ b/src/dictionary.rs
@@ -0,0 +1,69 @@
+use std::collections::HashMap;
+use crate::vector::FileVector;
+
+pub struct Dictionary {
+    last_index : usize,
+    data : HashMap<String, u64>,
+}
+
+
+impl Dictionary {
+    pub fn new() -> Self {
+        Self { last_index : 0, data : HashMap::new() }
+    }
+
+    pub fn from_line(line : &str) -> Self {
+        let mut data : HashMap<String, u64> = HashMap::new();
+        let mut i : usize = 0;
+
+        for word in line.split(',') {
+            data.insert(word.to_string(), i as u64);
+            i += 1;
+        }
+
+        Self { last_index : i - 1, data }
+    }
+
+    pub fn set(&mut self, name : String) {
+        if !self.data.contains_key(&name) {
+            self.last_index += 1;
+            self.data.insert(name, self.last_index as u64);
+        }
+    }
+
+    pub fn get(&self, name : String) -> u64 {
+        *self.data.get(&name).unwrap()
+    }
+
+    pub fn iter(&self) -> &HashMap<String, u64> {
+        &self.data
+    }
+
+    pub fn to_list(&self) -> Vec<String> {
+        let mut v = Vec::with_capacity(self.last_index + 1);
+
+        v.resize(self.last_index + 1, "".to_string());
+
+        for (word, id) in self.iter() {
+            v[(*id) as usize] = word.clone();
+        }
+
+        v
+    }
+
+    pub fn vectorize_word_list(&self, words : Vec<String>) -> FileVector {
+        let mut fv = FileVector::new();
+
+        for word in words {
+            let i = self.get(word);
+            if !fv.contains_key(&i) {
+                fv.insert(i, 1);
+            } else {
+                let c : u64 = *fv.get(&i).unwrap();
+                fv.insert(i, c + 1);
+            }
+        }
+
+        fv
+    }
+}
diff --git a/src/filecache.rs b/src/filecache.rs
new file mode 100644
index 0000000..c584c0b
--- /dev/null
+++ b/src/filecache.rs
@@ -0,0 +1,18 @@
+use crate::vector::FileVector;
+
+pub struct FileCache {
+    pub vector : FileVector,
+    pub path : String,
+}
+
+impl FileCache {
+    pub fn from_line(line : String) -> Self {
+        let ls : Vec<String> = line.split(',').map(|s| s.to_string()).collect();
+        let v = FileVector::from_hex(ls[1].clone());
+        let p = ls[0].clone().replace("\0", ",");
+        Self {
+            vector : v,
+            path : p
+        }
+    }
+}
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..f0975ad
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,87 @@
+pub mod vector;
+pub mod dictionary;
+pub mod text;
+pub mod splitter;
+pub mod filecache;
+pub mod searchresult;
+
+use vector::FileVector;
+use dictionary::Dictionary;
+use filecache::FileCache;
+use searchresult::SearchResult;
+use std::fs::File;
+use std::io::{Write, BufReader, BufRead};
+use walkdir::*;
+
+fn generate_index(input_path : &str, index_path : &str) {
+    let mut index_file = File::create(index_path).unwrap();
+    let mut dict = Dictionary::new();
+
+    for entry in WalkDir::new(input_path).into_iter().filter_map(|e| e.ok()) {
+        if entry.path().is_file() {
+            let content : String = text::extract_text(entry.path().to_str().unwrap());
+            if !content.is_empty() {
+                let words : Vec<String> = splitter::split_to_words(content);
+
+                for word in words.iter() {
+                    let w = word.clone();
+                    dict.set(w);
+                }
+
+                let fv : FileVector = dict.vectorize_word_list(words);
+                writeln!(index_file, "{}, {}", entry.path().to_str().unwrap().replace(",", "\0"), fv.to_hex()).ok();
+            }
+        }
+    }
+
+    let dict_list : Vec<String> = dict.to_list();
+    writeln!(index_file, "#{}", dict_list.join(",")).ok();
+}
+
+fn search(index_path : &str, search_args : Vec<&str>) {
+    let index_file = File::open(index_path).expect("could not open index file");
+    let reader = BufReader::new(index_file);
+    let mut filecaches : Vec<FileCache> = Vec::new();
+    let mut dict = Dictionary::new();
+
+
+    for line in reader.lines() {
+        let l = line.unwrap();
+        if l.starts_with("#") {
+            dict = Dictionary::from_line(&l.strip_prefix("#").unwrap());
+        } else {
+            filecaches.push(FileCache::from_line(l));
+        }
+    }
+
+    let mut v : FileVector = FileVector::new();
+
+    for arg in search_args {
+        v.insert(dict.get(arg.to_string()), 1);
+    }
+
+    let mut results : Vec<SearchResult> = Vec::new();
+
+    for filecache in filecaches.iter() {
+        let mut r = SearchResult { priority : 0, path : filecache.path.clone() };
+        r.priority = vector::scalar_product(&v, &filecache.vector);
+        if r.priority > 0 {
+            results.push(r);
+        }
+    }
+    results.sort_by(|a, b| b.priority.cmp(&a.priority));
+
+    for result in results.iter() {
+        println!("{}", result.path);
+    }
+
+    println!("{} results", results.len())
+
+}
+
+fn main() {
+    println!("Generating Index...");
+    generate_index("/home/n8", "index.idxs");
+    println!("Searching...");
+    search("index.idxs", vec!["one", "difficult", "under", "linux"]);
+}
diff --git a/src/searchresult.rs b/src/searchresult.rs
new file mode 100644
index 0000000..6a0dd30
--- /dev/null
+++ b/src/searchresult.rs
@@ -0,0 +1,5 @@
+
+pub struct SearchResult {
+    pub priority : u64,
+    pub path : String
+}
diff --git a/src/splitter.rs b/src/splitter.rs
new file mode 100644
index 0000000..64e659f
--- /dev/null
+++ b/src/splitter.rs
@@ -0,0 +1,16 @@
+use std::vec::Vec;
+
+pub fn split_to_words(data : String) -> Vec<String> {
+    let mut v : Vec<String> = data
+        .to_lowercase()
+        .split_whitespace()
+        .map(str::to_string).collect();
+
+    for word in v.iter_mut() {
+        word.retain(|c| !r#"{}[]#(),".;:?!'%|0123456789/\^"#.contains(c))
+    }
+
+    v.retain(|str| !str.is_empty());
+
+    v
+}
diff --git a/src/text/mod.rs b/src/text/mod.rs
new file mode 100644
index 0000000..4e1b9a4
--- /dev/null
+++ b/src/text/mod.rs
@@ -0,0 +1,31 @@
+use lazy_static::lazy_static;
+use std::ffi::OsStr;
+use std::path::Path;
+use std::collections::HashMap;
+
+mod txt;
+
+fn empty_extractor(_ : &str) -> String {
+    "".to_string()
+}
+
+macro_rules! ext {
+    ($f:ident) => {(stringify!($f), $f::get_text as ExtFn)}
+}
+
+type ExtFn = fn(&str) -> String;
+
+lazy_static! {
+    static ref EXT: HashMap<&'static str, ExtFn> = {
+        HashMap::from([
+            ext!(txt),
+        ])
+    };
+}
+
+
+pub fn extract_text(path : &str) -> String {
+    let p = Path::new(&path);
+    let extenstion = p.extension().unwrap_or_else(|| OsStr::new("")).to_str().unwrap();
+    EXT.get(extenstion).unwrap_or(&(empty_extractor as ExtFn))(path)
+}
diff --git a/src/text/txt.rs b/src/text/txt.rs
new file mode 100644
index 0000000..14e7422
--- /dev/null
+++ b/src/text/txt.rs
@@ -0,0 +1,5 @@
+use std::fs;
+
+pub fn get_text(path : &str) -> String {
+    fs::read_to_string(path).unwrap_or_default()
+}
diff --git a/src/vector.rs b/src/vector.rs
new file mode 100644
index 0000000..ce1139d
--- /dev/null
+++ b/src/vector.rs
@@ -0,0 +1,59 @@
+use std::collections::HashMap;
+use std::ops::{Deref, DerefMut};
+
+pub struct FileVector {
+    data : HashMap<u64, u64>
+}
+
+impl Deref for FileVector {
+    type Target = HashMap<u64, u64>;
+    fn deref(&self) -> &Self::Target {
+        &self.data
+    }
+}
+
+impl DerefMut for FileVector {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.data
+    }
+}
+
+impl FileVector {
+    pub fn new() -> Self {
+        Self { data : HashMap::new() }
+    }
+
+    pub fn from_hex(hex : String) -> Self {
+        let mut data : HashMap<u64, u64> = HashMap::new();
+        let data_chunks : Vec<&str> = hex.split(' ').collect();
+
+        for chunk in data_chunks {
+            if !chunk.is_empty() {
+                let n : Vec<&str> = chunk.split(';').collect();
+                let i : u64 = u64::from_str_radix(n[0], 16).expect("could not extract index");
+                let v : u64 = u64::from_str_radix(n[1], 16).expect("could not extract value");
+                data.insert(i, v);
+            }
+        }
+
+        Self { data }
+    }
+
+    pub fn to_hex(&self) -> String {
+        let mut hex = String::new();
+
+        for (i, v) in self.data.iter() {
+            hex += &format!("{:x};{:x} ", *i, *v);
+        }
+
+        hex.trim().to_string()
+    }
+}
+
+pub fn scalar_product(a : &FileVector, b : &FileVector) -> u64 {
+    let mut c = 0;
+    for (i, x) in a.iter() {
+        c += x * (b.get(i).unwrap_or_else(|| &0));
+    }
+    c
+}
author	Nathan Reiner <nathan@nathanreiner.xyz>	2023-07-05 23:07:26 +0200
committer	Nathan Reiner <nathan@nathanreiner.xyz>	2023-07-05 23:07:26 +0200
commit	4d577650f737daaeb477bbbd5ae2bad4f1121c38 (patch)
tree	ac973541e0a2d7751af4ece5f7f639e739f81fcc /src