aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNathan Reiner <nathan@nathanreiner.xyz>2023-07-05 23:07:26 +0200
committerNathan Reiner <nathan@nathanreiner.xyz>2023-07-05 23:07:26 +0200
commit4d577650f737daaeb477bbbd5ae2bad4f1121c38 (patch)
treeac973541e0a2d7751af4ece5f7f639e739f81fcc
first sketch of indexer
-rw-r--r--.gitignore1
-rw-r--r--Cargo.lock67
-rw-r--r--Cargo.toml10
-rw-r--r--src/dictionary.rs69
-rw-r--r--src/filecache.rs18
-rw-r--r--src/main.rs87
-rw-r--r--src/searchresult.rs5
-rw-r--r--src/splitter.rs16
-rw-r--r--src/text/mod.rs31
-rw-r--r--src/text/txt.rs5
-rw-r--r--src/vector.rs59
11 files changed, 368 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ea8c4bf
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/target
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..7d71608
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,67 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "indexsearch"
+version = "0.1.0"
+dependencies = [
+ "lazy_static",
+ "walkdir",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+
+[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "walkdir"
+version = "2.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698"
+dependencies = [
+ "same-file",
+ "winapi-util",
+]
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-util"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..f09fcf6
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "indexsearch"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+lazy_static = "1.4.0"
+walkdir = "2.3.3"
diff --git a/src/dictionary.rs b/src/dictionary.rs
new file mode 100644
index 0000000..a8d9b28
--- /dev/null
+++ b/src/dictionary.rs
@@ -0,0 +1,69 @@
+use std::collections::HashMap;
+use crate::vector::FileVector;
+
+pub struct Dictionary {
+ last_index : usize,
+ data : HashMap<String, u64>,
+}
+
+
+impl Dictionary {
+ pub fn new() -> Self {
+ Self { last_index : 0, data : HashMap::new() }
+ }
+
+ pub fn from_line(line : &str) -> Self {
+ let mut data : HashMap<String, u64> = HashMap::new();
+ let mut i : usize = 0;
+
+ for word in line.split(',') {
+ data.insert(word.to_string(), i as u64);
+ i += 1;
+ }
+
+ Self { last_index : i - 1, data }
+ }
+
+ pub fn set(&mut self, name : String) {
+ if !self.data.contains_key(&name) {
+ self.last_index += 1;
+ self.data.insert(name, self.last_index as u64);
+ }
+ }
+
+ pub fn get(&self, name : String) -> u64 {
+ *self.data.get(&name).unwrap()
+ }
+
+ pub fn iter(&self) -> &HashMap<String, u64> {
+ &self.data
+ }
+
+ pub fn to_list(&self) -> Vec<String> {
+ let mut v = Vec::with_capacity(self.last_index + 1);
+
+ v.resize(self.last_index + 1, "".to_string());
+
+ for (word, id) in self.iter() {
+ v[(*id) as usize] = word.clone();
+ }
+
+ v
+ }
+
+ pub fn vectorize_word_list(&self, words : Vec<String>) -> FileVector {
+ let mut fv = FileVector::new();
+
+ for word in words {
+ let i = self.get(word);
+ if !fv.contains_key(&i) {
+ fv.insert(i, 1);
+ } else {
+ let c : u64 = *fv.get(&i).unwrap();
+ fv.insert(i, c + 1);
+ }
+ }
+
+ fv
+ }
+}
diff --git a/src/filecache.rs b/src/filecache.rs
new file mode 100644
index 0000000..c584c0b
--- /dev/null
+++ b/src/filecache.rs
@@ -0,0 +1,18 @@
+use crate::vector::FileVector;
+
+pub struct FileCache {
+ pub vector : FileVector,
+ pub path : String,
+}
+
+impl FileCache {
+ pub fn from_line(line : String) -> Self {
+ let ls : Vec<String> = line.split(',').map(|s| s.to_string()).collect();
+ let v = FileVector::from_hex(ls[1].clone());
+ let p = ls[0].clone().replace("\0", ",");
+ Self {
+ vector : v,
+ path : p
+ }
+ }
+}
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..f0975ad
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,87 @@
+pub mod vector;
+pub mod dictionary;
+pub mod text;
+pub mod splitter;
+pub mod filecache;
+pub mod searchresult;
+
+use vector::FileVector;
+use dictionary::Dictionary;
+use filecache::FileCache;
+use searchresult::SearchResult;
+use std::fs::File;
+use std::io::{Write, BufReader, BufRead};
+use walkdir::*;
+
+fn generate_index(input_path : &str, index_path : &str) {
+ let mut index_file = File::create(index_path).unwrap();
+ let mut dict = Dictionary::new();
+
+ for entry in WalkDir::new(input_path).into_iter().filter_map(|e| e.ok()) {
+ if entry.path().is_file() {
+ let content : String = text::extract_text(entry.path().to_str().unwrap());
+ if !content.is_empty() {
+ let words : Vec<String> = splitter::split_to_words(content);
+
+ for word in words.iter() {
+ let w = word.clone();
+ dict.set(w);
+ }
+
+ let fv : FileVector = dict.vectorize_word_list(words);
+ writeln!(index_file, "{}, {}", entry.path().to_str().unwrap().replace(",", "\0"), fv.to_hex()).ok();
+ }
+ }
+ }
+
+ let dict_list : Vec<String> = dict.to_list();
+ writeln!(index_file, "#{}", dict_list.join(",")).ok();
+}
+
+fn search(index_path : &str, search_args : Vec<&str>) {
+ let index_file = File::open(index_path).expect("could not open index file");
+ let reader = BufReader::new(index_file);
+ let mut filecaches : Vec<FileCache> = Vec::new();
+ let mut dict = Dictionary::new();
+
+
+ for line in reader.lines() {
+ let l = line.unwrap();
+ if l.starts_with("#") {
+ dict = Dictionary::from_line(&l.strip_prefix("#").unwrap());
+ } else {
+ filecaches.push(FileCache::from_line(l));
+ }
+ }
+
+ let mut v : FileVector = FileVector::new();
+
+ for arg in search_args {
+ v.insert(dict.get(arg.to_string()), 1);
+ }
+
+ let mut results : Vec<SearchResult> = Vec::new();
+
+ for filecache in filecaches.iter() {
+ let mut r = SearchResult { priority : 0, path : filecache.path.clone() };
+ r.priority = vector::scalar_product(&v, &filecache.vector);
+ if r.priority > 0 {
+ results.push(r);
+ }
+ }
+ results.sort_by(|a, b| b.priority.cmp(&a.priority));
+
+ for result in results.iter() {
+ println!("{}", result.path);
+ }
+
+ println!("{} results", results.len())
+
+}
+
+fn main() {
+ println!("Generating Index...");
+ generate_index("/home/n8", "index.idxs");
+ println!("Searching...");
+ search("index.idxs", vec!["one", "difficult", "under", "linux"]);
+}
diff --git a/src/searchresult.rs b/src/searchresult.rs
new file mode 100644
index 0000000..6a0dd30
--- /dev/null
+++ b/src/searchresult.rs
@@ -0,0 +1,5 @@
+
+pub struct SearchResult {
+ pub priority : u64,
+ pub path : String
+}
diff --git a/src/splitter.rs b/src/splitter.rs
new file mode 100644
index 0000000..64e659f
--- /dev/null
+++ b/src/splitter.rs
@@ -0,0 +1,16 @@
+use std::vec::Vec;
+
+pub fn split_to_words(data : String) -> Vec<String> {
+ let mut v : Vec<String> = data
+ .to_lowercase()
+ .split_whitespace()
+ .map(str::to_string).collect();
+
+ for word in v.iter_mut() {
+ word.retain(|c| !r#"{}[]#(),".;:?!'%|0123456789/\^"#.contains(c))
+ }
+
+ v.retain(|str| !str.is_empty());
+
+ v
+}
diff --git a/src/text/mod.rs b/src/text/mod.rs
new file mode 100644
index 0000000..4e1b9a4
--- /dev/null
+++ b/src/text/mod.rs
@@ -0,0 +1,31 @@
+use lazy_static::lazy_static;
+use std::ffi::OsStr;
+use std::path::Path;
+use std::collections::HashMap;
+
+mod txt;
+
+fn empty_extractor(_ : &str) -> String {
+ "".to_string()
+}
+
+macro_rules! ext {
+ ($f:ident) => {(stringify!($f), $f::get_text as ExtFn)}
+}
+
+type ExtFn = fn(&str) -> String;
+
+lazy_static! {
+ static ref EXT: HashMap<&'static str, ExtFn> = {
+ HashMap::from([
+ ext!(txt),
+ ])
+ };
+}
+
+
+pub fn extract_text(path : &str) -> String {
+ let p = Path::new(&path);
+ let extenstion = p.extension().unwrap_or_else(|| OsStr::new("")).to_str().unwrap();
+ EXT.get(extenstion).unwrap_or(&(empty_extractor as ExtFn))(path)
+}
diff --git a/src/text/txt.rs b/src/text/txt.rs
new file mode 100644
index 0000000..14e7422
--- /dev/null
+++ b/src/text/txt.rs
@@ -0,0 +1,5 @@
+use std::fs;
+
+pub fn get_text(path : &str) -> String {
+ fs::read_to_string(path).unwrap_or_default()
+}
diff --git a/src/vector.rs b/src/vector.rs
new file mode 100644
index 0000000..ce1139d
--- /dev/null
+++ b/src/vector.rs
@@ -0,0 +1,59 @@
+use std::collections::HashMap;
+use std::ops::{Deref, DerefMut};
+
+pub struct FileVector {
+ data : HashMap<u64, u64>
+}
+
+impl Deref for FileVector {
+ type Target = HashMap<u64, u64>;
+ fn deref(&self) -> &Self::Target {
+ &self.data
+ }
+}
+
+impl DerefMut for FileVector {
+ fn deref_mut(&mut self) -> &mut Self::Target {
+ &mut self.data
+ }
+}
+
+impl FileVector {
+ pub fn new() -> Self {
+ Self { data : HashMap::new() }
+ }
+
+ pub fn from_hex(hex : String) -> Self {
+ let mut data : HashMap<u64, u64> = HashMap::new();
+ let data_chunks : Vec<&str> = hex.split(' ').collect();
+
+ for chunk in data_chunks {
+ if !chunk.is_empty() {
+ let n : Vec<&str> = chunk.split(';').collect();
+ let i : u64 = u64::from_str_radix(n[0], 16).expect("could not extract index");
+ let v : u64 = u64::from_str_radix(n[1], 16).expect("could not extract value");
+ data.insert(i, v);
+ }
+ }
+
+ Self { data }
+ }
+
+ pub fn to_hex(&self) -> String {
+ let mut hex = String::new();
+
+ for (i, v) in self.data.iter() {
+ hex += &format!("{:x};{:x} ", *i, *v);
+ }
+
+ hex.trim().to_string()
+ }
+}
+
+pub fn scalar_product(a : &FileVector, b : &FileVector) -> u64 {
+ let mut c = 0;
+ for (i, x) in a.iter() {
+ c += x * (b.get(i).unwrap_or_else(|| &0));
+ }
+ c
+}