aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/dictionary.rs105
-rw-r--r--src/filecache.rs16
-rw-r--r--src/gui/mod.rs3
-rw-r--r--src/index.rs123
-rw-r--r--src/main.rs1
-rw-r--r--src/vector.rs43
6 files changed, 43 insertions, 248 deletions
diff --git a/src/dictionary.rs b/src/dictionary.rs
deleted file mode 100644
index 3e05b91..0000000
--- a/src/dictionary.rs
+++ /dev/null
@@ -1,105 +0,0 @@
-use std::collections::HashMap;
-use crate::vector::FileVector;
-
-/// The dictionary is used to cache to words ids.
-/// It also provides a function to convert it to
-/// a vector and generate a FileVector from a word list
-/// with the current directory.
-#[derive(Clone, Debug)]
-pub struct Dictionary {
- last_index : usize,
- data : HashMap<String, u64>,
-}
-
-impl Default for Dictionary {
- fn default() -> Self {
- Self::new()
- }
-}
-
-impl Dictionary {
- pub fn new() -> Self {
- Self { last_index : 0, data : HashMap::new() }
- }
-
- pub fn from_line(line : &str) -> Self {
- let mut data : HashMap<String, u64> = HashMap::new();
- let mut i : usize = 0;
-
- for word in line.split(',') {
- data.insert(word.to_string(), i as u64);
- i += 1;
- }
-
- Self { last_index : i - 1, data }
- }
-
- pub fn set(&mut self, name : &String) {
- if !self.data.contains_key(name) {
- self.last_index += 1;
- self.data.insert(name.clone(), self.last_index as u64);
- }
- }
-
- pub fn set_and_get(&mut self, name : &String) -> u64 {
- if !self.data.contains_key(name) {
- self.last_index += 1;
- self.data.insert(name.clone(), self.last_index as u64);
- self.last_index as u64
- } else {
- *self.data.get(name).unwrap()
- }
- }
-
- pub fn get(&self, name : &String) -> Option<&u64> {
- self.data.get(name)
- }
-
- pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, u64> {
- self.data.iter()
- }
-
- pub fn to_list(&self) -> Vec<String> {
- let mut v = Vec::with_capacity(self.last_index + 1);
-
- v.resize(self.last_index + 1, "".to_string());
-
- for (word, id) in self.iter() {
- v[(*id) as usize] = word.clone();
- }
-
- v
- }
-
- pub fn vectorize_word_list(&self, words : &Vec<&String>) -> FileVector {
- let mut fv = FileVector::new();
-
- for word in words {
- let i = *self.get(word).unwrap();
- if !fv.contains_key(&i) {
- fv.insert(i, 1);
- } else {
- let c = *fv.get(&i).unwrap();
- fv.insert(i, c + 1);
- }
- }
-
- fv
- }
-
- pub fn insert_words_and_vectorize_word_list(&mut self, words : &Vec<&String>) -> FileVector {
- let mut fv = FileVector::new();
-
- for word in words {
- let i = self.set_and_get(word);
- if !fv.contains_key(&i) {
- fv.insert(i, 1);
- } else {
- let c = *fv.get(&i).unwrap();
- fv.insert(i, c + 1);
- }
- }
-
- fv
- }
-}
diff --git a/src/filecache.rs b/src/filecache.rs
index af97c20..721d7a4 100644
--- a/src/filecache.rs
+++ b/src/filecache.rs
@@ -1,9 +1,9 @@
use std::hash::{Hasher, Hash};
-
+use serde::{Deserialize, Serialize};
use crate::vector::FileVector;
/// Represents one file which was indexed.
-#[derive(Clone, Debug, Default)]
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct FileCache {
pub path : String,
pub vector : FileVector,
@@ -22,15 +22,3 @@ impl Hash for FileCache {
self.path.hash(state);
}
}
-
-impl FileCache {
- pub fn from_line(line : String) -> Self {
- let ls : Vec<String> = line.split(',').map(|s| s.to_string()).collect();
- let v = FileVector::from_string(&ls[1]);
- let p = ls[0].clone().replace('\0', ",");
- Self {
- vector : v,
- path : p
- }
- }
-}
diff --git a/src/gui/mod.rs b/src/gui/mod.rs
index f56dd71..593eb65 100644
--- a/src/gui/mod.rs
+++ b/src/gui/mod.rs
@@ -210,8 +210,7 @@ impl Application for App {
}
Message::ExportResults => {
let file = rfd::FileDialog::new().set_title("Export to File").add_filter("Raw Text", &["txt"]).save_file();
- if file.is_some() {
- let file = file.unwrap();
+ if let Some(file) = file {
let mut file = File::create(file).unwrap();
for result in state.results.iter() {
writeln!(file, "{}", result.path).ok();
diff --git a/src/index.rs b/src/index.rs
index fd38298..8fb34fe 100644
--- a/src/index.rs
+++ b/src/index.rs
@@ -1,13 +1,13 @@
-use std::collections::{HashSet, HashMap};
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{Hash, Hasher};
use std::fs::File;
-use std::io::{BufWriter, BufReader, BufRead, Write};
+use std::io::{BufWriter, Write};
use std::sync::mpsc::{channel, Sender};
use std::time::Duration;
use walkdir::*;
use std::thread;
use std::option::Option::None;
use crate::vector::FileVector;
-use crate::dictionary::Dictionary;
use crate::filecache::FileCache;
use crate::searchresult::SearchResult;
use crate::filecounter::filecount;
@@ -19,7 +19,6 @@ use crate::vector;
/// or read from a file.
#[derive(Clone, Debug)]
pub struct Index {
- dictionary : Dictionary,
filecache : Vec<FileCache>,
}
@@ -40,7 +39,6 @@ pub enum GenState {
impl Index {
pub fn empty() -> Self {
Self {
- dictionary : Dictionary::new(),
filecache : Vec::new()
}
}
@@ -62,14 +60,12 @@ impl Index {
tx_vec.push(tx);
let status_tx = status_tx.clone();
crawler_handles.push(thread::spawn(move || {
- let mut dict = Dictionary::new();
let mut filecache : Vec<FileCache> = Vec::new();
loop {
let path = rx.recv().unwrap();
if path.is_empty() {
return Self {
- dictionary : dict,
filecache
}
}
@@ -83,7 +79,7 @@ impl Index {
}
let words : Vec<String> = splitter::split_to_words(content);
- let fv = dict.insert_words_and_vectorize_word_list(&words.iter().collect());
+ let fv = FileVector::from_words(words);
filecache.push(FileCache {
path,
vector : fv
@@ -160,90 +156,26 @@ impl Index {
}
pub fn from_file(path : &String) -> Self {
- let index_file = File::open(path).expect("could not open index file");
- let reader = BufReader::new(index_file);
- let mut filecache : Vec<FileCache> = Vec::new();
- let mut dict = Dictionary::new();
-
- for line in reader.lines() {
- let l = line.unwrap();
- if l.starts_with('#') {
- dict = Dictionary::from_line(l.strip_prefix('#').unwrap());
- } else {
- filecache.push(FileCache::from_line(l));
- }
- }
+ let bytes = std::fs::read(path).expect("could not read index file");
+ let filecache : Vec<FileCache> = bincode::deserialize(&bytes).unwrap();
Self {
- dictionary : dict,
filecache
}
}
- fn merge_into(&mut self, other : Index) {
- let mut dict = self.dictionary.clone();
- thread::scope(|s| {
- let mut a_hash : HashSet<&FileCache> = HashSet::new();
- let mut diff : Vec<&FileCache> = Vec::new();
-
- let converter_handle = s.spawn(|| {
- let mut b_id_to_word : HashMap<u64, &String> = HashMap::new();
-
- for (value, id) in other.dictionary.iter() {
- b_id_to_word.insert(*id, value);
- }
- b_id_to_word
- });
-
- let dict_handle = s.spawn(|| {
- for (word, _) in other.dictionary.iter() {
- dict.set(word);
- }
- dict
- });
-
- for file in self.filecache.iter() {
- a_hash.insert(file);
- }
-
- for file in other.filecache.iter() {
- if !a_hash.contains(file) {
- diff.push(file);
- }
- }
-
- let b_id_to_word = converter_handle.join().unwrap();
- self.dictionary = dict_handle.join().unwrap();
-
- for file in diff {
- let mut words : Vec<&String> = Vec::new();
-
- for (word_id, i) in file.vector.iter() {
- for _ in 0..*i {
- words.push(b_id_to_word.get(word_id).unwrap());
- }
- }
-
- self.filecache.push(FileCache {
- path : file.path.clone(),
- vector: self.dictionary.vectorize_word_list(&words)
- });
- }
- });
- }
-
- pub fn merge(mut indexes : Vec<Index>, callback : impl Fn(u8)) -> Self {
+ pub fn merge(indexes : Vec<Index>, callback : impl Fn(u8)) -> Self {
let max = indexes.len();
+ let mut filecache = Vec::new();
- indexes.sort_by(|a, b| a.filecache.len().cmp(&b.filecache.len()));
- let mut merged_index : Index = indexes.pop().unwrap();
for (i, index) in indexes.into_iter().enumerate() {
callback((i * 100 / max) as u8);
- merged_index.merge_into(index);
+ filecache.extend(index.filecache);
}
+
callback(100);
- merged_index
+ Self { filecache }
}
pub fn search(&self, search_args : Vec<String>) -> Vec<SearchResult> {
@@ -251,13 +183,14 @@ impl Index {
let mut opt : FileVector = FileVector::new();
for arg in search_args {
- let a = arg.trim_start_matches("+");
- if let Some(value) = self.dictionary.get(&a.to_string()) {
- if arg.chars().nth(0).unwrap() == '+' {
- opt.insert(*value, 1);
- } else {
- v.insert(*value, 1);
- }
+ let mut hasher = DefaultHasher::new();
+ let a = arg.trim_start_matches('+');
+ a.hash(&mut hasher);
+ let value = hasher.finish();
+ if arg.starts_with('+') {
+ opt.insert(value, 1);
+ } else {
+ v.insert(value, 1);
}
}
@@ -276,21 +209,11 @@ impl Index {
}
pub fn save(&self, path: String) {
- thread::scope(|s| {
- let dict_list_handle = s.spawn(|| {
- self.dictionary.to_list().join(",")
- });
+ let index_file = File::create(path).expect("could not open output file");
+ let mut file = BufWriter::new(index_file);
- let index_file = File::create(path).expect("could not open output file");
- let mut file = BufWriter::new(index_file);
-
- for fc in self.filecache.iter() {
- write!(file, "{}, {}\n", fc.path.replace(',', "\0"), fc.vector.stringify()).ok();
- }
-
- write!(file, "#{}\n", dict_list_handle.join().unwrap().as_str()).ok();
- file.flush().ok();
- });
+ file.write_all(&bincode::serialize(&self.filecache).unwrap()).ok();
+ file.flush().ok();
}
pub fn num_files(&self) -> usize {
diff --git a/src/main.rs b/src/main.rs
index 729b40d..3f18de9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,5 +1,4 @@
pub mod vector;
-pub mod dictionary;
pub mod text;
pub mod splitter;
pub mod filecache;
diff --git a/src/vector.rs b/src/vector.rs
index c058490..fa0a139 100644
--- a/src/vector.rs
+++ b/src/vector.rs
@@ -1,11 +1,14 @@
use std::collections::HashMap;
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{Hash, Hasher};
use std::ops::{Deref, DerefMut};
+use serde::{Deserialize, Serialize};
/// Represents the content of a cached file.
/// It is stored as a HashMap, because we do not
/// have to store the zeros. With that we save a lot
/// of storage.
-#[derive(Clone, Debug)]
+#[derive(Default, Clone, Debug, Deserialize, Serialize)]
pub struct FileVector {
data : HashMap<u64, u64>
}
@@ -23,42 +26,30 @@ impl DerefMut for FileVector {
}
}
-impl Default for FileVector {
- fn default() -> Self {
- Self::new()
- }
-}
-
impl FileVector {
pub fn new() -> Self {
Self { data : HashMap::new() }
}
- pub fn from_string(hex : &str) -> Self {
- let mut data : HashMap<u64, u64> = HashMap::new();
- let data_chunks : Vec<&str> = hex.split(' ').collect();
+ pub fn from_words(words: Vec<String>) -> Self {
+ let mut data = HashMap::new();
- for chunk in data_chunks {
- if !chunk.is_empty() {
- let n : Vec<&str> = chunk.split(';').collect();
- let i : u64 = u64::from_str_radix(n[0], 16).expect("could not extract index");
- let v : u64 = u64::from_str_radix(n[1], 16).expect("could not extract value");
- data.insert(i, v);
+ for word in words {
+ let mut hasher = DefaultHasher::new();
+ word.hash(&mut hasher);
+ let k = hasher.finish();
+ match data.entry(k) {
+ std::collections::hash_map::Entry::Occupied(mut e) => {
+ e.insert(e.get() + 1);
+ }
+ std::collections::hash_map::Entry::Vacant(e) => {
+ e.insert(1);
+ }
}
}
Self { data }
}
-
- pub fn stringify(&self) -> String {
- let mut hex = String::new();
-
- for (i, v) in self.data.iter() {
- hex += &format!("{:x};{:x} ", *i, *v);
- }
-
- hex.trim().to_string()
- }
}
pub fn scalar_product(a : &FileVector, b : &FileVector) -> u64 {