6 files changed, 43 insertions, 248 deletions
diff --git a/src/dictionary.rs b/src/dictionary.rs
deleted file mode 100644
index 3e05b91..0000000
--- a/src/dictionary.rs
+++ /dev/null
@@ -1,105 +0,0 @@
-use std::collections::HashMap;
-use crate::vector::FileVector;
-
-/// The dictionary is used to cache to words ids.
-/// It also provides a function to convert it to
-/// a vector and generate a FileVector from a word list
-/// with the current directory.
-#[derive(Clone, Debug)]
-pub struct Dictionary {
-    last_index : usize,
-    data : HashMap<String, u64>,
-}
-
-impl Default for Dictionary {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl Dictionary {
-    pub fn new() -> Self {
-        Self { last_index : 0, data : HashMap::new() }
-    }
-
-    pub fn from_line(line : &str) -> Self {
-        let mut data : HashMap<String, u64> = HashMap::new();
-        let mut i : usize = 0;
-
-        for word in line.split(',') {
-            data.insert(word.to_string(), i as u64);
-            i += 1;
-        }
-
-        Self { last_index : i - 1, data }
-    }
-
-    pub fn set(&mut self, name : &String) {
-        if !self.data.contains_key(name) {
-            self.last_index += 1;
-            self.data.insert(name.clone(), self.last_index as u64);
-        }
-    }
-
-    pub fn set_and_get(&mut self, name : &String) -> u64 {
-        if !self.data.contains_key(name) {
-            self.last_index += 1;
-            self.data.insert(name.clone(), self.last_index as u64);
-            self.last_index as u64
-        } else {
-            *self.data.get(name).unwrap()
-        }
-    }
-
-    pub fn get(&self, name : &String) -> Option<&u64> {
-        self.data.get(name)
-    }
-
-    pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, u64> {
-        self.data.iter()
-    }
-
-    pub fn to_list(&self) -> Vec<String> {
-        let mut v = Vec::with_capacity(self.last_index + 1);
-
-        v.resize(self.last_index + 1, "".to_string());
-
-        for (word, id) in self.iter() {
-            v[(*id) as usize] = word.clone();
-        }
-
-        v
-    }
-
-    pub fn vectorize_word_list(&self, words : &Vec<&String>) -> FileVector {
-        let mut fv = FileVector::new();
-
-        for word in words {
-            let i = *self.get(word).unwrap();
-            if !fv.contains_key(&i) {
-                fv.insert(i, 1);
-            } else {
-                let c = *fv.get(&i).unwrap();
-                fv.insert(i, c + 1);
-            }
-        }
-
-        fv
-    }
-
-    pub fn insert_words_and_vectorize_word_list(&mut self, words : &Vec<&String>) -> FileVector {
-        let mut fv = FileVector::new();
-
-        for word in words {
-            let i = self.set_and_get(word);
-            if !fv.contains_key(&i) {
-                fv.insert(i, 1);
-            } else {
-                let c = *fv.get(&i).unwrap();
-                fv.insert(i, c + 1);
-            }
-        }
-
-        fv
-    }
-}
diff --git a/src/filecache.rs b/src/filecache.rs
index af97c20..721d7a4 100644
--- a/src/filecache.rs
+++ b/src/filecache.rs
@@ -1,9 +1,9 @@
 use std::hash::{Hasher, Hash};
-
+use serde::{Deserialize, Serialize};
 use crate::vector::FileVector;
 
 /// Represents one file which was indexed.
-#[derive(Clone, Debug, Default)]
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
 pub struct FileCache {
     pub path : String,
     pub vector : FileVector,
@@ -22,15 +22,3 @@ impl Hash for FileCache {
         self.path.hash(state);
     }
 }
-
-impl FileCache {
-    pub fn from_line(line : String) -> Self {
-        let ls : Vec<String> = line.split(',').map(|s| s.to_string()).collect();
-        let v = FileVector::from_string(&ls[1]);
-        let p = ls[0].clone().replace('\0', ",");
-        Self {
-            vector : v,
-            path : p
-        }
-    }
-}
diff --git a/src/gui/mod.rs b/src/gui/mod.rs
index f56dd71..593eb65 100644
--- a/src/gui/mod.rs
+++ b/src/gui/mod.rs
@@ -210,8 +210,7 @@ impl Application for App {
                     }
                     Message::ExportResults => {
                         let file = rfd::FileDialog::new().set_title("Export to File").add_filter("Raw Text", &["txt"]).save_file();
-                        if file.is_some() {
-                            let file = file.unwrap();
+                        if let Some(file) = file {
                             let mut file = File::create(file).unwrap();
                             for result in state.results.iter() {
                                 writeln!(file, "{}", result.path).ok();
diff --git a/src/index.rs b/src/index.rs
index fd38298..8fb34fe 100644
--- a/src/index.rs
+++ b/src/index.rs
@@ -1,13 +1,13 @@
-use std::collections::{HashSet, HashMap};
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{Hash, Hasher};
 use std::fs::File;
-use std::io::{BufWriter, BufReader, BufRead, Write};
+use std::io::{BufWriter, Write};
 use std::sync::mpsc::{channel, Sender};
 use std::time::Duration;
 use walkdir::*;
 use std::thread;
 use std::option::Option::None;
 use crate::vector::FileVector;
-use crate::dictionary::Dictionary;
 use crate::filecache::FileCache;
 use crate::searchresult::SearchResult;
 use crate::filecounter::filecount;
@@ -19,7 +19,6 @@ use crate::vector;
 /// or read from a file.
 #[derive(Clone, Debug)]
 pub struct Index {
-    dictionary : Dictionary,
     filecache : Vec<FileCache>,
 }
 
@@ -40,7 +39,6 @@ pub enum GenState {
 impl Index {
     pub fn empty() -> Self {
         Self {
-            dictionary : Dictionary::new(),
             filecache : Vec::new()
         }
     }
@@ -62,14 +60,12 @@ impl Index {
                 tx_vec.push(tx);
                 let status_tx = status_tx.clone();
                 crawler_handles.push(thread::spawn(move || {
-                    let mut dict = Dictionary::new();
                     let mut filecache : Vec<FileCache> = Vec::new();
 
                     loop {
                         let path = rx.recv().unwrap();
                         if path.is_empty() {
                             return Self {
-                                dictionary : dict,
                                 filecache
                             }
                         }
@@ -83,7 +79,7 @@ impl Index {
                         }
 
                         let words : Vec<String> = splitter::split_to_words(content);
-                        let fv = dict.insert_words_and_vectorize_word_list(&words.iter().collect());
+                        let fv = FileVector::from_words(words);
                         filecache.push(FileCache {
                             path,
                             vector : fv
@@ -160,90 +156,26 @@ impl Index {
     }
 
     pub fn from_file(path : &String) -> Self {
-        let index_file = File::open(path).expect("could not open index file");
-        let reader = BufReader::new(index_file);
-        let mut filecache : Vec<FileCache> = Vec::new();
-        let mut dict = Dictionary::new();
-
-        for line in reader.lines() {
-            let l = line.unwrap();
-            if l.starts_with('#') {
-                dict = Dictionary::from_line(l.strip_prefix('#').unwrap());
-            } else {
-                filecache.push(FileCache::from_line(l));
-            }
-        }
+        let bytes = std::fs::read(path).expect("could not read index file");
+        let filecache : Vec<FileCache> = bincode::deserialize(&bytes).unwrap();
 
         Self {
-            dictionary : dict,
             filecache
         }
     }
 
-    fn merge_into(&mut self, other : Index) {
-        let mut dict = self.dictionary.clone();
-        thread::scope(|s| {
-            let mut a_hash : HashSet<&FileCache> = HashSet::new();
-            let mut diff : Vec<&FileCache> = Vec::new();
-
-            let converter_handle = s.spawn(|| {
-                let mut b_id_to_word : HashMap<u64, &String> = HashMap::new();
-
-                for (value, id) in other.dictionary.iter() {
-                    b_id_to_word.insert(*id, value);
-                }
-                b_id_to_word
-            });
-
-            let dict_handle = s.spawn(|| {
-                for (word, _) in other.dictionary.iter() {
-                   dict.set(word);
-                }
-                dict
-            });
-
-            for file in self.filecache.iter() {
-                a_hash.insert(file);
-            }
-
-            for file in other.filecache.iter() {
-                if !a_hash.contains(file) {
-                    diff.push(file);
-                }
-            }
-
-            let b_id_to_word = converter_handle.join().unwrap();
-            self.dictionary = dict_handle.join().unwrap();
-
-            for file in diff {
-                let mut words : Vec<&String> = Vec::new();
-
-                for (word_id, i) in file.vector.iter() {
-                    for _ in 0..*i {
-                        words.push(b_id_to_word.get(word_id).unwrap());
-                    }
-                }
-
-                self.filecache.push(FileCache {
-                    path : file.path.clone(),
-                    vector: self.dictionary.vectorize_word_list(&words)
-                });
-            }
-        });
-    }
-
-    pub fn merge(mut indexes : Vec<Index>, callback : impl Fn(u8)) -> Self {
+    pub fn merge(indexes : Vec<Index>, callback : impl Fn(u8)) -> Self {
         let max = indexes.len();
+        let mut filecache = Vec::new();
 
-        indexes.sort_by(|a, b| a.filecache.len().cmp(&b.filecache.len()));
-        let mut merged_index : Index = indexes.pop().unwrap();
 
         for (i, index) in indexes.into_iter().enumerate() {
             callback((i * 100 / max) as u8);
-            merged_index.merge_into(index);
+            filecache.extend(index.filecache);
         }
+
         callback(100);
-        merged_index
+        Self { filecache }
     }
 
     pub fn search(&self, search_args : Vec<String>) -> Vec<SearchResult> {
@@ -251,13 +183,14 @@ impl Index {
         let mut opt : FileVector = FileVector::new();
 
         for arg in search_args {
-            let a = arg.trim_start_matches("+");
-            if let Some(value) = self.dictionary.get(&a.to_string()) {
-                if arg.chars().nth(0).unwrap() == '+' {
-                    opt.insert(*value, 1);
-                } else {
-                    v.insert(*value, 1);
-                }
+            let mut hasher = DefaultHasher::new();
+            let a = arg.trim_start_matches('+');
+            a.hash(&mut hasher);
+            let value = hasher.finish();
+            if arg.starts_with('+') {
+                opt.insert(value, 1);
+            } else {
+                v.insert(value, 1);
             }
         }
 
@@ -276,21 +209,11 @@ impl Index {
     }
 
     pub fn save(&self, path: String) {
-        thread::scope(|s| {
-            let dict_list_handle = s.spawn(|| {
-                self.dictionary.to_list().join(",")
-            });
+        let index_file = File::create(path).expect("could not open output file");
+        let mut file = BufWriter::new(index_file);
 
-            let index_file = File::create(path).expect("could not open output file");
-            let mut file = BufWriter::new(index_file);
-
-            for fc in self.filecache.iter() {
-                write!(file, "{}, {}\n", fc.path.replace(',', "\0"), fc.vector.stringify()).ok();
-            }
-
-            write!(file, "#{}\n", dict_list_handle.join().unwrap().as_str()).ok();
-            file.flush().ok();
-        });
+        file.write_all(&bincode::serialize(&self.filecache).unwrap()).ok();
+        file.flush().ok();
     }
 
     pub fn num_files(&self) -> usize {
diff --git a/src/main.rs b/src/main.rs
index 729b40d..3f18de9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,5 +1,4 @@
 pub mod vector;
-pub mod dictionary;
 pub mod text;
 pub mod splitter;
 pub mod filecache;
diff --git a/src/vector.rs b/src/vector.rs
index c058490..fa0a139 100644
--- a/src/vector.rs
+++ b/src/vector.rs
@@ -1,11 +1,14 @@
 use std::collections::HashMap;
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{Hash, Hasher};
 use std::ops::{Deref, DerefMut};
+use serde::{Deserialize, Serialize};
 
 /// Represents the content of a cached file.
 /// It is stored as a HashMap, because we do not
 /// have to store the zeros. With that we save a lot
 /// of storage.
-#[derive(Clone, Debug)]
+#[derive(Default, Clone, Debug, Deserialize, Serialize)]
 pub struct FileVector {
     data : HashMap<u64, u64>
 }
@@ -23,42 +26,30 @@ impl DerefMut for FileVector {
     }
 }
 
-impl Default for FileVector {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl FileVector {
     pub fn new() -> Self {
         Self { data : HashMap::new() }
     }
 
-    pub fn from_string(hex : &str) -> Self {
-        let mut data : HashMap<u64, u64> = HashMap::new();
-        let data_chunks : Vec<&str> = hex.split(' ').collect();
+    pub fn from_words(words: Vec<String>) -> Self {
+        let mut data = HashMap::new();
 
-        for chunk in data_chunks {
-            if !chunk.is_empty() {
-                let n : Vec<&str> = chunk.split(';').collect();
-                let i : u64 = u64::from_str_radix(n[0], 16).expect("could not extract index");
-                let v : u64 = u64::from_str_radix(n[1], 16).expect("could not extract value");
-                data.insert(i, v);
+        for word in words {
+            let mut hasher = DefaultHasher::new();
+            word.hash(&mut hasher);
+            let k = hasher.finish();
+            match data.entry(k) {
+                std::collections::hash_map::Entry::Occupied(mut e) => {
+                    e.insert(e.get() + 1);
+                }
+                std::collections::hash_map::Entry::Vacant(e) => {
+                    e.insert(1);
+                }
             }
         }
 
         Self { data }
     }
-
-    pub fn stringify(&self) -> String {
-        let mut hex = String::new();
-
-        for (i, v) in self.data.iter() {
-            hex += &format!("{:x};{:x} ", *i, *v);
-        }
-
-        hex.trim().to_string()
-    }
 }
 
 pub fn scalar_product(a : &FileVector, b : &FileVector) -> u64 {