From 019d08f3441c9e499977d583bb0f8383aff50d4b Mon Sep 17 00:00:00 2001
From: Nathan Reiner <nathan@nathanreiner.xyz>
Date: Sun, 9 Jul 2023 13:12:00 +0200
Subject: introduce some minor optimizations

---
 src/dictionary.rs |  42 +++++++++++++----
 src/filecache.rs  |   2 +-
 src/gui/mod.rs    |   2 +-
 src/index.rs      | 136 ++++++++++++++++++++++++++++--------------------------
 src/main.rs       |  16 +++----
 src/splitter.rs   |   2 +-
 src/vector.rs     |   2 +-
 7 files changed, 117 insertions(+), 85 deletions(-)

(limited to 'src')
diff --git a/src/dictionary.rs b/src/dictionary.rs
index e651e71..563621d 100644
--- a/src/dictionary.rs
+++ b/src/dictionary.rs
@@ -34,19 +34,29 @@ impl Dictionary {
         Self { last_index : i - 1, data }
     }
 
-    pub fn set(&mut self, name : String) {
-        if let std::collections::hash_map::Entry::Vacant(e) = self.data.entry(name) {
+    pub fn set(&mut self, name : &String) {
+        if !self.data.contains_key(name) {
             self.last_index += 1;
-            e.insert(self.last_index as u64);
+            self.data.insert(name.clone(), self.last_index as u64);
         }
     }
 
-    pub fn get(&self, name : String) -> Option<&u64> {
-        self.data.get(&name)
+    pub fn set_and_get(&mut self, name : &String) -> u64 {
+        if !self.data.contains_key(name) {
+            self.last_index += 1;
+            self.data.insert(name.clone(), self.last_index as u64);
+            self.last_index as u64
+        } else {
+            *self.data.get(name).unwrap()
+        }
     }
 
-    pub fn iter(&self) -> &HashMap<String, u64> {
-        &self.data
+    pub fn get(&self, name : &String) -> Option<&u64> {
+        self.data.get(name)
+    }
+
+    pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, u64> {
+        self.data.iter()
     }
 
     pub fn to_list(&self) -> Vec<String> {
@@ -61,7 +71,7 @@ impl Dictionary {
         v
     }
 
-    pub fn vectorize_word_list(&self, words : Vec<String>) -> FileVector {
+    pub fn vectorize_word_list(&self, words : &Vec<&String>) -> FileVector {
         let mut fv = FileVector::new();
 
         for word in words {
@@ -76,4 +86,20 @@ impl Dictionary {
 
         fv
     }
+
+    pub fn insert_words_and_vectorize_word_list(&mut self, words : &Vec<&String>) -> FileVector {
+        let mut fv = FileVector::new();
+
+        for word in words {
+            let i = self.set_and_get(word);
+            if !fv.contains_key(&i) {
+                fv.insert(i, 1);
+            } else {
+                let c : u64 = *fv.get(&i).unwrap();
+                fv.insert(i, c + 1);
+            }
+        }
+
+        fv
+    }
 }
diff --git a/src/filecache.rs b/src/filecache.rs
index a352e58..af97c20 100644
--- a/src/filecache.rs
+++ b/src/filecache.rs
@@ -26,7 +26,7 @@ impl Hash for FileCache {
 impl FileCache {
     pub fn from_line(line : String) -> Self {
         let ls : Vec<String> = line.split(',').map(|s| s.to_string()).collect();
-        let v = FileVector::from_string(ls[1].clone());
+        let v = FileVector::from_string(&ls[1]);
         let p = ls[0].clone().replace('\0', ",");
         Self {
             vector : v,
diff --git a/src/gui/mod.rs b/src/gui/mod.rs
index 389fdb6..52cf8a7 100644
--- a/src/gui/mod.rs
+++ b/src/gui/mod.rs
@@ -83,7 +83,7 @@ async fn load_file() -> Index {
     let file = file.unwrap();
     let file = file.to_str();
     let file = file.unwrap();
-    Index::from_file(file.to_string())
+    Index::from_file(&file.to_string())
 }
 
 async fn generate() -> Index {
diff --git a/src/index.rs b/src/index.rs
index cfb612a..741eb7c 100644
--- a/src/index.rs
+++ b/src/index.rs
@@ -1,7 +1,8 @@
 use std::collections::{HashSet, HashMap};
 use std::fs::File;
-use std::io::{Write, BufReader, BufRead};
+use std::io::{BufWriter, BufReader, BufRead, Write};
 use std::sync::mpsc::{channel, Sender};
+use std::time::Duration;
 use walkdir::*;
 use std::thread;
 use std::option::Option::None;
@@ -50,13 +51,16 @@ impl Index {
         let mut crawler_handles = Vec::new();
         let num_threads = thread::available_parallelism().unwrap().get();
         let mut tx_vec : Vec<Sender<String>> = Vec::new();
+        let mut indexes = Vec::new();
 
         thread::scope(|s| {
             let mut nof_handle : Option<_> = Some(s.spawn(|| filecount(input_path)));
+            let (status_tx, status_rx) = channel();
 
             for _ in 0..num_threads {
                 let (tx, rx) = channel();
                 tx_vec.push(tx);
+                let status_tx = status_tx.clone();
                 crawler_handles.push(thread::spawn(move || {
                     let mut dict = Dictionary::new();
                     let mut filecache : Vec<FileCache> = Vec::new();
@@ -72,17 +76,14 @@ impl Index {
 
                         let content : String = text::extract_text(path.as_str());
 
+                        let _ = status_tx.send(());
+
                         if content.is_empty() {
                             continue;
                         }
 
                         let words : Vec<String> = splitter::split_to_words(content);
-
-                        for word in words.iter() {
-                            dict.set(word.clone());
-                        }
-
-                        let fv = dict.vectorize_word_list(words.clone());
+                        let fv = dict.insert_words_and_vectorize_word_list(&words.iter().collect());
                         filecache.push(FileCache {
                             path,
                             vector : fv
@@ -128,22 +129,36 @@ impl Index {
                     }
                 }
             }
-        });
 
-        let mut indexes = Vec::new();
-        let mut i = 0;
+            let join_handle = s.spawn(|| {
+                for (i, handle) in crawler_handles.into_iter().enumerate() {
+                    tx_vec[i].send(String::new()).ok();
+                    indexes.push(handle.join().unwrap());
+                }
+            });
 
-        for handle in crawler_handles {
-            callback(GenState::Parsing, (i * 100 / num_threads) as u8);
-            tx_vec[i].send(String::new()).ok();
-            indexes.push(handle.join().unwrap());
-            i += 1;
-        }
 
-        Index::merge(indexes, |p| { callback(GenState::Merging, p) })
+            let mut i = 0;
+            let mut last_p = 0;
+            while !join_handle.is_finished() {
+                if status_rx.recv_timeout(Duration::from_millis(20)).is_ok() {
+                    i += 1;
+                    let p = i * 100 / nof;
+                    if p != last_p {
+                        callback(GenState::Parsing, p as u8);
+                        last_p = p;
+                    }
+                }
+            }
+
+            join_handle.join().ok();
+        });
+
+
+        Index::merge(indexes.iter().collect(), |p| { callback(GenState::Merging, p) })
     }
 
-    pub fn from_file(path : String) -> Self {
+    pub fn from_file(path : &String) -> Self {
         let index_file = File::open(path).expect("could not open index file");
         let reader = BufReader::new(index_file);
         let mut filecache : Vec<FileCache> = Vec::new();
@@ -164,77 +179,67 @@ impl Index {
         }
     }
 
-    fn merge_two(a : Index, b : Index) -> Self {
-        let mut filecache = a.filecache.clone();
-        let mut dictionary = Dictionary::default();
-
+    fn merge_into(&mut self, other : &Index) {
+        let mut dict = self.dictionary.clone();
         thread::scope(|s| {
-            let mut a_hash : HashSet<FileCache> = HashSet::new();
-            let mut diff : Vec<FileCache> = Vec::new();
+            let mut a_hash : HashSet<&FileCache> = HashSet::new();
+            let mut diff : Vec<&FileCache> = Vec::new();
 
             let converter_handle = s.spawn(|| {
-                let mut b_id_to_word : HashMap<u64, String> = HashMap::new();
+                let mut b_id_to_word : HashMap<u64, &String> = HashMap::new();
 
-                for (value, id) in b.dictionary.iter() {
-                    b_id_to_word.insert(id.clone(), value.clone());
+                for (value, id) in other.dictionary.iter() {
+                    b_id_to_word.insert(*id, value);
                 }
                 b_id_to_word
             });
 
             let dict_handle = s.spawn(|| {
-                let mut dict = a.dictionary.clone();
-                for (word, _) in b.dictionary.iter() {
-                    dict.set(word.clone());
+                for (word, _) in other.dictionary.iter() {
+                   dict.set(word);
                 }
                 dict
             });
 
-            for file in a.filecache.iter() {
-                a_hash.insert(file.clone());
+            for file in self.filecache.iter() {
+                a_hash.insert(file);
             }
 
-            for file in b.filecache.iter() {
+            for file in other.filecache.iter() {
                 if !a_hash.contains(file) {
-                    diff.push(file.clone());
+                    diff.push(file);
                 }
             }
 
             let b_id_to_word = converter_handle.join().unwrap();
-            dictionary = dict_handle.join().unwrap();
+            self.dictionary = dict_handle.join().unwrap();
 
             for file in diff {
-                let mut words = Vec::new();
+                let mut words : Vec<&String> = Vec::new();
 
                 for (word_id, i) in file.vector.iter() {
                     for _ in 0..*i {
-                        words.push(b_id_to_word.get(word_id).unwrap().clone());
+                        words.push(b_id_to_word.get(word_id).unwrap());
                     }
                 }
 
-                filecache.push(FileCache {
+                self.filecache.push(FileCache {
                     path : file.path.clone(),
-                    vector: dictionary.vectorize_word_list(words)
+                    vector: self.dictionary.vectorize_word_list(&words)
                 });
             }
         });
-
-        Self {
-            dictionary,
-            filecache
-        }
     }
 
-    pub fn merge(mut indexes : Vec<Index>, callback : impl Fn(u8)) -> Self {
+    pub fn merge(mut indexes : Vec<&Index>, callback : impl Fn(u8)) -> Self {
         let max = indexes.len();
-        let mut i = 0 as usize;
 
         indexes.sort_by(|a, b| a.filecache.len().cmp(&b.filecache.len()));
-        let mut merged_index = indexes.pop().unwrap();
+        let mut merged_index : Index = indexes.pop().unwrap().clone();
 
-        for index in indexes {
+        for (i, index) in indexes.into_iter().enumerate() {
             callback((i * 100 / max) as u8);
-            i += 1;
-            merged_index = Index::merge_two(merged_index, index);
+            merged_index.merge_into(index);
         }
         callback(100);
         merged_index
@@ -244,7 +249,7 @@ impl Index {
         let mut v : FileVector = FileVector::new();
 
         for arg in search_args {
-            if let Some(value) = self.dictionary.get(arg.to_string()) {
+            if let Some(value) = self.dictionary.get(&arg) {
                 v.insert(*value, 1);
             }
         }
@@ -262,19 +267,20 @@ impl Index {
         results
     }
 
-    pub fn save(&self, output : String) {
-        let mut index_file = File::create(output).unwrap();
-
-        for file in self.filecache.iter() {
-            writeln!(
-                index_file,
-                "{}, {}",
-                file.path .replace(',', "\0"),
-                file.vector.stringify()
-                ).ok();
-        }
-
-        let dict_list : Vec<String> = self.dictionary.to_list();
-        writeln!(index_file, "#{}", dict_list.join(",")).ok();
+    pub fn save(&self, path: String) {
+        thread::scope(|s| {
+            let dict_list_handle = s.spawn(|| {
+                self.dictionary.to_list().join(",")
+            });
+            let mut output : String = self.filecache.iter().map(|c| format!("{}, {}\n", c.path.replace(',', "\0"), c.vector.stringify())).collect();
+            output += "#";
+            output += dict_list_handle.join().unwrap().as_str();
+            output += "\n";
+
+            let index_file = File::create(path).expect("could not open output file");
+            let mut file = BufWriter::new(index_file);
+            file.write_all(output.as_bytes()).expect("could not write");
+            file.flush().ok();
+        });
     }
 }
diff --git a/src/main.rs b/src/main.rs
index c5cc5f6..f9d5018 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -28,7 +28,7 @@ fn main() {
 
             let input = args.get(2).unwrap();
             let file = args.get(3).unwrap();
-            let _ = Index::generate(input, |t, p| {
+            Index::generate(input, |t, p| {
                 eprint!("\r\x1b[2K{}% ", p);
                 match t {
                     GenState::Fetching => { eprint!("fetched") }
@@ -48,7 +48,7 @@ fn main() {
             let search = v.join(" ");
             let searchvec = splitter::split_to_words(search);
 
-            let idx = Index::from_file(file);
+            let idx = Index::from_file(&file);
             let results = idx.search(searchvec);
             for result in results {
                 println!("{}", result.path);
@@ -61,12 +61,12 @@ fn main() {
 
             let merged = args.get(2).unwrap().clone();
             let v : Vec<String> = args.get(3..(args.len())).unwrap().into();
-            let indexes = v.iter().map(|s| Index::from_file(s.clone())).collect();
-            let _ = Index::merge(indexes,
-                                 |p| {
-                                     eprint!("\r\x1b[2K{}% merged", p);
-                                 }
-                                 ).save(merged);
+            let indexes : Vec<Index> = v.iter().map(Index::from_file).collect();
+            Index::merge(indexes.iter().collect(),
+                         |p| {
+                             eprint!("\r\x1b[2K{}% merged", p);
+                         }
+                        ).save(merged);
         }
     } else {
         let _ = gui::run();
diff --git a/src/splitter.rs b/src/splitter.rs
index fbb2b6a..c4015e8 100644
--- a/src/splitter.rs
+++ b/src/splitter.rs
@@ -12,7 +12,7 @@ pub fn split_to_words(data : String) -> Vec<String> {
         word.retain(|c| !r#"{}[]#(),".;:?!'%|0123456789/\^"#.contains(c))
     }
 
-    v.retain(|str| !str.is_empty());
+    v.retain(|str| !str.is_empty() && !str.contains("--"));
 
     v
 }
diff --git a/src/vector.rs b/src/vector.rs
index 87be04b..dfd2d71 100644
--- a/src/vector.rs
+++ b/src/vector.rs
@@ -34,7 +34,7 @@ impl FileVector {
         Self { data : HashMap::new() }
     }
 
-    pub fn from_string(hex : String) -> Self {
+    pub fn from_string(hex : &str) -> Self {
         let mut data : HashMap<u64, u64> = HashMap::new();
         let data_chunks : Vec<&str> = hex.split(' ').collect();
 
-- 
cgit v1.2.3-70-g09d2