aboutsummaryrefslogtreecommitdiff
path: root/src/index.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/index.rs')
-rw-r--r--src/index.rs134
1 files changed, 70 insertions, 64 deletions
diff --git a/src/index.rs b/src/index.rs
index cfb612a..741eb7c 100644
--- a/src/index.rs
+++ b/src/index.rs
@@ -1,7 +1,8 @@
use std::collections::{HashSet, HashMap};
use std::fs::File;
-use std::io::{Write, BufReader, BufRead};
+use std::io::{BufWriter, BufReader, BufRead, Write};
use std::sync::mpsc::{channel, Sender};
+use std::time::Duration;
use walkdir::*;
use std::thread;
use std::option::Option::None;
@@ -50,13 +51,16 @@ impl Index {
let mut crawler_handles = Vec::new();
let num_threads = thread::available_parallelism().unwrap().get();
let mut tx_vec : Vec<Sender<String>> = Vec::new();
+ let mut indexes = Vec::new();
thread::scope(|s| {
let mut nof_handle : Option<_> = Some(s.spawn(|| filecount(input_path)));
+ let (status_tx, status_rx) = channel();
for _ in 0..num_threads {
let (tx, rx) = channel();
tx_vec.push(tx);
+ let status_tx = status_tx.clone();
crawler_handles.push(thread::spawn(move || {
let mut dict = Dictionary::new();
let mut filecache : Vec<FileCache> = Vec::new();
@@ -72,17 +76,14 @@ impl Index {
let content : String = text::extract_text(path.as_str());
+ let _ = status_tx.send(());
+
if content.is_empty() {
continue;
}
let words : Vec<String> = splitter::split_to_words(content);
-
- for word in words.iter() {
- dict.set(word.clone());
- }
-
- let fv = dict.vectorize_word_list(words.clone());
+ let fv = dict.insert_words_and_vectorize_word_list(&words.iter().collect());
filecache.push(FileCache {
path,
vector : fv
@@ -128,22 +129,36 @@ impl Index {
}
}
}
- });
- let mut indexes = Vec::new();
- let mut i = 0;
+ let join_handle = s.spawn(|| {
+ for (i, handle) in crawler_handles.into_iter().enumerate() {
+ tx_vec[i].send(String::new()).ok();
+ indexes.push(handle.join().unwrap());
+ }
+ });
- for handle in crawler_handles {
- callback(GenState::Parsing, (i * 100 / num_threads) as u8);
- tx_vec[i].send(String::new()).ok();
- indexes.push(handle.join().unwrap());
- i += 1;
- }
- Index::merge(indexes, |p| { callback(GenState::Merging, p) })
+ let mut i = 0;
+ let mut last_p = 0;
+ while !join_handle.is_finished() {
+ if status_rx.recv_timeout(Duration::from_millis(20)).is_ok() {
+ i += 1;
+ let p = i * 100 / nof;
+ if p != last_p {
+ callback(GenState::Parsing, p as u8);
+ last_p = p;
+ }
+ }
+ }
+
+ join_handle.join().ok();
+ });
+
+
+ Index::merge(indexes.iter().collect(), |p| { callback(GenState::Merging, p) })
}
- pub fn from_file(path : String) -> Self {
+ pub fn from_file(path : &String) -> Self {
let index_file = File::open(path).expect("could not open index file");
let reader = BufReader::new(index_file);
let mut filecache : Vec<FileCache> = Vec::new();
@@ -164,77 +179,67 @@ impl Index {
}
}
- fn merge_two(a : Index, b : Index) -> Self {
- let mut filecache = a.filecache.clone();
- let mut dictionary = Dictionary::default();
-
+ fn merge_into(&mut self, other : &Index) {
+ let mut dict = self.dictionary.clone();
thread::scope(|s| {
- let mut a_hash : HashSet<FileCache> = HashSet::new();
- let mut diff : Vec<FileCache> = Vec::new();
+ let mut a_hash : HashSet<&FileCache> = HashSet::new();
+ let mut diff : Vec<&FileCache> = Vec::new();
let converter_handle = s.spawn(|| {
- let mut b_id_to_word : HashMap<u64, String> = HashMap::new();
+ let mut b_id_to_word : HashMap<u64, &String> = HashMap::new();
- for (value, id) in b.dictionary.iter() {
- b_id_to_word.insert(id.clone(), value.clone());
+ for (value, id) in other.dictionary.iter() {
+ b_id_to_word.insert(*id, value);
}
b_id_to_word
});
let dict_handle = s.spawn(|| {
- let mut dict = a.dictionary.clone();
- for (word, _) in b.dictionary.iter() {
- dict.set(word.clone());
+ for (word, _) in other.dictionary.iter() {
+ dict.set(word);
}
dict
});
- for file in a.filecache.iter() {
- a_hash.insert(file.clone());
+ for file in self.filecache.iter() {
+ a_hash.insert(file);
}
- for file in b.filecache.iter() {
+ for file in other.filecache.iter() {
if !a_hash.contains(file) {
- diff.push(file.clone());
+ diff.push(file);
}
}
let b_id_to_word = converter_handle.join().unwrap();
- dictionary = dict_handle.join().unwrap();
+ self.dictionary = dict_handle.join().unwrap();
for file in diff {
- let mut words = Vec::new();
+ let mut words : Vec<&String> = Vec::new();
for (word_id, i) in file.vector.iter() {
for _ in 0..*i {
- words.push(b_id_to_word.get(word_id).unwrap().clone());
+ words.push(b_id_to_word.get(word_id).unwrap());
}
}
- filecache.push(FileCache {
+ self.filecache.push(FileCache {
path : file.path.clone(),
- vector: dictionary.vectorize_word_list(words)
+ vector: self.dictionary.vectorize_word_list(&words)
});
}
});
-
- Self {
- dictionary,
- filecache
- }
}
- pub fn merge(mut indexes : Vec<Index>, callback : impl Fn(u8)) -> Self {
+ pub fn merge(mut indexes : Vec<&Index>, callback : impl Fn(u8)) -> Self {
let max = indexes.len();
- let mut i = 0 as usize;
indexes.sort_by(|a, b| a.filecache.len().cmp(&b.filecache.len()));
- let mut merged_index = indexes.pop().unwrap();
+ let mut merged_index : Index = indexes.pop().unwrap().clone();
- for index in indexes {
+ for (i, index) in indexes.into_iter().enumerate() {
callback((i * 100 / max) as u8);
- i += 1;
- merged_index = Index::merge_two(merged_index, index);
+ merged_index.merge_into(index);
}
callback(100);
merged_index
@@ -244,7 +249,7 @@ impl Index {
let mut v : FileVector = FileVector::new();
for arg in search_args {
- if let Some(value) = self.dictionary.get(arg.to_string()) {
+ if let Some(value) = self.dictionary.get(&arg) {
v.insert(*value, 1);
}
}
@@ -262,19 +267,20 @@ impl Index {
results
}
- pub fn save(&self, output : String) {
- let mut index_file = File::create(output).unwrap();
-
- for file in self.filecache.iter() {
- writeln!(
- index_file,
- "{}, {}",
- file.path .replace(',', "\0"),
- file.vector.stringify()
- ).ok();
- }
+ pub fn save(&self, path: String) {
+ thread::scope(|s| {
+ let dict_list_handle = s.spawn(|| {
+ self.dictionary.to_list().join(",")
+ });
+ let mut output : String = self.filecache.iter().map(|c| format!("{}, {}\n", c.path.replace(',', "\0"), c.vector.stringify())).collect();
+ output += "#";
+ output += dict_list_handle.join().unwrap().as_str();
+ output += "\n";
- let dict_list : Vec<String> = self.dictionary.to_list();
- writeln!(index_file, "#{}", dict_list.join(",")).ok();
+ let index_file = File::create(path).expect("could not open output file");
+ let mut file = BufWriter::new(index_file);
+ file.write_all(output.as_bytes()).expect("could not write");
+ file.flush().ok();
+ });
}
}