aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/dictionary.rs42
-rw-r--r--src/filecache.rs2
-rw-r--r--src/gui/mod.rs2
-rw-r--r--src/index.rs134
-rw-r--r--src/main.rs16
-rw-r--r--src/splitter.rs2
-rw-r--r--src/vector.rs2
7 files changed, 116 insertions, 84 deletions
diff --git a/src/dictionary.rs b/src/dictionary.rs
index e651e71..563621d 100644
--- a/src/dictionary.rs
+++ b/src/dictionary.rs
@@ -34,19 +34,29 @@ impl Dictionary {
Self { last_index : i - 1, data }
}
- pub fn set(&mut self, name : String) {
- if let std::collections::hash_map::Entry::Vacant(e) = self.data.entry(name) {
+ pub fn set(&mut self, name : &String) {
+ if !self.data.contains_key(name) {
self.last_index += 1;
- e.insert(self.last_index as u64);
+ self.data.insert(name.clone(), self.last_index as u64);
}
}
- pub fn get(&self, name : String) -> Option<&u64> {
- self.data.get(&name)
+ pub fn set_and_get(&mut self, name : &String) -> u64 {
+ if !self.data.contains_key(name) {
+ self.last_index += 1;
+ self.data.insert(name.clone(), self.last_index as u64);
+ self.last_index as u64
+ } else {
+ *self.data.get(name).unwrap()
+ }
}
- pub fn iter(&self) -> &HashMap<String, u64> {
- &self.data
+ pub fn get(&self, name : &String) -> Option<&u64> {
+ self.data.get(name)
+ }
+
+ pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, u64> {
+ self.data.iter()
}
pub fn to_list(&self) -> Vec<String> {
@@ -61,7 +71,7 @@ impl Dictionary {
v
}
- pub fn vectorize_word_list(&self, words : Vec<String>) -> FileVector {
+ pub fn vectorize_word_list(&self, words : &Vec<&String>) -> FileVector {
let mut fv = FileVector::new();
for word in words {
@@ -76,4 +86,20 @@ impl Dictionary {
fv
}
+
+ pub fn insert_words_and_vectorize_word_list(&mut self, words : &Vec<&String>) -> FileVector {
+ let mut fv = FileVector::new();
+
+ for word in words {
+ let i = self.set_and_get(word);
+ if !fv.contains_key(&i) {
+ fv.insert(i, 1);
+ } else {
+ let c : u64 = *fv.get(&i).unwrap();
+ fv.insert(i, c + 1);
+ }
+ }
+
+ fv
+ }
}
diff --git a/src/filecache.rs b/src/filecache.rs
index a352e58..af97c20 100644
--- a/src/filecache.rs
+++ b/src/filecache.rs
@@ -26,7 +26,7 @@ impl Hash for FileCache {
impl FileCache {
pub fn from_line(line : String) -> Self {
let ls : Vec<String> = line.split(',').map(|s| s.to_string()).collect();
- let v = FileVector::from_string(ls[1].clone());
+ let v = FileVector::from_string(&ls[1]);
let p = ls[0].clone().replace('\0', ",");
Self {
vector : v,
diff --git a/src/gui/mod.rs b/src/gui/mod.rs
index 389fdb6..52cf8a7 100644
--- a/src/gui/mod.rs
+++ b/src/gui/mod.rs
@@ -83,7 +83,7 @@ async fn load_file() -> Index {
let file = file.unwrap();
let file = file.to_str();
let file = file.unwrap();
- Index::from_file(file.to_string())
+ Index::from_file(&file.to_string())
}
async fn generate() -> Index {
diff --git a/src/index.rs b/src/index.rs
index cfb612a..741eb7c 100644
--- a/src/index.rs
+++ b/src/index.rs
@@ -1,7 +1,8 @@
use std::collections::{HashSet, HashMap};
use std::fs::File;
-use std::io::{Write, BufReader, BufRead};
+use std::io::{BufWriter, BufReader, BufRead, Write};
use std::sync::mpsc::{channel, Sender};
+use std::time::Duration;
use walkdir::*;
use std::thread;
use std::option::Option::None;
@@ -50,13 +51,16 @@ impl Index {
let mut crawler_handles = Vec::new();
let num_threads = thread::available_parallelism().unwrap().get();
let mut tx_vec : Vec<Sender<String>> = Vec::new();
+ let mut indexes = Vec::new();
thread::scope(|s| {
let mut nof_handle : Option<_> = Some(s.spawn(|| filecount(input_path)));
+ let (status_tx, status_rx) = channel();
for _ in 0..num_threads {
let (tx, rx) = channel();
tx_vec.push(tx);
+ let status_tx = status_tx.clone();
crawler_handles.push(thread::spawn(move || {
let mut dict = Dictionary::new();
let mut filecache : Vec<FileCache> = Vec::new();
@@ -72,17 +76,14 @@ impl Index {
let content : String = text::extract_text(path.as_str());
+ let _ = status_tx.send(());
+
if content.is_empty() {
continue;
}
let words : Vec<String> = splitter::split_to_words(content);
-
- for word in words.iter() {
- dict.set(word.clone());
- }
-
- let fv = dict.vectorize_word_list(words.clone());
+ let fv = dict.insert_words_and_vectorize_word_list(&words.iter().collect());
filecache.push(FileCache {
path,
vector : fv
@@ -128,22 +129,36 @@ impl Index {
}
}
}
- });
- let mut indexes = Vec::new();
- let mut i = 0;
+ let join_handle = s.spawn(|| {
+ for (i, handle) in crawler_handles.into_iter().enumerate() {
+ tx_vec[i].send(String::new()).ok();
+ indexes.push(handle.join().unwrap());
+ }
+ });
- for handle in crawler_handles {
- callback(GenState::Parsing, (i * 100 / num_threads) as u8);
- tx_vec[i].send(String::new()).ok();
- indexes.push(handle.join().unwrap());
- i += 1;
- }
- Index::merge(indexes, |p| { callback(GenState::Merging, p) })
+ let mut i = 0;
+ let mut last_p = 0;
+ while !join_handle.is_finished() {
+ if status_rx.recv_timeout(Duration::from_millis(20)).is_ok() {
+ i += 1;
+ let p = i * 100 / nof;
+ if p != last_p {
+ callback(GenState::Parsing, p as u8);
+ last_p = p;
+ }
+ }
+ }
+
+ join_handle.join().ok();
+ });
+
+
+ Index::merge(indexes.iter().collect(), |p| { callback(GenState::Merging, p) })
}
- pub fn from_file(path : String) -> Self {
+ pub fn from_file(path : &String) -> Self {
let index_file = File::open(path).expect("could not open index file");
let reader = BufReader::new(index_file);
let mut filecache : Vec<FileCache> = Vec::new();
@@ -164,77 +179,67 @@ impl Index {
}
}
- fn merge_two(a : Index, b : Index) -> Self {
- let mut filecache = a.filecache.clone();
- let mut dictionary = Dictionary::default();
-
+ fn merge_into(&mut self, other : &Index) {
+ let mut dict = self.dictionary.clone();
thread::scope(|s| {
- let mut a_hash : HashSet<FileCache> = HashSet::new();
- let mut diff : Vec<FileCache> = Vec::new();
+ let mut a_hash : HashSet<&FileCache> = HashSet::new();
+ let mut diff : Vec<&FileCache> = Vec::new();
let converter_handle = s.spawn(|| {
- let mut b_id_to_word : HashMap<u64, String> = HashMap::new();
+ let mut b_id_to_word : HashMap<u64, &String> = HashMap::new();
- for (value, id) in b.dictionary.iter() {
- b_id_to_word.insert(id.clone(), value.clone());
+ for (value, id) in other.dictionary.iter() {
+ b_id_to_word.insert(*id, value);
}
b_id_to_word
});
let dict_handle = s.spawn(|| {
- let mut dict = a.dictionary.clone();
- for (word, _) in b.dictionary.iter() {
- dict.set(word.clone());
+ for (word, _) in other.dictionary.iter() {
+ dict.set(word);
}
dict
});
- for file in a.filecache.iter() {
- a_hash.insert(file.clone());
+ for file in self.filecache.iter() {
+ a_hash.insert(file);
}
- for file in b.filecache.iter() {
+ for file in other.filecache.iter() {
if !a_hash.contains(file) {
- diff.push(file.clone());
+ diff.push(file);
}
}
let b_id_to_word = converter_handle.join().unwrap();
- dictionary = dict_handle.join().unwrap();
+ self.dictionary = dict_handle.join().unwrap();
for file in diff {
- let mut words = Vec::new();
+ let mut words : Vec<&String> = Vec::new();
for (word_id, i) in file.vector.iter() {
for _ in 0..*i {
- words.push(b_id_to_word.get(word_id).unwrap().clone());
+ words.push(b_id_to_word.get(word_id).unwrap());
}
}
- filecache.push(FileCache {
+ self.filecache.push(FileCache {
path : file.path.clone(),
- vector: dictionary.vectorize_word_list(words)
+ vector: self.dictionary.vectorize_word_list(&words)
});
}
});
-
- Self {
- dictionary,
- filecache
- }
}
- pub fn merge(mut indexes : Vec<Index>, callback : impl Fn(u8)) -> Self {
+ pub fn merge(mut indexes : Vec<&Index>, callback : impl Fn(u8)) -> Self {
let max = indexes.len();
- let mut i = 0 as usize;
indexes.sort_by(|a, b| a.filecache.len().cmp(&b.filecache.len()));
- let mut merged_index = indexes.pop().unwrap();
+ let mut merged_index : Index = indexes.pop().unwrap().clone();
- for index in indexes {
+ for (i, index) in indexes.into_iter().enumerate() {
callback((i * 100 / max) as u8);
- i += 1;
- merged_index = Index::merge_two(merged_index, index);
+ merged_index.merge_into(index);
}
callback(100);
merged_index
@@ -244,7 +249,7 @@ impl Index {
let mut v : FileVector = FileVector::new();
for arg in search_args {
- if let Some(value) = self.dictionary.get(arg.to_string()) {
+ if let Some(value) = self.dictionary.get(&arg) {
v.insert(*value, 1);
}
}
@@ -262,19 +267,20 @@ impl Index {
results
}
- pub fn save(&self, output : String) {
- let mut index_file = File::create(output).unwrap();
-
- for file in self.filecache.iter() {
- writeln!(
- index_file,
- "{}, {}",
- file.path .replace(',', "\0"),
- file.vector.stringify()
- ).ok();
- }
+ pub fn save(&self, path: String) {
+ thread::scope(|s| {
+ let dict_list_handle = s.spawn(|| {
+ self.dictionary.to_list().join(",")
+ });
+ let mut output : String = self.filecache.iter().map(|c| format!("{}, {}\n", c.path.replace(',', "\0"), c.vector.stringify())).collect();
+ output += "#";
+ output += dict_list_handle.join().unwrap().as_str();
+ output += "\n";
- let dict_list : Vec<String> = self.dictionary.to_list();
- writeln!(index_file, "#{}", dict_list.join(",")).ok();
+ let index_file = File::create(path).expect("could not open output file");
+ let mut file = BufWriter::new(index_file);
+ file.write_all(output.as_bytes()).expect("could not write");
+ file.flush().ok();
+ });
}
}
diff --git a/src/main.rs b/src/main.rs
index c5cc5f6..f9d5018 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -28,7 +28,7 @@ fn main() {
let input = args.get(2).unwrap();
let file = args.get(3).unwrap();
- let _ = Index::generate(input, |t, p| {
+ Index::generate(input, |t, p| {
eprint!("\r\x1b[2K{}% ", p);
match t {
GenState::Fetching => { eprint!("fetched") }
@@ -48,7 +48,7 @@ fn main() {
let search = v.join(" ");
let searchvec = splitter::split_to_words(search);
- let idx = Index::from_file(file);
+ let idx = Index::from_file(&file);
let results = idx.search(searchvec);
for result in results {
println!("{}", result.path);
@@ -61,12 +61,12 @@ fn main() {
let merged = args.get(2).unwrap().clone();
let v : Vec<String> = args.get(3..(args.len())).unwrap().into();
- let indexes = v.iter().map(|s| Index::from_file(s.clone())).collect();
- let _ = Index::merge(indexes,
- |p| {
- eprint!("\r\x1b[2K{}% merged", p);
- }
- ).save(merged);
+ let indexes : Vec<Index> = v.iter().map(Index::from_file).collect();
+ Index::merge(indexes.iter().collect(),
+ |p| {
+ eprint!("\r\x1b[2K{}% merged", p);
+ }
+ ).save(merged);
}
} else {
let _ = gui::run();
diff --git a/src/splitter.rs b/src/splitter.rs
index fbb2b6a..c4015e8 100644
--- a/src/splitter.rs
+++ b/src/splitter.rs
@@ -12,7 +12,7 @@ pub fn split_to_words(data : String) -> Vec<String> {
word.retain(|c| !r#"{}[]#(),".;:?!'%|0123456789/\^"#.contains(c))
}
- v.retain(|str| !str.is_empty());
+ v.retain(|str| !str.is_empty() && !str.contains("--"));
v
}
diff --git a/src/vector.rs b/src/vector.rs
index 87be04b..dfd2d71 100644
--- a/src/vector.rs
+++ b/src/vector.rs
@@ -34,7 +34,7 @@ impl FileVector {
Self { data : HashMap::new() }
}
- pub fn from_string(hex : String) -> Self {
+ pub fn from_string(hex : &str) -> Self {
let mut data : HashMap<u64, u64> = HashMap::new();
let data_chunks : Vec<&str> = hex.split(' ').collect();