aboutsummaryrefslogtreecommitdiff
path: root/src/index.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/index.rs')
-rw-r--r--src/index.rs153
1 files changed, 52 insertions, 101 deletions
diff --git a/src/index.rs b/src/index.rs
index 8fb34fe..cc86f0c 100644
--- a/src/index.rs
+++ b/src/index.rs
@@ -1,16 +1,13 @@
-use std::collections::hash_map::DefaultHasher;
-use std::hash::{Hash, Hasher};
+use std::collections::HashMap;
+use std::hash::Hash;
use std::fs::File;
-use std::io::{BufWriter, Write};
-use std::sync::mpsc::{channel, Sender};
-use std::time::Duration;
-use walkdir::*;
+use std::io::BufWriter;
use std::thread;
-use std::option::Option::None;
-use crate::vector::FileVector;
+use walkdir::*;
+use hash32::Hasher;
+use crate::vector::{FileVector, Indexer, Count};
use crate::filecache::FileCache;
use crate::searchresult::SearchResult;
-use crate::filecounter::filecount;
use crate::text;
use crate::splitter;
use crate::vector;
@@ -44,100 +41,55 @@ impl Index {
}
pub fn generate(input_path : &str, callback : impl Fn(GenState, u8)) -> Self {
- let mut nof = 1;
- let mut counter = 0;
- let mut crawler_handles = Vec::new();
+ let mut nof : usize = 0;
let num_threads = thread::available_parallelism().unwrap().get();
- let mut tx_vec : Vec<Sender<String>> = Vec::new();
- let mut indexes = Vec::new();
-
- thread::scope(|s| {
- let mut nof_handle : Option<_> = Some(s.spawn(|| filecount(input_path)));
- let (status_tx, status_rx) = channel();
+ let mut paths = Vec::new();
+ let (result_tx, result_rx) = std::sync::mpsc::channel();
- for _ in 0..num_threads {
- let (tx, rx) = channel();
- tx_vec.push(tx);
- let status_tx = status_tx.clone();
- crawler_handles.push(thread::spawn(move || {
- let mut filecache : Vec<FileCache> = Vec::new();
-
- loop {
- let path = rx.recv().unwrap();
- if path.is_empty() {
- return Self {
- filecache
- }
- }
+ callback(GenState::Fetching, 0);
+ for entry in WalkDir::new(input_path)
+ .into_iter()
+ .filter_map(|e| e.ok()) {
+ if entry.path().is_file() {
+ nof += 1;
+ paths.push(entry.path().to_str().unwrap().to_string());
+ }
+ }
+ callback(GenState::Fetching, 100);
- let content : String = text::extract_text(path.as_str());
+ let chunks = paths.chunks(paths.len() / num_threads);
+ let mut filecache = Vec::with_capacity(nof);
- let _ = status_tx.send(());
+ thread::scope(|s| {
+ for chunk in chunks {
+ let result_tx = result_tx.clone();
+ s.spawn(move || {
+ for path in chunk {
+ let content : String = text::extract_text(path);
if content.is_empty() {
+ result_tx.send(FileCache {
+ path: "".to_string(),
+ vector : FileVector::default()
+ }).ok();
continue;
}
let words : Vec<String> = splitter::split_to_words(content);
let fv = FileVector::from_words(words);
- filecache.push(FileCache {
- path,
+ result_tx.send(FileCache {
+ path: "".to_string(),
vector : fv
- });
- }
- }));
- }
-
- let mut next_crawler = 0;
- let mut last_p = 0;
-
- for entry in WalkDir::new(input_path)
- .into_iter()
- .filter_map(|e| e.ok()) {
- counter += 1;
- if entry.path().is_file() {
- tx_vec[next_crawler].send(entry.path().to_str().unwrap().to_string()).ok();
- next_crawler += 1;
- if next_crawler == num_threads {
- next_crawler = 0;
- }
-
- match nof_handle {
- Some(t) => {
- if t.is_finished() {
- nof = t.join().unwrap();
- nof_handle = None;
- } else {
- nof_handle = Some(t);
- }
- }
- None => {
- // Make sure that we only push a update
- // if there is a visual change to the number
- // because updating the screen takes a lot
- // of time.
- let p = counter * 100 / nof;
- if p != last_p {
- callback(GenState::Fetching, p as u8);
- last_p = p;
- }
- }
+ }).ok();
}
- }
+ });
}
- let join_handle = s.spawn(|| {
- for (i, handle) in crawler_handles.into_iter().enumerate() {
- tx_vec[i].send(String::new()).ok();
- indexes.push(handle.join().unwrap());
- }
- });
-
-
- let mut i = 0;
+ let mut i : usize = 0;
let mut last_p = 0;
- while !join_handle.is_finished() {
- if status_rx.recv_timeout(Duration::from_millis(20)).is_ok() {
+ while i != nof {
+ if let Ok(result) = result_rx.recv() {
+ filecache.push(result);
i += 1;
let p = i * 100 / nof;
if p != last_p {
@@ -148,11 +100,9 @@ impl Index {
}
callback(GenState::Parsing, 100);
-
- join_handle.join().ok();
});
- Index::merge(indexes, |p| { callback(GenState::Merging, p) })
+ Self { filecache }
}
pub fn from_file(path : &String) -> Self {
@@ -168,32 +118,35 @@ impl Index {
let max = indexes.len();
let mut filecache = Vec::new();
-
for (i, index) in indexes.into_iter().enumerate() {
callback((i * 100 / max) as u8);
filecache.extend(index.filecache);
}
callback(100);
+
Self { filecache }
}
pub fn search(&self, search_args : Vec<String>) -> Vec<SearchResult> {
- let mut v : FileVector = FileVector::new();
- let mut opt : FileVector = FileVector::new();
+ let mut v : HashMap<Indexer, Count> = HashMap::new();
+ let mut opt : HashMap<Indexer, Count> = HashMap::new();
for arg in search_args {
- let mut hasher = DefaultHasher::new();
+ let mut hasher = hash32::FnvHasher::default();
let a = arg.trim_start_matches('+');
a.hash(&mut hasher);
- let value = hasher.finish();
+ let value = hasher.finish32();
if arg.starts_with('+') {
- opt.insert(value, 1);
+ opt.insert(value as Indexer, 1);
} else {
- v.insert(value, 1);
+ v.insert(value as Indexer, 1);
}
}
+ let v = FileVector::from_hashmap(v);
+ let opt = FileVector::from_hashmap(opt);
+
let mut results : Vec<SearchResult> = Vec::new();
for filecache in self.filecache.iter() {
@@ -210,10 +163,8 @@ impl Index {
pub fn save(&self, path: String) {
let index_file = File::create(path).expect("could not open output file");
- let mut file = BufWriter::new(index_file);
-
- file.write_all(&bincode::serialize(&self.filecache).unwrap()).ok();
- file.flush().ok();
+ let file = BufWriter::new(index_file);
+ bincode::serialize_into(file, &self.filecache).ok();
}
pub fn num_files(&self) -> usize {