aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/index.rs153
-rw-r--r--src/main.rs3
-rw-r--r--src/searchresult.rs2
-rw-r--r--src/vector.rs67
4 files changed, 105 insertions, 120 deletions
diff --git a/src/index.rs b/src/index.rs
index 8fb34fe..cc86f0c 100644
--- a/src/index.rs
+++ b/src/index.rs
@@ -1,16 +1,13 @@
-use std::collections::hash_map::DefaultHasher;
-use std::hash::{Hash, Hasher};
+use std::collections::HashMap;
+use std::hash::Hash;
use std::fs::File;
-use std::io::{BufWriter, Write};
-use std::sync::mpsc::{channel, Sender};
-use std::time::Duration;
-use walkdir::*;
+use std::io::BufWriter;
use std::thread;
-use std::option::Option::None;
-use crate::vector::FileVector;
+use walkdir::*;
+use hash32::Hasher;
+use crate::vector::{FileVector, Indexer, Count};
use crate::filecache::FileCache;
use crate::searchresult::SearchResult;
-use crate::filecounter::filecount;
use crate::text;
use crate::splitter;
use crate::vector;
@@ -44,100 +41,55 @@ impl Index {
}
pub fn generate(input_path : &str, callback : impl Fn(GenState, u8)) -> Self {
- let mut nof = 1;
- let mut counter = 0;
- let mut crawler_handles = Vec::new();
+ let mut nof : usize = 0;
let num_threads = thread::available_parallelism().unwrap().get();
- let mut tx_vec : Vec<Sender<String>> = Vec::new();
- let mut indexes = Vec::new();
-
- thread::scope(|s| {
- let mut nof_handle : Option<_> = Some(s.spawn(|| filecount(input_path)));
- let (status_tx, status_rx) = channel();
+ let mut paths = Vec::new();
+ let (result_tx, result_rx) = std::sync::mpsc::channel();
- for _ in 0..num_threads {
- let (tx, rx) = channel();
- tx_vec.push(tx);
- let status_tx = status_tx.clone();
- crawler_handles.push(thread::spawn(move || {
- let mut filecache : Vec<FileCache> = Vec::new();
-
- loop {
- let path = rx.recv().unwrap();
- if path.is_empty() {
- return Self {
- filecache
- }
- }
+ callback(GenState::Fetching, 0);
+ for entry in WalkDir::new(input_path)
+ .into_iter()
+ .filter_map(|e| e.ok()) {
+ if entry.path().is_file() {
+ nof += 1;
+ paths.push(entry.path().to_str().unwrap().to_string());
+ }
+ }
+ callback(GenState::Fetching, 100);
- let content : String = text::extract_text(path.as_str());
+ let chunks = paths.chunks(paths.len() / num_threads);
+ let mut filecache = Vec::with_capacity(nof);
- let _ = status_tx.send(());
+ thread::scope(|s| {
+ for chunk in chunks {
+ let result_tx = result_tx.clone();
+ s.spawn(move || {
+ for path in chunk {
+ let content : String = text::extract_text(path);
if content.is_empty() {
+ result_tx.send(FileCache {
+ path: "".to_string(),
+ vector : FileVector::default()
+ }).ok();
continue;
}
let words : Vec<String> = splitter::split_to_words(content);
let fv = FileVector::from_words(words);
- filecache.push(FileCache {
- path,
+ result_tx.send(FileCache {
+ path: "".to_string(),
vector : fv
- });
- }
- }));
- }
-
- let mut next_crawler = 0;
- let mut last_p = 0;
-
- for entry in WalkDir::new(input_path)
- .into_iter()
- .filter_map(|e| e.ok()) {
- counter += 1;
- if entry.path().is_file() {
- tx_vec[next_crawler].send(entry.path().to_str().unwrap().to_string()).ok();
- next_crawler += 1;
- if next_crawler == num_threads {
- next_crawler = 0;
- }
-
- match nof_handle {
- Some(t) => {
- if t.is_finished() {
- nof = t.join().unwrap();
- nof_handle = None;
- } else {
- nof_handle = Some(t);
- }
- }
- None => {
- // Make sure that we only push a update
- // if there is a visual change to the number
- // because updating the screen takes a lot
- // of time.
- let p = counter * 100 / nof;
- if p != last_p {
- callback(GenState::Fetching, p as u8);
- last_p = p;
- }
- }
+ }).ok();
}
- }
+ });
}
- let join_handle = s.spawn(|| {
- for (i, handle) in crawler_handles.into_iter().enumerate() {
- tx_vec[i].send(String::new()).ok();
- indexes.push(handle.join().unwrap());
- }
- });
-
-
- let mut i = 0;
+ let mut i : usize = 0;
let mut last_p = 0;
- while !join_handle.is_finished() {
- if status_rx.recv_timeout(Duration::from_millis(20)).is_ok() {
+ while i != nof {
+ if let Ok(result) = result_rx.recv() {
+ filecache.push(result);
i += 1;
let p = i * 100 / nof;
if p != last_p {
@@ -148,11 +100,9 @@ impl Index {
}
callback(GenState::Parsing, 100);
-
- join_handle.join().ok();
});
- Index::merge(indexes, |p| { callback(GenState::Merging, p) })
+ Self { filecache }
}
pub fn from_file(path : &String) -> Self {
@@ -168,32 +118,35 @@ impl Index {
let max = indexes.len();
let mut filecache = Vec::new();
-
for (i, index) in indexes.into_iter().enumerate() {
callback((i * 100 / max) as u8);
filecache.extend(index.filecache);
}
callback(100);
+
Self { filecache }
}
pub fn search(&self, search_args : Vec<String>) -> Vec<SearchResult> {
- let mut v : FileVector = FileVector::new();
- let mut opt : FileVector = FileVector::new();
+ let mut v : HashMap<Indexer, Count> = HashMap::new();
+ let mut opt : HashMap<Indexer, Count> = HashMap::new();
for arg in search_args {
- let mut hasher = DefaultHasher::new();
+ let mut hasher = hash32::FnvHasher::default();
let a = arg.trim_start_matches('+');
a.hash(&mut hasher);
- let value = hasher.finish();
+ let value = hasher.finish32();
if arg.starts_with('+') {
- opt.insert(value, 1);
+ opt.insert(value as Indexer, 1);
} else {
- v.insert(value, 1);
+ v.insert(value as Indexer, 1);
}
}
+ let v = FileVector::from_hashmap(v);
+ let opt = FileVector::from_hashmap(opt);
+
let mut results : Vec<SearchResult> = Vec::new();
for filecache in self.filecache.iter() {
@@ -210,10 +163,8 @@ impl Index {
pub fn save(&self, path: String) {
let index_file = File::create(path).expect("could not open output file");
- let mut file = BufWriter::new(index_file);
-
- file.write_all(&bincode::serialize(&self.filecache).unwrap()).ok();
- file.flush().ok();
+ let file = BufWriter::new(index_file);
+ bincode::serialize_into(file, &self.filecache).ok();
}
pub fn num_files(&self) -> usize {
diff --git a/src/main.rs b/src/main.rs
index 3f18de9..1a8a7c9 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,3 +1,4 @@
+#![cfg_attr(not(debug_assertions), windows_subsystem = "windows")] // hide console window on Windows in release
pub mod vector;
pub mod text;
pub mod splitter;
@@ -68,6 +69,6 @@ fn main() {
).save(merged);
}
} else {
- let _ = gui::run();
+ gui::run();
}
}
diff --git a/src/searchresult.rs b/src/searchresult.rs
index 23e8d02..0630791 100644
--- a/src/searchresult.rs
+++ b/src/searchresult.rs
@@ -2,6 +2,6 @@
/// function.
#[derive(Debug, Clone)]
pub struct SearchResult {
- pub priority : u64,
+ pub priority : u32,
pub path : String
}
diff --git a/src/vector.rs b/src/vector.rs
index fa0a139..c85c50a 100644
--- a/src/vector.rs
+++ b/src/vector.rs
@@ -1,8 +1,11 @@
use std::collections::HashMap;
-use std::collections::hash_map::DefaultHasher;
-use std::hash::{Hash, Hasher};
+use std::hash::Hash;
use std::ops::{Deref, DerefMut};
use serde::{Deserialize, Serialize};
+use hash32::Hasher;
+
+pub type Count = u8;
+pub type Indexer = u32;
/// Represents the content of a cached file.
/// It is stored as a HashMap, because we do not
@@ -10,11 +13,18 @@ use serde::{Deserialize, Serialize};
/// of storage.
#[derive(Default, Clone, Debug, Deserialize, Serialize)]
pub struct FileVector {
- data : HashMap<u64, u64>
+ data : Vec<FileVectorEntry>
+}
+
+#[repr(packed)]
+#[derive(Default, Clone, Debug, Deserialize, Serialize)]
+pub struct FileVectorEntry {
+ index: u32,
+ count: Count,
}
impl Deref for FileVector {
- type Target = HashMap<u64, u64>;
+ type Target = Vec<FileVectorEntry>;
fn deref(&self) -> &Self::Target {
&self.data
}
@@ -28,19 +38,32 @@ impl DerefMut for FileVector {
impl FileVector {
pub fn new() -> Self {
- Self { data : HashMap::new() }
+ Self { data : Vec::new() }
+ }
+
+ pub fn to_hashmap(&self) -> HashMap<Indexer, Count> {
+ let mut map = HashMap::new();
+
+ for e in self.data.iter() {
+ map.insert(e.index, e.count);
+ }
+
+ map
}
pub fn from_words(words: Vec<String>) -> Self {
- let mut data = HashMap::new();
+ let mut data : HashMap<Indexer, Count> = HashMap::new();
for word in words {
- let mut hasher = DefaultHasher::new();
+ let mut hasher = hash32::FnvHasher::default();
word.hash(&mut hasher);
- let k = hasher.finish();
- match data.entry(k) {
+ let k = hasher.finish32();
+ match data.entry(k as Indexer) {
std::collections::hash_map::Entry::Occupied(mut e) => {
- e.insert(e.get() + 1);
+ let i = *e.get();
+ if i == Count::MAX {
+ e.insert((i + 1) as Count);
+ }
}
std::collections::hash_map::Entry::Vacant(e) => {
e.insert(1);
@@ -48,26 +71,36 @@ impl FileVector {
}
}
- Self { data }
+ FileVector::from_hashmap(data)
+ }
+
+ pub fn from_hashmap(map : HashMap<Indexer, Count>) -> Self {
+ Self { data : Vec::from_iter(map.iter().map(|e| {
+ FileVectorEntry { index: *e.0, count: *e.1 }
+ }))}
}
}
-pub fn scalar_product(a : &FileVector, b : &FileVector) -> u64 {
- let mut c = 0;
+pub fn scalar_product(a : &FileVector, b : &FileVector) -> u32 {
+ let a = a.to_hashmap();
+ let b = b.to_hashmap();
+ let mut c : u32 = 0;
for (i, x) in a.iter() {
- c += x * (b.get(i).unwrap_or(&0));
+ c += (x * (b.get(i).unwrap_or(&0))) as u32;
}
c
}
-pub fn match_vector(query : &FileVector, v : &FileVector) -> u64 {
- let mut c = 0;
+pub fn match_vector(query : &FileVector, v : &FileVector) -> u32 {
+ let query = query.to_hashmap();
+ let v = v.to_hashmap();
+ let mut c : u32 = 0;
for (i, x) in query.iter() {
let s = x * (v.get(i).unwrap_or(&0));
if s == 0 {
return 0
} else {
- c += s;
+ c += s as u32;
}
}
c