From 8d6b55ebed2bda4705525004863055e99e72da83 Mon Sep 17 00:00:00 2001 From: Nathan Reiner Date: Sat, 8 Jul 2023 17:29:13 +0200 Subject: optimize merge --- src/index.rs | 141 +++++++++++++++++++++++++---------------------------------- 1 file changed, 60 insertions(+), 81 deletions(-) (limited to 'src') diff --git a/src/index.rs b/src/index.rs index 78df3a1..cfb612a 100644 --- a/src/index.rs +++ b/src/index.rs @@ -107,8 +107,12 @@ impl Index { match nof_handle { Some(t) => { - nof = t.join().unwrap(); - nof_handle = None; + if t.is_finished() { + nof = t.join().unwrap(); + nof_handle = None; + } else { + nof_handle = Some(t); + } } None => { // Make sure that we only push a update @@ -160,105 +164,80 @@ impl Index { } } - fn merge_two(first : Index, second : Index) -> thread::JoinHandle { - thread::spawn(move || { - let (a, b) = if first.filecache.len() < second.filecache.len() { - (second, first) - } else { - (first, second) - }; - let mut filecache = a.filecache.clone(); - let mut dictionary = Dictionary::default(); - - thread::scope(|s| { - let mut a_hash : HashSet = HashSet::new(); - let mut diff : Vec = Vec::new(); + fn merge_two(a : Index, b : Index) -> Self { + let mut filecache = a.filecache.clone(); + let mut dictionary = Dictionary::default(); - let converter_handle = s.spawn(|| { - let mut b_id_to_word : HashMap = HashMap::new(); + thread::scope(|s| { + let mut a_hash : HashSet = HashSet::new(); + let mut diff : Vec = Vec::new(); - for (value, id) in b.dictionary.iter() { - b_id_to_word.insert(id.clone(), value.clone()); - } - b_id_to_word - }); + let converter_handle = s.spawn(|| { + let mut b_id_to_word : HashMap = HashMap::new(); - let dict_handle = s.spawn(|| { - let mut dict = a.dictionary.clone(); - for (word, _) in b.dictionary.iter() { - dict.set(word.clone()); - } - dict - }); + for (value, id) in b.dictionary.iter() { + b_id_to_word.insert(id.clone(), value.clone()); + } + b_id_to_word + }); - for file in a.filecache.iter() { - a_hash.insert(file.clone()); + let dict_handle = s.spawn(|| { + let mut dict = a.dictionary.clone(); + for (word, _) in b.dictionary.iter() { + dict.set(word.clone()); } + dict + }); - for file in b.filecache.iter() { - if !a_hash.contains(file) { - diff.push(file.clone()); - } + for file in a.filecache.iter() { + a_hash.insert(file.clone()); + } + + for file in b.filecache.iter() { + if !a_hash.contains(file) { + diff.push(file.clone()); } + } - let b_id_to_word = converter_handle.join().unwrap(); - dictionary = dict_handle.join().unwrap(); + let b_id_to_word = converter_handle.join().unwrap(); + dictionary = dict_handle.join().unwrap(); - for file in diff { - let mut words = Vec::new(); + for file in diff { + let mut words = Vec::new(); - for (word_id, i) in file.vector.iter() { - for _ in 0..*i { - words.push(b_id_to_word.get(word_id).unwrap().clone()); - } + for (word_id, i) in file.vector.iter() { + for _ in 0..*i { + words.push(b_id_to_word.get(word_id).unwrap().clone()); } - - filecache.push(FileCache { - path : file.path.clone(), - vector: dictionary.vectorize_word_list(words) - }); } - }); - Self { - dictionary, - filecache + filecache.push(FileCache { + path : file.path.clone(), + vector: dictionary.vectorize_word_list(words) + }); } - }) + }); + + Self { + dictionary, + filecache + } } - pub fn merge(indexes : Vec, callback : impl Fn(u8)) -> Self { - let mut idxs : Vec = indexes.clone(); - let max = (idxs.len() as f32).log2().ceil() as u32; - let mut i = 0 as u32; + pub fn merge(mut indexes : Vec, callback : impl Fn(u8)) -> Self { + let max = indexes.len(); + let mut i = 0 as usize; - while idxs.len() > 1 { + indexes.sort_by(|a, b| a.filecache.len().cmp(&b.filecache.len())); + let mut merged_index = indexes.pop().unwrap(); + + for index in indexes { callback((i * 100 / max) as u8); i += 1; - let mut idxs_handle = Vec::new(); - let mut processed = Vec::new(); - - for chunk in idxs.chunks(2) { - if chunk.len() == 2 { - let a = chunk[0].clone(); - let b = chunk[1].clone(); - idxs_handle.push(Index::merge_two(a, b)); - } else { - for idx in chunk.iter() { - processed.push(idx.clone()) - } - } - } - - for idx_handle in idxs_handle { - let idx : Index = idx_handle.join().unwrap(); - processed.push(idx) - } - - idxs = processed; + merged_index = Index::merge_two(merged_index, index); } - - idxs.get(0).unwrap().clone() + callback(100); + merged_index } pub fn search(&self, search_args : Vec) -> Vec { -- cgit v1.2.3-70-g09d2