aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorNathan Reiner <nathan@nathanreiner.xyz>2023-07-06 11:51:21 +0200
committerNathan Reiner <nathan@nathanreiner.xyz>2023-07-06 11:51:21 +0200
commite1770cf3b0fd5eff3e69a8ec28c15018084eae73 (patch)
tree0fc6289cd8b56f654a760d1ee7d748d160bcc251 /src
parent3ca9adc0c5e138271dacab7691dac77da0ba0f21 (diff)
add extractors for docx, pptx, pdf, etc.
Diffstat (limited to 'src')
-rw-r--r--src/extractors/mod.rs1
-rw-r--r--src/extractors/pdf.rs143
-rw-r--r--src/main.rs1
-rw-r--r--src/text/docx.rs9
-rw-r--r--src/text/mod.rs12
-rw-r--r--src/text/odp.rs9
-rw-r--r--src/text/odt.rs9
-rw-r--r--src/text/pdf.rs5
-rw-r--r--src/text/pptx.rs9
-rw-r--r--src/text/xlsx.rs9
10 files changed, 207 insertions, 0 deletions
diff --git a/src/extractors/mod.rs b/src/extractors/mod.rs
new file mode 100644
index 0000000..0f137f1
--- /dev/null
+++ b/src/extractors/mod.rs
@@ -0,0 +1 @@
+pub mod pdf;
diff --git a/src/extractors/pdf.rs b/src/extractors/pdf.rs
new file mode 100644
index 0000000..c08c75c
--- /dev/null
+++ b/src/extractors/pdf.rs
@@ -0,0 +1,143 @@
+// MIT License
+//
+// Copyright (c) 2016 Junfeng Liu
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+use std::collections::BTreeMap;
+use std::fmt::Debug;
+use std::io::{Error, ErrorKind};
+use std::path::Path;
+
+use lopdf::{Document, Object};
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+use serde::{Deserialize, Serialize};
+use serde_json;
+
+static IGNORE: &[&str] = &[
+ "Length",
+ "BBox",
+ "FormType",
+ "Matrix",
+ "Resources",
+ "Type",
+ "XObject",
+ "Subtype",
+ "Filter",
+ "ColorSpace",
+ "Width",
+ "Height",
+ "BitsPerComponent",
+ "Length1",
+ "Length2",
+ "Length3",
+ "PTEX.FileName",
+ "PTEX.PageNumber",
+ "PTEX.InfoDict",
+ "FontDescriptor",
+ "ExtGState",
+ "Font",
+ "MediaBox",
+ "Annot",
+];
+
+#[derive(Debug, Deserialize, Serialize)]
+struct PdfText {
+ text: BTreeMap<u32, Vec<String>>, // Key is page number
+ errors: Vec<String>,
+}
+
+fn filter_func(object_id: (u32, u16), object: &mut Object) -> Option<((u32, u16), Object)> {
+ if IGNORE.contains(&object.type_name().unwrap_or_default()) {
+ return None;
+ }
+ if let Ok(d) = object.as_dict_mut() {
+ d.remove(b"Font");
+ d.remove(b"Resources");
+ d.remove(b"Producer");
+ d.remove(b"ModDate");
+ d.remove(b"Creator");
+ d.remove(b"ProcSet");
+ d.remove(b"XObject");
+ d.remove(b"MediaBox");
+ d.remove(b"Annots");
+ if d.is_empty() {
+ return None;
+ }
+ }
+ Some((object_id, object.to_owned()))
+}
+
+fn load_pdf<P: AsRef<Path>>(path: P) -> Result<Document, Error> {
+ Document::load_filtered(path, filter_func).map_err(|e| Error::new(ErrorKind::Other, e.to_string()))
+}
+
+fn get_pdf_text(doc: &Document) -> Result<PdfText, Error> {
+ let mut pdf_text: PdfText = PdfText {
+ text: BTreeMap::new(),
+ errors: Vec::new(),
+ };
+ let pages: Vec<Result<(u32, Vec<String>), Error>> = doc
+ .get_pages()
+ .into_par_iter()
+ .map(
+ |(page_num, page_id): (u32, (u32, u16))| -> Result<(u32, Vec<String>), Error> {
+ let text = doc.extract_text(&[page_num]).map_err(|e| {
+ Error::new(
+ ErrorKind::Other,
+ format!("Failed to extract text from page {page_num} id={page_id:?}: {e:?}"),
+ )
+ })?;
+ Ok((
+ page_num,
+ text.split('\n')
+ .map(|s| s.trim_end().to_string())
+ .collect::<Vec<String>>(),
+ ))
+ },
+ )
+ .collect();
+ for page in pages {
+ match page {
+ Ok((page_num, lines)) => {
+ pdf_text.text.insert(page_num, lines);
+ }
+ Err(e) => {
+ pdf_text.errors.push(e.to_string());
+ }
+ }
+ }
+ Ok(pdf_text)
+}
+
+pub fn pdf2text(path: &str) -> Result<String, Error> {
+ let doc = load_pdf(&path)?;
+ if doc.is_encrypted() {
+ return Ok("".to_string());
+ }
+ let text = get_pdf_text(&doc)?;
+ if !text.errors.is_empty() {
+ eprintln!("{path:?} has {} errors:", text.errors.len());
+ for error in &text.errors[..10] {
+ eprintln!("{error:?}");
+ }
+ }
+ let data = serde_json::to_string(&text).unwrap();
+ Ok(data)
+}
diff --git a/src/main.rs b/src/main.rs
index 3cae322..2e49fd4 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -5,6 +5,7 @@ pub mod splitter;
pub mod filecache;
pub mod searchresult;
pub mod filecounter;
+pub mod extractors;
use vector::FileVector;
use dictionary::Dictionary;
diff --git a/src/text/docx.rs b/src/text/docx.rs
new file mode 100644
index 0000000..7b4a80e
--- /dev/null
+++ b/src/text/docx.rs
@@ -0,0 +1,9 @@
+use std::io::Read;
+use dotext::{Docx, MsDoc};
+
+pub fn get_text(path : &str) -> String {
+ let mut file = Docx::open(path).unwrap();
+ let mut content = String::new();
+ let _ = file.read_to_string(&mut content);
+ content
+}
diff --git a/src/text/mod.rs b/src/text/mod.rs
index 4e1b9a4..dd969af 100644
--- a/src/text/mod.rs
+++ b/src/text/mod.rs
@@ -4,6 +4,12 @@ use std::path::Path;
use std::collections::HashMap;
mod txt;
+mod docx;
+mod xlsx;
+mod pptx;
+mod odt;
+mod odp;
+mod pdf;
fn empty_extractor(_ : &str) -> String {
"".to_string()
@@ -19,6 +25,12 @@ lazy_static! {
static ref EXT: HashMap<&'static str, ExtFn> = {
HashMap::from([
ext!(txt),
+ ext!(docx),
+ ext!(xlsx),
+ ext!(pptx),
+ ext!(odt),
+ ext!(odp),
+ ext!(pdf),
])
};
}
diff --git a/src/text/odp.rs b/src/text/odp.rs
new file mode 100644
index 0000000..eaed196
--- /dev/null
+++ b/src/text/odp.rs
@@ -0,0 +1,9 @@
+use std::io::Read;
+use dotext::{Odp, doc::OpenOfficeDoc};
+
+pub fn get_text(path : &str) -> String {
+ let mut file = Odp::open(path).unwrap();
+ let mut content = String::new();
+ let _ = file.read_to_string(&mut content);
+ content
+}
diff --git a/src/text/odt.rs b/src/text/odt.rs
new file mode 100644
index 0000000..10b5342
--- /dev/null
+++ b/src/text/odt.rs
@@ -0,0 +1,9 @@
+use std::io::Read;
+use dotext::{Odt, doc::OpenOfficeDoc};
+
+pub fn get_text(path : &str) -> String {
+ let mut file = Odt::open(path).unwrap();
+ let mut content = String::new();
+ let _ = file.read_to_string(&mut content);
+ content
+}
diff --git a/src/text/pdf.rs b/src/text/pdf.rs
new file mode 100644
index 0000000..efa441f
--- /dev/null
+++ b/src/text/pdf.rs
@@ -0,0 +1,5 @@
+use crate::extractors::pdf;
+
+pub fn get_text(path : &str) -> String {
+ pdf::pdf2text(path).ok().unwrap_or_else(|| "".to_string())
+}
diff --git a/src/text/pptx.rs b/src/text/pptx.rs
new file mode 100644
index 0000000..7dac77e
--- /dev/null
+++ b/src/text/pptx.rs
@@ -0,0 +1,9 @@
+use std::io::Read;
+use dotext::{Pptx, MsDoc};
+
+pub fn get_text(path : &str) -> String {
+ let mut file = Pptx::open(path).unwrap();
+ let mut content = String::new();
+ let _ = file.read_to_string(&mut content);
+ content
+}
diff --git a/src/text/xlsx.rs b/src/text/xlsx.rs
new file mode 100644
index 0000000..a438e96
--- /dev/null
+++ b/src/text/xlsx.rs
@@ -0,0 +1,9 @@
+use std::io::Read;
+use dotext::{Xlsx, MsDoc};
+
+pub fn get_text(path : &str) -> String {
+ let mut file = Xlsx::open(path).unwrap();
+ let mut content = String::new();
+ let _ = file.read_to_string(&mut content);
+ content
+}