add extractors for docx, pptx, pdf, etc.

author: Nathan Reiner <nathan@nathanreiner.xyz> 2023-07-06 11:51:21 +0200
committer: Nathan Reiner <nathan@nathanreiner.xyz> 2023-07-06 11:51:21 +0200
commit: e1770cf3b0fd5eff3e69a8ec28c15018084eae73 (patch)
tree: 0fc6289cd8b56f654a760d1ee7d748d160bcc251 /src/extractors
parent: 3ca9adc0c5e138271dacab7691dac77da0ba0f21 (diff)
2 files changed, 144 insertions, 0 deletions
diff --git a/src/extractors/mod.rs b/src/extractors/mod.rs
new file mode 100644
index 0000000..0f137f1
--- /dev/null
+++ b/src/extractors/mod.rs
@@ -0,0 +1 @@
+pub mod pdf;
diff --git a/src/extractors/pdf.rs b/src/extractors/pdf.rs
new file mode 100644
index 0000000..c08c75c
--- /dev/null
+++ b/src/extractors/pdf.rs
@@ -0,0 +1,143 @@
+// MIT License
+// 
+// Copyright (c) 2016 Junfeng Liu
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+use std::collections::BTreeMap;
+use std::fmt::Debug;
+use std::io::{Error, ErrorKind};
+use std::path::Path;
+
+use lopdf::{Document, Object};
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+use serde::{Deserialize, Serialize};
+use serde_json;
+
+static IGNORE: &[&str] = &[
+    "Length",
+    "BBox",
+    "FormType",
+    "Matrix",
+    "Resources",
+    "Type",
+    "XObject",
+    "Subtype",
+    "Filter",
+    "ColorSpace",
+    "Width",
+    "Height",
+    "BitsPerComponent",
+    "Length1",
+    "Length2",
+    "Length3",
+    "PTEX.FileName",
+    "PTEX.PageNumber",
+    "PTEX.InfoDict",
+    "FontDescriptor",
+    "ExtGState",
+    "Font",
+    "MediaBox",
+    "Annot",
+];
+
+#[derive(Debug, Deserialize, Serialize)]
+struct PdfText {
+    text: BTreeMap<u32, Vec<String>>, // Key is page number
+    errors: Vec<String>,
+}
+
+fn filter_func(object_id: (u32, u16), object: &mut Object) -> Option<((u32, u16), Object)> {
+    if IGNORE.contains(&object.type_name().unwrap_or_default()) {
+        return None;
+    }
+    if let Ok(d) = object.as_dict_mut() {
+        d.remove(b"Font");
+        d.remove(b"Resources");
+        d.remove(b"Producer");
+        d.remove(b"ModDate");
+        d.remove(b"Creator");
+        d.remove(b"ProcSet");
+        d.remove(b"XObject");
+        d.remove(b"MediaBox");
+        d.remove(b"Annots");
+        if d.is_empty() {
+            return None;
+        }
+    }
+    Some((object_id, object.to_owned()))
+}
+
+fn load_pdf<P: AsRef<Path>>(path: P) -> Result<Document, Error> {
+    Document::load_filtered(path, filter_func).map_err(|e| Error::new(ErrorKind::Other, e.to_string()))
+}
+
+fn get_pdf_text(doc: &Document) -> Result<PdfText, Error> {
+    let mut pdf_text: PdfText = PdfText {
+        text: BTreeMap::new(),
+        errors: Vec::new(),
+    };
+    let pages: Vec<Result<(u32, Vec<String>), Error>> = doc
+        .get_pages()
+        .into_par_iter()
+        .map(
+            |(page_num, page_id): (u32, (u32, u16))| -> Result<(u32, Vec<String>), Error> {
+                let text = doc.extract_text(&[page_num]).map_err(|e| {
+                    Error::new(
+                        ErrorKind::Other,
+                        format!("Failed to extract text from page {page_num} id={page_id:?}: {e:?}"),
+                    )
+                })?;
+                Ok((
+                    page_num,
+                    text.split('\n')
+                        .map(|s| s.trim_end().to_string())
+                        .collect::<Vec<String>>(),
+                ))
+            },
+        )
+        .collect();
+    for page in pages {
+        match page {
+            Ok((page_num, lines)) => {
+                pdf_text.text.insert(page_num, lines);
+            }
+            Err(e) => {
+                pdf_text.errors.push(e.to_string());
+            }
+        }
+    }
+    Ok(pdf_text)
+}
+
+pub fn pdf2text(path: &str) -> Result<String, Error> {
+    let doc = load_pdf(&path)?;
+    if doc.is_encrypted() {
+        return Ok("".to_string());
+    }
+    let text = get_pdf_text(&doc)?;
+    if !text.errors.is_empty() {
+        eprintln!("{path:?} has {} errors:", text.errors.len());
+        for error in &text.errors[..10] {
+            eprintln!("{error:?}");
+        }
+    }
+    let data = serde_json::to_string(&text).unwrap();
+    Ok(data)
+}
author	Nathan Reiner <nathan@nathanreiner.xyz>	2023-07-06 11:51:21 +0200
committer	Nathan Reiner <nathan@nathanreiner.xyz>	2023-07-06 11:51:21 +0200
commit	e1770cf3b0fd5eff3e69a8ec28c15018084eae73 (patch)
tree	0fc6289cd8b56f654a760d1ee7d748d160bcc251 /src/extractors
parent	3ca9adc0c5e138271dacab7691dac77da0ba0f21 (diff)