aboutsummaryrefslogtreecommitdiff
path: root/src/extractors/pdf.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/extractors/pdf.rs')
-rw-r--r--src/extractors/pdf.rs143
1 files changed, 143 insertions, 0 deletions
diff --git a/src/extractors/pdf.rs b/src/extractors/pdf.rs
new file mode 100644
index 0000000..c08c75c
--- /dev/null
+++ b/src/extractors/pdf.rs
@@ -0,0 +1,143 @@
+// MIT License
+//
+// Copyright (c) 2016 Junfeng Liu
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+use std::collections::BTreeMap;
+use std::fmt::Debug;
+use std::io::{Error, ErrorKind};
+use std::path::Path;
+
+use lopdf::{Document, Object};
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+use serde::{Deserialize, Serialize};
+use serde_json;
+
+static IGNORE: &[&str] = &[
+ "Length",
+ "BBox",
+ "FormType",
+ "Matrix",
+ "Resources",
+ "Type",
+ "XObject",
+ "Subtype",
+ "Filter",
+ "ColorSpace",
+ "Width",
+ "Height",
+ "BitsPerComponent",
+ "Length1",
+ "Length2",
+ "Length3",
+ "PTEX.FileName",
+ "PTEX.PageNumber",
+ "PTEX.InfoDict",
+ "FontDescriptor",
+ "ExtGState",
+ "Font",
+ "MediaBox",
+ "Annot",
+];
+
+#[derive(Debug, Deserialize, Serialize)]
+struct PdfText {
+ text: BTreeMap<u32, Vec<String>>, // Key is page number
+ errors: Vec<String>,
+}
+
+fn filter_func(object_id: (u32, u16), object: &mut Object) -> Option<((u32, u16), Object)> {
+ if IGNORE.contains(&object.type_name().unwrap_or_default()) {
+ return None;
+ }
+ if let Ok(d) = object.as_dict_mut() {
+ d.remove(b"Font");
+ d.remove(b"Resources");
+ d.remove(b"Producer");
+ d.remove(b"ModDate");
+ d.remove(b"Creator");
+ d.remove(b"ProcSet");
+ d.remove(b"XObject");
+ d.remove(b"MediaBox");
+ d.remove(b"Annots");
+ if d.is_empty() {
+ return None;
+ }
+ }
+ Some((object_id, object.to_owned()))
+}
+
+fn load_pdf<P: AsRef<Path>>(path: P) -> Result<Document, Error> {
+ Document::load_filtered(path, filter_func).map_err(|e| Error::new(ErrorKind::Other, e.to_string()))
+}
+
+fn get_pdf_text(doc: &Document) -> Result<PdfText, Error> {
+ let mut pdf_text: PdfText = PdfText {
+ text: BTreeMap::new(),
+ errors: Vec::new(),
+ };
+ let pages: Vec<Result<(u32, Vec<String>), Error>> = doc
+ .get_pages()
+ .into_par_iter()
+ .map(
+ |(page_num, page_id): (u32, (u32, u16))| -> Result<(u32, Vec<String>), Error> {
+ let text = doc.extract_text(&[page_num]).map_err(|e| {
+ Error::new(
+ ErrorKind::Other,
+ format!("Failed to extract text from page {page_num} id={page_id:?}: {e:?}"),
+ )
+ })?;
+ Ok((
+ page_num,
+ text.split('\n')
+ .map(|s| s.trim_end().to_string())
+ .collect::<Vec<String>>(),
+ ))
+ },
+ )
+ .collect();
+ for page in pages {
+ match page {
+ Ok((page_num, lines)) => {
+ pdf_text.text.insert(page_num, lines);
+ }
+ Err(e) => {
+ pdf_text.errors.push(e.to_string());
+ }
+ }
+ }
+ Ok(pdf_text)
+}
+
+pub fn pdf2text(path: &str) -> Result<String, Error> {
+ let doc = load_pdf(&path)?;
+ if doc.is_encrypted() {
+ return Ok("".to_string());
+ }
+ let text = get_pdf_text(&doc)?;
+ if !text.errors.is_empty() {
+ eprintln!("{path:?} has {} errors:", text.errors.len());
+ for error in &text.errors[..10] {
+ eprintln!("{error:?}");
+ }
+ }
+ let data = serde_json::to_string(&text).unwrap();
+ Ok(data)
+}