diff options
| author | Nathan Reiner <nathan@nathanreiner.xyz> | 2023-07-06 11:51:21 +0200 |
|---|---|---|
| committer | Nathan Reiner <nathan@nathanreiner.xyz> | 2023-07-06 11:51:21 +0200 |
| commit | e1770cf3b0fd5eff3e69a8ec28c15018084eae73 (patch) | |
| tree | 0fc6289cd8b56f654a760d1ee7d748d160bcc251 /src/extractors | |
| parent | 3ca9adc0c5e138271dacab7691dac77da0ba0f21 (diff) | |
add extractors for docx, pptx, pdf, etc.
Diffstat (limited to 'src/extractors')
| -rw-r--r-- | src/extractors/mod.rs | 1 | ||||
| -rw-r--r-- | src/extractors/pdf.rs | 143 |
2 files changed, 144 insertions, 0 deletions
diff --git a/src/extractors/mod.rs b/src/extractors/mod.rs new file mode 100644 index 0000000..0f137f1 --- /dev/null +++ b/src/extractors/mod.rs @@ -0,0 +1 @@ +pub mod pdf; diff --git a/src/extractors/pdf.rs b/src/extractors/pdf.rs new file mode 100644 index 0000000..c08c75c --- /dev/null +++ b/src/extractors/pdf.rs @@ -0,0 +1,143 @@ +// MIT License +// +// Copyright (c) 2016 Junfeng Liu +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +use std::collections::BTreeMap; +use std::fmt::Debug; +use std::io::{Error, ErrorKind}; +use std::path::Path; + +use lopdf::{Document, Object}; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use serde::{Deserialize, Serialize}; +use serde_json; + +static IGNORE: &[&str] = &[ + "Length", + "BBox", + "FormType", + "Matrix", + "Resources", + "Type", + "XObject", + "Subtype", + "Filter", + "ColorSpace", + "Width", + "Height", + "BitsPerComponent", + "Length1", + "Length2", + "Length3", + "PTEX.FileName", + "PTEX.PageNumber", + "PTEX.InfoDict", + "FontDescriptor", + "ExtGState", + "Font", + "MediaBox", + "Annot", +]; + +#[derive(Debug, Deserialize, Serialize)] +struct PdfText { + text: BTreeMap<u32, Vec<String>>, // Key is page number + errors: Vec<String>, +} + +fn filter_func(object_id: (u32, u16), object: &mut Object) -> Option<((u32, u16), Object)> { + if IGNORE.contains(&object.type_name().unwrap_or_default()) { + return None; + } + if let Ok(d) = object.as_dict_mut() { + d.remove(b"Font"); + d.remove(b"Resources"); + d.remove(b"Producer"); + d.remove(b"ModDate"); + d.remove(b"Creator"); + d.remove(b"ProcSet"); + d.remove(b"XObject"); + d.remove(b"MediaBox"); + d.remove(b"Annots"); + if d.is_empty() { + return None; + } + } + Some((object_id, object.to_owned())) +} + +fn load_pdf<P: AsRef<Path>>(path: P) -> Result<Document, Error> { + Document::load_filtered(path, filter_func).map_err(|e| Error::new(ErrorKind::Other, e.to_string())) +} + +fn get_pdf_text(doc: &Document) -> Result<PdfText, Error> { + let mut pdf_text: PdfText = PdfText { + text: BTreeMap::new(), + errors: Vec::new(), + }; + let pages: Vec<Result<(u32, Vec<String>), Error>> = doc + .get_pages() + .into_par_iter() + .map( + |(page_num, page_id): (u32, (u32, u16))| -> Result<(u32, Vec<String>), Error> { + let text = doc.extract_text(&[page_num]).map_err(|e| { + Error::new( + ErrorKind::Other, + format!("Failed to extract text from page {page_num} id={page_id:?}: {e:?}"), + ) + })?; + Ok(( + page_num, + text.split('\n') + .map(|s| s.trim_end().to_string()) + .collect::<Vec<String>>(), + )) + }, + ) + .collect(); + for page in pages { + match page { + Ok((page_num, lines)) => { + pdf_text.text.insert(page_num, lines); + } + Err(e) => { + pdf_text.errors.push(e.to_string()); + } + } + } + Ok(pdf_text) +} + +pub fn pdf2text(path: &str) -> Result<String, Error> { + let doc = load_pdf(&path)?; + if doc.is_encrypted() { + return Ok("".to_string()); + } + let text = get_pdf_text(&doc)?; + if !text.errors.is_empty() { + eprintln!("{path:?} has {} errors:", text.errors.len()); + for error in &text.errors[..10] { + eprintln!("{error:?}"); + } + } + let data = serde_json::to_string(&text).unwrap(); + Ok(data) +} |