// // MIT License // // Copyright (c) 2016 Junfeng Liu // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // // // For more information see `https://github.com/J-F-Liu/lopdf/blob/master/examples/extract_text.rs` // use std::collections::BTreeMap; use std::fmt::Debug; use std::io::{Error, ErrorKind}; use std::path::Path; use lopdf::{Document, Object}; use rayon::iter::{IntoParallelIterator, ParallelIterator}; use serde::{Deserialize, Serialize}; use serde_json; static IGNORE: &[&str] = &[ "Length", "BBox", "FormType", "Matrix", "Resources", "Type", "XObject", "Subtype", "Filter", "ColorSpace", "Width", "Height", "BitsPerComponent", "Length1", "Length2", "Length3", "PTEX.FileName", "PTEX.PageNumber", "PTEX.InfoDict", "FontDescriptor", "ExtGState", "Font", "MediaBox", "Annot", ]; #[derive(Debug, Deserialize, Serialize)] struct PdfText { text: BTreeMap>, // Key is page number errors: Vec, } fn filter_func(object_id: (u32, u16), object: &mut Object) -> Option<((u32, u16), Object)> { if IGNORE.contains(&object.type_name().unwrap_or_default()) { return None; } if let Ok(d) = object.as_dict_mut() { d.remove(b"Font"); d.remove(b"Resources"); d.remove(b"Producer"); d.remove(b"ModDate"); d.remove(b"Creator"); d.remove(b"ProcSet"); d.remove(b"XObject"); d.remove(b"MediaBox"); d.remove(b"Annots"); if d.is_empty() { return None; } } Some((object_id, object.to_owned())) } fn load_pdf>(path: P) -> Result { Document::load_filtered(path, filter_func).map_err(|e| Error::new(ErrorKind::Other, e.to_string())) } fn get_pdf_text(doc: &Document) -> Result { let mut pdf_text: PdfText = PdfText { text: BTreeMap::new(), errors: Vec::new(), }; let pages: Vec), Error>> = doc .get_pages() .into_par_iter() .map( |(page_num, page_id): (u32, (u32, u16))| -> Result<(u32, Vec), Error> { let text = doc.extract_text(&[page_num]).map_err(|e| { Error::new( ErrorKind::Other, format!("Failed to extract text from page {page_num} id={page_id:?}: {e:?}"), ) })?; Ok(( page_num, text.split('\n') .map(|s| s.trim_end().to_string()) .collect::>(), )) }, ) .collect(); for page in pages { match page { Ok((page_num, lines)) => { pdf_text.text.insert(page_num, lines); } Err(e) => { pdf_text.errors.push(e.to_string()); } } } Ok(pdf_text) } pub fn pdf2text(path: &str) -> Result { let doc = load_pdf(path)?; if doc.is_encrypted() { return Ok("".to_string()); } let text = get_pdf_text(&doc)?; if !text.errors.is_empty() { eprintln!("{path:?} has {} errors:", text.errors.len()); for error in &text.errors[..10] { eprintln!("{error:?}"); } } let data = serde_json::to_string(&text).unwrap(); Ok(data) }