diff options
| author | Nathan Reiner <nathan@nathanreiner.xyz> | 2023-07-06 11:51:21 +0200 |
|---|---|---|
| committer | Nathan Reiner <nathan@nathanreiner.xyz> | 2023-07-06 11:51:21 +0200 |
| commit | e1770cf3b0fd5eff3e69a8ec28c15018084eae73 (patch) | |
| tree | 0fc6289cd8b56f654a760d1ee7d748d160bcc251 /src | |
| parent | 3ca9adc0c5e138271dacab7691dac77da0ba0f21 (diff) | |
add extractors for docx, pptx, pdf, etc.
Diffstat (limited to 'src')
| -rw-r--r-- | src/extractors/mod.rs | 1 | ||||
| -rw-r--r-- | src/extractors/pdf.rs | 143 | ||||
| -rw-r--r-- | src/main.rs | 1 | ||||
| -rw-r--r-- | src/text/docx.rs | 9 | ||||
| -rw-r--r-- | src/text/mod.rs | 12 | ||||
| -rw-r--r-- | src/text/odp.rs | 9 | ||||
| -rw-r--r-- | src/text/odt.rs | 9 | ||||
| -rw-r--r-- | src/text/pdf.rs | 5 | ||||
| -rw-r--r-- | src/text/pptx.rs | 9 | ||||
| -rw-r--r-- | src/text/xlsx.rs | 9 |
10 files changed, 207 insertions, 0 deletions
diff --git a/src/extractors/mod.rs b/src/extractors/mod.rs new file mode 100644 index 0000000..0f137f1 --- /dev/null +++ b/src/extractors/mod.rs @@ -0,0 +1 @@ +pub mod pdf; diff --git a/src/extractors/pdf.rs b/src/extractors/pdf.rs new file mode 100644 index 0000000..c08c75c --- /dev/null +++ b/src/extractors/pdf.rs @@ -0,0 +1,143 @@ +// MIT License +// +// Copyright (c) 2016 Junfeng Liu +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +use std::collections::BTreeMap; +use std::fmt::Debug; +use std::io::{Error, ErrorKind}; +use std::path::Path; + +use lopdf::{Document, Object}; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use serde::{Deserialize, Serialize}; +use serde_json; + +static IGNORE: &[&str] = &[ + "Length", + "BBox", + "FormType", + "Matrix", + "Resources", + "Type", + "XObject", + "Subtype", + "Filter", + "ColorSpace", + "Width", + "Height", + "BitsPerComponent", + "Length1", + "Length2", + "Length3", + "PTEX.FileName", + "PTEX.PageNumber", + "PTEX.InfoDict", + "FontDescriptor", + "ExtGState", + "Font", + "MediaBox", + "Annot", +]; + +#[derive(Debug, Deserialize, Serialize)] +struct PdfText { + text: BTreeMap<u32, Vec<String>>, // Key is page number + errors: Vec<String>, +} + +fn filter_func(object_id: (u32, u16), object: &mut Object) -> Option<((u32, u16), Object)> { + if IGNORE.contains(&object.type_name().unwrap_or_default()) { + return None; + } + if let Ok(d) = object.as_dict_mut() { + d.remove(b"Font"); + d.remove(b"Resources"); + d.remove(b"Producer"); + d.remove(b"ModDate"); + d.remove(b"Creator"); + d.remove(b"ProcSet"); + d.remove(b"XObject"); + d.remove(b"MediaBox"); + d.remove(b"Annots"); + if d.is_empty() { + return None; + } + } + Some((object_id, object.to_owned())) +} + +fn load_pdf<P: AsRef<Path>>(path: P) -> Result<Document, Error> { + Document::load_filtered(path, filter_func).map_err(|e| Error::new(ErrorKind::Other, e.to_string())) +} + +fn get_pdf_text(doc: &Document) -> Result<PdfText, Error> { + let mut pdf_text: PdfText = PdfText { + text: BTreeMap::new(), + errors: Vec::new(), + }; + let pages: Vec<Result<(u32, Vec<String>), Error>> = doc + .get_pages() + .into_par_iter() + .map( + |(page_num, page_id): (u32, (u32, u16))| -> Result<(u32, Vec<String>), Error> { + let text = doc.extract_text(&[page_num]).map_err(|e| { + Error::new( + ErrorKind::Other, + format!("Failed to extract text from page {page_num} id={page_id:?}: {e:?}"), + ) + })?; + Ok(( + page_num, + text.split('\n') + .map(|s| s.trim_end().to_string()) + .collect::<Vec<String>>(), + )) + }, + ) + .collect(); + for page in pages { + match page { + Ok((page_num, lines)) => { + pdf_text.text.insert(page_num, lines); + } + Err(e) => { + pdf_text.errors.push(e.to_string()); + } + } + } + Ok(pdf_text) +} + +pub fn pdf2text(path: &str) -> Result<String, Error> { + let doc = load_pdf(&path)?; + if doc.is_encrypted() { + return Ok("".to_string()); + } + let text = get_pdf_text(&doc)?; + if !text.errors.is_empty() { + eprintln!("{path:?} has {} errors:", text.errors.len()); + for error in &text.errors[..10] { + eprintln!("{error:?}"); + } + } + let data = serde_json::to_string(&text).unwrap(); + Ok(data) +} diff --git a/src/main.rs b/src/main.rs index 3cae322..2e49fd4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,6 +5,7 @@ pub mod splitter; pub mod filecache; pub mod searchresult; pub mod filecounter; +pub mod extractors; use vector::FileVector; use dictionary::Dictionary; diff --git a/src/text/docx.rs b/src/text/docx.rs new file mode 100644 index 0000000..7b4a80e --- /dev/null +++ b/src/text/docx.rs @@ -0,0 +1,9 @@ +use std::io::Read; +use dotext::{Docx, MsDoc}; + +pub fn get_text(path : &str) -> String { + let mut file = Docx::open(path).unwrap(); + let mut content = String::new(); + let _ = file.read_to_string(&mut content); + content +} diff --git a/src/text/mod.rs b/src/text/mod.rs index 4e1b9a4..dd969af 100644 --- a/src/text/mod.rs +++ b/src/text/mod.rs @@ -4,6 +4,12 @@ use std::path::Path; use std::collections::HashMap; mod txt; +mod docx; +mod xlsx; +mod pptx; +mod odt; +mod odp; +mod pdf; fn empty_extractor(_ : &str) -> String { "".to_string() @@ -19,6 +25,12 @@ lazy_static! { static ref EXT: HashMap<&'static str, ExtFn> = { HashMap::from([ ext!(txt), + ext!(docx), + ext!(xlsx), + ext!(pptx), + ext!(odt), + ext!(odp), + ext!(pdf), ]) }; } diff --git a/src/text/odp.rs b/src/text/odp.rs new file mode 100644 index 0000000..eaed196 --- /dev/null +++ b/src/text/odp.rs @@ -0,0 +1,9 @@ +use std::io::Read; +use dotext::{Odp, doc::OpenOfficeDoc}; + +pub fn get_text(path : &str) -> String { + let mut file = Odp::open(path).unwrap(); + let mut content = String::new(); + let _ = file.read_to_string(&mut content); + content +} diff --git a/src/text/odt.rs b/src/text/odt.rs new file mode 100644 index 0000000..10b5342 --- /dev/null +++ b/src/text/odt.rs @@ -0,0 +1,9 @@ +use std::io::Read; +use dotext::{Odt, doc::OpenOfficeDoc}; + +pub fn get_text(path : &str) -> String { + let mut file = Odt::open(path).unwrap(); + let mut content = String::new(); + let _ = file.read_to_string(&mut content); + content +} diff --git a/src/text/pdf.rs b/src/text/pdf.rs new file mode 100644 index 0000000..efa441f --- /dev/null +++ b/src/text/pdf.rs @@ -0,0 +1,5 @@ +use crate::extractors::pdf; + +pub fn get_text(path : &str) -> String { + pdf::pdf2text(path).ok().unwrap_or_else(|| "".to_string()) +} diff --git a/src/text/pptx.rs b/src/text/pptx.rs new file mode 100644 index 0000000..7dac77e --- /dev/null +++ b/src/text/pptx.rs @@ -0,0 +1,9 @@ +use std::io::Read; +use dotext::{Pptx, MsDoc}; + +pub fn get_text(path : &str) -> String { + let mut file = Pptx::open(path).unwrap(); + let mut content = String::new(); + let _ = file.read_to_string(&mut content); + content +} diff --git a/src/text/xlsx.rs b/src/text/xlsx.rs new file mode 100644 index 0000000..a438e96 --- /dev/null +++ b/src/text/xlsx.rs @@ -0,0 +1,9 @@ +use std::io::Read; +use dotext::{Xlsx, MsDoc}; + +pub fn get_text(path : &str) -> String { + let mut file = Xlsx::open(path).unwrap(); + let mut content = String::new(); + let _ = file.read_to_string(&mut content); + content +} |