diff options
Diffstat (limited to 'src/text')
| -rw-r--r-- | src/text/docx.rs | 9 | ||||
| -rw-r--r-- | src/text/mod.rs | 12 | ||||
| -rw-r--r-- | src/text/odp.rs | 9 | ||||
| -rw-r--r-- | src/text/odt.rs | 9 | ||||
| -rw-r--r-- | src/text/pdf.rs | 5 | ||||
| -rw-r--r-- | src/text/pptx.rs | 9 | ||||
| -rw-r--r-- | src/text/xlsx.rs | 9 |
7 files changed, 62 insertions, 0 deletions
diff --git a/src/text/docx.rs b/src/text/docx.rs new file mode 100644 index 0000000..7b4a80e --- /dev/null +++ b/src/text/docx.rs @@ -0,0 +1,9 @@ +use std::io::Read; +use dotext::{Docx, MsDoc}; + +pub fn get_text(path : &str) -> String { + let mut file = Docx::open(path).unwrap(); + let mut content = String::new(); + let _ = file.read_to_string(&mut content); + content +} diff --git a/src/text/mod.rs b/src/text/mod.rs index 4e1b9a4..dd969af 100644 --- a/src/text/mod.rs +++ b/src/text/mod.rs @@ -4,6 +4,12 @@ use std::path::Path; use std::collections::HashMap; mod txt; +mod docx; +mod xlsx; +mod pptx; +mod odt; +mod odp; +mod pdf; fn empty_extractor(_ : &str) -> String { "".to_string() @@ -19,6 +25,12 @@ lazy_static! { static ref EXT: HashMap<&'static str, ExtFn> = { HashMap::from([ ext!(txt), + ext!(docx), + ext!(xlsx), + ext!(pptx), + ext!(odt), + ext!(odp), + ext!(pdf), ]) }; } diff --git a/src/text/odp.rs b/src/text/odp.rs new file mode 100644 index 0000000..eaed196 --- /dev/null +++ b/src/text/odp.rs @@ -0,0 +1,9 @@ +use std::io::Read; +use dotext::{Odp, doc::OpenOfficeDoc}; + +pub fn get_text(path : &str) -> String { + let mut file = Odp::open(path).unwrap(); + let mut content = String::new(); + let _ = file.read_to_string(&mut content); + content +} diff --git a/src/text/odt.rs b/src/text/odt.rs new file mode 100644 index 0000000..10b5342 --- /dev/null +++ b/src/text/odt.rs @@ -0,0 +1,9 @@ +use std::io::Read; +use dotext::{Odt, doc::OpenOfficeDoc}; + +pub fn get_text(path : &str) -> String { + let mut file = Odt::open(path).unwrap(); + let mut content = String::new(); + let _ = file.read_to_string(&mut content); + content +} diff --git a/src/text/pdf.rs b/src/text/pdf.rs new file mode 100644 index 0000000..efa441f --- /dev/null +++ b/src/text/pdf.rs @@ -0,0 +1,5 @@ +use crate::extractors::pdf; + +pub fn get_text(path : &str) -> String { + pdf::pdf2text(path).ok().unwrap_or_else(|| "".to_string()) +} diff --git a/src/text/pptx.rs b/src/text/pptx.rs new file mode 100644 index 0000000..7dac77e --- /dev/null +++ b/src/text/pptx.rs @@ -0,0 +1,9 @@ +use std::io::Read; +use dotext::{Pptx, MsDoc}; + +pub fn get_text(path : &str) -> String { + let mut file = Pptx::open(path).unwrap(); + let mut content = String::new(); + let _ = file.read_to_string(&mut content); + content +} diff --git a/src/text/xlsx.rs b/src/text/xlsx.rs new file mode 100644 index 0000000..a438e96 --- /dev/null +++ b/src/text/xlsx.rs @@ -0,0 +1,9 @@ +use std::io::Read; +use dotext::{Xlsx, MsDoc}; + +pub fn get_text(path : &str) -> String { + let mut file = Xlsx::open(path).unwrap(); + let mut content = String::new(); + let _ = file.read_to_string(&mut content); + content +} |