diff --git a/exampledir/test/excel.xlsx b/exampledir/test/excel.xlsx new file mode 100644 index 0000000..1d3fb36 Binary files /dev/null and b/exampledir/test/excel.xlsx differ diff --git a/src/adapters.rs b/src/adapters.rs index f1f36f9..f47a19d 100644 --- a/src/adapters.rs +++ b/src/adapters.rs @@ -11,7 +11,9 @@ pub mod zip; use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*}; use anyhow::{format_err, Context, Result}; use async_trait::async_trait; +use custom::strs; use custom::CustomAdapterConfig; +use custom::CustomIdentifiers; use custom::BUILTIN_SPAWNING_ADAPTERS; use log::*; use tokio::io::AsyncRead; @@ -38,7 +40,7 @@ pub struct AdapterMeta { pub fast_matchers: Vec, /// list of matchers when we have mime type detection active (interpreted as ORed) /// warning: this *overrides* the fast matchers - pub slow_matchers: Option>, + pub slow_matchers: Vec, /// if true, slow_matchers is merged with fast matchers if accurate is enabled /// for example, in sqlite you want this disabled since the db extension can mean other things and the mime type matching is very accurate for sqlite. /// but for tar you want it enabled, since the tar extension is very accurate but the tar mime matcher can have false negatives @@ -48,39 +50,63 @@ pub struct AdapterMeta { } impl AdapterMeta { // todo: this is pretty ugly - pub fn get_matchers<'a>( - &'a self, - slow: bool, - ) -> Box> + 'a> { + pub fn get_matchers(&self, slow: bool) -> Box> + '_> { match ( slow, self.keep_fast_matchers_if_accurate, &self.slow_matchers, + &self.fast_matchers, ) { - (true, false, Some(ref sm)) => Box::new(sm.iter().map(Cow::Borrowed)), - (true, true, Some(ref sm)) => Box::new( + (true, false, sm, _) => Box::new(sm.iter().map(Cow::Borrowed)), + (true, true, sm, fm) => Box::new( sm.iter().map(Cow::Borrowed).chain( - self.fast_matchers - .iter() - .map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))), + fm.iter() + .map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))) + .collect::>(), ), ), - // don't have slow matchers or slow matching disabled - (true, _, None) | (false, _, _) => Box::new( - self.fast_matchers - .iter() - .map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))), - ), + // slow matching disabled + (false, _, _, fm) => { + Box::new(fm.iter().map(|e| Cow::Owned(FileMatcher::Fast(e.clone())))) + } } } } -pub trait GetMetadata { - fn metadata(&self) -> &AdapterMeta; +pub trait Adapter { + fn name(&self) -> String; + fn version(&self) -> i32; + fn description(&self) -> String; + fn recurses(&self) -> bool; + fn disabled_by_default(&self) -> bool; + fn keep_fast_matchers_if_accurate(&self) -> bool; + fn extensions(&self) -> Vec; + fn mimetypes(&self) -> Vec; + + fn metadata(&self) -> AdapterMeta { + return AdapterMeta { + name: self.name(), + version: self.version(), + description: self.description(), + recurses: true, + fast_matchers: self + .extensions() + .iter() + .map(|s| FastFileMatcher::FileExtension(s.to_string())) + .collect(), + slow_matchers: self + .mimetypes() + .iter() + .map(|mimetype| FileMatcher::MimeType(mimetype.to_string())) + .collect(), + disabled_by_default: self.disabled_by_default(), + keep_fast_matchers_if_accurate: self.keep_fast_matchers_if_accurate(), + }; + } } #[async_trait] -pub trait FileAdapter: GetMetadata + Send + Sync { +pub trait FileAdapter: Adapter + Send + Sync { /// adapt a file. /// /// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher @@ -109,7 +135,110 @@ pub struct AdaptInfo { /// (enabledAdapters, disabledAdapters) type AdaptersTuple = (Vec>, Vec>); -pub fn get_all_adapters(custom_adapters: Option>) -> AdaptersTuple { +pub fn get_all_adapters( + custom_identifiers: Option, + custom_adapters: Option>, +) -> AdaptersTuple { + // decompress + let mut bz2_extensions = strs(decompress::EXTENSIONS_BZ2); + let mut bz2_mimetypes = strs(decompress::MIMETYPES_BZ2); + let mut gz_extensions = strs(decompress::EXTENSIONS_GZ); + let mut gz_mimetypes = strs(decompress::MIMETYPES_GZ); + let mut xz_extensions = strs(decompress::EXTENSIONS_XZ); + let mut xz_mimetypes = strs(decompress::MIMETYPES_XZ); + let mut zst_extensions = strs(decompress::EXTENSIONS_ZST); + let mut zst_mimetypes = strs(decompress::MIMETYPES_ZST); + + let mut ffmpeg_extensions = strs(ffmpeg::EXTENSIONS); + let mut ffmpeg_mimetypes = strs(ffmpeg::MIMETYPES); + + let mut mbox_extensions = strs(mbox::EXTENSIONS); + let mut mbox_mimetypes = strs(mbox::MIMETYPES); + + let mut sqlite_extensions = strs(sqlite::EXTENSIONS); + let mut sqlite_mimetypes = strs(sqlite::MIMETYPES); + + let mut tar_extensions = strs(tar::EXTENSIONS); + let mut tar_mimetypes = strs(tar::MIMETYPES); + + let mut zip_extensions = strs(zip::EXTENSIONS); + let mut zip_mimetypes = strs(zip::MIMETYPES); + + if let Some(identifiers) = custom_identifiers { + if let Some(identifier) = identifiers.bz2 { + if let Some(extensions) = identifier.extensions { + bz2_extensions = extensions; + } + if let Some(mimetypes) = identifier.mimetypes { + bz2_mimetypes = mimetypes; + } + } + if let Some(identifier) = identifiers.gz { + if let Some(extensions) = identifier.extensions { + gz_extensions = extensions; + } + if let Some(mimetypes) = identifier.mimetypes { + gz_mimetypes = mimetypes; + } + } + if let Some(identifier) = identifiers.xz { + if let Some(extensions) = identifier.extensions { + xz_extensions = extensions; + } + if let Some(mimetypes) = identifier.mimetypes { + xz_mimetypes = mimetypes; + } + } + if let Some(identifier) = identifiers.zst { + if let Some(extensions) = identifier.extensions { + zst_extensions = extensions; + } + if let Some(mimetypes) = identifier.mimetypes { + zst_mimetypes = mimetypes; + } + } + if let Some(identifier) = identifiers.ffmpeg { + if let Some(extensions) = identifier.extensions { + ffmpeg_extensions = extensions; + } + if let Some(mimetypes) = identifier.mimetypes { + ffmpeg_mimetypes = mimetypes; + } + } + if let Some(identifier) = identifiers.mbox { + if let Some(extensions) = identifier.extensions { + mbox_extensions = extensions; + } + if let Some(mimetypes) = identifier.mimetypes { + mbox_mimetypes = mimetypes; + } + } + if let Some(identifier) = identifiers.sqlite { + if let Some(extensions) = identifier.extensions { + sqlite_extensions = extensions; + } + if let Some(mimetypes) = identifier.mimetypes { + sqlite_mimetypes = mimetypes; + } + } + if let Some(identifier) = identifiers.tar { + if let Some(extensions) = identifier.extensions { + tar_extensions = extensions; + } + if let Some(mimetypes) = identifier.mimetypes { + tar_mimetypes = mimetypes; + } + } + if let Some(identifier) = identifiers.zip { + if let Some(extensions) = identifier.extensions { + zip_extensions = extensions; + } + if let Some(mimetypes) = identifier.mimetypes { + zip_mimetypes = mimetypes; + } + } + } + // order in descending priority let mut adapters: Vec> = vec![]; if let Some(custom_adapters) = custom_adapters { @@ -120,12 +249,36 @@ pub fn get_all_adapters(custom_adapters: Option>) -> Ad let internal_adapters: Vec> = vec![ Arc::new(PostprocPageBreaks::default()), - Arc::new(ffmpeg::FFmpegAdapter::new()), - Arc::new(zip::ZipAdapter::new()), - Arc::new(decompress::DecompressAdapter::new()), - Arc::new(mbox::MboxAdapter::new()), - Arc::new(tar::TarAdapter::new()), - Arc::new(sqlite::SqliteAdapter::new()), + Arc::new(ffmpeg::FFmpegAdapter { + extensions: ffmpeg_extensions, + mimetypes: ffmpeg_mimetypes, + }), + Arc::new(zip::ZipAdapter { + extensions: zip_extensions, + mimetypes: zip_mimetypes, + }), + Arc::new(decompress::DecompressAdapter { + extensions_gz: gz_extensions, + extensions_bz2: bz2_extensions, + extensions_xz: xz_extensions, + extensions_zst: zst_extensions, + mimetypes_gz: gz_mimetypes, + mimetypes_bz2: bz2_mimetypes, + mimetypes_xz: xz_mimetypes, + mimetypes_zst: zst_mimetypes, + }), + Arc::new(mbox::MboxAdapter { + extensions: mbox_extensions, + mimetypes: mbox_mimetypes, + }), + Arc::new(sqlite::SqliteAdapter { + extensions: sqlite_extensions, + mimetypes: sqlite_mimetypes, + }), + Arc::new(tar::TarAdapter { + extensions: tar_extensions, + mimetypes: tar_mimetypes, + }), ]; adapters.extend( BUILTIN_SPAWNING_ADAPTERS @@ -148,10 +301,12 @@ pub fn get_all_adapters(custom_adapters: Option>) -> Ad * - "+a,b" means use default list but also a and b (a,b will be prepended to the list so given higher priority) */ pub fn get_adapters_filtered>( + custom_identifiers: Option, custom_adapters: Option>, adapter_names: &[T], ) -> Result>> { - let (def_enabled_adapters, def_disabled_adapters) = get_all_adapters(custom_adapters); + let (def_enabled_adapters, def_disabled_adapters) = + get_all_adapters(custom_identifiers, custom_adapters); let adapters = if !adapter_names.is_empty() { let adapters_map: HashMap<_, _> = def_enabled_adapters .iter() diff --git a/src/adapters/custom.rs b/src/adapters/custom.rs index 3ae0e34..899cb60 100644 --- a/src/adapters/custom.rs +++ b/src/adapters/custom.rs @@ -1,12 +1,8 @@ use super::*; -use super::{AdaptInfo, AdapterMeta, FileAdapter, GetMetadata}; +use super::{AdaptInfo, Adapter, FileAdapter}; use crate::adapted_iter::one_file; -use crate::{ - adapted_iter::AdaptedFilesIterBox, - expand::expand_str_ez, - matching::{FastFileMatcher, FileMatcher}, -}; +use crate::{adapted_iter::AdaptedFilesIterBox, expand::expand_str_ez, matching::FileMatcher}; use crate::{join_handle_to_stream, to_io_err}; use anyhow::Result; use async_stream::stream; @@ -23,6 +19,42 @@ use tokio::process::Command; use tokio_util::io::StreamReader; // mostly the same as AdapterMeta + SpawningFileAdapter + +#[derive(Debug, Deserialize, Serialize, JsonSchema, Default, PartialEq, Clone)] +pub struct CustomIdentifier { + /// The file extensions this adapter supports, for example `["gz", "tgz"]`. + pub extensions: Option>, + /// If not null and --rga-accurate is enabled, mimetype matching is used instead of file name matching. + pub mimetypes: Option>, +} + +#[derive(Debug, Deserialize, Serialize, JsonSchema, PartialEq, Clone)] +pub struct CustomIdentifiers { + /// The identifiers to process as bz2 archives. + pub bz2: Option, + /// The identifiers to process as gz archives. + pub gz: Option, + /// The identifiers to process as xz archives. + pub xz: Option, + /// The identifiers to process as zst archives. + pub zst: Option, + + /// The identifiers to process via ffmpeg. + pub ffmpeg: Option, + + /// The identifiers to process as mbox files. + pub mbox: Option, + + /// The identifiers to process as SQLite files. + pub sqlite: Option, + + /// The identifiers to process as tar files. + pub tar: Option, + + /// The identifiers to process as zip archives. + pub zip: Option, +} + #[derive(Debug, Deserialize, Serialize, JsonSchema, Default, PartialEq, Clone)] pub struct CustomAdapterConfig { /// the unique identifier and name of this adapter. Must only include a-z, 0-9, _ @@ -36,7 +68,7 @@ pub struct CustomAdapterConfig { /// the file extensions this adapter supports. For example ["epub", "mobi"] pub extensions: Vec, /// if not null and --rga-accurate is enabled, mime type matching is used instead of file name matching - pub mimetypes: Option>, + pub mimetypes: Vec, /// if --rga-accurate, only match by mime types, ignore extensions completely pub match_only_by_mime: Option, /// the name or path of the binary to run @@ -56,7 +88,7 @@ pub struct CustomAdapterConfig { pub output_path_hint: Option, } -fn strs(arr: &[&str]) -> Vec { +pub fn strs(arr: &[&str]) -> Vec { arr.iter().map(ToString::to_string).collect() } @@ -103,7 +135,7 @@ lazy_static! { version: 3, extensions: strs(&["epub", "odt", "docx", "fb2", "ipynb", "html", "htm"]), binary: "pandoc".to_string(), - mimetypes: None, + mimetypes: Vec::new(), // simpler markdown (with more information loss but plainer text) //.arg("--to=commonmark-header_attributes-link_attributes-fenced_divs-markdown_in_html_blocks-raw_html-native_divs-native_spans-bracketed_spans") args: strs(&[ @@ -123,8 +155,7 @@ lazy_static! { .to_owned(), extensions: strs(&["pdf"]), - mimetypes: Some(strs(&["application/pdf"])), - + mimetypes: strs(&["application/pdf"]), binary: "pdftotext".to_string(), args: strs(&["-", "-"]), disabled_by_default: None, @@ -183,16 +214,46 @@ pub fn pipe_output( } pub struct CustomSpawningFileAdapter { + name: String, + version: i32, + description: String, + recurses: bool, + disabled_by_default: bool, + keep_fast_matchers_if_accurate: bool, + extensions: Vec, + mimetypes: Vec, binary: String, args: Vec, - meta: AdapterMeta, output_path_hint: Option, } -impl GetMetadata for CustomSpawningFileAdapter { - fn metadata(&self) -> &AdapterMeta { - &self.meta + +impl Adapter for CustomSpawningFileAdapter { + fn name(&self) -> String { + self.name.clone() + } + fn version(&self) -> i32 { + self.version + } + fn description(&self) -> String { + self.description.clone() + } + fn recurses(&self) -> bool { + self.recurses + } + fn disabled_by_default(&self) -> bool { + self.disabled_by_default + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + self.keep_fast_matchers_if_accurate + } + fn extensions(&self) -> Vec { + self.extensions.clone() + } + fn mimetypes(&self) -> Vec { + self.mimetypes.clone() } } + fn arg_replacer(arg: &str, filepath_hint: &Path) -> Result { expand_str_ez(arg, |s| match s { "input_virtual_path" => Ok(filepath_hint.to_string_lossy()), @@ -265,33 +326,22 @@ impl FileAdapter for CustomSpawningFileAdapter { impl CustomAdapterConfig { pub fn to_adapter(&self) -> CustomSpawningFileAdapter { CustomSpawningFileAdapter { + name: self.name.clone(), + version: self.version, + description: format!( + "{}\nRuns: {} {}", + self.description, + self.binary, + self.args.join(" ") + ), + recurses: false, + disabled_by_default: self.disabled_by_default.unwrap_or(false), + keep_fast_matchers_if_accurate: !self.match_only_by_mime.unwrap_or(false), + extensions: self.extensions.clone(), + mimetypes: self.mimetypes.clone(), binary: self.binary.clone(), args: self.args.clone(), output_path_hint: self.output_path_hint.clone(), - meta: AdapterMeta { - name: self.name.clone(), - version: self.version, - description: format!( - "{}\nRuns: {} {}", - self.description, - self.binary, - self.args.join(" ") - ), - recurses: true, - fast_matchers: self - .extensions - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: self.mimetypes.as_ref().map(|mimetypes| { - mimetypes - .iter() - .map(|s| FileMatcher::MimeType(s.to_string())) - .collect() - }), - keep_fast_matchers_if_accurate: !self.match_only_by_mime.unwrap_or(false), - disabled_by_default: self.disabled_by_default.unwrap_or(false), - }, } } } @@ -344,7 +394,7 @@ PREFIX:Page 1: disabled_by_default: None, version: 1, extensions: vec!["txt".to_string()], - mimetypes: None, + mimetypes: Vec::new(), match_only_by_mime: None, binary: "sed".to_string(), args: vec!["s/e/u/g".to_string()], diff --git a/src/adapters/decompress.rs b/src/adapters/decompress.rs index f4b96a7..185f456 100644 --- a/src/adapters/decompress.rs +++ b/src/adapters/decompress.rs @@ -3,51 +3,133 @@ use crate::adapted_iter::one_file; use super::*; use anyhow::Result; -use lazy_static::lazy_static; +use std::path::{Path, PathBuf}; +use std::str::FromStr; use tokio::io::BufReader; -use std::path::{Path, PathBuf}; +pub const EXTENSIONS_GZ: &[&str] = &["als", "gz", "tgz"]; +pub const EXTENSIONS_BZ2: &[&str] = &["bz2", "tbz", "tbz2"]; +pub const EXTENSIONS_XZ: &[&str] = &["xz"]; +pub const EXTENSIONS_ZST: &[&str] = &["zst"]; -static EXTENSIONS: &[&str] = &["als", "bz2", "gz", "tbz", "tbz2", "tgz", "xz", "zst"]; -static MIME_TYPES: &[&str] = &[ - "application/gzip", - "application/x-bzip", - "application/x-xz", - "application/zstd", -]; -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "decompress".to_owned(), - version: 1, - description: - "Reads compressed file as a stream and runs a different extractor on the contents." - .to_owned(), - recurses: true, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: Some( - MIME_TYPES - .iter() - .map(|s| FileMatcher::MimeType(s.to_string())) - .collect() - ), - disabled_by_default: false, - keep_fast_matchers_if_accurate: true - }; +#[derive(Debug, PartialEq, Eq)] +struct DecompressError; + +#[derive(Debug, PartialEq)] +enum Extension { + Gz, + Bz2, + Xz, + Zst, +} +impl FromStr for Extension { + type Err = DecompressError; + + fn from_str(ext: &str) -> Result { + if EXTENSIONS_GZ.contains(&ext) { + Ok(Extension::Gz) + } else if EXTENSIONS_BZ2.contains(&ext) { + Ok(Extension::Bz2) + } else if EXTENSIONS_XZ.contains(&ext) { + Ok(Extension::Xz) + } else if EXTENSIONS_ZST.contains(&ext) { + Ok(Extension::Zst) + } else { + Err(DecompressError) + } + } } -#[derive(Default)] -pub struct DecompressAdapter; -impl DecompressAdapter { - pub fn new() -> DecompressAdapter { - DecompressAdapter +pub const MIMETYPES_GZ: &[&str] = &["application/gzip"]; +pub const MIMETYPES_BZ2: &[&str] = &["application/x-bzip"]; +pub const MIMETYPES_XZ: &[&str] = &["application/x-xz"]; +pub const MIMETYPES_ZST: &[&str] = &["application/zstd"]; + +#[derive(Debug, PartialEq)] +enum Mime { + Gz, + Bz2, + Xz, + Zst, +} +impl FromStr for Mime { + type Err = DecompressError; + + fn from_str(ext: &str) -> Result { + if MIMETYPES_GZ.contains(&ext) { + Ok(Mime::Gz) + } else if MIMETYPES_BZ2.contains(&ext) { + Ok(Mime::Bz2) + } else if MIMETYPES_XZ.contains(&ext) { + Ok(Mime::Xz) + } else if MIMETYPES_ZST.contains(&ext) { + Ok(Mime::Zst) + } else { + Err(DecompressError) + } } } -impl GetMetadata for DecompressAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA + +#[derive(Default)] +pub struct DecompressAdapter { + pub extensions_gz: Vec, + pub extensions_bz2: Vec, + pub extensions_xz: Vec, + pub extensions_zst: Vec, + pub mimetypes_gz: Vec, + pub mimetypes_bz2: Vec, + pub mimetypes_xz: Vec, + pub mimetypes_zst: Vec, +} + +impl Adapter for DecompressAdapter { + fn name(&self) -> String { + String::from("decompress") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from( + "Reads compressed file as a stream and runs a different extractor on the contents.", + ) + } + fn recurses(&self) -> bool { + true + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + true + } + fn extensions(&self) -> Vec { + let mut extensions: Vec = Vec::new(); + for exts in [ + &self.extensions_gz, + &self.extensions_bz2, + &self.extensions_xz, + &self.extensions_zst, + ] { + for ext in exts { + extensions.push(ext.to_string()) + } + } + extensions + } + fn mimetypes(&self) -> Vec { + let mut mimetypes: Vec = Vec::new(); + for mimes in [ + &self.mimetypes_gz, + &self.mimetypes_bz2, + &self.mimetypes_xz, + &self.mimetypes_zst, + ] { + for mime in mimes { + mimetypes.push(mime.to_string()) + } + } + mimetypes } } @@ -61,19 +143,19 @@ fn decompress_any(reason: &FileMatcher, inp: ReadBox) -> Result { let zst = |inp: ReadBox| Box::pin(bufread::ZstdDecoder::new(BufReader::new(inp))); Ok(match reason { - Fast(FileExtension(ext)) => match ext.as_ref() { - "als" | "gz" | "tgz" => gz(inp), - "bz2" | "tbz" | "tbz2" => bz2(inp), - "zst" => zst(inp), - "xz" => xz(inp), - ext => Err(format_err!("don't know how to decompress {}", ext))?, + Fast(FileExtension(ext)) => match Extension::from_str(ext) { + Ok(Extension::Gz) => gz(inp), + Ok(Extension::Bz2) => gz(inp), + Ok(Extension::Zst) => gz(inp), + Ok(Extension::Xz) => gz(inp), + Err(_) => Err(format_err!("don't know how to decompress {}", ext))?, }, - MimeType(mime) => match mime.as_ref() { - "application/gzip" => gz(inp), - "application/x-bzip" => bz2(inp), - "application/x-xz" => xz(inp), - "application/zstd" => zst(inp), - mime => Err(format_err!("don't know how to decompress mime {}", mime))?, + MimeType(mime) => match Mime::from_str(mime) { + Ok(Mime::Gz) => gz(inp), + Ok(Mime::Bz2) => bz2(inp), + Ok(Mime::Xz) => xz(inp), + Ok(Mime::Zst) => zst(inp), + Err(_) => Err(format_err!("don't know how to decompress mime {}", mime))?, }, }) } @@ -137,7 +219,7 @@ mod tests { #[tokio::test] async fn gz() -> Result<()> { - let adapter = DecompressAdapter; + let adapter = DecompressAdapter::default(); let filepath = test_data_dir().join("hello.gz"); @@ -150,7 +232,7 @@ mod tests { #[tokio::test] async fn pdf_gz() -> Result<()> { - let adapter = DecompressAdapter; + let adapter = DecompressAdapter::default(); let filepath = test_data_dir().join("short.pdf.gz"); diff --git a/src/adapters/ffmpeg.rs b/src/adapters/ffmpeg.rs index 32298fe..6e23a2a 100644 --- a/src/adapters/ffmpeg.rs +++ b/src/adapters/ffmpeg.rs @@ -2,7 +2,6 @@ use super::*; use super::{custom::map_exe_error, writing::async_writeln}; use anyhow::*; use async_trait::async_trait; -use lazy_static::lazy_static; use regex::Regex; use serde::{Deserialize, Serialize}; use std::process::Stdio; @@ -10,41 +9,45 @@ use tokio::io::AsyncWrite; use tokio::io::{AsyncBufReadExt, BufReader}; use tokio::process::Command; use writing::WritingFileAdapter; -// todo: + // maybe todo: read list of extensions from // ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null // but really, the probability of getting useful information from a .flv is low -static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi", "mp3", "ogg", "flac", "webm"]; +pub const EXTENSIONS: &[&str] = &["mkv", "mp4", "avi", "mp3", "ogg", "flac", "webm"]; +pub const MIMETYPES: &[&str] = &[]; -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "ffmpeg".to_owned(), - version: 1, - description: - "Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata" - .to_owned(), - recurses: false, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: None, - disabled_by_default: false, - keep_fast_matchers_if_accurate: true - }; +#[derive(Clone)] +pub struct FFmpegAdapter { + pub extensions: Vec, + pub mimetypes: Vec, } -#[derive(Default, Clone)] -pub struct FFmpegAdapter; - -impl FFmpegAdapter { - pub fn new() -> FFmpegAdapter { - FFmpegAdapter +impl Adapter for FFmpegAdapter { + fn name(&self) -> String { + String::from("ffmpeg") } -} -impl GetMetadata for FFmpegAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from( + "Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata.", + ) + } + fn recurses(&self) -> bool { + false + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + true + } + fn extensions(&self) -> Vec { + self.extensions.clone() + } + fn mimetypes(&self) -> Vec { + self.mimetypes.clone() } } diff --git a/src/adapters/mbox.rs b/src/adapters/mbox.rs index ee39d0d..d6bf31f 100644 --- a/src/adapters/mbox.rs +++ b/src/adapters/mbox.rs @@ -9,42 +9,45 @@ use tokio::io::AsyncReadExt; use std::{collections::VecDeque, io::Cursor}; -static EXTENSIONS: &[&str] = &["mbox", "mbx", "eml"]; -static MIME_TYPES: &[&str] = &["application/mbox", "message/rfc822"]; +pub const EXTENSIONS: &[&str] = &["mbox", "mbx", "eml"]; +pub const MIMETYPES: &[&str] = &["application/mbox", "message/rfc822"]; + lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "mail".to_owned(), - version: 1, - description: - "Reads mailbox/mail files and runs extractors on the contents and attachments." - .to_owned(), - recurses: true, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: Some( - MIME_TYPES - .iter() - .map(|s| FileMatcher::MimeType(s.to_string())) - .collect() - ), - disabled_by_default: true, - keep_fast_matchers_if_accurate: true - }; static ref FROM_REGEX: Regex = Regex::new("\r?\nFrom [^\n]+\n").unwrap(); } + #[derive(Default)] -pub struct MboxAdapter; +pub struct MboxAdapter { + pub extensions: Vec, + pub mimetypes: Vec, +} -impl MboxAdapter { - pub fn new() -> MboxAdapter { - MboxAdapter +impl Adapter for MboxAdapter { + fn name(&self) -> String { + String::from("mail") } -} -impl GetMetadata for MboxAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from( + "Reads mailbox/mail files and runs extractors on the contents and attachments.", + ) + } + fn recurses(&self) -> bool { + true + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + true + } + fn extensions(&self) -> Vec { + self.extensions.clone() + } + fn mimetypes(&self) -> Vec { + self.mimetypes.clone() } } @@ -138,7 +141,7 @@ mod tests { #[tokio::test] async fn mail_simple() -> Result<()> { - let adapter = MboxAdapter; + let adapter = MboxAdapter::default(); let filepath = test_data_dir().join("github_email.eml"); @@ -171,7 +174,7 @@ mod tests { #[tokio::test] async fn mbox_simple() -> Result<()> { - let adapter = MboxAdapter; + let adapter = MboxAdapter::default(); let filepath = test_data_dir().join("test.mbx"); @@ -197,7 +200,7 @@ mod tests { async fn mbox_attachment() -> Result<()> { init_logging(); - let adapter = MboxAdapter; + let adapter = MboxAdapter::default(); let filepath = test_data_dir().join("mail_with_attachment.mbox"); diff --git a/src/adapters/postproc.rs b/src/adapters/postproc.rs index 45ec2a7..5339d10 100644 --- a/src/adapters/postproc.rs +++ b/src/adapters/postproc.rs @@ -1,4 +1,4 @@ -//trait RunFnAdapter: GetMetadata {} +//trait RunFnAdapter: Adapter {} //impl FileAdapter for T where T: RunFnAdapter {} @@ -19,30 +19,38 @@ use tokio_util::io::StreamReader; use crate::adapted_iter::one_file; use crate::adapted_iter::AdaptedFilesIterBox; -use crate::matching::FastFileMatcher; -use super::{AdaptInfo, AdapterMeta, FileAdapter, GetMetadata}; +use super::{AdaptInfo, Adapter, FileAdapter}; fn add_newline(ar: impl AsyncRead + Send) -> impl AsyncRead + Send { ar.chain(Cursor::new(&[b'\n'])) } pub struct PostprocPrefix {} -impl GetMetadata for PostprocPrefix { - fn metadata(&self) -> &super::AdapterMeta { - lazy_static::lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "postprocprefix".to_owned(), - version: 1, - description: "Adds the line prefix to each line (e.g. the filename within a zip)".to_owned(), - recurses: false, - fast_matchers: vec![], - slow_matchers: None, - keep_fast_matchers_if_accurate: false, - disabled_by_default: false - }; - } - &METADATA +impl Adapter for PostprocPrefix { + fn name(&self) -> String { + String::from("postprocprefix") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from("Adds the line prefix to each line (e.g. the filename within a zip)") + } + fn recurses(&self) -> bool { + false + } + fn mimetypes(&self) -> Vec { + [].into() + } + fn extensions(&self) -> Vec { + [].into() + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + false + } + fn disabled_by_default(&self) -> bool { + false } } #[async_trait] @@ -155,21 +163,30 @@ pub fn postproc_prefix(line_prefix: &str, inp: impl AsyncRead + Send) -> impl As #[derive(Default)] pub struct PostprocPageBreaks {} -impl GetMetadata for PostprocPageBreaks { - fn metadata(&self) -> &super::AdapterMeta { - lazy_static::lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "postprocpagebreaks".to_owned(), - version: 1, - description: "Adds the page number to each line for an input file that specifies page breaks as ascii page break character.\nMainly to be used internally by the poppler adapter.".to_owned(), - recurses: false, - fast_matchers: vec![FastFileMatcher::FileExtension("asciipagebreaks".to_string())], - slow_matchers: None, - keep_fast_matchers_if_accurate: false, - disabled_by_default: false - }; - } - &METADATA +impl Adapter for PostprocPageBreaks { + fn name(&self) -> String { + String::from("postprocpagebreaks") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from("Adds the page number to each line for an input file that specifies page breaks as ascii page break character.\nMainly to be used internally by the poppler adapter.") + } + fn recurses(&self) -> bool { + false + } + fn extensions(&self) -> Vec { + vec![String::from("asciipagebreaks")] + } + fn mimetypes(&self) -> Vec { + [].into() + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + true } } #[async_trait] diff --git a/src/adapters/sqlite.rs b/src/adapters/sqlite.rs index 0e8c1b9..ddaf487 100644 --- a/src/adapters/sqlite.rs +++ b/src/adapters/sqlite.rs @@ -1,7 +1,6 @@ use super::{writing::WritingFileAdapter, *}; use anyhow::Result; use async_trait::async_trait; -use lazy_static::lazy_static; use log::*; use rusqlite::types::ValueRef; use rusqlite::*; @@ -10,39 +9,50 @@ use tokio::io::AsyncWrite; use tokio_util::io::SyncIoBridge; -static EXTENSIONS: &[&str] = &["db", "db3", "sqlite", "sqlite3"]; +pub const EXTENSIONS: &[&str] = &["db", "db3", "sqlite", "sqlite3"]; +pub const MIMETYPES: &[&str] = &["application/x-sqlite3"]; -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "sqlite".to_owned(), - version: 1, - description: - "Uses sqlite bindings to convert sqlite databases into a simple plain text format" - .to_owned(), - recurses: false, // set to true if we decide to make sqlite blobs searchable (gz blob in db is kinda common I think) - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: Some(vec![FileMatcher::MimeType( - "application/x-sqlite3".to_owned() - )]), - keep_fast_matchers_if_accurate: false, - disabled_by_default: false - }; +#[derive(Clone)] +pub struct SqliteAdapter { + pub extensions: Vec, + pub mimetypes: Vec, } -#[derive(Default, Clone)] -pub struct SqliteAdapter; - -impl SqliteAdapter { - pub fn new() -> SqliteAdapter { - SqliteAdapter +impl Default for SqliteAdapter { + fn default() -> SqliteAdapter { + SqliteAdapter { + extensions: EXTENSIONS.iter().map(|&s| s.to_string()).collect(), + mimetypes: MIMETYPES.iter().map(|&s| s.to_string()).collect(), + } } } -impl GetMetadata for SqliteAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA + +impl Adapter for SqliteAdapter { + fn name(&self) -> String { + String::from("sqlite") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from( + "Uses sqlite bindings to convert sqlite databases into a simple plain text format", + ) + } + fn recurses(&self) -> bool { + false + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + false + } + fn extensions(&self) -> Vec { + self.extensions.clone() + } + fn mimetypes(&self) -> Vec { + self.mimetypes.clone() } } diff --git a/src/adapters/tar.rs b/src/adapters/tar.rs index 144bd20..e4fcf68 100644 --- a/src/adapters/tar.rs +++ b/src/adapters/tar.rs @@ -1,48 +1,56 @@ -use crate::{ - adapted_iter::AdaptedFilesIterBox, - adapters::AdapterMeta, - matching::{FastFileMatcher, FileMatcher}, - print_bytes, -}; +use crate::{adapted_iter::AdaptedFilesIterBox, matching::FileMatcher, print_bytes}; use anyhow::*; use async_stream::stream; use async_trait::async_trait; -use lazy_static::lazy_static; use log::*; use std::path::PathBuf; use tokio_stream::StreamExt; -use super::{AdaptInfo, FileAdapter, GetMetadata}; +use super::{AdaptInfo, Adapter, FileAdapter}; -static EXTENSIONS: &[&str] = &["tar"]; +pub const EXTENSIONS: &[&str] = &["tar"]; +pub const MIMETYPES: &[&str] = &[]; -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "tar".to_owned(), - version: 1, - description: "Reads a tar file as a stream and recurses down into its contents".to_owned(), - recurses: true, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: None, - keep_fast_matchers_if_accurate: true, - disabled_by_default: false - }; +#[derive(Clone)] +pub struct TarAdapter { + pub extensions: Vec, + pub mimetypes: Vec, } -#[derive(Default, Clone)] -pub struct TarAdapter; -impl TarAdapter { - pub fn new() -> TarAdapter { - TarAdapter +impl Default for TarAdapter { + fn default() -> TarAdapter { + TarAdapter { + extensions: EXTENSIONS.iter().map(|&s| s.to_string()).collect(), + mimetypes: MIMETYPES.iter().map(|&s| s.to_string()).collect(), + } } } -impl GetMetadata for TarAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA + +impl Adapter for TarAdapter { + fn name(&self) -> String { + String::from("tar") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from("Reads a tar file as a stream and recurses down into its contents") + } + fn recurses(&self) -> bool { + true + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + true + } + fn extensions(&self) -> Vec { + self.extensions.clone() + } + fn mimetypes(&self) -> Vec { + self.mimetypes.clone() } } @@ -108,7 +116,7 @@ mod tests { let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); - let adapter = TarAdapter::new(); + let adapter = TarAdapter::default(); let r = loop_adapt(&adapter, d, a).await.context("adapt")?; let o = adapted_to_vec(r).await.context("adapted_to_vec")?; assert_eq!( diff --git a/src/adapters/writing.rs b/src/adapters/writing.rs index b17152a..1ed13d0 100644 --- a/src/adapters/writing.rs +++ b/src/adapters/writing.rs @@ -2,13 +2,13 @@ use std::pin::Pin; use crate::{adapted_iter::one_file, join_handle_to_stream, to_io_err}; -use super::{AdaptInfo, FileAdapter, GetMetadata}; +use super::{AdaptInfo, Adapter, FileAdapter}; use anyhow::{Context, Result}; use async_trait::async_trait; use tokio::io::{AsyncReadExt, AsyncWrite}; #[async_trait] -pub trait WritingFileAdapter: GetMetadata + Send + Sync + Clone { +pub trait WritingFileAdapter: Adapter + Send + Sync + Clone { async fn adapt_write( a: super::AdaptInfo, detection_reason: &crate::matching::FileMatcher, diff --git a/src/adapters/zip.rs b/src/adapters/zip.rs index 8c30407..d616528 100644 --- a/src/adapters/zip.rs +++ b/src/adapters/zip.rs @@ -2,39 +2,50 @@ use super::*; use crate::print_bytes; use anyhow::*; use async_stream::stream; -use lazy_static::lazy_static; use log::*; -// TODO: allow users to configure file extensions instead of hard coding the list -// https://github.com/phiresky/ripgrep-all/pull/208#issuecomment-2173241243 -static EXTENSIONS: &[&str] = &["zip", "jar", "xpi", "kra", "snagx"]; +pub const EXTENSIONS: &[&str] = &["zip", "jar", "xpi", "kra", "snagx"]; +pub const MIMETYPES: &[&str] = &["application/zip"]; -lazy_static! { - static ref METADATA: AdapterMeta = AdapterMeta { - name: "zip".to_owned(), - version: 1, - description: "Reads a zip file as a stream and recurses down into its contents".to_owned(), - recurses: true, - fast_matchers: EXTENSIONS - .iter() - .map(|s| FastFileMatcher::FileExtension(s.to_string())) - .collect(), - slow_matchers: Some(vec![FileMatcher::MimeType("application/zip".to_owned())]), - keep_fast_matchers_if_accurate: false, - disabled_by_default: false - }; +#[derive(Debug, Clone)] +pub struct ZipAdapter { + pub extensions: Vec, + pub mimetypes: Vec, } -#[derive(Default, Clone)] -pub struct ZipAdapter; -impl ZipAdapter { - pub fn new() -> ZipAdapter { - ZipAdapter +impl Default for ZipAdapter { + fn default() -> ZipAdapter { + ZipAdapter { + extensions: EXTENSIONS.iter().map(|&s| s.to_string()).collect(), + mimetypes: MIMETYPES.iter().map(|&s| s.to_string()).collect(), + } } } -impl GetMetadata for ZipAdapter { - fn metadata(&self) -> &AdapterMeta { - &METADATA + +impl Adapter for ZipAdapter { + fn name(&self) -> String { + String::from("zip") + } + fn version(&self) -> i32 { + 1 + } + fn description(&self) -> String { + String::from("Reads a zip file as a stream and recurses down into its contents") + } + fn recurses(&self) -> bool { + true + } + fn disabled_by_default(&self) -> bool { + false + } + fn keep_fast_matchers_if_accurate(&self) -> bool { + false + } + fn extensions(&self) -> Vec { + self.extensions.clone() + } + fn mimetypes(&self) -> Vec { + self.mimetypes.clone() } } @@ -225,7 +236,7 @@ mod test { async fn only_seek_zip_fs() -> Result<()> { let zip = test_data_dir().join("only-seek-zip.zip"); let (a, d) = simple_fs_adapt_info(&zip).await?; - let _v = adapted_to_vec(loop_adapt(&ZipAdapter::new(), d, a).await?).await?; + let _v = adapted_to_vec(loop_adapt(&ZipAdapter::default(), d, a).await?).await?; // assert_eq!(String::from_utf8(v)?, ""); Ok(()) @@ -242,7 +253,7 @@ mod test { #[tokio::test] async fn recurse() -> Result<()> { let zipfile = create_zip("outer.txt", "outer text file", true).await?; - let adapter = ZipAdapter::new(); + let adapter = ZipAdapter::default(); let (a, d) = simple_adapt_info( &PathBuf::from("outer.zip"), @@ -257,4 +268,25 @@ mod test { Ok(()) } + + #[tokio::test] + async fn search_xlsx_with_extension_config() -> Result<()> { + let zip = test_data_dir().join("excel.xlsx"); + let (a, d) = simple_fs_adapt_info(&zip).await?; + let v = adapted_to_vec( + loop_adapt( + &ZipAdapter { + extensions: vec![String::from("xlsx")], + mimetypes: Vec::new(), + }, + d, + a, + ) + .await?, + ) + .await?; + assert_eq!(String::from_utf8(v[..18].to_vec())?, "PREFIX:_rels/.rels"); // first filename in the spreadsheet archive + + Ok(()) + } } diff --git a/src/bin/rga.rs b/src/bin/rga.rs index c3ed99d..42ea906 100644 --- a/src/bin/rga.rs +++ b/src/bin/rga.rs @@ -12,7 +12,8 @@ use std::process::Command; use std::time::Instant; fn list_adapters(args: RgaConfig) -> Result<()> { - let (enabled_adapters, disabled_adapters) = get_all_adapters(args.custom_adapters); + let (enabled_adapters, disabled_adapters) = + get_all_adapters(args.custom_identifiers, args.custom_adapters); println!("Adapters:\n"); let print = |adapter: std::sync::Arc| { @@ -27,8 +28,6 @@ fn list_adapters(args: RgaConfig) -> Result<()> { .join(", "); let slow_matchers = meta .slow_matchers - .as_ref() - .unwrap_or(&vec![]) .iter() .filter_map(|m| match m { FileMatcher::MimeType(x) => Some(x.to_string()), @@ -87,14 +86,18 @@ fn main() -> anyhow::Result<()> { return Ok(()); } - let adapters = get_adapters_filtered(config.custom_adapters.clone(), &config.adapters)?; + let adapters = get_adapters_filtered( + config.custom_identifiers.clone(), + config.custom_adapters.clone(), + &config.adapters, + )?; let pre_glob = if !config.accurate { let extensions = adapters .iter() - .flat_map(|a| &a.metadata().fast_matchers) - .flat_map(|m| match m { - FastFileMatcher::FileExtension(ext) => vec![ext.clone(), ext.to_ascii_uppercase()], + .flat_map(|a| a.metadata().fast_matchers) + .map(|matcher| match matcher { + FastFileMatcher::FileExtension(_) => matcher.to_string(), }) .collect::>() .join(","); diff --git a/src/config.rs b/src/config.rs index 768ce12..ac2c802 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,4 +1,7 @@ -use crate::{adapters::custom::CustomAdapterConfig, project_dirs}; +use crate::{ + adapters::custom::{CustomAdapterConfig, CustomIdentifiers}, + project_dirs, +}; use anyhow::{Context, Result}; use derive_more::FromStr; use log::*; @@ -170,6 +173,14 @@ pub struct RgaConfig { #[serde(default, skip_serializing_if = "is_default")] #[structopt(skip)] pub custom_adapters: Option>, + + ////////////////////////////////////////// + //////////////////////////// Config file only + ////////////////////////////////////////// + #[serde(default, skip_serializing_if = "is_default")] + #[structopt(skip)] + pub custom_identifiers: Option, + ////////////////////////////////////////// //////////////////////////// CMD line only ////////////////////////////////////////// diff --git a/src/matching.rs b/src/matching.rs index 3b67ba4..2839103 100644 --- a/src/matching.rs +++ b/src/matching.rs @@ -7,8 +7,8 @@ use anyhow::*; use regex::{Regex, RegexSet}; +use std::fmt; use std::iter::Iterator; - use std::sync::Arc; // match only based on file path @@ -24,6 +24,20 @@ pub enum FastFileMatcher { // todo: maybe allow matching a directory (e.g. /var/lib/postgres) } +impl std::fmt::Display for FastFileMatcher { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + FastFileMatcher::FileExtension(val) => { + // Write strictly the first element into the supplied output + // stream: `f`. Returns `fmt::Result` which indicates whether the + // operation succeeded or failed. Note that `write!` uses syntax which + // is very similar to `println!`. + write!(f, "{}", val) + } + } + } +} + #[derive(Clone, Debug)] pub enum FileMatcher { /// any type of fast matcher @@ -40,12 +54,12 @@ impl From for FileMatcher { } } -pub struct FileMeta { +pub struct FileMeta<'a> { // filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either, // and since we probably only want to do only matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed pub lossy_filename: String, // only given when slow matching is enabled - pub mimetype: Option<&'static str>, + pub mimetype: Option<&'a str>, } pub fn extension_to_regex(extension: &str) -> Regex { diff --git a/src/preproc.rs b/src/preproc.rs index 32f3fa8..089ad03 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -32,7 +32,11 @@ async fn choose_adapter( archive_recursion_depth: i32, inp: &mut (impl AsyncBufRead + Unpin), ) -> Result, FileMatcher, ActiveAdapters)>> { - let active_adapters = get_adapters_filtered(config.custom_adapters.clone(), &config.adapters)?; + let active_adapters = get_adapters_filtered( + config.custom_identifiers.clone(), + config.custom_adapters.clone(), + &config.adapters, + )?; let adapters = adapter_matcher(&active_adapters, config.accurate)?; let filename = filepath_hint .file_name() @@ -255,7 +259,7 @@ pub async fn loop_adapt_inner( ai.filepath_hint.to_string_lossy(), &adapter.metadata().name ); - for await ifile in loop_adapt(adapter.as_ref(), detection_reason, ai).await? { + for await ifile in loop_adapt(adapter.clone().as_ref(), detection_reason, ai).await? { yield ifile; } }