From 1da40806dae1cee9e740953fafaecf18688a28e3 Mon Sep 17 00:00:00 2001 From: natygyoon <59327875+natygyoon@users.noreply.github.com> Date: Thu, 30 Mar 2023 03:24:12 +0900 Subject: [PATCH] feat: add --max-docs parameter to unstructured-ingest (#402) * added --max-docs parameter to unstructured-ingest --- CHANGELOG.md | 3 ++- unstructured/__version__.py | 2 +- unstructured/ingest/main.py | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b8ab2960..60e0bb5a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.5.8-dev1 +## 0.5.8-dev2 ### Enhancements @@ -6,6 +6,7 @@ ### Features +* Add `--max-docs` parameter to `unstructured-ingest` * Added `partition_msg` for processing MSFT Outlook .msg files. ### Fixes diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 77a2b5e47..84f113780 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.5.8-dev1" # pragma: no cover +__version__ = "0.5.8-dev2" # pragma: no cover diff --git a/unstructured/ingest/main.py b/unstructured/ingest/main.py index ed758e480..6f3aa29c4 100755 --- a/unstructured/ingest/main.py +++ b/unstructured/ingest/main.py @@ -43,6 +43,7 @@ class MainProcess: num_processes, reprocess, verbose, + max_docs, ): # initialize the reader and writer self.doc_connector = doc_connector @@ -50,6 +51,7 @@ class MainProcess: self.num_processes = num_processes self.reprocess = reprocess self.verbose = verbose + self.max_docs = max_docs def initialize(self): """Slower initialization things: check connections, load things into memory, etc.""" @@ -63,6 +65,10 @@ class MainProcess: def _filter_docs_with_outputs(self, docs): num_docs_all = len(docs) docs = [doc for doc in docs if not doc.has_output()] + if self.max_docs is not None: + if num_docs_all > self.max_docs: + num_docs_all = self.max_docs + docs = docs[: self.max_docs] num_docs_to_process = len(docs) if num_docs_to_process == 0: logger.info( @@ -103,6 +109,12 @@ class MainProcess: @click.command() +@click.option( + "--max-docs", + default=None, + type=int, + help="If specified, process at most specified number of documents.", +) @click.option( "--flatten-metadata", is_flag=True, @@ -354,6 +366,7 @@ def main( metadata_exclude, fields_include, flatten_metadata, + max_docs, ): if flatten_metadata and "metadata" not in fields_include: logger.warning( @@ -610,6 +623,7 @@ def main( num_processes=num_processes, reprocess=reprocess, verbose=verbose, + max_docs=max_docs, ).run()