feat: add --max-docs parameter to unstructured-ingest (#402)

* added --max-docs parameter to unstructured-ingest
This commit is contained in:
natygyoon 2023-03-30 03:24:12 +09:00 committed by GitHub
parent 65fec954ba
commit 1da40806da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 17 additions and 2 deletions

View File

@ -1,4 +1,4 @@
## 0.5.8-dev1
## 0.5.8-dev2
### Enhancements
@ -6,6 +6,7 @@
### Features
* Add `--max-docs` parameter to `unstructured-ingest`
* Added `partition_msg` for processing MSFT Outlook .msg files.
### Fixes

View File

@ -1 +1 @@
__version__ = "0.5.8-dev1" # pragma: no cover
__version__ = "0.5.8-dev2" # pragma: no cover

View File

@ -43,6 +43,7 @@ class MainProcess:
num_processes,
reprocess,
verbose,
max_docs,
):
# initialize the reader and writer
self.doc_connector = doc_connector
@ -50,6 +51,7 @@ class MainProcess:
self.num_processes = num_processes
self.reprocess = reprocess
self.verbose = verbose
self.max_docs = max_docs
def initialize(self):
"""Slower initialization things: check connections, load things into memory, etc."""
@ -63,6 +65,10 @@ class MainProcess:
def _filter_docs_with_outputs(self, docs):
num_docs_all = len(docs)
docs = [doc for doc in docs if not doc.has_output()]
if self.max_docs is not None:
if num_docs_all > self.max_docs:
num_docs_all = self.max_docs
docs = docs[: self.max_docs]
num_docs_to_process = len(docs)
if num_docs_to_process == 0:
logger.info(
@ -103,6 +109,12 @@ class MainProcess:
@click.command()
@click.option(
"--max-docs",
default=None,
type=int,
help="If specified, process at most specified number of documents.",
)
@click.option(
"--flatten-metadata",
is_flag=True,
@ -354,6 +366,7 @@ def main(
metadata_exclude,
fields_include,
flatten_metadata,
max_docs,
):
if flatten_metadata and "metadata" not in fields_include:
logger.warning(
@ -610,6 +623,7 @@ def main(
num_processes=num_processes,
reprocess=reprocess,
verbose=verbose,
max_docs=max_docs,
).run()