feat: add --max-docs parameter to unstructured-ingest (#402)

* added --max-docs parameter to unstructured-ingest
This commit is contained in:
natygyoon 2023-03-30 03:24:12 +09:00 committed by GitHub
parent 65fec954ba
commit 1da40806da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 17 additions and 2 deletions

View File

@ -1,4 +1,4 @@
## 0.5.8-dev1 ## 0.5.8-dev2
### Enhancements ### Enhancements
@ -6,6 +6,7 @@
### Features ### Features
* Add `--max-docs` parameter to `unstructured-ingest`
* Added `partition_msg` for processing MSFT Outlook .msg files. * Added `partition_msg` for processing MSFT Outlook .msg files.
### Fixes ### Fixes

View File

@ -1 +1 @@
__version__ = "0.5.8-dev1" # pragma: no cover __version__ = "0.5.8-dev2" # pragma: no cover

View File

@ -43,6 +43,7 @@ class MainProcess:
num_processes, num_processes,
reprocess, reprocess,
verbose, verbose,
max_docs,
): ):
# initialize the reader and writer # initialize the reader and writer
self.doc_connector = doc_connector self.doc_connector = doc_connector
@ -50,6 +51,7 @@ class MainProcess:
self.num_processes = num_processes self.num_processes = num_processes
self.reprocess = reprocess self.reprocess = reprocess
self.verbose = verbose self.verbose = verbose
self.max_docs = max_docs
def initialize(self): def initialize(self):
"""Slower initialization things: check connections, load things into memory, etc.""" """Slower initialization things: check connections, load things into memory, etc."""
@ -63,6 +65,10 @@ class MainProcess:
def _filter_docs_with_outputs(self, docs): def _filter_docs_with_outputs(self, docs):
num_docs_all = len(docs) num_docs_all = len(docs)
docs = [doc for doc in docs if not doc.has_output()] docs = [doc for doc in docs if not doc.has_output()]
if self.max_docs is not None:
if num_docs_all > self.max_docs:
num_docs_all = self.max_docs
docs = docs[: self.max_docs]
num_docs_to_process = len(docs) num_docs_to_process = len(docs)
if num_docs_to_process == 0: if num_docs_to_process == 0:
logger.info( logger.info(
@ -103,6 +109,12 @@ class MainProcess:
@click.command() @click.command()
@click.option(
"--max-docs",
default=None,
type=int,
help="If specified, process at most specified number of documents.",
)
@click.option( @click.option(
"--flatten-metadata", "--flatten-metadata",
is_flag=True, is_flag=True,
@ -354,6 +366,7 @@ def main(
metadata_exclude, metadata_exclude,
fields_include, fields_include,
flatten_metadata, flatten_metadata,
max_docs,
): ):
if flatten_metadata and "metadata" not in fields_include: if flatten_metadata and "metadata" not in fields_include:
logger.warning( logger.warning(
@ -610,6 +623,7 @@ def main(
num_processes=num_processes, num_processes=num_processes,
reprocess=reprocess, reprocess=reprocess,
verbose=verbose, verbose=verbose,
max_docs=max_docs,
).run() ).run()