mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-22 06:51:01 +00:00
feat: add --max-docs parameter to unstructured-ingest (#402)
* added --max-docs parameter to unstructured-ingest
This commit is contained in:
parent
65fec954ba
commit
1da40806da
@ -1,4 +1,4 @@
|
||||
## 0.5.8-dev1
|
||||
## 0.5.8-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
|
||||
### Features
|
||||
|
||||
* Add `--max-docs` parameter to `unstructured-ingest`
|
||||
* Added `partition_msg` for processing MSFT Outlook .msg files.
|
||||
|
||||
### Fixes
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.5.8-dev1" # pragma: no cover
|
||||
__version__ = "0.5.8-dev2" # pragma: no cover
|
||||
|
@ -43,6 +43,7 @@ class MainProcess:
|
||||
num_processes,
|
||||
reprocess,
|
||||
verbose,
|
||||
max_docs,
|
||||
):
|
||||
# initialize the reader and writer
|
||||
self.doc_connector = doc_connector
|
||||
@ -50,6 +51,7 @@ class MainProcess:
|
||||
self.num_processes = num_processes
|
||||
self.reprocess = reprocess
|
||||
self.verbose = verbose
|
||||
self.max_docs = max_docs
|
||||
|
||||
def initialize(self):
|
||||
"""Slower initialization things: check connections, load things into memory, etc."""
|
||||
@ -63,6 +65,10 @@ class MainProcess:
|
||||
def _filter_docs_with_outputs(self, docs):
|
||||
num_docs_all = len(docs)
|
||||
docs = [doc for doc in docs if not doc.has_output()]
|
||||
if self.max_docs is not None:
|
||||
if num_docs_all > self.max_docs:
|
||||
num_docs_all = self.max_docs
|
||||
docs = docs[: self.max_docs]
|
||||
num_docs_to_process = len(docs)
|
||||
if num_docs_to_process == 0:
|
||||
logger.info(
|
||||
@ -103,6 +109,12 @@ class MainProcess:
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--max-docs",
|
||||
default=None,
|
||||
type=int,
|
||||
help="If specified, process at most specified number of documents.",
|
||||
)
|
||||
@click.option(
|
||||
"--flatten-metadata",
|
||||
is_flag=True,
|
||||
@ -354,6 +366,7 @@ def main(
|
||||
metadata_exclude,
|
||||
fields_include,
|
||||
flatten_metadata,
|
||||
max_docs,
|
||||
):
|
||||
if flatten_metadata and "metadata" not in fields_include:
|
||||
logger.warning(
|
||||
@ -610,6 +623,7 @@ def main(
|
||||
num_processes=num_processes,
|
||||
reprocess=reprocess,
|
||||
verbose=verbose,
|
||||
max_docs=max_docs,
|
||||
).run()
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user