mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-23 07:23:11 +00:00
feat: add --max-docs parameter to unstructured-ingest (#402)
* added --max-docs parameter to unstructured-ingest
This commit is contained in:
parent
65fec954ba
commit
1da40806da
@ -1,4 +1,4 @@
|
|||||||
## 0.5.8-dev1
|
## 0.5.8-dev2
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
* Add `--max-docs` parameter to `unstructured-ingest`
|
||||||
* Added `partition_msg` for processing MSFT Outlook .msg files.
|
* Added `partition_msg` for processing MSFT Outlook .msg files.
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.5.8-dev1" # pragma: no cover
|
__version__ = "0.5.8-dev2" # pragma: no cover
|
||||||
|
@ -43,6 +43,7 @@ class MainProcess:
|
|||||||
num_processes,
|
num_processes,
|
||||||
reprocess,
|
reprocess,
|
||||||
verbose,
|
verbose,
|
||||||
|
max_docs,
|
||||||
):
|
):
|
||||||
# initialize the reader and writer
|
# initialize the reader and writer
|
||||||
self.doc_connector = doc_connector
|
self.doc_connector = doc_connector
|
||||||
@ -50,6 +51,7 @@ class MainProcess:
|
|||||||
self.num_processes = num_processes
|
self.num_processes = num_processes
|
||||||
self.reprocess = reprocess
|
self.reprocess = reprocess
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
|
self.max_docs = max_docs
|
||||||
|
|
||||||
def initialize(self):
|
def initialize(self):
|
||||||
"""Slower initialization things: check connections, load things into memory, etc."""
|
"""Slower initialization things: check connections, load things into memory, etc."""
|
||||||
@ -63,6 +65,10 @@ class MainProcess:
|
|||||||
def _filter_docs_with_outputs(self, docs):
|
def _filter_docs_with_outputs(self, docs):
|
||||||
num_docs_all = len(docs)
|
num_docs_all = len(docs)
|
||||||
docs = [doc for doc in docs if not doc.has_output()]
|
docs = [doc for doc in docs if not doc.has_output()]
|
||||||
|
if self.max_docs is not None:
|
||||||
|
if num_docs_all > self.max_docs:
|
||||||
|
num_docs_all = self.max_docs
|
||||||
|
docs = docs[: self.max_docs]
|
||||||
num_docs_to_process = len(docs)
|
num_docs_to_process = len(docs)
|
||||||
if num_docs_to_process == 0:
|
if num_docs_to_process == 0:
|
||||||
logger.info(
|
logger.info(
|
||||||
@ -103,6 +109,12 @@ class MainProcess:
|
|||||||
|
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
|
@click.option(
|
||||||
|
"--max-docs",
|
||||||
|
default=None,
|
||||||
|
type=int,
|
||||||
|
help="If specified, process at most specified number of documents.",
|
||||||
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--flatten-metadata",
|
"--flatten-metadata",
|
||||||
is_flag=True,
|
is_flag=True,
|
||||||
@ -354,6 +366,7 @@ def main(
|
|||||||
metadata_exclude,
|
metadata_exclude,
|
||||||
fields_include,
|
fields_include,
|
||||||
flatten_metadata,
|
flatten_metadata,
|
||||||
|
max_docs,
|
||||||
):
|
):
|
||||||
if flatten_metadata and "metadata" not in fields_include:
|
if flatten_metadata and "metadata" not in fields_include:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
@ -610,6 +623,7 @@ def main(
|
|||||||
num_processes=num_processes,
|
num_processes=num_processes,
|
||||||
reprocess=reprocess,
|
reprocess=reprocess,
|
||||||
verbose=verbose,
|
verbose=verbose,
|
||||||
|
max_docs=max_docs,
|
||||||
).run()
|
).run()
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user