mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-25 09:09:40 +00:00

* Many command line options added. The sample ingest project is now an easy to use CLI (no code editing necessary), capable of processing large numbers of files from S3 in a re-entrant manner. See Ingest.md. * Fixes issue where text fixtures had been truncated * Adds a check to make sure this doesn't happen again * Moves fixture outputs for the existing connector one subdir lower, to make room for future connector outputs.
110 lines
4.6 KiB
Python
110 lines
4.6 KiB
Python
import multiprocessing as mp
|
|
import os
|
|
import random
|
|
import string
|
|
|
|
import click
|
|
|
|
from unstructured.ingest.connector.s3_connector import S3Connector, SimpleS3Config
|
|
from unstructured.ingest.doc_processor.generalized import initialize, process_document
|
|
|
|
class MainProcess:
|
|
|
|
def __init__(self, doc_connector, doc_processor_fn, num_processes, reprocess):
|
|
# initialize the reader and writer
|
|
self.doc_connector = doc_connector
|
|
self.doc_processor_fn = doc_processor_fn
|
|
self.num_processes = num_processes
|
|
self.reprocess = reprocess
|
|
|
|
def initialize(self):
|
|
"""Slower initialization things: check connections, load things into memory, etc."""
|
|
initialize()
|
|
|
|
def cleanup(self):
|
|
self.doc_connector.cleanup()
|
|
|
|
def _filter_docs_with_outputs(self, docs):
|
|
num_docs_all = len(docs)
|
|
docs = [doc for doc in docs if not doc.has_output()]
|
|
num_docs_to_process = len(docs)
|
|
if num_docs_to_process == 0:
|
|
print("All docs have structured outputs, nothing to do. Use --reprocess to process all.")
|
|
return None
|
|
elif num_docs_to_process != num_docs_all:
|
|
print(f"Skipping processing for {num_docs_all - num_docs_to_process} docs out of "
|
|
f"{num_docs_all} since their structured outputs already exist, use --reprocess to "
|
|
"reprocess those in addition to the unprocessed ones.")
|
|
return docs
|
|
|
|
def run(self):
|
|
self.initialize()
|
|
|
|
self.doc_connector.fetch_docs()
|
|
|
|
# fetch the list of lazy downloading IngestDoc obj's
|
|
docs = self.doc_connector.fetch_docs()
|
|
|
|
# remove docs that have already been processed
|
|
if not self.reprocess:
|
|
docs = self._filter_docs_with_outputs(docs)
|
|
if not docs:
|
|
return
|
|
|
|
# Debugging tip: use the below line and comment out the mp.Pool loop
|
|
# block to remain in single process
|
|
# self.doc_processor_fn(docs[0])
|
|
|
|
with mp.Pool(processes=self.num_processes) as pool:
|
|
results = pool.map(self.doc_processor_fn, docs)
|
|
|
|
self.cleanup()
|
|
|
|
@click.command()
|
|
@click.option('--s3-url', default="s3://utic-dev-tech-fixtures/small-pdf-set/",
|
|
help="Prefix of s3 objects (files) to download. E.g. s3://bucket1/path/. This value may also be a single file.")
|
|
@click.option('--re-download/--no-re-download', default=False,
|
|
help="Re-download files from s3 even if they are already present in --download-dir.")
|
|
@click.option('--download-dir',
|
|
help="Where s3 files are downloaded to, defaults to tmp-ingest-<6 random chars>." )
|
|
@click.option('--preserve-downloads', is_flag=True, default=False,
|
|
help="Preserve downloaded s3 files. Otherwise each file is removed after being processed successfully." )
|
|
@click.option('--structured-output-dir', default="structured-output",
|
|
help="Where to place structured output .json files.")
|
|
@click.option('--reprocess', is_flag=True, default=False,
|
|
help="Reprocess a downloaded file from s3 even if the relevant structured output .json file in --structured-output-dir already exists.")
|
|
@click.option('--num-processes', default=2, show_default=True,
|
|
help="Number of parallel processes to process docs in.")
|
|
@click.option('--anonymous', is_flag=True, default=False,
|
|
help="Connect to s3 without local AWS credentials.")
|
|
@click.option('-v', '--verbose', is_flag=True, default=False)
|
|
def main(s3_url, re_download, download_dir, preserve_downloads, structured_output_dir,
|
|
reprocess, num_processes, anonymous, verbose):
|
|
if not preserve_downloads and download_dir:
|
|
print("Warning: not preserving downloaded s3 files but --download_dir is specified")
|
|
if not download_dir:
|
|
download_dir = "tmp-ingest-" + "".join(
|
|
random.choice(string.ascii_letters) for i in range(6)
|
|
)
|
|
doc_connector = S3Connector(
|
|
config=SimpleS3Config(
|
|
download_dir=download_dir,
|
|
s3_url=s3_url,
|
|
output_dir=structured_output_dir,
|
|
# set to False to use your AWS creds (not needed for this public s3 url)
|
|
anonymous=anonymous,
|
|
re_download=re_download,
|
|
preserve_downloads=preserve_downloads,
|
|
verbose=verbose,
|
|
),
|
|
)
|
|
MainProcess(doc_connector=doc_connector,
|
|
doc_processor_fn=process_document,
|
|
num_processes=num_processes,
|
|
reprocess=reprocess,
|
|
).run()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|