feat: add --partition-by-api and --partition-host to unstructured-ingest (#443)

* Add --partition-by-api and --partition-host args to ingest

* Fix error in make check

* Bump changelog

* Add a test ingest script

Also add a workaround for the test causing 400s from our api. Seems we need to make sure
unstructured-api can handle getting a file.content_type of None.

* Remove the content type workaround
This commit is contained in:
Austin Walker 2023-04-12 01:05:07 -04:00 committed by GitHub
parent ba4dadaa98
commit 4af4d33423
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 106 additions and 4 deletions

View File

@ -1,4 +1,4 @@
## 0.5.12-dev4
## 0.5.12-dev5
### Enhancements
@ -8,6 +8,7 @@
### Features
* Add --partition-by-api parameter to unstructured-ingest
* Added `partition_rtf` for processing rich text files.
### Fixes

View File

@ -0,0 +1,18 @@
#!/usr/bin/env bash
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
cd "$SCRIPT_DIR"/.. || exit 1
PYTHONPATH=. ./unstructured/ingest/main.py \
--local-input-path example-docs \
--local-file-glob "*.pdf" \
--structured-output-dir api-ingest-output \
--partition-by-api \
--verbose \
--reprocess
if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 2 ]; then
echo
echo "2 files should have been created."
exit 1
fi

View File

@ -16,3 +16,4 @@ export OMP_THREAD_LIMIT=1
./test_unstructured_ingest/test-ingest-biomed-api.sh
./test_unstructured_ingest/test-ingest-biomed-path.sh
./test_unstructured_ingest/test-ingest-local.sh
./test_unstructured_ingest/test-ingest-against-api.sh

View File

@ -1 +1 @@
__version__ = "0.5.12-dev4" # pragma: no cover
__version__ = "0.5.12-dev5" # pragma: no cover

View File

@ -51,6 +51,8 @@ class SimpleBiomedConfig(BaseConnectorConfig):
preserve_downloads: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
partition_by_api: bool = False
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False

View File

@ -32,6 +32,8 @@ class SimpleFsspecConfig(BaseConnectorConfig):
download_only: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
partition_by_api: bool = False
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False

View File

@ -29,6 +29,8 @@ class SimpleGitConfig(BaseConnectorConfig):
download_only: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
partition_by_api: bool = False
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False

View File

@ -80,6 +80,8 @@ class SimpleGoogleDriveConfig(BaseConnectorConfig):
preserve_downloads: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
partition_by_api: bool = False
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False

View File

@ -27,6 +27,8 @@ class SimpleLocalConfig(BaseConnectorConfig):
download_only: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
partition_by_api: bool = False
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False

View File

@ -34,6 +34,8 @@ class SimpleRedditConfig(BaseConnectorConfig):
download_only: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
partition_by_api: bool = False
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False

View File

@ -29,6 +29,8 @@ class SimpleWikipediaConfig(BaseConnectorConfig):
download_only: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
partition_by_api: bool = False
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False

View File

@ -4,6 +4,8 @@ through Unstructured."""
from abc import ABC, abstractmethod
from typing import Optional
import requests
from unstructured.ingest.logger import logger
from unstructured.partition.auto import partition
from unstructured.staging.base import convert_to_dict
@ -51,6 +53,8 @@ class BaseConnectorConfig(ABC):
download_only: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
partition_by_api: bool = False
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False
@ -93,12 +97,34 @@ class BaseIngestDoc(ABC):
"""Write the structured json result for this doc. result must be json serializable."""
pass
def partition_file(self):
if not self.config.partition_by_api:
logger.debug("Using local partition")
elements = partition(filename=str(self.filename))
return convert_to_dict(elements)
else:
endpoint = self.config.partition_endpoint
logger.debug(f"Using remote partition ({endpoint})")
with open(self.filename, "rb") as f:
response = requests.post(
f"{endpoint}",
files={"files": (str(self.filename), f)},
)
if response.status_code != 200:
raise RuntimeError(f"Caught {response.status_code} from API: {response.text}")
return response.json()
def process_file(self):
if self.config.download_only:
return
logger.info(f"Processing {self.filename}")
elements = partition(filename=str(self.filename))
isd_elems = convert_to_dict(elements)
isd_elems = self.partition_file()
self.isd_elems_no_filename = []
for elem in isd_elems:

View File

@ -129,6 +129,19 @@ class MainProcess:
"Usage: provide a single string with comma separated values. "
"Example: --metadata-exclude filename,page_number ",
)
@click.option(
"--partition-by-api",
is_flag=True,
default=False,
help="Use a remote API to partition the files."
" Otherwise, use the function from partition.auto",
)
@click.option(
"--partition-endpoint",
default="https://api.unstructured.io/general/v0/general",
help="If partitioning via api, use the following host. "
"Default: https://api.unstructured.io/general/v0/general",
)
@click.option(
"--local-input-path",
default=None,
@ -378,6 +391,8 @@ def main(
fields_include,
flatten_metadata,
max_docs,
partition_by_api,
partition_endpoint,
local_input_path,
local_recursive,
local_file_glob,
@ -399,6 +414,13 @@ def main(
"mutually exclusive with each other.",
)
sys.exit(1)
if (
not partition_by_api
and partition_endpoint != "https://api.unstructured.io/general/v0/general"
):
logger.warning(
"Ignoring --partition-endpoint because --partition-by-api was not set",
)
if (not preserve_downloads and not download_only) and download_dir:
logger.warning(
"Not preserving downloaded files but --download_dir is specified",
@ -477,6 +499,8 @@ def main(
preserve_downloads=preserve_downloads,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
partition_by_api=partition_by_api,
partition_endpoint=partition_endpoint,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
download_only=download_only,
@ -507,6 +531,8 @@ def main(
preserve_downloads=preserve_downloads,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
partition_by_api=partition_by_api,
partition_endpoint=partition_endpoint,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
download_only=download_only,
@ -533,6 +559,8 @@ def main(
preserve_downloads=preserve_downloads,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
partition_by_api=partition_by_api,
partition_endpoint=partition_endpoint,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
download_only=download_only,
@ -557,6 +585,8 @@ def main(
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
partition_by_api=partition_by_api,
partition_endpoint=partition_endpoint,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
download_only=download_only,
@ -581,6 +611,8 @@ def main(
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
partition_by_api=partition_by_api,
partition_endpoint=partition_endpoint,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
download_only=download_only,
@ -607,6 +639,8 @@ def main(
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
partition_by_api=partition_by_api,
partition_endpoint=partition_endpoint,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
download_only=download_only,
@ -624,6 +658,8 @@ def main(
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
partition_by_api=partition_by_api,
partition_endpoint=partition_endpoint,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
download_only=download_only,
@ -648,6 +684,8 @@ def main(
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
partition_by_api=partition_by_api,
partition_endpoint=partition_endpoint,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
download_only=download_only,
@ -672,6 +710,8 @@ def main(
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
partition_by_api=partition_by_api,
partition_endpoint=partition_endpoint,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
download_only=download_only,
@ -692,6 +732,8 @@ def main(
output_dir=structured_output_dir,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
partition_by_api=partition_by_api,
partition_endpoint=partition_endpoint,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),