mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 07:03:52 +00:00
feat: add --partition-by-api and --partition-host to unstructured-ingest (#443)
* Add --partition-by-api and --partition-host args to ingest * Fix error in make check * Bump changelog * Add a test ingest script Also add a workaround for the test causing 400s from our api. Seems we need to make sure unstructured-api can handle getting a file.content_type of None. * Remove the content type workaround
This commit is contained in:
parent
ba4dadaa98
commit
4af4d33423
@ -1,4 +1,4 @@
|
||||
## 0.5.12-dev4
|
||||
## 0.5.12-dev5
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
|
||||
### Features
|
||||
|
||||
* Add --partition-by-api parameter to unstructured-ingest
|
||||
* Added `partition_rtf` for processing rich text files.
|
||||
|
||||
### Fixes
|
||||
|
||||
18
test_unstructured_ingest/test-ingest-against-api.sh
Executable file
18
test_unstructured_ingest/test-ingest-against-api.sh
Executable file
@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--local-input-path example-docs \
|
||||
--local-file-glob "*.pdf" \
|
||||
--structured-output-dir api-ingest-output \
|
||||
--partition-by-api \
|
||||
--verbose \
|
||||
--reprocess
|
||||
|
||||
if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 2 ]; then
|
||||
echo
|
||||
echo "2 files should have been created."
|
||||
exit 1
|
||||
fi
|
||||
@ -16,3 +16,4 @@ export OMP_THREAD_LIMIT=1
|
||||
./test_unstructured_ingest/test-ingest-biomed-api.sh
|
||||
./test_unstructured_ingest/test-ingest-biomed-path.sh
|
||||
./test_unstructured_ingest/test-ingest-local.sh
|
||||
./test_unstructured_ingest/test-ingest-against-api.sh
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.5.12-dev4" # pragma: no cover
|
||||
__version__ = "0.5.12-dev5" # pragma: no cover
|
||||
|
||||
@ -51,6 +51,8 @@ class SimpleBiomedConfig(BaseConnectorConfig):
|
||||
preserve_downloads: bool = False
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
partition_by_api: bool = False
|
||||
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
|
||||
fields_include: str = "element_id,text,type,metadata"
|
||||
flatten_metadata: bool = False
|
||||
|
||||
|
||||
@ -32,6 +32,8 @@ class SimpleFsspecConfig(BaseConnectorConfig):
|
||||
download_only: bool = False
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
partition_by_api: bool = False
|
||||
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
|
||||
fields_include: str = "element_id,text,type,metadata"
|
||||
flatten_metadata: bool = False
|
||||
|
||||
|
||||
@ -29,6 +29,8 @@ class SimpleGitConfig(BaseConnectorConfig):
|
||||
download_only: bool = False
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
partition_by_api: bool = False
|
||||
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
|
||||
fields_include: str = "element_id,text,type,metadata"
|
||||
flatten_metadata: bool = False
|
||||
|
||||
|
||||
@ -80,6 +80,8 @@ class SimpleGoogleDriveConfig(BaseConnectorConfig):
|
||||
preserve_downloads: bool = False
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
partition_by_api: bool = False
|
||||
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
|
||||
fields_include: str = "element_id,text,type,metadata"
|
||||
flatten_metadata: bool = False
|
||||
|
||||
|
||||
@ -27,6 +27,8 @@ class SimpleLocalConfig(BaseConnectorConfig):
|
||||
download_only: bool = False
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
partition_by_api: bool = False
|
||||
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
|
||||
fields_include: str = "element_id,text,type,metadata"
|
||||
flatten_metadata: bool = False
|
||||
|
||||
|
||||
@ -34,6 +34,8 @@ class SimpleRedditConfig(BaseConnectorConfig):
|
||||
download_only: bool = False
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
partition_by_api: bool = False
|
||||
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
|
||||
fields_include: str = "element_id,text,type,metadata"
|
||||
flatten_metadata: bool = False
|
||||
|
||||
|
||||
@ -29,6 +29,8 @@ class SimpleWikipediaConfig(BaseConnectorConfig):
|
||||
download_only: bool = False
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
partition_by_api: bool = False
|
||||
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
|
||||
fields_include: str = "element_id,text,type,metadata"
|
||||
flatten_metadata: bool = False
|
||||
|
||||
|
||||
@ -4,6 +4,8 @@ through Unstructured."""
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
|
||||
from unstructured.ingest.logger import logger
|
||||
from unstructured.partition.auto import partition
|
||||
from unstructured.staging.base import convert_to_dict
|
||||
@ -51,6 +53,8 @@ class BaseConnectorConfig(ABC):
|
||||
download_only: bool = False
|
||||
metadata_include: Optional[str] = None
|
||||
metadata_exclude: Optional[str] = None
|
||||
partition_by_api: bool = False
|
||||
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
|
||||
fields_include: str = "element_id,text,type,metadata"
|
||||
flatten_metadata: bool = False
|
||||
|
||||
@ -93,12 +97,34 @@ class BaseIngestDoc(ABC):
|
||||
"""Write the structured json result for this doc. result must be json serializable."""
|
||||
pass
|
||||
|
||||
def partition_file(self):
|
||||
if not self.config.partition_by_api:
|
||||
logger.debug("Using local partition")
|
||||
elements = partition(filename=str(self.filename))
|
||||
return convert_to_dict(elements)
|
||||
|
||||
else:
|
||||
endpoint = self.config.partition_endpoint
|
||||
|
||||
logger.debug(f"Using remote partition ({endpoint})")
|
||||
|
||||
with open(self.filename, "rb") as f:
|
||||
response = requests.post(
|
||||
f"{endpoint}",
|
||||
files={"files": (str(self.filename), f)},
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise RuntimeError(f"Caught {response.status_code} from API: {response.text}")
|
||||
|
||||
return response.json()
|
||||
|
||||
def process_file(self):
|
||||
if self.config.download_only:
|
||||
return
|
||||
logger.info(f"Processing {self.filename}")
|
||||
elements = partition(filename=str(self.filename))
|
||||
isd_elems = convert_to_dict(elements)
|
||||
|
||||
isd_elems = self.partition_file()
|
||||
|
||||
self.isd_elems_no_filename = []
|
||||
for elem in isd_elems:
|
||||
|
||||
@ -129,6 +129,19 @@ class MainProcess:
|
||||
"Usage: provide a single string with comma separated values. "
|
||||
"Example: --metadata-exclude filename,page_number ",
|
||||
)
|
||||
@click.option(
|
||||
"--partition-by-api",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Use a remote API to partition the files."
|
||||
" Otherwise, use the function from partition.auto",
|
||||
)
|
||||
@click.option(
|
||||
"--partition-endpoint",
|
||||
default="https://api.unstructured.io/general/v0/general",
|
||||
help="If partitioning via api, use the following host. "
|
||||
"Default: https://api.unstructured.io/general/v0/general",
|
||||
)
|
||||
@click.option(
|
||||
"--local-input-path",
|
||||
default=None,
|
||||
@ -378,6 +391,8 @@ def main(
|
||||
fields_include,
|
||||
flatten_metadata,
|
||||
max_docs,
|
||||
partition_by_api,
|
||||
partition_endpoint,
|
||||
local_input_path,
|
||||
local_recursive,
|
||||
local_file_glob,
|
||||
@ -399,6 +414,13 @@ def main(
|
||||
"mutually exclusive with each other.",
|
||||
)
|
||||
sys.exit(1)
|
||||
if (
|
||||
not partition_by_api
|
||||
and partition_endpoint != "https://api.unstructured.io/general/v0/general"
|
||||
):
|
||||
logger.warning(
|
||||
"Ignoring --partition-endpoint because --partition-by-api was not set",
|
||||
)
|
||||
if (not preserve_downloads and not download_only) and download_dir:
|
||||
logger.warning(
|
||||
"Not preserving downloaded files but --download_dir is specified",
|
||||
@ -477,6 +499,8 @@ def main(
|
||||
preserve_downloads=preserve_downloads,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
partition_by_api=partition_by_api,
|
||||
partition_endpoint=partition_endpoint,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
download_only=download_only,
|
||||
@ -507,6 +531,8 @@ def main(
|
||||
preserve_downloads=preserve_downloads,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
partition_by_api=partition_by_api,
|
||||
partition_endpoint=partition_endpoint,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
download_only=download_only,
|
||||
@ -533,6 +559,8 @@ def main(
|
||||
preserve_downloads=preserve_downloads,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
partition_by_api=partition_by_api,
|
||||
partition_endpoint=partition_endpoint,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
download_only=download_only,
|
||||
@ -557,6 +585,8 @@ def main(
|
||||
re_download=re_download,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
partition_by_api=partition_by_api,
|
||||
partition_endpoint=partition_endpoint,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
download_only=download_only,
|
||||
@ -581,6 +611,8 @@ def main(
|
||||
re_download=re_download,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
partition_by_api=partition_by_api,
|
||||
partition_endpoint=partition_endpoint,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
download_only=download_only,
|
||||
@ -607,6 +639,8 @@ def main(
|
||||
re_download=re_download,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
partition_by_api=partition_by_api,
|
||||
partition_endpoint=partition_endpoint,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
download_only=download_only,
|
||||
@ -624,6 +658,8 @@ def main(
|
||||
re_download=re_download,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
partition_by_api=partition_by_api,
|
||||
partition_endpoint=partition_endpoint,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
download_only=download_only,
|
||||
@ -648,6 +684,8 @@ def main(
|
||||
re_download=re_download,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
partition_by_api=partition_by_api,
|
||||
partition_endpoint=partition_endpoint,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
download_only=download_only,
|
||||
@ -672,6 +710,8 @@ def main(
|
||||
re_download=re_download,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
partition_by_api=partition_by_api,
|
||||
partition_endpoint=partition_endpoint,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
download_only=download_only,
|
||||
@ -692,6 +732,8 @@ def main(
|
||||
output_dir=structured_output_dir,
|
||||
metadata_include=metadata_include,
|
||||
metadata_exclude=metadata_exclude,
|
||||
partition_by_api=partition_by_api,
|
||||
partition_endpoint=partition_endpoint,
|
||||
fields_include=fields_include,
|
||||
flatten_metadata=flatten_metadata,
|
||||
),
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user