mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-29 09:00:30 +00:00
feat: add partition_multiple_via_api function (#539)
* added function for multiple files via api * make multiple work with files * updated docs strings * changelog and version * docs and contextlib for open files * tests for partition multiple * add tests for error conditions * add output example
This commit is contained in:
parent
3c3c59a726
commit
7e43a25f07
13
CHANGELOG.md
13
CHANGELOG.md
@ -1,3 +1,16 @@
|
|||||||
|
## 0.6.3-dev0
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
* Added `partition_multiple_via_api` for partitioning multiple documents in a single REST
|
||||||
|
API call.
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
|
|
||||||
## 0.6.2
|
## 0.6.2
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|||||||
@ -162,6 +162,45 @@ Examples:
|
|||||||
elements = partition_via_api(file=f, file_filename=filename, api_key="MY_API_KEY")
|
elements = partition_via_api(file=f, file_filename=filename, api_key="MY_API_KEY")
|
||||||
|
|
||||||
|
|
||||||
|
``partition_multiple_via_api``
|
||||||
|
------------------------------
|
||||||
|
|
||||||
|
``partition_multiple_via_api`` is similar to ``partition_via_api``, but allows you to partition
|
||||||
|
multiple documents in a single REST API call. The result has the type ``List[List[Element]]``,
|
||||||
|
for example:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
[
|
||||||
|
[NarrativeText("Narrative!"), Title("Title!")],
|
||||||
|
[NarrativeText("Narrative!"), Title("Title!")]
|
||||||
|
]
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.partition.api import partition_multiple_via_api
|
||||||
|
|
||||||
|
filenames = ["example-docs/fake-email.eml", "example-docs/fake.docx"]
|
||||||
|
|
||||||
|
documents = partition_multiple_via_api(filenames=filenames)
|
||||||
|
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from contextlib import ExitStack
|
||||||
|
|
||||||
|
from unstructured.partition.api import partition_multiple_via_api
|
||||||
|
|
||||||
|
filenames = ["example-docs/fake-email.eml", "example-docs/fake.docx"]
|
||||||
|
files = [open(filename, "rb") for filename in filenames]
|
||||||
|
|
||||||
|
with ExitStack() as stack:
|
||||||
|
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
||||||
|
documents = partition_multiple_via_api(files=files, file_filenames=filenames)
|
||||||
|
|
||||||
|
|
||||||
``partition_docx``
|
``partition_docx``
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
import contextlib
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
@ -5,7 +7,7 @@ import pytest
|
|||||||
import requests
|
import requests
|
||||||
|
|
||||||
from unstructured.documents.elements import NarrativeText
|
from unstructured.documents.elements import NarrativeText
|
||||||
from unstructured.partition.api import partition_via_api
|
from unstructured.partition.api import partition_multiple_via_api, partition_via_api
|
||||||
|
|
||||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||||
|
|
||||||
@ -70,3 +72,172 @@ def test_partition_via_api_raises_with_bad_response(monkeypatch):
|
|||||||
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
partition_via_api(filename=filename, api_key="FAKEROO")
|
partition_via_api(filename=filename, api_key="FAKEROO")
|
||||||
|
|
||||||
|
|
||||||
|
class MockMultipleResponse:
|
||||||
|
def __init__(self, status_code):
|
||||||
|
self.status_code = status_code
|
||||||
|
|
||||||
|
def json(self):
|
||||||
|
return json.loads(self.text)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text(self):
|
||||||
|
return """[
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"element_id": "f49fbd614ddf5b72e06f59e554e6ae2b",
|
||||||
|
"text": "This is a test email to use for unit tests.",
|
||||||
|
"type": "NarrativeText",
|
||||||
|
"metadata": {
|
||||||
|
"date": "2022-12-16T17:04:16-05:00",
|
||||||
|
"sent_from": [
|
||||||
|
"Matthew Robinson <mrobinson@unstructured.io>"
|
||||||
|
],
|
||||||
|
"sent_to": [
|
||||||
|
"Matthew Robinson <mrobinson@unstructured.io>"
|
||||||
|
],
|
||||||
|
"subject": "Test Email",
|
||||||
|
"filename": "fake-email.eml"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"element_id": "f49fbd614ddf5b72e06f59e554e6ae2b",
|
||||||
|
"text": "This is a test email to use for unit tests.",
|
||||||
|
"type": "NarrativeText",
|
||||||
|
"metadata": {
|
||||||
|
"date": "2022-12-16T17:04:16-05:00",
|
||||||
|
"sent_from": [
|
||||||
|
"Matthew Robinson <mrobinson@unstructured.io>"
|
||||||
|
],
|
||||||
|
"sent_to": [
|
||||||
|
"Matthew Robinson <mrobinson@unstructured.io>"
|
||||||
|
],
|
||||||
|
"subject": "Test Email",
|
||||||
|
"filename": "fake-email.eml"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
]"""
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_multiple_via_api_from_filenames(monkeypatch):
|
||||||
|
monkeypatch.setattr(
|
||||||
|
requests,
|
||||||
|
"post",
|
||||||
|
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
|
||||||
|
)
|
||||||
|
|
||||||
|
filenames = [
|
||||||
|
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml"),
|
||||||
|
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
|
||||||
|
]
|
||||||
|
|
||||||
|
elements = partition_multiple_via_api(filenames=filenames, api_key="FAKEROO")
|
||||||
|
assert len(elements) == 2
|
||||||
|
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_multiple_via_api_from_files(monkeypatch):
|
||||||
|
monkeypatch.setattr(
|
||||||
|
requests,
|
||||||
|
"post",
|
||||||
|
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
|
||||||
|
)
|
||||||
|
|
||||||
|
filenames = [
|
||||||
|
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml"),
|
||||||
|
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
|
||||||
|
]
|
||||||
|
|
||||||
|
with contextlib.ExitStack() as stack:
|
||||||
|
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
||||||
|
elements = partition_multiple_via_api(
|
||||||
|
files=files,
|
||||||
|
file_filenames=filenames,
|
||||||
|
api_key="FAKEROO",
|
||||||
|
)
|
||||||
|
assert len(elements) == 2
|
||||||
|
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch):
|
||||||
|
monkeypatch.setattr(
|
||||||
|
requests,
|
||||||
|
"post",
|
||||||
|
lambda *args, **kwargs: MockMultipleResponse(status_code=500),
|
||||||
|
)
|
||||||
|
|
||||||
|
filenames = [
|
||||||
|
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml"),
|
||||||
|
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
|
||||||
|
]
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
partition_multiple_via_api(filenames=filenames, api_key="FAKEROO")
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_multiple_via_api_raises_with_content_types_size_mismatch(monkeypatch):
|
||||||
|
monkeypatch.setattr(
|
||||||
|
requests,
|
||||||
|
"post",
|
||||||
|
lambda *args, **kwargs: MockMultipleResponse(status_code=500),
|
||||||
|
)
|
||||||
|
|
||||||
|
filenames = [
|
||||||
|
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml"),
|
||||||
|
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
|
||||||
|
]
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
partition_multiple_via_api(
|
||||||
|
filenames=filenames,
|
||||||
|
content_types=["text/plain"],
|
||||||
|
api_key="FAKEROO",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_multiple_via_api_from_files_raises_with_size_mismatch(monkeypatch):
|
||||||
|
monkeypatch.setattr(
|
||||||
|
requests,
|
||||||
|
"post",
|
||||||
|
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
|
||||||
|
)
|
||||||
|
|
||||||
|
filenames = [
|
||||||
|
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml"),
|
||||||
|
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
|
||||||
|
]
|
||||||
|
|
||||||
|
with contextlib.ExitStack() as stack:
|
||||||
|
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
partition_multiple_via_api(
|
||||||
|
files=files,
|
||||||
|
file_filenames=filenames,
|
||||||
|
content_types=["text/plain"],
|
||||||
|
api_key="FAKEROO",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_multiple_via_api_from_files_raises_without_filenames(monkeypatch):
|
||||||
|
monkeypatch.setattr(
|
||||||
|
requests,
|
||||||
|
"post",
|
||||||
|
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
|
||||||
|
)
|
||||||
|
|
||||||
|
filenames = [
|
||||||
|
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml"),
|
||||||
|
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
|
||||||
|
]
|
||||||
|
|
||||||
|
with contextlib.ExitStack() as stack:
|
||||||
|
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
partition_multiple_via_api(
|
||||||
|
files=files,
|
||||||
|
api_key="FAKEROO",
|
||||||
|
)
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.6.2" # pragma: no cover
|
__version__ = "0.6.3-dev0" # pragma: no cover
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
import contextlib
|
||||||
|
import json
|
||||||
from typing import (
|
from typing import (
|
||||||
IO,
|
IO,
|
||||||
List,
|
List,
|
||||||
@ -81,3 +83,95 @@ def partition_via_api(
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Receive unexpected status code {response.status_code} from the API.",
|
f"Receive unexpected status code {response.status_code} from the API.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def partition_multiple_via_api(
|
||||||
|
filenames: Optional[List[str]] = None,
|
||||||
|
content_types: Optional[List[str]] = None,
|
||||||
|
files: Optional[List[str]] = None,
|
||||||
|
file_filenames: Optional[List[str]] = None,
|
||||||
|
strategy: str = "hi_res",
|
||||||
|
api_url: str = "https://api.unstructured.io/general/v0/general",
|
||||||
|
api_key: str = "",
|
||||||
|
) -> List[List[Element]]:
|
||||||
|
"""Partitions multiple document using the Unstructured REST API by batching
|
||||||
|
the documents into a single HTTP request.
|
||||||
|
|
||||||
|
See https://api.unstructured.io/general/docs for the hosted API documentation or
|
||||||
|
https://github.com/Unstructured-IO/unstructured-api for instructions on how to run
|
||||||
|
the API locally as a container.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename
|
||||||
|
A list of strings defining the target filename paths.
|
||||||
|
content_types
|
||||||
|
A list of strings defining the file contents in MIME types.
|
||||||
|
files
|
||||||
|
A list of file-like object using "rb" mode --> open(filename, "rb").
|
||||||
|
file_filename
|
||||||
|
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
||||||
|
strategy
|
||||||
|
The strategy to use for partitioning the PDF. Uses a layout detection model if set
|
||||||
|
to 'hi_res', otherwise partition_pdf simply extracts the text from the document
|
||||||
|
and processes it.
|
||||||
|
api_url
|
||||||
|
The URL for the Unstructured API. Defaults to the hosted Unstructured API.
|
||||||
|
api_key
|
||||||
|
The API key to pass to the Unstructured API.
|
||||||
|
"""
|
||||||
|
headers = {
|
||||||
|
"ACCEPT": "application/json",
|
||||||
|
"UNSTRUCTURED-API-KEY": api_key,
|
||||||
|
}
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"strategy": strategy,
|
||||||
|
}
|
||||||
|
|
||||||
|
if filenames is not None:
|
||||||
|
if content_types and len(content_types) != len(filenames):
|
||||||
|
raise ValueError("content_types and filenames must have the same length.")
|
||||||
|
|
||||||
|
with contextlib.ExitStack() as stack:
|
||||||
|
files = [stack.enter_context(open(f, "rb")) for f in filenames] # type: ignore
|
||||||
|
|
||||||
|
_files = []
|
||||||
|
for i, file in enumerate(files):
|
||||||
|
filename = filenames[i]
|
||||||
|
content_type = content_types[i] if content_types is not None else None
|
||||||
|
_files.append(("files", (filename, file, content_type)))
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
api_url,
|
||||||
|
headers=headers,
|
||||||
|
data=data,
|
||||||
|
files=_files, # type: ignore
|
||||||
|
)
|
||||||
|
|
||||||
|
elif files is not None:
|
||||||
|
if content_types and len(content_types) != len(files):
|
||||||
|
raise ValueError("content_types and files must have the same length.")
|
||||||
|
|
||||||
|
if not file_filenames:
|
||||||
|
raise ValueError("file_filenames must be specified if files are passed")
|
||||||
|
elif len(file_filenames) != len(files):
|
||||||
|
raise ValueError("file_filenames and files must have the same length.")
|
||||||
|
|
||||||
|
_files = []
|
||||||
|
for i, _file in enumerate(files): # type: ignore
|
||||||
|
content_type = content_types[i] if content_types is not None else None
|
||||||
|
filename = file_filenames[i]
|
||||||
|
_files.append(("files", (filename, _file, content_type)))
|
||||||
|
|
||||||
|
response = requests.post(api_url, headers=headers, data=data, files=_files) # type: ignore
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
documents = []
|
||||||
|
for document in response.json():
|
||||||
|
documents.append(partition_json(text=json.dumps(document)))
|
||||||
|
return documents
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Receive unexpected status code {response.status_code} from the API.",
|
||||||
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user