diff --git a/CHANGELOG.md b/CHANGELOG.md index bb2c30685..e199c99c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,16 @@ +## 0.6.3-dev0 + +### Enhancements + + +### Features + +* Added `partition_multiple_via_api` for partitioning multiple documents in a single REST + API call. + +### Fixes + + ## 0.6.2 ### Enhancements diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index 5bcfc82b6..24d7aeaf1 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -162,6 +162,45 @@ Examples: elements = partition_via_api(file=f, file_filename=filename, api_key="MY_API_KEY") +``partition_multiple_via_api`` +------------------------------ + +``partition_multiple_via_api`` is similar to ``partition_via_api``, but allows you to partition +multiple documents in a single REST API call. The result has the type ``List[List[Element]]``, +for example: + +.. code:: python + + [ + [NarrativeText("Narrative!"), Title("Title!")], + [NarrativeText("Narrative!"), Title("Title!")] + ] + +Examples: + +.. code:: python + + from unstructured.partition.api import partition_multiple_via_api + + filenames = ["example-docs/fake-email.eml", "example-docs/fake.docx"] + + documents = partition_multiple_via_api(filenames=filenames) + + +.. code:: python + + from contextlib import ExitStack + + from unstructured.partition.api import partition_multiple_via_api + + filenames = ["example-docs/fake-email.eml", "example-docs/fake.docx"] + files = [open(filename, "rb") for filename in filenames] + + with ExitStack() as stack: + files = [stack.enter_context(open(filename, "rb")) for filename in filenames] + documents = partition_multiple_via_api(files=files, file_filenames=filenames) + + ``partition_docx`` ------------------ diff --git a/test_unstructured/partition/test_api.py b/test_unstructured/partition/test_api.py index d19f3af15..74c199ab8 100644 --- a/test_unstructured/partition/test_api.py +++ b/test_unstructured/partition/test_api.py @@ -1,3 +1,5 @@ +import contextlib +import json import os import pathlib @@ -5,7 +7,7 @@ import pytest import requests from unstructured.documents.elements import NarrativeText -from unstructured.partition.api import partition_via_api +from unstructured.partition.api import partition_multiple_via_api, partition_via_api DIRECTORY = pathlib.Path(__file__).parent.resolve() @@ -70,3 +72,172 @@ def test_partition_via_api_raises_with_bad_response(monkeypatch): with pytest.raises(ValueError): partition_via_api(filename=filename, api_key="FAKEROO") + + +class MockMultipleResponse: + def __init__(self, status_code): + self.status_code = status_code + + def json(self): + return json.loads(self.text) + + @property + def text(self): + return """[ + [ + { + "element_id": "f49fbd614ddf5b72e06f59e554e6ae2b", + "text": "This is a test email to use for unit tests.", + "type": "NarrativeText", + "metadata": { + "date": "2022-12-16T17:04:16-05:00", + "sent_from": [ + "Matthew Robinson " + ], + "sent_to": [ + "Matthew Robinson " + ], + "subject": "Test Email", + "filename": "fake-email.eml" + } + } + ], + [ + { + "element_id": "f49fbd614ddf5b72e06f59e554e6ae2b", + "text": "This is a test email to use for unit tests.", + "type": "NarrativeText", + "metadata": { + "date": "2022-12-16T17:04:16-05:00", + "sent_from": [ + "Matthew Robinson " + ], + "sent_to": [ + "Matthew Robinson " + ], + "subject": "Test Email", + "filename": "fake-email.eml" + } + } + ] +]""" + + +def test_partition_multiple_via_api_from_filenames(monkeypatch): + monkeypatch.setattr( + requests, + "post", + lambda *args, **kwargs: MockMultipleResponse(status_code=200), + ) + + filenames = [ + os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml"), + os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), + ] + + elements = partition_multiple_via_api(filenames=filenames, api_key="FAKEROO") + assert len(elements) == 2 + assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.") + + +def test_partition_multiple_via_api_from_files(monkeypatch): + monkeypatch.setattr( + requests, + "post", + lambda *args, **kwargs: MockMultipleResponse(status_code=200), + ) + + filenames = [ + os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml"), + os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), + ] + + with contextlib.ExitStack() as stack: + files = [stack.enter_context(open(filename, "rb")) for filename in filenames] + elements = partition_multiple_via_api( + files=files, + file_filenames=filenames, + api_key="FAKEROO", + ) + assert len(elements) == 2 + assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.") + + +def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch): + monkeypatch.setattr( + requests, + "post", + lambda *args, **kwargs: MockMultipleResponse(status_code=500), + ) + + filenames = [ + os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml"), + os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), + ] + + with pytest.raises(ValueError): + partition_multiple_via_api(filenames=filenames, api_key="FAKEROO") + + +def test_partition_multiple_via_api_raises_with_content_types_size_mismatch(monkeypatch): + monkeypatch.setattr( + requests, + "post", + lambda *args, **kwargs: MockMultipleResponse(status_code=500), + ) + + filenames = [ + os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml"), + os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), + ] + + with pytest.raises(ValueError): + partition_multiple_via_api( + filenames=filenames, + content_types=["text/plain"], + api_key="FAKEROO", + ) + + +def test_partition_multiple_via_api_from_files_raises_with_size_mismatch(monkeypatch): + monkeypatch.setattr( + requests, + "post", + lambda *args, **kwargs: MockMultipleResponse(status_code=200), + ) + + filenames = [ + os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml"), + os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), + ] + + with contextlib.ExitStack() as stack: + files = [stack.enter_context(open(filename, "rb")) for filename in filenames] + with pytest.raises(ValueError): + partition_multiple_via_api( + files=files, + file_filenames=filenames, + content_types=["text/plain"], + api_key="FAKEROO", + ) + + +def test_partition_multiple_via_api_from_files_raises_without_filenames(monkeypatch): + monkeypatch.setattr( + requests, + "post", + lambda *args, **kwargs: MockMultipleResponse(status_code=200), + ) + + filenames = [ + os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml"), + os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), + ] + + with contextlib.ExitStack() as stack: + files = [stack.enter_context(open(filename, "rb")) for filename in filenames] + with pytest.raises(ValueError): + partition_multiple_via_api( + files=files, + api_key="FAKEROO", + ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index f42820dd3..de1f54595 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.6.2" # pragma: no cover +__version__ = "0.6.3-dev0" # pragma: no cover diff --git a/unstructured/partition/api.py b/unstructured/partition/api.py index 481ef9124..c4c3add3b 100644 --- a/unstructured/partition/api.py +++ b/unstructured/partition/api.py @@ -1,3 +1,5 @@ +import contextlib +import json from typing import ( IO, List, @@ -81,3 +83,95 @@ def partition_via_api( raise ValueError( f"Receive unexpected status code {response.status_code} from the API.", ) + + +def partition_multiple_via_api( + filenames: Optional[List[str]] = None, + content_types: Optional[List[str]] = None, + files: Optional[List[str]] = None, + file_filenames: Optional[List[str]] = None, + strategy: str = "hi_res", + api_url: str = "https://api.unstructured.io/general/v0/general", + api_key: str = "", +) -> List[List[Element]]: + """Partitions multiple document using the Unstructured REST API by batching + the documents into a single HTTP request. + + See https://api.unstructured.io/general/docs for the hosted API documentation or + https://github.com/Unstructured-IO/unstructured-api for instructions on how to run + the API locally as a container. + + Parameters + ---------- + filename + A list of strings defining the target filename paths. + content_types + A list of strings defining the file contents in MIME types. + files + A list of file-like object using "rb" mode --> open(filename, "rb"). + file_filename + When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt" + strategy + The strategy to use for partitioning the PDF. Uses a layout detection model if set + to 'hi_res', otherwise partition_pdf simply extracts the text from the document + and processes it. + api_url + The URL for the Unstructured API. Defaults to the hosted Unstructured API. + api_key + The API key to pass to the Unstructured API. + """ + headers = { + "ACCEPT": "application/json", + "UNSTRUCTURED-API-KEY": api_key, + } + + data = { + "strategy": strategy, + } + + if filenames is not None: + if content_types and len(content_types) != len(filenames): + raise ValueError("content_types and filenames must have the same length.") + + with contextlib.ExitStack() as stack: + files = [stack.enter_context(open(f, "rb")) for f in filenames] # type: ignore + + _files = [] + for i, file in enumerate(files): + filename = filenames[i] + content_type = content_types[i] if content_types is not None else None + _files.append(("files", (filename, file, content_type))) + + response = requests.post( + api_url, + headers=headers, + data=data, + files=_files, # type: ignore + ) + + elif files is not None: + if content_types and len(content_types) != len(files): + raise ValueError("content_types and files must have the same length.") + + if not file_filenames: + raise ValueError("file_filenames must be specified if files are passed") + elif len(file_filenames) != len(files): + raise ValueError("file_filenames and files must have the same length.") + + _files = [] + for i, _file in enumerate(files): # type: ignore + content_type = content_types[i] if content_types is not None else None + filename = file_filenames[i] + _files.append(("files", (filename, _file, content_type))) + + response = requests.post(api_url, headers=headers, data=data, files=_files) # type: ignore + + if response.status_code == 200: + documents = [] + for document in response.json(): + documents.append(partition_json(text=json.dumps(document))) + return documents + else: + raise ValueError( + f"Receive unexpected status code {response.status_code} from the API.", + )