2024-02-26 11:17:06 -08:00
|
|
|
import base64
|
2023-05-03 15:06:06 -04:00
|
|
|
import contextlib
|
|
|
|
import json
|
2023-04-26 09:05:35 -04:00
|
|
|
import os
|
|
|
|
import pathlib
|
2024-07-02 11:42:03 -05:00
|
|
|
from typing import Any
|
2023-04-26 09:05:35 -04:00
|
|
|
|
|
|
|
import pytest
|
|
|
|
import requests
|
2023-12-01 12:49:59 -06:00
|
|
|
from unstructured_client.general import General
|
2024-07-02 11:42:03 -05:00
|
|
|
from unstructured_client.models import shared
|
|
|
|
from unstructured_client.models.shared import PartitionParameters
|
2023-04-26 09:05:35 -04:00
|
|
|
|
2024-02-26 11:17:06 -08:00
|
|
|
from unstructured.documents.elements import ElementType, NarrativeText
|
2023-05-03 15:06:06 -04:00
|
|
|
from unstructured.partition.api import partition_multiple_via_api, partition_via_api
|
2023-04-26 09:05:35 -04:00
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
from ..unit_utils import ANY, FixtureRequest, example_doc_path, method_mock
|
2023-04-26 09:05:35 -04:00
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
2023-06-16 17:52:13 -07:00
|
|
|
|
2023-06-29 10:31:01 -07:00
|
|
|
skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}
|
2023-06-30 09:44:46 -05:00
|
|
|
skip_not_on_main = os.getenv("GITHUB_REF_NAME", "").lower() != "main"
|
2023-06-29 10:31:01 -07:00
|
|
|
|
2023-12-01 12:49:59 -06:00
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
def test_partition_via_api_with_filename_correctly_calls_sdk(
|
|
|
|
request: FixtureRequest, expected_call_: list[Any]
|
|
|
|
):
|
|
|
|
partition_mock_ = method_mock(
|
|
|
|
request, General, "partition", return_value=FakeResponse(status_code=200)
|
|
|
|
)
|
2023-12-01 12:49:59 -06:00
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
elements = partition_via_api(filename=example_doc_path("eml/fake-email.eml"))
|
2023-04-26 09:05:35 -04:00
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
partition_mock_.assert_called_once_with(*expected_call_)
|
|
|
|
assert isinstance(partition_mock_.call_args_list[0].args[0], General)
|
|
|
|
assert len(elements) == 1
|
2023-04-26 09:05:35 -04:00
|
|
|
assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
|
2023-06-08 09:24:16 -04:00
|
|
|
assert elements[0].metadata.filetype == "message/rfc822"
|
2023-04-26 09:05:35 -04:00
|
|
|
|
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
def test_partition_via_api_with_file_correctly_calls_sdk(
|
|
|
|
request: FixtureRequest, expected_call_: list[Any]
|
|
|
|
):
|
|
|
|
partition_mock_ = method_mock(
|
|
|
|
request, General, "partition", return_value=FakeResponse(status_code=200)
|
2024-01-03 15:08:48 -05:00
|
|
|
)
|
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
|
|
|
|
elements = partition_via_api(
|
|
|
|
file=f, metadata_filename=example_doc_path("eml/fake-email.eml")
|
|
|
|
)
|
2024-01-03 15:08:48 -05:00
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
# Update the fixture content to match the format passed to partition_via_api
|
|
|
|
modified_expected_call = expected_call_[:]
|
|
|
|
modified_expected_call[1].files.content = f
|
2023-04-26 09:05:35 -04:00
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
partition_mock_.assert_called_once_with(*modified_expected_call)
|
|
|
|
assert isinstance(partition_mock_.call_args_list[0].args[0], General)
|
|
|
|
assert len(elements) == 1
|
2023-04-26 09:05:35 -04:00
|
|
|
assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
|
2023-06-08 09:24:16 -04:00
|
|
|
assert elements[0].metadata.filetype == "message/rfc822"
|
2023-04-26 09:05:35 -04:00
|
|
|
|
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
def test_partition_via_api_warns_with_file_and_filename_and_calls_sdk(
|
|
|
|
request: FixtureRequest, expected_call_: list[Any], caplog: pytest.LogCaptureFixture
|
|
|
|
):
|
|
|
|
partition_mock_ = method_mock(
|
|
|
|
request, General, "partition", return_value=FakeResponse(status_code=200)
|
2023-08-24 03:02:47 -04:00
|
|
|
)
|
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
|
|
|
|
partition_via_api(file=f, file_filename=example_doc_path("eml/fake-email.eml"))
|
|
|
|
|
|
|
|
# Update the fixture content to match the format passed to partition_via_api
|
|
|
|
modified_expected_call = expected_call_[:]
|
|
|
|
modified_expected_call[1].files.content = f
|
2023-08-24 03:02:47 -04:00
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
partition_mock_.assert_called_once_with(*modified_expected_call)
|
2023-08-24 03:02:47 -04:00
|
|
|
assert "WARNING" in caplog.text
|
|
|
|
assert "The file_filename kwarg will be deprecated" in caplog.text
|
|
|
|
|
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
def test_partition_via_api_from_file_raises_with_metadata_and_file_and_filename():
|
|
|
|
filename = example_doc_path("eml/fake-email.eml")
|
2023-08-24 03:02:47 -04:00
|
|
|
|
|
|
|
with open(filename, "rb") as f, pytest.raises(ValueError):
|
|
|
|
partition_via_api(file=f, file_filename=filename, metadata_filename=filename)
|
|
|
|
|
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
def test_partition_via_api_from_file_raises_without_filename():
|
|
|
|
with open(example_doc_path("eml/fake-email.eml"), "rb") as f, pytest.raises(ValueError):
|
2023-06-15 15:18:22 -04:00
|
|
|
partition_via_api(file=f)
|
2023-05-31 14:09:58 -04:00
|
|
|
|
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
def test_partition_via_api_raises_with_bad_response(request: FixtureRequest):
|
|
|
|
partition_mock_ = method_mock(
|
|
|
|
request, General, "partition", return_value=FakeResponse(status_code=500)
|
2023-04-26 09:05:35 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
2024-07-02 11:42:03 -05:00
|
|
|
partition_via_api(filename=example_doc_path("eml/fake-email.eml"))
|
|
|
|
partition_mock_.assert_called_once()
|
2023-05-03 15:06:06 -04:00
|
|
|
|
|
|
|
|
2023-10-24 17:17:54 -05:00
|
|
|
@pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI")
|
|
|
|
@pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch")
|
2023-07-26 09:56:39 -07:00
|
|
|
def test_partition_via_api_with_no_strategy():
|
2023-08-18 10:06:17 -07:00
|
|
|
elements_no_strategy = partition_via_api(
|
2024-07-02 11:42:03 -05:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-08-18 10:06:17 -07:00
|
|
|
strategy="auto",
|
|
|
|
api_key=get_api_key(),
|
2024-07-02 11:42:03 -05:00
|
|
|
# The url has changed since the 06/24 API release while the sdk defaults to the old url
|
|
|
|
api_url="https://api.unstructuredapp.io/general/v0/general",
|
fix: set `skip_infer_tables` explicitly in `test_partition_via_api_with_no_strategy` (#3057)
### Summary
A `partition_via_api` test that only runs on `main` was
[failing](https://github.com/Unstructured-IO/unstructured/actions/runs/9159429513/job/25181600959)
with the following output, likely due to the change in the default
behavior for `skip_infer_table_types`. This PR explicitly sets the
`skip_infer_table_types` param to avoid the failure..
```python
=========================== short test summary info ============================
FAILED test_unstructured/partition/test_api.py::test_partition_via_api_with_no_strategy - AssertionError: assert 'Zejiang Shen® (<), Ruochen Zhang?, Melissa Dell®, Benjamin Charles Germain Lee?, Jacob Carlson®, and Weining Li®' != 'Zejiang Shen® (<), Ruochen Zhang?, Melissa Dell®, Benjamin Charles Germain Lee?, Jacob Carlson®, and Weining Li®'
+ where 'Zejiang Shen® (<), Ruochen Zhang?, Melissa Dell®, Benjamin Charles Germain Lee?, Jacob Carlson®, and Weining Li®' = <unstructured.documents.elements.Text object at 0x7fb9069fc610>.text
+ and 'Zejiang Shen® (<), Ruochen Zhang?, Melissa Dell®, Benjamin Charles Germain Lee?, Jacob Carlson®, and Weining Li®' = <unstructured.documents.elements.Text object at 0x7fb90648ad90>.text
= 1 failed, 2299 passed, 9 skipped, 2 deselected, 2 xfailed, 9 xpassed, 14 warnings in 1241.64s (0:20:41) =
make: *** [Makefile:302: test] Error 1
```
### Testing
After temporarily removing the "skip if not on `main`" `pytest` mark,
the [unit tests
pass](https://github.com/Unstructured-IO/unstructured/actions/runs/9163268381/job/25192040902?pr=3057O)
on the feature branch.
2024-05-20 19:05:13 -04:00
|
|
|
skip_infer_table_types=["pdf"],
|
|
|
|
)
|
|
|
|
elements_hi_res = partition_via_api(
|
2024-07-02 11:42:03 -05:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
|
|
|
strategy="hi_res",
|
|
|
|
api_key=get_api_key(),
|
|
|
|
# The url has changed since the 06/24 API release while the sdk defaults to the old url
|
|
|
|
api_url="https://api.unstructuredapp.io/general/v0/general",
|
|
|
|
skip_infer_table_types=["pdf"],
|
2023-08-18 10:06:17 -07:00
|
|
|
)
|
2023-07-26 09:56:39 -07:00
|
|
|
|
2023-08-18 10:06:17 -07:00
|
|
|
# confirm that hi_res strategy was not passed as default to partition by comparing outputs
|
2023-10-24 17:17:54 -05:00
|
|
|
# elements_hi_res[3].text =
|
|
|
|
# 'LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis'
|
|
|
|
# while elements_no_strategy[3].text = ']' (as of this writing)
|
|
|
|
assert elements_no_strategy[3].text != elements_hi_res[3].text
|
2023-07-26 09:56:39 -07:00
|
|
|
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
@pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI")
|
|
|
|
@pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch")
|
2023-07-26 09:56:39 -07:00
|
|
|
def test_partition_via_api_with_image_hi_res_strategy_includes_coordinates():
|
|
|
|
# coordinates not included by default to limit payload size
|
|
|
|
elements = partition_via_api(
|
2024-07-02 11:42:03 -05:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-07-26 09:56:39 -07:00
|
|
|
strategy="hi_res",
|
|
|
|
coordinates="true",
|
|
|
|
api_key=get_api_key(),
|
2024-07-02 11:42:03 -05:00
|
|
|
# The url has changed since the 06/24 API release while the sdk defaults to the old url
|
|
|
|
api_url="https://api.unstructuredapp.io/general/v0/general",
|
2023-07-26 09:56:39 -07:00
|
|
|
)
|
|
|
|
|
|
|
|
assert elements[0].metadata.coordinates is not None
|
|
|
|
|
|
|
|
|
2023-06-29 17:47:51 -07:00
|
|
|
@pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI")
|
2023-06-30 09:44:46 -05:00
|
|
|
@pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch")
|
2023-06-12 12:39:58 -04:00
|
|
|
def test_partition_via_api_valid_request_data_kwargs():
|
2024-07-02 11:42:03 -05:00
|
|
|
elements = partition_via_api(
|
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
|
|
|
strategy="fast",
|
|
|
|
api_key=get_api_key(),
|
|
|
|
# The url has changed since the 06/24 API release while the sdk defaults to the old url
|
|
|
|
api_url="https://api.unstructuredapp.io/general/v0/general",
|
|
|
|
)
|
2023-06-29 10:31:01 -07:00
|
|
|
|
2023-06-12 12:39:58 -04:00
|
|
|
assert isinstance(elements, list)
|
|
|
|
|
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
@pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI")
|
|
|
|
@pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch")
|
2024-02-26 11:17:06 -08:00
|
|
|
def test_partition_via_api_image_block_extraction():
|
|
|
|
elements = partition_via_api(
|
2024-07-02 11:42:03 -05:00
|
|
|
filename=example_doc_path("embedded-images-tables.pdf"),
|
2024-02-26 11:17:06 -08:00
|
|
|
strategy="hi_res",
|
|
|
|
extract_image_block_types=["image", "table"],
|
|
|
|
api_key=get_api_key(),
|
2024-07-02 11:42:03 -05:00
|
|
|
# The url has changed since the 06/24 API release while the sdk defaults to the old url
|
|
|
|
api_url="https://api.unstructuredapp.io/general/v0/general",
|
2024-02-26 11:17:06 -08:00
|
|
|
)
|
|
|
|
image_elements = [el for el in elements if el.category == ElementType.IMAGE]
|
|
|
|
for el in image_elements:
|
|
|
|
assert el.metadata.image_base64 is not None
|
|
|
|
assert el.metadata.image_mime_type is not None
|
|
|
|
image_data = base64.b64decode(el.metadata.image_base64)
|
|
|
|
assert isinstance(image_data, bytes)
|
|
|
|
|
|
|
|
|
2024-01-03 15:08:48 -05:00
|
|
|
# Note(austin) - This test is way too noisy against the hosted api
|
|
|
|
# def test_partition_via_api_invalid_request_data_kwargs():
|
|
|
|
# filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf")
|
|
|
|
# with pytest.raises(SDKError):
|
|
|
|
# partition_via_api(filename=filename, strategy="not_a_strategy")
|
2023-06-12 12:39:58 -04:00
|
|
|
|
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
def test_partition_multiple_via_api_with_single_filename(request: FixtureRequest):
|
|
|
|
partition_mock_ = method_mock(
|
|
|
|
request, requests, "post", return_value=FakeResponse(status_code=200)
|
2023-06-08 09:24:16 -04:00
|
|
|
)
|
2024-07-02 11:42:03 -05:00
|
|
|
filename = example_doc_path("eml/fake-email.eml")
|
2023-06-08 09:24:16 -04:00
|
|
|
|
2023-06-15 15:18:22 -04:00
|
|
|
elements = partition_multiple_via_api(filenames=[filename])
|
2024-07-02 11:42:03 -05:00
|
|
|
|
|
|
|
partition_mock_.assert_called_once_with(
|
|
|
|
"https://api.unstructured.io/general/v0/general",
|
|
|
|
headers={"ACCEPT": "application/json", "UNSTRUCTURED-API-KEY": ANY},
|
|
|
|
data={},
|
|
|
|
files=[("files", (example_doc_path("eml/fake-email.eml"), ANY, None))],
|
|
|
|
)
|
2023-06-08 09:24:16 -04:00
|
|
|
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
|
|
|
|
assert elements[0][0].metadata.filetype == "message/rfc822"
|
|
|
|
|
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
def test_partition_multiple_via_api_from_filenames(request: FixtureRequest):
|
|
|
|
partition_mock_ = method_mock(
|
|
|
|
request, requests, "post", return_value=FakeMultipleResponse(status_code=200)
|
2023-05-03 15:06:06 -04:00
|
|
|
)
|
2024-07-02 11:42:03 -05:00
|
|
|
filenames = [example_doc_path("eml/fake-email.eml"), example_doc_path("fake.docx")]
|
2023-05-03 15:06:06 -04:00
|
|
|
|
2023-06-15 15:18:22 -04:00
|
|
|
elements = partition_multiple_via_api(filenames=filenames)
|
2024-07-02 11:42:03 -05:00
|
|
|
|
|
|
|
partition_mock_.assert_called_once_with(
|
|
|
|
"https://api.unstructured.io/general/v0/general",
|
|
|
|
headers={"ACCEPT": "application/json", "UNSTRUCTURED-API-KEY": ANY},
|
|
|
|
data={},
|
|
|
|
files=[
|
|
|
|
("files", (example_doc_path("eml/fake-email.eml"), ANY, None)),
|
|
|
|
("files", (example_doc_path("fake.docx"), ANY, None)),
|
|
|
|
],
|
|
|
|
)
|
2023-05-03 15:06:06 -04:00
|
|
|
assert len(elements) == 2
|
|
|
|
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
|
2023-06-08 09:24:16 -04:00
|
|
|
assert elements[0][0].metadata.filetype == "message/rfc822"
|
2023-05-03 15:06:06 -04:00
|
|
|
|
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
def test_partition_multiple_via_api_from_files(request: FixtureRequest):
|
|
|
|
partition_mock_ = method_mock(
|
|
|
|
request, requests, "post", return_value=FakeMultipleResponse(status_code=200)
|
2023-05-03 15:06:06 -04:00
|
|
|
)
|
2024-07-02 11:42:03 -05:00
|
|
|
filenames = [example_doc_path("eml/fake-email.eml"), example_doc_path("fake.docx")]
|
2023-05-03 15:06:06 -04:00
|
|
|
|
|
|
|
with contextlib.ExitStack() as stack:
|
|
|
|
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
|
|
|
elements = partition_multiple_via_api(
|
|
|
|
files=files,
|
2023-08-24 03:02:47 -04:00
|
|
|
metadata_filenames=filenames,
|
2023-05-03 15:06:06 -04:00
|
|
|
)
|
2024-07-02 11:42:03 -05:00
|
|
|
|
|
|
|
partition_mock_.assert_called_once_with(
|
|
|
|
"https://api.unstructured.io/general/v0/general",
|
|
|
|
headers={"ACCEPT": "application/json", "UNSTRUCTURED-API-KEY": ANY},
|
|
|
|
data={},
|
|
|
|
files=[
|
|
|
|
("files", (example_doc_path("eml/fake-email.eml"), ANY, None)),
|
|
|
|
("files", (example_doc_path("fake.docx"), ANY, None)),
|
|
|
|
],
|
|
|
|
)
|
2023-05-03 15:06:06 -04:00
|
|
|
assert len(elements) == 2
|
|
|
|
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
|
2023-06-08 09:24:16 -04:00
|
|
|
assert elements[0][0].metadata.filetype == "message/rfc822"
|
2023-05-03 15:06:06 -04:00
|
|
|
|
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
def test_partition_multiple_via_api_warns_with_file_filename(
|
|
|
|
caplog: pytest.LogCaptureFixture, request: FixtureRequest
|
|
|
|
):
|
|
|
|
partition_mock_ = method_mock(
|
|
|
|
request, requests, "post", return_value=FakeMultipleResponse(status_code=200)
|
2023-08-24 03:02:47 -04:00
|
|
|
)
|
2024-07-02 11:42:03 -05:00
|
|
|
filenames = [example_doc_path("eml/fake-email.eml"), example_doc_path("fake.docx")]
|
2023-08-24 03:02:47 -04:00
|
|
|
|
|
|
|
with contextlib.ExitStack() as stack:
|
|
|
|
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
|
|
|
partition_multiple_via_api(
|
|
|
|
files=files,
|
|
|
|
file_filenames=filenames,
|
|
|
|
)
|
2024-07-02 11:42:03 -05:00
|
|
|
|
|
|
|
partition_mock_.assert_called_once_with(
|
|
|
|
"https://api.unstructured.io/general/v0/general",
|
|
|
|
headers={"ACCEPT": "application/json", "UNSTRUCTURED-API-KEY": ANY},
|
|
|
|
data={},
|
|
|
|
files=[
|
|
|
|
("files", (example_doc_path("eml/fake-email.eml"), ANY, None)),
|
|
|
|
("files", (example_doc_path("fake.docx"), ANY, None)),
|
|
|
|
],
|
|
|
|
)
|
2023-08-24 03:02:47 -04:00
|
|
|
assert "WARNING" in caplog.text
|
|
|
|
assert "The file_filenames kwarg will be deprecated" in caplog.text
|
|
|
|
|
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
def test_partition_multiple_via_api_raises_with_file_and_metadata_filename():
|
|
|
|
filenames = [example_doc_path("eml/fake-email.eml"), example_doc_path("fake.docx")]
|
2023-08-24 03:02:47 -04:00
|
|
|
|
|
|
|
with contextlib.ExitStack() as stack:
|
|
|
|
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_multiple_via_api(
|
|
|
|
files=files,
|
|
|
|
metadata_filenames=filenames,
|
|
|
|
file_filenames=filenames,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
def test_partition_multiple_via_api_raises_with_bad_response(request: FixtureRequest):
|
|
|
|
partition_mock_ = method_mock(
|
|
|
|
request, requests, "post", return_value=FakeMultipleResponse(status_code=500)
|
2023-05-03 15:06:06 -04:00
|
|
|
)
|
2024-07-02 11:42:03 -05:00
|
|
|
filenames = [example_doc_path("eml/fake-email.eml"), example_doc_path("fake.docx")]
|
2023-05-03 15:06:06 -04:00
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
2023-06-15 15:18:22 -04:00
|
|
|
partition_multiple_via_api(filenames=filenames)
|
2024-07-02 11:42:03 -05:00
|
|
|
partition_mock_.assert_called_once_with(
|
|
|
|
"https://api.unstructured.io/general/v0/general",
|
|
|
|
headers={"ACCEPT": "application/json", "UNSTRUCTURED-API-KEY": ANY},
|
|
|
|
data={},
|
|
|
|
files=[
|
|
|
|
("files", (example_doc_path("eml/fake-email.eml"), ANY, None)),
|
|
|
|
("files", (example_doc_path("fake.docx"), ANY, None)),
|
|
|
|
],
|
2023-05-03 15:06:06 -04:00
|
|
|
)
|
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
|
|
|
|
def test_partition_multiple_via_api_raises_with_content_types_size_mismatch():
|
|
|
|
filenames = [example_doc_path("eml/fake-email.eml"), example_doc_path("fake.docx")]
|
2023-05-03 15:06:06 -04:00
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_multiple_via_api(
|
|
|
|
filenames=filenames,
|
|
|
|
content_types=["text/plain"],
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
def test_partition_multiple_via_api_from_files_raises_with_size_mismatch():
|
|
|
|
filenames = [example_doc_path("eml/fake-email.eml"), example_doc_path("fake.docx")]
|
2023-05-03 15:06:06 -04:00
|
|
|
|
|
|
|
with contextlib.ExitStack() as stack:
|
|
|
|
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_multiple_via_api(
|
|
|
|
files=files,
|
2023-08-24 03:02:47 -04:00
|
|
|
metadata_filenames=filenames,
|
2023-05-03 15:06:06 -04:00
|
|
|
content_types=["text/plain"],
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2024-07-02 11:42:03 -05:00
|
|
|
def test_partition_multiple_via_api_from_files_raises_without_filenames():
|
|
|
|
filenames = [example_doc_path("eml/fake-email.eml"), example_doc_path("fake.docx")]
|
2023-05-03 15:06:06 -04:00
|
|
|
|
|
|
|
with contextlib.ExitStack() as stack:
|
|
|
|
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_multiple_via_api(
|
|
|
|
files=files,
|
|
|
|
)
|
2023-06-12 12:39:58 -04:00
|
|
|
|
|
|
|
|
2023-06-29 10:31:01 -07:00
|
|
|
def get_api_key():
|
|
|
|
api_key = os.getenv("UNS_API_KEY")
|
|
|
|
if api_key is None:
|
|
|
|
raise ValueError("UNS_API_KEY environment variable not set")
|
|
|
|
return api_key
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI")
|
2023-06-30 09:44:46 -05:00
|
|
|
@pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch")
|
2023-06-12 12:39:58 -04:00
|
|
|
def test_partition_multiple_via_api_valid_request_data_kwargs():
|
|
|
|
filenames = [
|
2024-07-02 11:42:03 -05:00
|
|
|
example_doc_path("layout-parser-paper-fast.pdf"),
|
|
|
|
example_doc_path("layout-parser-paper-fast.jpg"),
|
2023-06-12 12:39:58 -04:00
|
|
|
]
|
|
|
|
|
2023-06-29 10:31:01 -07:00
|
|
|
elements = partition_multiple_via_api(
|
|
|
|
filenames=filenames,
|
2023-11-22 20:35:04 -05:00
|
|
|
strategy="auto",
|
2023-06-29 10:31:01 -07:00
|
|
|
api_key=get_api_key(),
|
2024-07-02 11:42:03 -05:00
|
|
|
# The url has changed since the 06/24 API release while the sdk defaults to the old url
|
|
|
|
api_url="https://api.unstructuredapp.io/general/v0/general",
|
2023-06-29 10:31:01 -07:00
|
|
|
)
|
2023-06-12 12:39:58 -04:00
|
|
|
assert isinstance(elements, list)
|
|
|
|
|
|
|
|
|
2023-06-29 10:31:01 -07:00
|
|
|
@pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI")
|
2023-06-12 12:39:58 -04:00
|
|
|
def test_partition_multiple_via_api_invalid_request_data_kwargs():
|
|
|
|
filenames = [
|
2024-07-02 11:42:03 -05:00
|
|
|
example_doc_path("layout-parser-paper-fast.pdf"),
|
|
|
|
example_doc_path("layout-parser-paper-fast.jpg"),
|
2023-06-12 12:39:58 -04:00
|
|
|
]
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_multiple_via_api(
|
|
|
|
filenames=filenames,
|
|
|
|
strategy="not_a_strategy",
|
2023-06-29 10:31:01 -07:00
|
|
|
api_key=get_api_key(),
|
2024-07-02 11:42:03 -05:00
|
|
|
# The url has changed since the 06/24 API release while the sdk defaults to the old url
|
|
|
|
api_url="https://api.unstructuredapp.io/general/v0/general",
|
2023-06-12 12:39:58 -04:00
|
|
|
)
|
2024-07-02 11:42:03 -05:00
|
|
|
|
|
|
|
|
|
|
|
MOCK_TEXT = """[
|
|
|
|
{
|
|
|
|
"element_id": "f49fbd614ddf5b72e06f59e554e6ae2b",
|
|
|
|
"text": "This is a test email to use for unit tests.",
|
|
|
|
"type": "NarrativeText",
|
|
|
|
"metadata": {
|
|
|
|
"sent_from": [
|
|
|
|
"Matthew Robinson <mrobinson@unstructured.io>"
|
|
|
|
],
|
|
|
|
"sent_to": [
|
|
|
|
"Matthew Robinson <mrobinson@unstructured.io>"
|
|
|
|
],
|
|
|
|
"subject": "Test Email",
|
|
|
|
"filename": "fake-email.eml",
|
|
|
|
"filetype": "message/rfc822"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
]"""
|
|
|
|
|
|
|
|
|
|
|
|
class FakeResponse:
|
|
|
|
def __init__(self, status_code: int):
|
|
|
|
self.status_code = status_code
|
|
|
|
# The string representation of partitioned elements is nested in an additional
|
|
|
|
# layer in the new unstructured-client:
|
|
|
|
# `elements_from_json(text=response.raw_response.text)`
|
|
|
|
self.raw_response = FakeRawResponse()
|
|
|
|
self.headers = {"Content-Type": "application/json"}
|
|
|
|
|
|
|
|
def json(self):
|
|
|
|
return json.loads(self.text)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def text(self):
|
|
|
|
return MOCK_TEXT
|
|
|
|
|
|
|
|
|
|
|
|
class FakeRawResponse:
|
|
|
|
def __init__(self):
|
|
|
|
self.text = MOCK_TEXT
|
|
|
|
|
|
|
|
|
|
|
|
class FakeMultipleResponse:
|
|
|
|
def __init__(self, status_code: int):
|
|
|
|
self.status_code = status_code
|
|
|
|
|
|
|
|
def json(self):
|
|
|
|
return json.loads(self.text)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def text(self):
|
|
|
|
return """[
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"element_id": "f49fbd614ddf5b72e06f59e554e6ae2b",
|
|
|
|
"text": "This is a test email to use for unit tests.",
|
|
|
|
"type": "NarrativeText",
|
|
|
|
"metadata": {
|
|
|
|
"sent_from": [
|
|
|
|
"Matthew Robinson <mrobinson@unstructured.io>"
|
|
|
|
],
|
|
|
|
"sent_to": [
|
|
|
|
"Matthew Robinson <mrobinson@unstructured.io>"
|
|
|
|
],
|
|
|
|
"subject": "Test Email",
|
|
|
|
"filename": "fake-email.eml",
|
|
|
|
"filetype": "message/rfc822"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
],
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"element_id": "f49fbd614ddf5b72e06f59e554e6ae2b",
|
|
|
|
"text": "This is a test email to use for unit tests.",
|
|
|
|
"type": "NarrativeText",
|
|
|
|
"metadata": {
|
|
|
|
"sent_from": [
|
|
|
|
"Matthew Robinson <mrobinson@unstructured.io>"
|
|
|
|
],
|
|
|
|
"sent_to": [
|
|
|
|
"Matthew Robinson <mrobinson@unstructured.io>"
|
|
|
|
],
|
|
|
|
"subject": "Test Email",
|
|
|
|
"filename": "fake-email.eml",
|
|
|
|
"filetype": "message/rfc822"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
]
|
|
|
|
]"""
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture()
|
|
|
|
def expected_call_():
|
|
|
|
with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
|
|
|
|
file_bytes = f.read()
|
|
|
|
return [
|
|
|
|
ANY,
|
|
|
|
PartitionParameters(
|
|
|
|
files=shared.Files(
|
|
|
|
content=file_bytes,
|
|
|
|
file_name=example_doc_path("eml/fake-email.eml"),
|
|
|
|
),
|
|
|
|
chunking_strategy=None,
|
|
|
|
combine_under_n_chars=None,
|
|
|
|
coordinates=False,
|
|
|
|
encoding=None,
|
|
|
|
extract_image_block_types=None,
|
|
|
|
gz_uncompressed_content_type=None,
|
|
|
|
hi_res_model_name=None,
|
|
|
|
include_orig_elements=None,
|
|
|
|
include_page_breaks=False,
|
|
|
|
languages=None,
|
|
|
|
max_characters=None,
|
|
|
|
multipage_sections=True,
|
|
|
|
new_after_n_chars=None,
|
|
|
|
ocr_languages=None,
|
|
|
|
output_format=shared.OutputFormat.APPLICATION_JSON,
|
|
|
|
overlap=0,
|
|
|
|
overlap_all=False,
|
|
|
|
pdf_infer_table_structure=True,
|
|
|
|
similarity_threshold=None,
|
|
|
|
skip_infer_table_types=None,
|
|
|
|
split_pdf_concurrency_level=5,
|
|
|
|
split_pdf_page=True,
|
|
|
|
starting_page_number=None,
|
|
|
|
strategy=shared.Strategy.AUTO,
|
|
|
|
unique_element_ids=False,
|
|
|
|
xml_keep_tags=False,
|
|
|
|
),
|
|
|
|
]
|