Matt Robinson acda4d0707
fix: set skip_infer_tables explicitly in test_partition_via_api_with_no_strategy (#3057)
### Summary

A `partition_via_api` test that only runs on `main` was
[failing](https://github.com/Unstructured-IO/unstructured/actions/runs/9159429513/job/25181600959)
with the following output, likely due to the change in the default
behavior for `skip_infer_table_types`. This PR explicitly sets the
`skip_infer_table_types` param to avoid the failure..

```python
=========================== short test summary info ============================
FAILED test_unstructured/partition/test_api.py::test_partition_via_api_with_no_strategy - AssertionError: assert 'Zejiang Shen® (<), Ruochen Zhang?, Melissa Dell®, Benjamin Charles Germain Lee?, Jacob Carlson®, and Weining Li®' != 'Zejiang Shen® (<), Ruochen Zhang?, Melissa Dell®, Benjamin Charles Germain Lee?, Jacob Carlson®, and Weining Li®'
 +  where 'Zejiang Shen® (<), Ruochen Zhang?, Melissa Dell®, Benjamin Charles Germain Lee?, Jacob Carlson®, and Weining Li®' = <unstructured.documents.elements.Text object at 0x7fb9069fc610>.text
 +  and   'Zejiang Shen® (<), Ruochen Zhang?, Melissa Dell®, Benjamin Charles Germain Lee?, Jacob Carlson®, and Weining Li®' = <unstructured.documents.elements.Text object at 0x7fb90648ad90>.text
= 1 failed, 2299 passed, 9 skipped, 2 deselected, 2 xfailed, 9 xpassed, 14 warnings in 1241.64s (0:20:41) =
make: *** [Makefile:302: test] Error 1
```

### Testing

After temporarily removing the "skip if not on `main`" `pytest` mark,
the [unit tests
pass](https://github.com/Unstructured-IO/unstructured/actions/runs/9163268381/job/25192040902?pr=3057O)
on the feature branch.
2024-05-20 19:05:13 -04:00

529 lines
18 KiB
Python

import base64
import contextlib
import json
import os
import pathlib
from unittest.mock import ANY, Mock
import pytest
import requests
from unstructured_client.general import General
from unstructured.documents.elements import ElementType, NarrativeText
from unstructured.partition.api import partition_multiple_via_api, partition_via_api
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EML_TEST_FILE = "eml/fake-email.eml"
skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}
skip_not_on_main = os.getenv("GITHUB_REF_NAME", "").lower() != "main"
MOCK_TEXT = """[
{
"element_id": "f49fbd614ddf5b72e06f59e554e6ae2b",
"text": "This is a test email to use for unit tests.",
"type": "NarrativeText",
"metadata": {
"sent_from": [
"Matthew Robinson <mrobinson@unstructured.io>"
],
"sent_to": [
"Matthew Robinson <mrobinson@unstructured.io>"
],
"subject": "Test Email",
"filename": "fake-email.eml",
"filetype": "message/rfc822"
}
}
]"""
class MockResponse:
def __init__(self, status_code):
self.status_code = status_code
# string representation of partitioned elements is nested in an additional
# layer in the new unstructured-client:
# `elements_from_json(text=response.raw_response.text)`
self.raw_response = MockRawResponse()
self.headers = {"Content-Type": "application/json"}
def json(self):
return json.loads(self.text)
@property
def text(self):
return MOCK_TEXT
class MockRawResponse:
def __init__(self):
self.text = MOCK_TEXT
def test_partition_via_api_from_filename(monkeypatch):
monkeypatch.setattr(
General,
"partition",
lambda *args, **kwargs: MockResponse(status_code=200),
)
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
elements = partition_via_api(filename=filename)
assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
assert elements[0].metadata.filetype == "message/rfc822"
def test_partition_via_api_custom_url(monkeypatch):
"""
Assert that we can specify api_url and requests are sent to the right place
"""
mock_request = Mock(return_value=MockResponse(status_code=200))
monkeypatch.setattr(requests.Session, "request", mock_request)
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
custom_url = "http://localhost:8000/general/v0/general"
with open(filename, "rb") as f:
partition_via_api(file=f, api_url=custom_url, metadata_filename=filename)
mock_request.assert_called_with(
"POST", custom_url, data=ANY, files=ANY, headers=ANY, params=ANY
)
# The sdk uses the server url, so we should be able to pass that as well
base_url = "http://localhost:8000"
with open(filename, "rb") as f:
partition_via_api(file=f, api_url=base_url, metadata_filename=filename)
mock_request.assert_called_with(
"POST", custom_url, data=ANY, files=ANY, headers=ANY, params=ANY
)
def test_partition_via_api_from_file(monkeypatch):
monkeypatch.setattr(
General,
"partition",
lambda *args, **kwargs: MockResponse(status_code=200),
)
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
with open(filename, "rb") as f:
elements = partition_via_api(file=f, metadata_filename=filename)
assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
assert elements[0].metadata.filetype == "message/rfc822"
def test_partition_via_api_from_file_warns_with_file_filename(monkeypatch, caplog):
monkeypatch.setattr(
General,
"partition",
lambda *args, **kwargs: MockResponse(status_code=200),
)
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
with open(filename, "rb") as f:
partition_via_api(file=f, file_filename=filename)
assert "WARNING" in caplog.text
assert "The file_filename kwarg will be deprecated" in caplog.text
def test_partition_via_api_from_file_raises_with_metadata_and_file_filename(monkeypatch):
monkeypatch.setattr(
General,
"partition",
lambda *args, **kwargs: MockResponse(status_code=200),
)
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
with open(filename, "rb") as f, pytest.raises(ValueError):
partition_via_api(file=f, file_filename=filename, metadata_filename=filename)
def test_partition_via_api_from_file_raises_without_filename(monkeypatch):
monkeypatch.setattr(
General,
"partition",
lambda *args, **kwargs: MockResponse(status_code=200),
)
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
with open(filename, "rb") as f, pytest.raises(ValueError):
partition_via_api(file=f)
def test_partition_via_api_raises_with_bad_response(monkeypatch):
monkeypatch.setattr(
General,
"partition",
lambda *args, **kwargs: MockResponse(status_code=500),
)
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
with pytest.raises(ValueError):
partition_via_api(filename=filename)
@pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI")
@pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch")
def test_partition_via_api_with_no_strategy():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf")
elements_no_strategy = partition_via_api(
filename=filename,
strategy="auto",
api_key=get_api_key(),
skip_infer_table_types=["pdf"],
)
elements_hi_res = partition_via_api(
filename=filename, strategy="hi_res", api_key=get_api_key(), skip_infer_table_types=["pdf"]
)
# confirm that hi_res strategy was not passed as default to partition by comparing outputs
# elements_hi_res[3].text =
# 'LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis'
# while elements_no_strategy[3].text = ']' (as of this writing)
assert elements_no_strategy[3].text != elements_hi_res[3].text
@pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI")
@pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch")
def test_partition_via_api_with_image_hi_res_strategy_includes_coordinates():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.jpg")
# coordinates not included by default to limit payload size
elements = partition_via_api(
filename=filename,
strategy="hi_res",
coordinates="true",
api_key=get_api_key(),
)
assert elements[0].metadata.coordinates is not None
@pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI")
@pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch")
def test_partition_via_api_valid_request_data_kwargs():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf")
elements = partition_via_api(filename=filename, strategy="fast", api_key=get_api_key())
assert isinstance(elements, list)
def test_partition_via_api_image_block_extraction():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "embedded-images-tables.pdf")
elements = partition_via_api(
filename=filename,
strategy="hi_res",
extract_image_block_types=["image", "table"],
api_key=get_api_key(),
)
image_elements = [el for el in elements if el.category == ElementType.IMAGE]
for el in image_elements:
assert el.metadata.image_base64 is not None
assert el.metadata.image_mime_type is not None
image_data = base64.b64decode(el.metadata.image_base64)
assert isinstance(image_data, bytes)
def test_partition_via_api_pass_list_type_parameters(monkeypatch):
mock_request = Mock(return_value=MockResponse(status_code=200))
monkeypatch.setattr(requests.Session, "request", mock_request)
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "embedded-images-tables.pdf")
partition_via_api(
filename=filename,
strategy="hi_res",
extract_image_block_types=["image", "table"],
skip_infer_table_types=["pdf", "docx"],
languages=["eng"],
)
mock_request.assert_called_with(
"POST",
ANY,
data=ANY,
files=[
["extract_image_block_types", [None, '["image", "table"]']],
["files", ANY],
["languages", [None, '["eng"]']],
["skip_infer_table_types", [None, '["pdf", "docx"]']],
["strategy", [None, "hi_res"]],
],
headers=ANY,
params=ANY,
)
# Note(austin) - This test is way too noisy against the hosted api
# def test_partition_via_api_invalid_request_data_kwargs():
# filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf")
# with pytest.raises(SDKError):
# partition_via_api(filename=filename, strategy="not_a_strategy")
class MockMultipleResponse:
def __init__(self, status_code):
self.status_code = status_code
def json(self):
return json.loads(self.text)
@property
def text(self):
return """[
[
{
"element_id": "f49fbd614ddf5b72e06f59e554e6ae2b",
"text": "This is a test email to use for unit tests.",
"type": "NarrativeText",
"metadata": {
"sent_from": [
"Matthew Robinson <mrobinson@unstructured.io>"
],
"sent_to": [
"Matthew Robinson <mrobinson@unstructured.io>"
],
"subject": "Test Email",
"filename": "fake-email.eml",
"filetype": "message/rfc822"
}
}
],
[
{
"element_id": "f49fbd614ddf5b72e06f59e554e6ae2b",
"text": "This is a test email to use for unit tests.",
"type": "NarrativeText",
"metadata": {
"sent_from": [
"Matthew Robinson <mrobinson@unstructured.io>"
],
"sent_to": [
"Matthew Robinson <mrobinson@unstructured.io>"
],
"subject": "Test Email",
"filename": "fake-email.eml",
"filetype": "message/rfc822"
}
}
]
]"""
def test_partition_multiple_via_api_with_single_filename(monkeypatch):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockResponse(status_code=200),
)
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE)
elements = partition_multiple_via_api(filenames=[filename])
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
assert elements[0][0].metadata.filetype == "message/rfc822"
def test_partition_multiple_via_api_from_filenames(monkeypatch):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
)
filenames = [
os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml"),
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
]
elements = partition_multiple_via_api(filenames=filenames)
assert len(elements) == 2
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
assert elements[0][0].metadata.filetype == "message/rfc822"
def test_partition_multiple_via_api_from_files(monkeypatch):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
)
filenames = [
os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE),
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
]
with contextlib.ExitStack() as stack:
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
elements = partition_multiple_via_api(
files=files,
metadata_filenames=filenames,
)
assert len(elements) == 2
assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.")
assert elements[0][0].metadata.filetype == "message/rfc822"
def test_partition_multiple_via_api_warns_with_file_filename(monkeypatch, caplog):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
)
filenames = [
os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE),
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
]
with contextlib.ExitStack() as stack:
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
partition_multiple_via_api(
files=files,
file_filenames=filenames,
)
assert "WARNING" in caplog.text
assert "The file_filenames kwarg will be deprecated" in caplog.text
def test_partition_multiple_via_api_warns_with_file_and_metadata_filename(monkeypatch):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
)
filenames = [
os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE),
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
]
with contextlib.ExitStack() as stack:
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
with pytest.raises(ValueError):
partition_multiple_via_api(
files=files,
metadata_filenames=filenames,
file_filenames=filenames,
)
def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockMultipleResponse(status_code=500),
)
filenames = [
os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE),
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
]
with pytest.raises(ValueError):
partition_multiple_via_api(filenames=filenames)
def test_partition_multiple_via_api_raises_with_content_types_size_mismatch(monkeypatch):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockMultipleResponse(status_code=500),
)
filenames = [
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml"),
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
]
with pytest.raises(ValueError):
partition_multiple_via_api(
filenames=filenames,
content_types=["text/plain"],
)
def test_partition_multiple_via_api_from_files_raises_with_size_mismatch(monkeypatch):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
)
filenames = [
os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE),
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
]
with contextlib.ExitStack() as stack:
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
with pytest.raises(ValueError):
partition_multiple_via_api(
files=files,
metadata_filenames=filenames,
content_types=["text/plain"],
)
def test_partition_multiple_via_api_from_files_raises_without_filenames(monkeypatch):
monkeypatch.setattr(
requests,
"post",
lambda *args, **kwargs: MockMultipleResponse(status_code=200),
)
filenames = [
os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE),
os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"),
]
with contextlib.ExitStack() as stack:
files = [stack.enter_context(open(filename, "rb")) for filename in filenames]
with pytest.raises(ValueError):
partition_multiple_via_api(
files=files,
)
def get_api_key():
api_key = os.getenv("UNS_API_KEY")
if api_key is None:
raise ValueError("UNS_API_KEY environment variable not set")
return api_key
@pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI")
@pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch")
def test_partition_multiple_via_api_valid_request_data_kwargs():
filenames = [
os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf"),
os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.jpg"),
]
elements = partition_multiple_via_api(
filenames=filenames,
strategy="auto",
api_key=get_api_key(),
)
assert isinstance(elements, list)
@pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI")
def test_partition_multiple_via_api_invalid_request_data_kwargs():
filenames = [
os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf"),
os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.jpg"),
]
with pytest.raises(ValueError):
partition_multiple_via_api(
filenames=filenames,
strategy="not_a_strategy",
api_key=get_api_key(),
)