2023-02-27 17:30:54 +01:00
|
|
|
from unittest import mock
|
|
|
|
|
2022-11-30 16:34:24 -05:00
|
|
|
import pytest
|
|
|
|
import requests
|
2023-02-27 17:30:54 +01:00
|
|
|
from unstructured_inference.inference import layout
|
2022-11-30 16:34:24 -05:00
|
|
|
|
2023-03-10 22:16:05 -05:00
|
|
|
from unstructured.documents.elements import PageBreak, Text
|
2023-02-27 17:30:54 +01:00
|
|
|
from unstructured.partition import pdf
|
2022-11-21 17:27:23 -05:00
|
|
|
|
|
|
|
|
2022-11-30 16:34:24 -05:00
|
|
|
class MockResponse:
|
|
|
|
def __init__(self, status_code, response):
|
|
|
|
self.status_code = status_code
|
|
|
|
self.response = response
|
|
|
|
|
|
|
|
def json(self):
|
|
|
|
return self.response
|
|
|
|
|
|
|
|
|
|
|
|
def mock_healthy_get(url, **kwargs):
|
|
|
|
return MockResponse(status_code=200, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_unhealthy_get(url, **kwargs):
|
|
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_unsuccessful_post(url, **kwargs):
|
|
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_successful_post(url, **kwargs):
|
|
|
|
response = {
|
|
|
|
"pages": [
|
|
|
|
{
|
|
|
|
"number": 0,
|
|
|
|
"elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}],
|
2023-02-08 10:11:15 -05:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"number": 1,
|
|
|
|
"elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
|
|
|
|
},
|
2023-02-27 17:30:54 +01:00
|
|
|
],
|
2022-11-30 16:34:24 -05:00
|
|
|
}
|
|
|
|
return MockResponse(status_code=200, response=response)
|
|
|
|
|
|
|
|
|
2023-01-04 16:19:05 -06:00
|
|
|
class MockPageLayout(layout.PageLayout):
|
|
|
|
def __init__(self, number: int):
|
|
|
|
pass
|
|
|
|
|
|
|
|
@property
|
|
|
|
def elements(self):
|
|
|
|
return [
|
|
|
|
layout.LayoutElement(
|
|
|
|
type="Title",
|
|
|
|
coordinates=[(0, 0), (2, 2)],
|
|
|
|
text="Charlie Brown and the Great Pumpkin",
|
2023-02-27 17:30:54 +01:00
|
|
|
),
|
2023-01-04 16:19:05 -06:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
class MockDocumentLayout(layout.DocumentLayout):
|
|
|
|
@property
|
|
|
|
def pages(self):
|
|
|
|
return [
|
|
|
|
MockPageLayout(
|
|
|
|
number=0,
|
2023-02-27 17:30:54 +01:00
|
|
|
),
|
2023-01-04 16:19:05 -06:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"):
|
2022-11-30 16:34:24 -05:00
|
|
|
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
|
|
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
|
|
|
2023-01-13 22:24:13 -06:00
|
|
|
partition_pdf_response = pdf._partition_via_api(filename)
|
2022-11-21 17:27:23 -05:00
|
|
|
assert partition_pdf_response[0]["type"] == "Title"
|
2022-11-30 16:34:24 -05:00
|
|
|
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
|
2023-02-08 10:11:15 -05:00
|
|
|
assert partition_pdf_response[1]["type"] == "Title"
|
|
|
|
assert partition_pdf_response[1]["text"] == "A Charlie Brown Christmas"
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_api_page_breaks(
|
2023-02-27 17:30:54 +01:00
|
|
|
monkeypatch,
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
2023-02-08 10:11:15 -05:00
|
|
|
):
|
|
|
|
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
|
|
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
|
|
|
|
|
|
partition_pdf_response = pdf._partition_via_api(filename, include_page_breaks=True)
|
|
|
|
assert partition_pdf_response[0]["type"] == "Title"
|
|
|
|
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
|
|
|
|
assert partition_pdf_response[1]["type"] == "PageBreak"
|
|
|
|
assert partition_pdf_response[2]["type"] == "Title"
|
|
|
|
assert partition_pdf_response[2]["text"] == "A Charlie Brown Christmas"
|
2022-11-30 16:34:24 -05:00
|
|
|
|
|
|
|
|
2023-01-04 16:19:05 -06:00
|
|
|
@pytest.mark.parametrize(
|
2023-02-27 17:30:54 +01:00
|
|
|
("filename", "file"),
|
|
|
|
[("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")],
|
2023-01-04 16:19:05 -06:00
|
|
|
)
|
|
|
|
def test_partition_pdf_local(monkeypatch, filename, file):
|
2023-01-13 22:24:13 -06:00
|
|
|
monkeypatch.setattr(
|
2023-02-27 17:30:54 +01:00
|
|
|
layout,
|
|
|
|
"process_data_with_model",
|
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
2023-01-13 22:24:13 -06:00
|
|
|
)
|
2023-01-04 16:19:05 -06:00
|
|
|
monkeypatch.setattr(
|
2023-02-27 17:30:54 +01:00
|
|
|
layout,
|
|
|
|
"process_file_with_model",
|
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
2023-01-04 16:19:05 -06:00
|
|
|
)
|
|
|
|
|
2023-01-13 22:24:13 -06:00
|
|
|
partition_pdf_response = pdf._partition_pdf_or_image_local(filename, file)
|
2023-01-04 16:19:05 -06:00
|
|
|
assert partition_pdf_response[0].type == "Title"
|
|
|
|
assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_api_raises_with_no_filename(monkeypatch):
|
|
|
|
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
|
|
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
|
|
|
|
|
|
with pytest.raises(FileNotFoundError):
|
2023-01-13 22:24:13 -06:00
|
|
|
pdf._partition_via_api(filename=None, file=None)
|
2023-01-04 16:19:05 -06:00
|
|
|
|
2022-11-30 16:34:24 -05:00
|
|
|
|
2023-01-13 22:24:13 -06:00
|
|
|
def test_partition_pdf_local_raises_with_no_filename():
|
2022-11-30 16:34:24 -05:00
|
|
|
with pytest.raises(FileNotFoundError):
|
2023-01-13 22:24:13 -06:00
|
|
|
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)
|
2022-11-30 16:34:24 -05:00
|
|
|
|
|
|
|
|
2023-01-04 16:19:05 -06:00
|
|
|
def test_partition_pdf_api_raises_with_failed_healthcheck(
|
2023-02-27 17:30:54 +01:00
|
|
|
monkeypatch,
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
2022-11-30 16:34:24 -05:00
|
|
|
):
|
|
|
|
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
|
|
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
|
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
2023-01-13 22:24:13 -06:00
|
|
|
pdf._partition_via_api(filename=filename)
|
2022-11-30 16:34:24 -05:00
|
|
|
|
|
|
|
|
2023-01-04 16:19:05 -06:00
|
|
|
def test_partition_pdf_api_raises_with_failed_api_call(
|
2023-02-27 17:30:54 +01:00
|
|
|
monkeypatch,
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
2022-11-30 16:34:24 -05:00
|
|
|
):
|
|
|
|
monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
|
|
|
|
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
2023-01-13 22:24:13 -06:00
|
|
|
pdf._partition_via_api(filename=filename)
|
2023-01-04 16:19:05 -06:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2023-02-27 17:30:54 +01:00
|
|
|
("url", "api_called", "local_called"),
|
|
|
|
[("fakeurl", True, False), (None, False, True)],
|
2023-01-04 16:19:05 -06:00
|
|
|
)
|
|
|
|
def test_partition_pdf(url, api_called, local_called):
|
2023-01-13 22:24:13 -06:00
|
|
|
with mock.patch.object(
|
2023-02-27 17:30:54 +01:00
|
|
|
pdf,
|
|
|
|
attribute="_partition_via_api",
|
|
|
|
new=mock.MagicMock(),
|
2023-01-13 22:24:13 -06:00
|
|
|
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
|
2023-01-04 16:19:05 -06:00
|
|
|
pdf.partition_pdf(filename="fake.pdf", url=url)
|
2023-01-13 22:24:13 -06:00
|
|
|
assert pdf._partition_via_api.called == api_called
|
|
|
|
assert pdf._partition_pdf_or_image_local.called == local_called
|
2023-01-31 12:17:09 +05:30
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2023-02-27 17:30:54 +01:00
|
|
|
("url", "api_called", "local_called"),
|
|
|
|
[("fakeurl", True, False), (None, False, True)],
|
2023-01-31 12:17:09 +05:30
|
|
|
)
|
|
|
|
def test_partition_pdf_with_template(url, api_called, local_called):
|
|
|
|
with mock.patch.object(
|
2023-02-27 17:30:54 +01:00
|
|
|
pdf,
|
|
|
|
attribute="_partition_via_api",
|
|
|
|
new=mock.MagicMock(),
|
2023-01-31 12:17:09 +05:30
|
|
|
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
|
|
|
|
pdf.partition_pdf(filename="fake.pdf", url=url, template="checkbox")
|
|
|
|
assert pdf._partition_via_api.called == api_called
|
|
|
|
assert pdf._partition_pdf_or_image_local.called == local_called
|
2023-02-08 10:11:15 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_page_breaks(filename="example-docs/layout-parser-paper-fast.pdf"):
|
|
|
|
elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True)
|
|
|
|
assert PageBreak() in elements
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_no_page_breaks(filename="example-docs/layout-parser-paper-fast.pdf"):
|
|
|
|
elements = pdf.partition_pdf(filename=filename, url=None)
|
|
|
|
assert PageBreak() not in elements
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_fast_strategy(filename="example-docs/layout-parser-paper-fast.pdf"):
|
|
|
|
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
|
|
|
|
assert len(elements) > 10
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_fast_strategy_from_file(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = pdf.partition_pdf(file=f, url=None, strategy="fast")
|
|
|
|
assert len(elements) > 10
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_fast_strategy_and_page_breaks(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename,
|
|
|
|
url=None,
|
|
|
|
strategy="fast",
|
|
|
|
include_page_breaks=True,
|
|
|
|
)
|
|
|
|
assert len(elements) > 10
|
|
|
|
assert PageBreak() in elements
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_raises_with_bad_strategy(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="made_up")
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_falls_back_to_fast(
|
|
|
|
monkeypatch,
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
monkeypatch.setattr(pdf, "dependency_exists", lambda dep: dep != "detectron2")
|
|
|
|
|
|
|
|
mock_return = [Text("Hello there!")]
|
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
|
|
|
"_partition_pdf_with_pdfminer",
|
|
|
|
return_value=mock_return,
|
|
|
|
) as mock_partition:
|
|
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="fast")
|
|
|
|
|
|
|
|
mock_partition.assert_called_once()
|