486 lines
15 KiB
Python
Raw Normal View History

import os
from tempfile import SpooledTemporaryFile
from unittest import mock
import pytest
import requests
from unstructured_inference.inference import layout
from unstructured.documents.elements import NarrativeText, PageBreak, Text, Title
from unstructured.partition import pdf, strategies
class MockResponse:
def __init__(self, status_code, response):
self.status_code = status_code
self.response = response
def json(self):
return self.response
def mock_healthy_get(url, **kwargs):
return MockResponse(status_code=200, response={})
def mock_unhealthy_get(url, **kwargs):
return MockResponse(status_code=500, response={})
def mock_unsuccessful_post(url, **kwargs):
return MockResponse(status_code=500, response={})
def mock_successful_post(url, **kwargs):
response = {
"pages": [
{
"number": 0,
"elements": [
{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"},
],
},
{
"number": 1,
"elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
},
],
}
return MockResponse(status_code=200, response=response)
class MockPageLayout(layout.PageLayout):
def __init__(self, number: int):
pass
@property
def elements(self):
return [
layout.LayoutElement(
type="Title",
2023-04-04 19:59:06 -07:00
x1=0,
y1=0,
x2=2,
y2=2,
text="Charlie Brown and the Great Pumpkin",
),
]
class MockDocumentLayout(layout.DocumentLayout):
@property
def pages(self):
return [
MockPageLayout(
number=0,
),
]
def test_partition_pdf_api(
monkeypatch,
filename="example-docs/layout-parser-paper-fast.pdf",
):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
partition_pdf_response = pdf._partition_via_api(filename)
assert partition_pdf_response[0]["type"] == "Title"
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
assert partition_pdf_response[1]["type"] == "Title"
assert partition_pdf_response[1]["text"] == "A Charlie Brown Christmas"
def test_partition_pdf_api_page_breaks(
monkeypatch,
filename="example-docs/layout-parser-paper-fast.pdf",
):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
partition_pdf_response = pdf._partition_via_api(filename, include_page_breaks=True)
assert partition_pdf_response[0]["type"] == "Title"
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
assert partition_pdf_response[1]["type"] == "PageBreak"
assert partition_pdf_response[2]["type"] == "Title"
assert partition_pdf_response[2]["text"] == "A Charlie Brown Christmas"
@pytest.mark.parametrize(
("filename", "file"),
[("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")],
)
def test_partition_pdf_local(monkeypatch, filename, file):
monkeypatch.setattr(
layout,
"process_data_with_model",
lambda *args, **kwargs: MockDocumentLayout(),
)
monkeypatch.setattr(
layout,
"process_file_with_model",
lambda *args, **kwargs: MockDocumentLayout(),
)
partition_pdf_response = pdf._partition_pdf_or_image_local(filename, file)
assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
def test_partition_pdf_api_raises_with_no_filename(monkeypatch):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
with pytest.raises(FileNotFoundError):
pdf._partition_via_api(filename=None, file=None)
def test_partition_pdf_local_raises_with_no_filename():
with pytest.raises(FileNotFoundError):
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)
def test_partition_pdf_api_raises_with_failed_healthcheck(
monkeypatch,
filename="example-docs/layout-parser-paper-fast.pdf",
):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
with pytest.raises(ValueError):
pdf._partition_via_api(filename=filename)
def test_partition_pdf_api_raises_with_failed_api_call(
monkeypatch,
filename="example-docs/layout-parser-paper-fast.pdf",
):
monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
with pytest.raises(ValueError):
pdf._partition_via_api(filename=filename)
@pytest.mark.parametrize(
("url", "api_called", "local_called"),
[("fakeurl", True, False), (None, False, True)],
)
def test_partition_pdf(
url,
api_called,
local_called,
monkeypatch,
filename="example-docs/layout-parser-paper-fast.pdf",
):
monkeypatch.setattr(
strategies,
"is_pdf_text_extractable",
lambda *args, **kwargs: True,
)
with mock.patch.object(
pdf,
attribute="_partition_via_api",
new=mock.MagicMock(),
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
pdf.partition_pdf(filename=filename, strategy="hi_res", url=url)
assert pdf._partition_via_api.called == api_called
assert pdf._partition_pdf_or_image_local.called == local_called
2023-01-31 12:17:09 +05:30
@pytest.mark.parametrize(
("strategy"),
[("fast"), ("hi_res"), ("ocr_only")],
)
def test_partition_pdf_with_spooled_file(
strategy,
filename="example-docs/layout-parser-paper-fast.pdf",
):
# Test that the partition_pdf function can handle a SpooledTemporaryFile
with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
# validate that the result is a non-empty list of dicts
assert len(result) > 10
# check that the pdf has multiple different page numbers
assert len({element.metadata.page_number for element in result}) > 1
2023-01-31 12:17:09 +05:30
@pytest.mark.parametrize(
("url", "api_called", "local_called"),
[("fakeurl", True, False), (None, False, True)],
2023-01-31 12:17:09 +05:30
)
def test_partition_pdf_with_template(
url,
api_called,
local_called,
monkeypatch,
filename="example-docs/layout-parser-paper-fast.pdf",
):
monkeypatch.setattr(
strategies,
"is_pdf_text_extractable",
lambda *args, **kwargs: True,
)
2023-01-31 12:17:09 +05:30
with mock.patch.object(
pdf,
attribute="_partition_via_api",
new=mock.MagicMock(),
2023-01-31 12:17:09 +05:30
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
pdf.partition_pdf(
filename=filename,
strategy="hi_res",
url=url,
template="checkbox",
)
2023-01-31 12:17:09 +05:30
assert pdf._partition_via_api.called == api_called
assert pdf._partition_pdf_or_image_local.called == local_called
def test_partition_pdf_with_auto_strategy(
filename="example-docs/layout-parser-paper-fast.pdf",
):
elements = pdf.partition_pdf(filename=filename, strategy="auto")
titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
assert titles[0].text == title
assert titles[0].metadata.filename == "layout-parser-paper-fast.pdf"
assert titles[0].metadata.file_directory == "example-docs"
def test_partition_pdf_with_page_breaks(
filename="example-docs/layout-parser-paper-fast.pdf",
):
elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True)
assert PageBreak() in elements
def test_partition_pdf_with_no_page_breaks(
filename="example-docs/layout-parser-paper-fast.pdf",
):
elements = pdf.partition_pdf(filename=filename, url=None)
assert PageBreak() not in elements
def test_partition_pdf_with_fast_strategy(
filename="example-docs/layout-parser-paper-fast.pdf",
):
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
assert len(elements) > 10
# check that the pdf has multiple different page numbers
assert len({element.metadata.page_number for element in elements}) > 1
def test_partition_pdf_with_fast_groups_text(
filename="example-docs/layout-parser-paper-fast.pdf",
):
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
first_narrative_element = None
for element in elements:
if isinstance(element, NarrativeText):
first_narrative_element = element
break
assert len(first_narrative_element.text) > 1000
assert first_narrative_element.text.startswith("Abstract. Recent advances")
assert first_narrative_element.text.endswith("https://layout-parser.github.io.")
def test_partition_pdf_with_fast_strategy_from_file(
filename="example-docs/layout-parser-paper-fast.pdf",
):
with open(filename, "rb") as f:
elements = pdf.partition_pdf(file=f, url=None, strategy="fast")
assert len(elements) > 10
def test_partition_pdf_with_fast_strategy_and_page_breaks(
caplog,
filename="example-docs/layout-parser-paper-fast.pdf",
):
elements = pdf.partition_pdf(
filename=filename,
url=None,
strategy="fast",
include_page_breaks=True,
)
assert len(elements) > 10
assert PageBreak() in elements
assert "unstructured_inference is not installed" not in caplog.text
def test_partition_pdf_raises_with_bad_strategy(
filename="example-docs/layout-parser-paper-fast.pdf",
):
with pytest.raises(ValueError):
pdf.partition_pdf(filename=filename, url=None, strategy="made_up")
def test_partition_pdf_falls_back_to_fast(
monkeypatch,
caplog,
filename="example-docs/layout-parser-paper-fast.pdf",
):
def mock_exists(dep):
return dep not in ["unstructured_inference", "pytesseract"]
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
mock_return = [Text("Hello there!")]
with mock.patch.object(
pdf,
"_partition_pdf_with_pdfminer",
return_value=mock_return,
) as mock_partition:
pdf.partition_pdf(filename=filename, url=None, strategy="hi_res")
mock_partition.assert_called_once()
assert "unstructured_inference is not installed" in caplog.text
def test_partition_pdf_falls_back_to_fast_from_ocr_only(
monkeypatch,
caplog,
filename="example-docs/layout-parser-paper-fast.pdf",
):
def mock_exists(dep):
return dep not in ["pytesseract"]
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
mock_return = [Text("Hello there!")]
with mock.patch.object(
pdf,
"_partition_pdf_with_pdfminer",
return_value=mock_return,
) as mock_partition:
pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only")
mock_partition.assert_called_once()
assert "pytesseract is not installed" in caplog.text
def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(
monkeypatch,
caplog,
filename="example-docs/layout-parser-paper-fast.pdf",
):
def mock_exists(dep):
return dep not in ["pytesseract"]
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
monkeypatch.setattr(
strategies,
"is_pdf_text_extractable",
lambda *args, **kwargs: False,
)
mock_return = [Text("Hello there!")]
with mock.patch.object(
pdf,
"_partition_pdf_or_image_local",
return_value=mock_return,
) as mock_partition:
pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only")
mock_partition.assert_called_once()
assert "pytesseract is not installed" in caplog.text
def test_partition_pdf_falls_back_to_ocr_only(
monkeypatch,
caplog,
filename="example-docs/layout-parser-paper-fast.pdf",
):
def mock_exists(dep):
return dep not in ["unstructured_inference"]
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
mock_return = [Text("Hello there!")]
with mock.patch.object(
pdf,
"_partition_pdf_or_image_with_ocr",
return_value=mock_return,
) as mock_partition:
pdf.partition_pdf(filename=filename, url=None, strategy="hi_res")
mock_partition.assert_called_once()
assert "unstructured_inference is not installed" in caplog.text
def test_partition_pdf_uses_table_extraction():
filename = "example-docs/layout-parser-paper-fast.pdf"
with mock.patch(
"unstructured_inference.inference.layout.process_file_with_model",
) as mock_process_file_with_model:
pdf.partition_pdf(filename, infer_table_structure=True)
assert mock_process_file_with_model.call_args[1]["extract_tables"]
def test_partition_pdf_with_copy_protection():
filename = os.path.join("example-docs", "copy-protected.pdf")
elements = pdf.partition_pdf(filename=filename, strategy="hi_res")
elements[0] == Title("LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis")
# check that the pdf has multiple different page numbers
assert len({element.metadata.page_number for element in elements}) > 1
def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog):
filename = os.path.join("example-docs", "copy-protected.pdf")
elements = pdf.partition_pdf(filename=filename, strategy="fast")
elements[0] == Title(
"LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis",
)
assert "PDF text is not extractable" in caplog.text
def test_partition_pdf_fails_if_pdf_not_processable(
monkeypatch,
filename="example-docs/layout-parser-paper-fast.pdf",
):
def mock_exists(dep):
return dep not in ["unstructured_inference", "pytesseract"]
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
monkeypatch.setattr(
strategies,
"is_pdf_text_extractable",
lambda *args, **kwargs: False,
)
with pytest.raises(ValueError):
pdf.partition_pdf(filename=filename)
def test_partition_pdf_fast_groups_text_in_text_box():
filename = os.path.join("example-docs", "chevron-page.pdf")
elements = pdf.partition_pdf(filename=filename, strategy="fast")
assert elements[0] == Title(
"eastern mediterranean",
coordinates=(
(193.1741, 71.94000000000005),
(193.1741, 91.94000000000005),
(418.6881, 91.94000000000005),
(418.6881, 71.94000000000005),
),
)
assert isinstance(elements[1], NarrativeText)
assert str(elements[1]).startswith("We")
assert str(elements[1]).endswith("Jordan and Egypt.")
assert elements[3] == Title(
"kilograms CO₂e/boe carbon intensity from our Eastern Mediterranean operations in 2022",
coordinates=(
(69.4871, 222.4357),
(69.4871, 272.1607),
(197.8209, 272.1607),
(197.8209, 222.4357),
),
)