2023-04-21 17:35:43 -04:00
|
|
|
import os
|
2023-05-09 21:39:07 -07:00
|
|
|
from tempfile import SpooledTemporaryFile
|
2023-02-27 17:30:54 +01:00
|
|
|
from unittest import mock
|
|
|
|
|
2022-11-30 16:34:24 -05:00
|
|
|
import pytest
|
2023-02-27 17:30:54 +01:00
|
|
|
from unstructured_inference.inference import layout
|
2022-11-30 16:34:24 -05:00
|
|
|
|
2023-06-20 11:19:55 -05:00
|
|
|
from unstructured.documents.coordinates import PixelSpace
|
2023-06-28 23:14:05 -04:00
|
|
|
from unstructured.documents.elements import NarrativeText, Text, Title
|
2023-05-08 13:21:24 -04:00
|
|
|
from unstructured.partition import pdf, strategies
|
2022-11-21 17:27:23 -05:00
|
|
|
|
|
|
|
|
2022-11-30 16:34:24 -05:00
|
|
|
class MockResponse:
|
|
|
|
def __init__(self, status_code, response):
|
|
|
|
self.status_code = status_code
|
|
|
|
self.response = response
|
|
|
|
|
|
|
|
def json(self):
|
|
|
|
return self.response
|
|
|
|
|
|
|
|
|
|
|
|
def mock_healthy_get(url, **kwargs):
|
|
|
|
return MockResponse(status_code=200, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_unhealthy_get(url, **kwargs):
|
|
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_unsuccessful_post(url, **kwargs):
|
|
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_successful_post(url, **kwargs):
|
|
|
|
response = {
|
|
|
|
"pages": [
|
|
|
|
{
|
|
|
|
"number": 0,
|
2023-05-31 13:50:15 -05:00
|
|
|
"elements": [
|
|
|
|
{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"},
|
|
|
|
],
|
2023-02-08 10:11:15 -05:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"number": 1,
|
|
|
|
"elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
|
|
|
|
},
|
2023-02-27 17:30:54 +01:00
|
|
|
],
|
2022-11-30 16:34:24 -05:00
|
|
|
}
|
|
|
|
return MockResponse(status_code=200, response=response)
|
|
|
|
|
|
|
|
|
2023-01-04 16:19:05 -06:00
|
|
|
class MockPageLayout(layout.PageLayout):
|
|
|
|
def __init__(self, number: int):
|
|
|
|
pass
|
|
|
|
|
|
|
|
@property
|
|
|
|
def elements(self):
|
|
|
|
return [
|
|
|
|
layout.LayoutElement(
|
|
|
|
type="Title",
|
2023-04-04 19:59:06 -07:00
|
|
|
x1=0,
|
|
|
|
y1=0,
|
|
|
|
x2=2,
|
|
|
|
y2=2,
|
2023-01-04 16:19:05 -06:00
|
|
|
text="Charlie Brown and the Great Pumpkin",
|
2023-02-27 17:30:54 +01:00
|
|
|
),
|
2023-01-04 16:19:05 -06:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
class MockDocumentLayout(layout.DocumentLayout):
|
|
|
|
@property
|
|
|
|
def pages(self):
|
|
|
|
return [
|
|
|
|
MockPageLayout(
|
|
|
|
number=0,
|
2023-02-27 17:30:54 +01:00
|
|
|
),
|
2023-01-04 16:19:05 -06:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2023-02-27 17:30:54 +01:00
|
|
|
("filename", "file"),
|
|
|
|
[("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")],
|
2023-01-04 16:19:05 -06:00
|
|
|
)
|
|
|
|
def test_partition_pdf_local(monkeypatch, filename, file):
|
2023-01-13 22:24:13 -06:00
|
|
|
monkeypatch.setattr(
|
2023-02-27 17:30:54 +01:00
|
|
|
layout,
|
|
|
|
"process_data_with_model",
|
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
2023-01-13 22:24:13 -06:00
|
|
|
)
|
2023-01-04 16:19:05 -06:00
|
|
|
monkeypatch.setattr(
|
2023-02-27 17:30:54 +01:00
|
|
|
layout,
|
|
|
|
"process_file_with_model",
|
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
2023-01-04 16:19:05 -06:00
|
|
|
)
|
|
|
|
|
2023-01-13 22:24:13 -06:00
|
|
|
partition_pdf_response = pdf._partition_pdf_or_image_local(filename, file)
|
2023-01-04 16:19:05 -06:00
|
|
|
assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
|
|
|
|
|
|
|
|
|
2023-01-13 22:24:13 -06:00
|
|
|
def test_partition_pdf_local_raises_with_no_filename():
|
2022-11-30 16:34:24 -05:00
|
|
|
with pytest.raises(FileNotFoundError):
|
2023-01-13 22:24:13 -06:00
|
|
|
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)
|
2022-11-30 16:34:24 -05:00
|
|
|
|
|
|
|
|
2023-05-09 21:39:07 -07:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("strategy"),
|
|
|
|
[("fast"), ("hi_res"), ("ocr_only")],
|
|
|
|
)
|
|
|
|
def test_partition_pdf_with_spooled_file(
|
|
|
|
strategy,
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
# Test that the partition_pdf function can handle a SpooledTemporaryFile
|
|
|
|
with open(filename, "rb") as test_file:
|
|
|
|
spooled_temp_file = SpooledTemporaryFile()
|
|
|
|
spooled_temp_file.write(test_file.read())
|
|
|
|
spooled_temp_file.seek(0)
|
|
|
|
result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
|
|
|
|
# validate that the result is a non-empty list of dicts
|
|
|
|
assert len(result) > 10
|
2023-05-30 15:10:14 -04:00
|
|
|
# check that the pdf has multiple different page numbers
|
2023-06-15 12:21:17 -04:00
|
|
|
assert {element.metadata.page_number for element in result} == {1, 2}
|
2023-05-09 21:39:07 -07:00
|
|
|
|
|
|
|
|
2023-06-27 23:06:08 -05:00
|
|
|
@mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
|
|
|
|
def test_partition_pdf_with_model_name(
|
2023-05-31 13:50:15 -05:00
|
|
|
monkeypatch,
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
monkeypatch.setattr(
|
|
|
|
strategies,
|
|
|
|
"is_pdf_text_extractable",
|
|
|
|
lambda *args, **kwargs: True,
|
|
|
|
)
|
2023-06-27 23:06:08 -05:00
|
|
|
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
|
2023-05-31 13:50:15 -05:00
|
|
|
pdf.partition_pdf(
|
|
|
|
filename=filename,
|
|
|
|
strategy="hi_res",
|
|
|
|
)
|
2023-06-27 23:06:08 -05:00
|
|
|
mock_process.assert_called_once_with(
|
|
|
|
filename,
|
|
|
|
is_image=False,
|
|
|
|
ocr_languages="eng",
|
|
|
|
extract_tables=False,
|
|
|
|
model_name="checkbox",
|
|
|
|
)
|
2023-02-08 10:11:15 -05:00
|
|
|
|
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
def test_partition_pdf_with_auto_strategy(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
2023-05-12 13:45:08 -04:00
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy="auto")
|
|
|
|
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
2023-06-22 11:19:54 -04:00
|
|
|
assert elements[0].text == title
|
|
|
|
assert elements[0].metadata.filename == "layout-parser-paper-fast.pdf"
|
|
|
|
assert elements[0].metadata.file_directory == "example-docs"
|
2023-05-12 13:45:08 -04:00
|
|
|
|
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
def test_partition_pdf_with_page_breaks(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
2023-02-08 10:11:15 -05:00
|
|
|
elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True)
|
2023-06-28 23:14:05 -04:00
|
|
|
assert "PageBreak" in [elem.category for elem in elements]
|
2023-02-08 10:11:15 -05:00
|
|
|
|
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
def test_partition_pdf_with_no_page_breaks(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
2023-02-08 10:11:15 -05:00
|
|
|
elements = pdf.partition_pdf(filename=filename, url=None)
|
2023-06-28 23:14:05 -04:00
|
|
|
assert "PageBreak" not in [elem.category for elem in elements]
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
def test_partition_pdf_with_fast_strategy(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
2023-03-10 22:16:05 -05:00
|
|
|
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
|
|
|
|
assert len(elements) > 10
|
2023-05-30 15:10:14 -04:00
|
|
|
# check that the pdf has multiple different page numbers
|
2023-06-15 12:21:17 -04:00
|
|
|
assert {element.metadata.page_number for element in elements} == {1, 2}
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
def test_partition_pdf_with_fast_groups_text(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
2023-04-19 13:54:17 -04:00
|
|
|
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
|
|
|
|
|
|
|
|
first_narrative_element = None
|
|
|
|
for element in elements:
|
|
|
|
if isinstance(element, NarrativeText):
|
|
|
|
first_narrative_element = element
|
|
|
|
break
|
|
|
|
|
|
|
|
assert len(first_narrative_element.text) > 1000
|
|
|
|
assert first_narrative_element.text.startswith("Abstract. Recent advances")
|
|
|
|
assert first_narrative_element.text.endswith("https://layout-parser.github.io.")
|
|
|
|
|
|
|
|
|
2023-03-10 22:16:05 -05:00
|
|
|
def test_partition_pdf_with_fast_strategy_from_file(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = pdf.partition_pdf(file=f, url=None, strategy="fast")
|
|
|
|
assert len(elements) > 10
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_fast_strategy_and_page_breaks(
|
2023-04-13 11:46:35 -04:00
|
|
|
caplog,
|
2023-03-10 22:16:05 -05:00
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename,
|
|
|
|
url=None,
|
|
|
|
strategy="fast",
|
|
|
|
include_page_breaks=True,
|
|
|
|
)
|
|
|
|
assert len(elements) > 10
|
2023-06-28 23:14:05 -04:00
|
|
|
assert "PageBreak" in [elem.category for elem in elements]
|
2023-03-10 22:16:05 -05:00
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
assert "unstructured_inference is not installed" not in caplog.text
|
2023-04-13 11:46:35 -04:00
|
|
|
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
|
|
def test_partition_pdf_raises_with_bad_strategy(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="made_up")
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_falls_back_to_fast(
|
|
|
|
monkeypatch,
|
2023-04-13 11:46:35 -04:00
|
|
|
caplog,
|
2023-03-10 22:16:05 -05:00
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
2023-05-08 13:21:24 -04:00
|
|
|
def mock_exists(dep):
|
2023-05-31 13:50:15 -05:00
|
|
|
return dep not in ["unstructured_inference", "pytesseract"]
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
|
|
|
|
|
|
|
mock_return = [Text("Hello there!")]
|
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
|
|
|
"_partition_pdf_with_pdfminer",
|
|
|
|
return_value=mock_return,
|
|
|
|
) as mock_partition:
|
|
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="hi_res")
|
|
|
|
|
|
|
|
mock_partition.assert_called_once()
|
2023-05-31 13:50:15 -05:00
|
|
|
assert "unstructured_inference is not installed" in caplog.text
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_falls_back_to_fast_from_ocr_only(
|
|
|
|
monkeypatch,
|
|
|
|
caplog,
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
def mock_exists(dep):
|
|
|
|
return dep not in ["pytesseract"]
|
|
|
|
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
|
|
mock_return = [Text("Hello there!")]
|
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
|
|
|
"_partition_pdf_with_pdfminer",
|
|
|
|
return_value=mock_return,
|
2023-05-08 13:21:24 -04:00
|
|
|
) as mock_partition:
|
|
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only")
|
|
|
|
|
|
|
|
mock_partition.assert_called_once()
|
|
|
|
assert "pytesseract is not installed" in caplog.text
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(
|
|
|
|
monkeypatch,
|
|
|
|
caplog,
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
def mock_exists(dep):
|
|
|
|
return dep not in ["pytesseract"]
|
|
|
|
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
2023-05-31 13:50:15 -05:00
|
|
|
monkeypatch.setattr(
|
|
|
|
strategies,
|
|
|
|
"is_pdf_text_extractable",
|
|
|
|
lambda *args, **kwargs: False,
|
|
|
|
)
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
mock_return = [Text("Hello there!")]
|
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
|
|
|
"_partition_pdf_or_image_local",
|
|
|
|
return_value=mock_return,
|
|
|
|
) as mock_partition:
|
|
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only")
|
|
|
|
|
|
|
|
mock_partition.assert_called_once()
|
|
|
|
assert "pytesseract is not installed" in caplog.text
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_falls_back_to_ocr_only(
|
|
|
|
monkeypatch,
|
|
|
|
caplog,
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
|
|
|
def mock_exists(dep):
|
2023-05-31 13:50:15 -05:00
|
|
|
return dep not in ["unstructured_inference"]
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
|
|
|
|
|
|
|
mock_return = [Text("Hello there!")]
|
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
|
|
|
"_partition_pdf_or_image_with_ocr",
|
|
|
|
return_value=mock_return,
|
2023-03-10 22:16:05 -05:00
|
|
|
) as mock_partition:
|
2023-04-13 11:46:35 -04:00
|
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="hi_res")
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
|
|
mock_partition.assert_called_once()
|
2023-05-31 13:50:15 -05:00
|
|
|
assert "unstructured_inference is not installed" in caplog.text
|
2023-04-21 12:01:29 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_uses_table_extraction():
|
|
|
|
filename = "example-docs/layout-parser-paper-fast.pdf"
|
|
|
|
with mock.patch(
|
|
|
|
"unstructured_inference.inference.layout.process_file_with_model",
|
|
|
|
) as mock_process_file_with_model:
|
2023-04-21 13:48:19 -05:00
|
|
|
pdf.partition_pdf(filename, infer_table_structure=True)
|
2023-04-21 12:01:29 -05:00
|
|
|
assert mock_process_file_with_model.call_args[1]["extract_tables"]
|
2023-04-21 17:35:43 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_copy_protection():
|
|
|
|
filename = os.path.join("example-docs", "copy-protected.pdf")
|
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy="hi_res")
|
|
|
|
elements[0] == Title("LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis")
|
2023-05-30 15:10:14 -04:00
|
|
|
# check that the pdf has multiple different page numbers
|
2023-06-15 12:21:17 -04:00
|
|
|
assert {element.metadata.page_number for element in elements} == {1, 2}
|
2023-04-21 17:35:43 -04:00
|
|
|
|
|
|
|
|
2023-06-22 11:19:54 -04:00
|
|
|
def test_partition_pdf_requiring_recursive_text_grab(filename="example-docs/reliance.pdf"):
|
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
|
|
|
assert len(elements) > 50
|
|
|
|
assert elements[0].metadata.page_number == 1
|
|
|
|
assert elements[-1].metadata.page_number == 3
|
|
|
|
|
|
|
|
|
2023-04-21 17:35:43 -04:00
|
|
|
def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog):
|
|
|
|
filename = os.path.join("example-docs", "copy-protected.pdf")
|
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
2023-05-31 13:50:15 -05:00
|
|
|
elements[0] == Title(
|
|
|
|
"LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis",
|
|
|
|
)
|
2023-04-21 17:35:43 -04:00
|
|
|
assert "PDF text is not extractable" in caplog.text
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_fails_if_pdf_not_processable(
|
|
|
|
monkeypatch,
|
|
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
|
|
):
|
2023-05-08 13:21:24 -04:00
|
|
|
def mock_exists(dep):
|
2023-05-31 13:50:15 -05:00
|
|
|
return dep not in ["unstructured_inference", "pytesseract"]
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
2023-05-31 13:50:15 -05:00
|
|
|
monkeypatch.setattr(
|
|
|
|
strategies,
|
|
|
|
"is_pdf_text_extractable",
|
|
|
|
lambda *args, **kwargs: False,
|
|
|
|
)
|
2023-04-21 17:35:43 -04:00
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
pdf.partition_pdf(filename=filename)
|
2023-05-03 18:33:24 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_fast_groups_text_in_text_box():
|
|
|
|
filename = os.path.join("example-docs", "chevron-page.pdf")
|
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
|
|
|
|
2023-05-20 16:26:55 -05:00
|
|
|
assert elements[0] == Title(
|
|
|
|
"eastern mediterranean",
|
|
|
|
coordinates=(
|
|
|
|
(193.1741, 71.94000000000005),
|
|
|
|
(193.1741, 91.94000000000005),
|
|
|
|
(418.6881, 91.94000000000005),
|
|
|
|
(418.6881, 71.94000000000005),
|
|
|
|
),
|
2023-06-20 11:19:55 -05:00
|
|
|
coordinate_system=PixelSpace(width=612, height=792),
|
2023-05-20 16:26:55 -05:00
|
|
|
)
|
2023-05-03 18:33:24 -04:00
|
|
|
|
|
|
|
assert isinstance(elements[1], NarrativeText)
|
|
|
|
assert str(elements[1]).startswith("We")
|
|
|
|
assert str(elements[1]).endswith("Jordan and Egypt.")
|
|
|
|
|
|
|
|
assert elements[3] == Title(
|
2023-06-12 15:02:48 -04:00
|
|
|
"1st",
|
2023-05-20 16:26:55 -05:00
|
|
|
coordinates=(
|
2023-06-12 15:02:48 -04:00
|
|
|
(273.9929, 181.16470000000004),
|
|
|
|
(273.9929, 226.16470000000004),
|
|
|
|
(333.59990000000005, 226.16470000000004),
|
|
|
|
(333.59990000000005, 181.16470000000004),
|
2023-05-20 16:26:55 -05:00
|
|
|
),
|
2023-06-20 11:19:55 -05:00
|
|
|
coordinate_system=PixelSpace(width=612, height=792),
|
2023-05-03 18:33:24 -04:00
|
|
|
)
|