mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-23 00:51:30 +00:00

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata. Tests are added to make sure: * When partition is used, any content type or auto file type detection will override file-specific partition function metadata * Both auto and file-specific partitioning gives the desired filetype metadata Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
414 lines
14 KiB
Python
414 lines
14 KiB
Python
import os
|
|
from tempfile import SpooledTemporaryFile
|
|
from unittest import mock
|
|
|
|
import pytest
|
|
import requests
|
|
from unstructured_inference.inference import layout
|
|
|
|
from unstructured.documents.elements import NarrativeText, PageBreak, Text, Title
|
|
from unstructured.partition import pdf, strategies
|
|
|
|
|
|
class MockResponse:
|
|
def __init__(self, status_code, response):
|
|
self.status_code = status_code
|
|
self.response = response
|
|
|
|
def json(self):
|
|
return self.response
|
|
|
|
|
|
def mock_healthy_get(url, **kwargs):
|
|
return MockResponse(status_code=200, response={})
|
|
|
|
|
|
def mock_unhealthy_get(url, **kwargs):
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
def mock_unsuccessful_post(url, **kwargs):
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
def mock_successful_post(url, **kwargs):
|
|
response = {
|
|
"pages": [
|
|
{
|
|
"number": 0,
|
|
"elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}],
|
|
},
|
|
{
|
|
"number": 1,
|
|
"elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
|
|
},
|
|
],
|
|
}
|
|
return MockResponse(status_code=200, response=response)
|
|
|
|
|
|
class MockPageLayout(layout.PageLayout):
|
|
def __init__(self, number: int):
|
|
pass
|
|
|
|
@property
|
|
def elements(self):
|
|
return [
|
|
layout.LayoutElement(
|
|
type="Title",
|
|
x1=0,
|
|
y1=0,
|
|
x2=2,
|
|
y2=2,
|
|
text="Charlie Brown and the Great Pumpkin",
|
|
),
|
|
]
|
|
|
|
|
|
class MockDocumentLayout(layout.DocumentLayout):
|
|
@property
|
|
def pages(self):
|
|
return [
|
|
MockPageLayout(
|
|
number=0,
|
|
),
|
|
]
|
|
|
|
|
|
def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"):
|
|
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
|
|
partition_pdf_response = pdf._partition_via_api(filename)
|
|
assert partition_pdf_response[0]["type"] == "Title"
|
|
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
|
|
assert partition_pdf_response[1]["type"] == "Title"
|
|
assert partition_pdf_response[1]["text"] == "A Charlie Brown Christmas"
|
|
|
|
|
|
def test_partition_pdf_api_page_breaks(
|
|
monkeypatch,
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
):
|
|
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
|
|
partition_pdf_response = pdf._partition_via_api(filename, include_page_breaks=True)
|
|
assert partition_pdf_response[0]["type"] == "Title"
|
|
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
|
|
assert partition_pdf_response[1]["type"] == "PageBreak"
|
|
assert partition_pdf_response[2]["type"] == "Title"
|
|
assert partition_pdf_response[2]["text"] == "A Charlie Brown Christmas"
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename", "file"),
|
|
[("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")],
|
|
)
|
|
def test_partition_pdf_local(monkeypatch, filename, file):
|
|
monkeypatch.setattr(
|
|
layout,
|
|
"process_data_with_model",
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
|
)
|
|
monkeypatch.setattr(
|
|
layout,
|
|
"process_file_with_model",
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
|
)
|
|
|
|
partition_pdf_response = pdf._partition_pdf_or_image_local(filename, file)
|
|
assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
|
|
|
|
|
|
def test_partition_pdf_api_raises_with_no_filename(monkeypatch):
|
|
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
|
|
with pytest.raises(FileNotFoundError):
|
|
pdf._partition_via_api(filename=None, file=None)
|
|
|
|
|
|
def test_partition_pdf_local_raises_with_no_filename():
|
|
with pytest.raises(FileNotFoundError):
|
|
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)
|
|
|
|
|
|
def test_partition_pdf_api_raises_with_failed_healthcheck(
|
|
monkeypatch,
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
):
|
|
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
|
|
|
|
with pytest.raises(ValueError):
|
|
pdf._partition_via_api(filename=filename)
|
|
|
|
|
|
def test_partition_pdf_api_raises_with_failed_api_call(
|
|
monkeypatch,
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
):
|
|
monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
|
|
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
|
|
with pytest.raises(ValueError):
|
|
pdf._partition_via_api(filename=filename)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("url", "api_called", "local_called"),
|
|
[("fakeurl", True, False), (None, False, True)],
|
|
)
|
|
def test_partition_pdf(url, api_called, local_called, monkeypatch):
|
|
monkeypatch.setattr(strategies, "is_pdf_text_extractable", lambda *args, **kwargs: True)
|
|
with mock.patch.object(
|
|
pdf,
|
|
attribute="_partition_via_api",
|
|
new=mock.MagicMock(),
|
|
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
|
|
pdf.partition_pdf(filename="fake.pdf", strategy="hi_res", url=url)
|
|
assert pdf._partition_via_api.called == api_called
|
|
assert pdf._partition_pdf_or_image_local.called == local_called
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("strategy"),
|
|
[("fast"), ("hi_res"), ("ocr_only")],
|
|
)
|
|
def test_partition_pdf_with_spooled_file(
|
|
strategy,
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
):
|
|
# Test that the partition_pdf function can handle a SpooledTemporaryFile
|
|
with open(filename, "rb") as test_file:
|
|
spooled_temp_file = SpooledTemporaryFile()
|
|
spooled_temp_file.write(test_file.read())
|
|
spooled_temp_file.seek(0)
|
|
result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
|
|
# validate that the result is a non-empty list of dicts
|
|
assert len(result) > 10
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("url", "api_called", "local_called"),
|
|
[("fakeurl", True, False), (None, False, True)],
|
|
)
|
|
def test_partition_pdf_with_template(url, api_called, local_called, monkeypatch):
|
|
monkeypatch.setattr(strategies, "is_pdf_text_extractable", lambda *args, **kwargs: True)
|
|
with mock.patch.object(
|
|
pdf,
|
|
attribute="_partition_via_api",
|
|
new=mock.MagicMock(),
|
|
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
|
|
pdf.partition_pdf(filename="fake.pdf", strategy="hi_res", url=url, template="checkbox")
|
|
assert pdf._partition_via_api.called == api_called
|
|
assert pdf._partition_pdf_or_image_local.called == local_called
|
|
|
|
|
|
def test_partition_pdf_with_auto_strategy(filename="example-docs/layout-parser-paper-fast.pdf"):
|
|
elements = pdf.partition_pdf(filename=filename, strategy="auto")
|
|
titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
|
|
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
|
assert titles[0].text == title
|
|
|
|
|
|
def test_partition_pdf_with_page_breaks(filename="example-docs/layout-parser-paper-fast.pdf"):
|
|
elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True)
|
|
assert PageBreak() in elements
|
|
|
|
|
|
def test_partition_pdf_with_no_page_breaks(filename="example-docs/layout-parser-paper-fast.pdf"):
|
|
elements = pdf.partition_pdf(filename=filename, url=None)
|
|
assert PageBreak() not in elements
|
|
|
|
|
|
def test_partition_pdf_with_fast_strategy(filename="example-docs/layout-parser-paper-fast.pdf"):
|
|
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
|
|
assert len(elements) > 10
|
|
|
|
|
|
def test_partition_pdf_with_fast_groups_text(filename="example-docs/layout-parser-paper-fast.pdf"):
|
|
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
|
|
|
|
first_narrative_element = None
|
|
for element in elements:
|
|
if isinstance(element, NarrativeText):
|
|
first_narrative_element = element
|
|
break
|
|
|
|
assert len(first_narrative_element.text) > 1000
|
|
assert first_narrative_element.text.startswith("Abstract. Recent advances")
|
|
assert first_narrative_element.text.endswith("https://layout-parser.github.io.")
|
|
|
|
|
|
def test_partition_pdf_with_fast_strategy_from_file(
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
):
|
|
with open(filename, "rb") as f:
|
|
elements = pdf.partition_pdf(file=f, url=None, strategy="fast")
|
|
assert len(elements) > 10
|
|
|
|
|
|
def test_partition_pdf_with_fast_strategy_and_page_breaks(
|
|
caplog,
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
):
|
|
elements = pdf.partition_pdf(
|
|
filename=filename,
|
|
url=None,
|
|
strategy="fast",
|
|
include_page_breaks=True,
|
|
)
|
|
assert len(elements) > 10
|
|
assert PageBreak() in elements
|
|
|
|
assert "detectron2 is not installed" not in caplog.text
|
|
|
|
|
|
def test_partition_pdf_raises_with_bad_strategy(
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
):
|
|
with pytest.raises(ValueError):
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="made_up")
|
|
|
|
|
|
def test_partition_pdf_falls_back_to_fast(
|
|
monkeypatch,
|
|
caplog,
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
):
|
|
def mock_exists(dep):
|
|
return dep not in ["detectron2", "pytesseract"]
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
|
|
|
mock_return = [Text("Hello there!")]
|
|
with mock.patch.object(
|
|
pdf,
|
|
"_partition_pdf_with_pdfminer",
|
|
return_value=mock_return,
|
|
) as mock_partition:
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="hi_res")
|
|
|
|
mock_partition.assert_called_once()
|
|
assert "detectron2 is not installed" in caplog.text
|
|
|
|
|
|
def test_partition_pdf_falls_back_to_fast_from_ocr_only(
|
|
monkeypatch,
|
|
caplog,
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
):
|
|
def mock_exists(dep):
|
|
return dep not in ["pytesseract"]
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
|
|
|
mock_return = [Text("Hello there!")]
|
|
with mock.patch.object(
|
|
pdf,
|
|
"_partition_pdf_with_pdfminer",
|
|
return_value=mock_return,
|
|
) as mock_partition:
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only")
|
|
|
|
mock_partition.assert_called_once()
|
|
assert "pytesseract is not installed" in caplog.text
|
|
|
|
|
|
def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(
|
|
monkeypatch,
|
|
caplog,
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
):
|
|
def mock_exists(dep):
|
|
return dep not in ["pytesseract"]
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
|
monkeypatch.setattr(strategies, "is_pdf_text_extractable", lambda *args, **kwargs: False)
|
|
|
|
mock_return = [Text("Hello there!")]
|
|
with mock.patch.object(
|
|
pdf,
|
|
"_partition_pdf_or_image_local",
|
|
return_value=mock_return,
|
|
) as mock_partition:
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only")
|
|
|
|
mock_partition.assert_called_once()
|
|
assert "pytesseract is not installed" in caplog.text
|
|
|
|
|
|
def test_partition_pdf_falls_back_to_ocr_only(
|
|
monkeypatch,
|
|
caplog,
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
):
|
|
def mock_exists(dep):
|
|
return dep not in ["detectron2"]
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
|
|
|
mock_return = [Text("Hello there!")]
|
|
with mock.patch.object(
|
|
pdf,
|
|
"_partition_pdf_or_image_with_ocr",
|
|
return_value=mock_return,
|
|
) as mock_partition:
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="hi_res")
|
|
|
|
mock_partition.assert_called_once()
|
|
assert "detectron2 is not installed" in caplog.text
|
|
|
|
|
|
def test_partition_pdf_uses_table_extraction():
|
|
filename = "example-docs/layout-parser-paper-fast.pdf"
|
|
with mock.patch(
|
|
"unstructured_inference.inference.layout.process_file_with_model",
|
|
) as mock_process_file_with_model:
|
|
pdf.partition_pdf(filename, infer_table_structure=True)
|
|
assert mock_process_file_with_model.call_args[1]["extract_tables"]
|
|
|
|
|
|
def test_partition_pdf_with_copy_protection():
|
|
filename = os.path.join("example-docs", "copy-protected.pdf")
|
|
elements = pdf.partition_pdf(filename=filename, strategy="hi_res")
|
|
elements[0] == Title("LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis")
|
|
|
|
|
|
def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog):
|
|
filename = os.path.join("example-docs", "copy-protected.pdf")
|
|
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
|
elements[0] == Title("LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis")
|
|
assert "PDF text is not extractable" in caplog.text
|
|
|
|
|
|
def test_partition_pdf_fails_if_pdf_not_processable(
|
|
monkeypatch,
|
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
):
|
|
def mock_exists(dep):
|
|
return dep not in ["detectron2", "pytesseract"]
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
|
monkeypatch.setattr(strategies, "is_pdf_text_extractable", lambda *args, **kwargs: False)
|
|
|
|
with pytest.raises(ValueError):
|
|
pdf.partition_pdf(filename=filename)
|
|
|
|
|
|
def test_partition_pdf_fast_groups_text_in_text_box():
|
|
filename = os.path.join("example-docs", "chevron-page.pdf")
|
|
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
|
|
|
assert elements[0] == Title("eastern mediterranean")
|
|
|
|
assert isinstance(elements[1], NarrativeText)
|
|
assert str(elements[1]).startswith("We")
|
|
assert str(elements[1]).endswith("Jordan and Egypt.")
|
|
|
|
assert elements[3] == Title(
|
|
"kilograms CO₂e/boe carbon intensity from our Eastern Mediterranean operations in 2022",
|
|
)
|