qued 55272eeceb
enhancement: filetype in metadata (#583)
Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
2023-05-15 13:23:19 -05:00

414 lines
14 KiB
Python

import os
from tempfile import SpooledTemporaryFile
from unittest import mock
import pytest
import requests
from unstructured_inference.inference import layout
from unstructured.documents.elements import NarrativeText, PageBreak, Text, Title
from unstructured.partition import pdf, strategies
class MockResponse:
def __init__(self, status_code, response):
self.status_code = status_code
self.response = response
def json(self):
return self.response
def mock_healthy_get(url, **kwargs):
return MockResponse(status_code=200, response={})
def mock_unhealthy_get(url, **kwargs):
return MockResponse(status_code=500, response={})
def mock_unsuccessful_post(url, **kwargs):
return MockResponse(status_code=500, response={})
def mock_successful_post(url, **kwargs):
response = {
"pages": [
{
"number": 0,
"elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}],
},
{
"number": 1,
"elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
},
],
}
return MockResponse(status_code=200, response=response)
class MockPageLayout(layout.PageLayout):
def __init__(self, number: int):
pass
@property
def elements(self):
return [
layout.LayoutElement(
type="Title",
x1=0,
y1=0,
x2=2,
y2=2,
text="Charlie Brown and the Great Pumpkin",
),
]
class MockDocumentLayout(layout.DocumentLayout):
@property
def pages(self):
return [
MockPageLayout(
number=0,
),
]
def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
partition_pdf_response = pdf._partition_via_api(filename)
assert partition_pdf_response[0]["type"] == "Title"
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
assert partition_pdf_response[1]["type"] == "Title"
assert partition_pdf_response[1]["text"] == "A Charlie Brown Christmas"
def test_partition_pdf_api_page_breaks(
monkeypatch,
filename="example-docs/layout-parser-paper-fast.pdf",
):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
partition_pdf_response = pdf._partition_via_api(filename, include_page_breaks=True)
assert partition_pdf_response[0]["type"] == "Title"
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
assert partition_pdf_response[1]["type"] == "PageBreak"
assert partition_pdf_response[2]["type"] == "Title"
assert partition_pdf_response[2]["text"] == "A Charlie Brown Christmas"
@pytest.mark.parametrize(
("filename", "file"),
[("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")],
)
def test_partition_pdf_local(monkeypatch, filename, file):
monkeypatch.setattr(
layout,
"process_data_with_model",
lambda *args, **kwargs: MockDocumentLayout(),
)
monkeypatch.setattr(
layout,
"process_file_with_model",
lambda *args, **kwargs: MockDocumentLayout(),
)
partition_pdf_response = pdf._partition_pdf_or_image_local(filename, file)
assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
def test_partition_pdf_api_raises_with_no_filename(monkeypatch):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
with pytest.raises(FileNotFoundError):
pdf._partition_via_api(filename=None, file=None)
def test_partition_pdf_local_raises_with_no_filename():
with pytest.raises(FileNotFoundError):
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)
def test_partition_pdf_api_raises_with_failed_healthcheck(
monkeypatch,
filename="example-docs/layout-parser-paper-fast.pdf",
):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
with pytest.raises(ValueError):
pdf._partition_via_api(filename=filename)
def test_partition_pdf_api_raises_with_failed_api_call(
monkeypatch,
filename="example-docs/layout-parser-paper-fast.pdf",
):
monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
with pytest.raises(ValueError):
pdf._partition_via_api(filename=filename)
@pytest.mark.parametrize(
("url", "api_called", "local_called"),
[("fakeurl", True, False), (None, False, True)],
)
def test_partition_pdf(url, api_called, local_called, monkeypatch):
monkeypatch.setattr(strategies, "is_pdf_text_extractable", lambda *args, **kwargs: True)
with mock.patch.object(
pdf,
attribute="_partition_via_api",
new=mock.MagicMock(),
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
pdf.partition_pdf(filename="fake.pdf", strategy="hi_res", url=url)
assert pdf._partition_via_api.called == api_called
assert pdf._partition_pdf_or_image_local.called == local_called
@pytest.mark.parametrize(
("strategy"),
[("fast"), ("hi_res"), ("ocr_only")],
)
def test_partition_pdf_with_spooled_file(
strategy,
filename="example-docs/layout-parser-paper-fast.pdf",
):
# Test that the partition_pdf function can handle a SpooledTemporaryFile
with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
# validate that the result is a non-empty list of dicts
assert len(result) > 10
@pytest.mark.parametrize(
("url", "api_called", "local_called"),
[("fakeurl", True, False), (None, False, True)],
)
def test_partition_pdf_with_template(url, api_called, local_called, monkeypatch):
monkeypatch.setattr(strategies, "is_pdf_text_extractable", lambda *args, **kwargs: True)
with mock.patch.object(
pdf,
attribute="_partition_via_api",
new=mock.MagicMock(),
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
pdf.partition_pdf(filename="fake.pdf", strategy="hi_res", url=url, template="checkbox")
assert pdf._partition_via_api.called == api_called
assert pdf._partition_pdf_or_image_local.called == local_called
def test_partition_pdf_with_auto_strategy(filename="example-docs/layout-parser-paper-fast.pdf"):
elements = pdf.partition_pdf(filename=filename, strategy="auto")
titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
assert titles[0].text == title
def test_partition_pdf_with_page_breaks(filename="example-docs/layout-parser-paper-fast.pdf"):
elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True)
assert PageBreak() in elements
def test_partition_pdf_with_no_page_breaks(filename="example-docs/layout-parser-paper-fast.pdf"):
elements = pdf.partition_pdf(filename=filename, url=None)
assert PageBreak() not in elements
def test_partition_pdf_with_fast_strategy(filename="example-docs/layout-parser-paper-fast.pdf"):
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
assert len(elements) > 10
def test_partition_pdf_with_fast_groups_text(filename="example-docs/layout-parser-paper-fast.pdf"):
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
first_narrative_element = None
for element in elements:
if isinstance(element, NarrativeText):
first_narrative_element = element
break
assert len(first_narrative_element.text) > 1000
assert first_narrative_element.text.startswith("Abstract. Recent advances")
assert first_narrative_element.text.endswith("https://layout-parser.github.io.")
def test_partition_pdf_with_fast_strategy_from_file(
filename="example-docs/layout-parser-paper-fast.pdf",
):
with open(filename, "rb") as f:
elements = pdf.partition_pdf(file=f, url=None, strategy="fast")
assert len(elements) > 10
def test_partition_pdf_with_fast_strategy_and_page_breaks(
caplog,
filename="example-docs/layout-parser-paper-fast.pdf",
):
elements = pdf.partition_pdf(
filename=filename,
url=None,
strategy="fast",
include_page_breaks=True,
)
assert len(elements) > 10
assert PageBreak() in elements
assert "detectron2 is not installed" not in caplog.text
def test_partition_pdf_raises_with_bad_strategy(
filename="example-docs/layout-parser-paper-fast.pdf",
):
with pytest.raises(ValueError):
pdf.partition_pdf(filename=filename, url=None, strategy="made_up")
def test_partition_pdf_falls_back_to_fast(
monkeypatch,
caplog,
filename="example-docs/layout-parser-paper-fast.pdf",
):
def mock_exists(dep):
return dep not in ["detectron2", "pytesseract"]
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
mock_return = [Text("Hello there!")]
with mock.patch.object(
pdf,
"_partition_pdf_with_pdfminer",
return_value=mock_return,
) as mock_partition:
pdf.partition_pdf(filename=filename, url=None, strategy="hi_res")
mock_partition.assert_called_once()
assert "detectron2 is not installed" in caplog.text
def test_partition_pdf_falls_back_to_fast_from_ocr_only(
monkeypatch,
caplog,
filename="example-docs/layout-parser-paper-fast.pdf",
):
def mock_exists(dep):
return dep not in ["pytesseract"]
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
mock_return = [Text("Hello there!")]
with mock.patch.object(
pdf,
"_partition_pdf_with_pdfminer",
return_value=mock_return,
) as mock_partition:
pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only")
mock_partition.assert_called_once()
assert "pytesseract is not installed" in caplog.text
def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(
monkeypatch,
caplog,
filename="example-docs/layout-parser-paper-fast.pdf",
):
def mock_exists(dep):
return dep not in ["pytesseract"]
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
monkeypatch.setattr(strategies, "is_pdf_text_extractable", lambda *args, **kwargs: False)
mock_return = [Text("Hello there!")]
with mock.patch.object(
pdf,
"_partition_pdf_or_image_local",
return_value=mock_return,
) as mock_partition:
pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only")
mock_partition.assert_called_once()
assert "pytesseract is not installed" in caplog.text
def test_partition_pdf_falls_back_to_ocr_only(
monkeypatch,
caplog,
filename="example-docs/layout-parser-paper-fast.pdf",
):
def mock_exists(dep):
return dep not in ["detectron2"]
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
mock_return = [Text("Hello there!")]
with mock.patch.object(
pdf,
"_partition_pdf_or_image_with_ocr",
return_value=mock_return,
) as mock_partition:
pdf.partition_pdf(filename=filename, url=None, strategy="hi_res")
mock_partition.assert_called_once()
assert "detectron2 is not installed" in caplog.text
def test_partition_pdf_uses_table_extraction():
filename = "example-docs/layout-parser-paper-fast.pdf"
with mock.patch(
"unstructured_inference.inference.layout.process_file_with_model",
) as mock_process_file_with_model:
pdf.partition_pdf(filename, infer_table_structure=True)
assert mock_process_file_with_model.call_args[1]["extract_tables"]
def test_partition_pdf_with_copy_protection():
filename = os.path.join("example-docs", "copy-protected.pdf")
elements = pdf.partition_pdf(filename=filename, strategy="hi_res")
elements[0] == Title("LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis")
def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog):
filename = os.path.join("example-docs", "copy-protected.pdf")
elements = pdf.partition_pdf(filename=filename, strategy="fast")
elements[0] == Title("LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis")
assert "PDF text is not extractable" in caplog.text
def test_partition_pdf_fails_if_pdf_not_processable(
monkeypatch,
filename="example-docs/layout-parser-paper-fast.pdf",
):
def mock_exists(dep):
return dep not in ["detectron2", "pytesseract"]
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
monkeypatch.setattr(strategies, "is_pdf_text_extractable", lambda *args, **kwargs: False)
with pytest.raises(ValueError):
pdf.partition_pdf(filename=filename)
def test_partition_pdf_fast_groups_text_in_text_box():
filename = os.path.join("example-docs", "chevron-page.pdf")
elements = pdf.partition_pdf(filename=filename, strategy="fast")
assert elements[0] == Title("eastern mediterranean")
assert isinstance(elements[1], NarrativeText)
assert str(elements[1]).startswith("We")
assert str(elements[1]).endswith("Jordan and Egypt.")
assert elements[3] == Title(
"kilograms CO₂e/boe carbon intensity from our Eastern Mediterranean operations in 2022",
)