mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-19 07:02:38 +00:00

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata. Tests are added to make sure: * When partition is used, any content type or auto file type detection will override file-specific partition function metadata * Both auto and file-specific partitioning gives the desired filetype metadata Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
228 lines
7.8 KiB
Python
228 lines
7.8 KiB
Python
import os
|
|
import pathlib
|
|
from unittest import mock
|
|
|
|
import pytest
|
|
import requests
|
|
from pytesseract import TesseractError
|
|
from unstructured_inference.inference import layout
|
|
|
|
from unstructured.documents.elements import Title
|
|
from unstructured.partition import image, pdf
|
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
|
|
is_in_docker = os.path.exists("/.dockerenv")
|
|
|
|
|
|
class MockResponse:
|
|
def __init__(self, status_code, response):
|
|
self.status_code = status_code
|
|
self.response = response
|
|
|
|
def json(self):
|
|
return self.response
|
|
|
|
|
|
def mock_healthy_get(url, **kwargs):
|
|
return MockResponse(status_code=200, response={})
|
|
|
|
|
|
def mock_unhealthy_get(url, **kwargs):
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
def mock_unsuccessful_post(url, **kwargs):
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
def mock_successful_post(url, **kwargs):
|
|
response = {
|
|
"pages": [
|
|
{
|
|
"number": 0,
|
|
"elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}],
|
|
},
|
|
{
|
|
"number": 1,
|
|
"elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
|
|
},
|
|
],
|
|
}
|
|
return MockResponse(status_code=200, response=response)
|
|
|
|
|
|
class MockPageLayout(layout.PageLayout):
|
|
def __init__(self, number: int):
|
|
pass
|
|
|
|
@property
|
|
def elements(self):
|
|
return [
|
|
layout.LayoutElement(
|
|
type="Title",
|
|
x1=0,
|
|
y1=0,
|
|
x2=2,
|
|
y2=2,
|
|
text="Charlie Brown and the Great Pumpkin",
|
|
),
|
|
]
|
|
|
|
|
|
class MockDocumentLayout(layout.DocumentLayout):
|
|
@property
|
|
def pages(self):
|
|
return [
|
|
MockPageLayout(
|
|
number=0,
|
|
),
|
|
]
|
|
|
|
|
|
def test_partition_image_api(monkeypatch, filename="example-docs/example.jpg"):
|
|
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
|
|
partition_image_response = pdf._partition_via_api(filename)
|
|
assert partition_image_response[0]["type"] == "Title"
|
|
assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
|
|
assert partition_image_response[1]["type"] == "Title"
|
|
assert partition_image_response[1]["text"] == "A Charlie Brown Christmas"
|
|
|
|
|
|
def test_partition_image_api_page_break(monkeypatch, filename="example-docs/example.jpg"):
|
|
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
|
|
partition_image_response = pdf._partition_via_api(filename, include_page_breaks=True)
|
|
assert partition_image_response[0]["type"] == "Title"
|
|
assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
|
|
assert partition_image_response[1]["type"] == "PageBreak"
|
|
assert partition_image_response[2]["type"] == "Title"
|
|
assert partition_image_response[2]["text"] == "A Charlie Brown Christmas"
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename", "file"),
|
|
[("example-docs/example.jpg", None), (None, b"0000")],
|
|
)
|
|
def test_partition_image_local(monkeypatch, filename, file):
|
|
monkeypatch.setattr(
|
|
layout,
|
|
"process_data_with_model",
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
|
)
|
|
monkeypatch.setattr(
|
|
layout,
|
|
"process_file_with_model",
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
|
)
|
|
|
|
partition_image_response = pdf._partition_pdf_or_image_local(filename, file, is_image=True)
|
|
assert partition_image_response[0].text == "Charlie Brown and the Great Pumpkin"
|
|
|
|
|
|
@pytest.mark.skip("Needs to be fixed upstream in unstructured-inference")
|
|
def test_partition_image_local_raises_with_no_filename():
|
|
with pytest.raises(FileNotFoundError):
|
|
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True)
|
|
|
|
|
|
def test_partition_image_api_raises_with_failed_healthcheck(
|
|
monkeypatch,
|
|
filename="example-docs/example.jpg",
|
|
):
|
|
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
|
|
|
|
with pytest.raises(ValueError):
|
|
pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image")
|
|
|
|
|
|
def test_partition_image_api_raises_with_failed_api_call(
|
|
monkeypatch,
|
|
filename="example-docs/example.jpg",
|
|
):
|
|
monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
|
|
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
|
|
with pytest.raises(ValueError):
|
|
pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image")
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("url", "api_called", "local_called"),
|
|
[("fakeurl", True, False), (None, False, True)],
|
|
)
|
|
def test_partition_image(url, api_called, local_called):
|
|
with mock.patch.object(
|
|
pdf,
|
|
attribute="_partition_via_api",
|
|
new=mock.MagicMock(),
|
|
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
|
|
image.partition_image(filename="fake.pdf", strategy="hi_res", url=url)
|
|
assert pdf._partition_via_api.called == api_called
|
|
assert pdf._partition_pdf_or_image_local.called == local_called
|
|
|
|
|
|
def test_partition_image_with_auto_strategy(filename="example-docs/layout-parser-paper-fast.jpg"):
|
|
elements = image.partition_image(filename=filename, strategy="auto")
|
|
titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
|
|
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
|
assert titles[0].text == title
|
|
|
|
|
|
def test_partition_image_with_language_passed(filename="example-docs/example.jpg"):
|
|
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_partition:
|
|
image.partition_image(filename=filename, strategy="hi_res", ocr_languages="eng+swe")
|
|
|
|
assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
|
|
|
|
|
|
def test_partition_image_from_file_with_language_passed(filename="example-docs/example.jpg"):
|
|
with mock.patch.object(layout, "process_data_with_model", mock.MagicMock()) as mock_partition:
|
|
with open(filename, "rb") as f:
|
|
image.partition_image(file=f, strategy="hi_res", ocr_languages="eng+swe")
|
|
|
|
assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
|
|
|
|
|
|
def test_partition_image_raises_with_invalid_language(filename="example-docs/example.jpg"):
|
|
with pytest.raises(TesseractError):
|
|
image.partition_image(filename=filename, strategy="hi_res", ocr_languages="fakeroo")
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
def test_partition_image_with_ocr_detects_korean():
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
|
|
elements = image.partition_image(
|
|
filename=filename,
|
|
ocr_languages="eng+kor",
|
|
strategy="ocr_only",
|
|
)
|
|
|
|
assert elements[0] == Title("RULES AND INSTRUCTIONS")
|
|
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
def test_partition_image_with_ocr_detects_korean_from_file():
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
|
|
|
|
with open(filename, "rb") as f:
|
|
elements = image.partition_image(
|
|
file=f,
|
|
ocr_languages="eng+kor",
|
|
strategy="ocr_only",
|
|
)
|
|
|
|
assert elements[0] == Title("RULES AND INSTRUCTIONS")
|
|
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
|
|
|
|
|
|
def test_partition_image_raises_with_bad_strategy():
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
|
|
with pytest.raises(ValueError):
|
|
image.partition_image(filename=filename, strategy="fakeroo")
|