2023-05-04 16:23:51 -04:00
|
|
|
import os
|
|
|
|
import pathlib
|
2023-02-27 17:30:54 +01:00
|
|
|
from unittest import mock
|
|
|
|
|
2023-01-13 22:24:13 -06:00
|
|
|
import pytest
|
2023-07-05 11:25:11 -07:00
|
|
|
from PIL import Image
|
2023-04-21 09:41:26 -04:00
|
|
|
from pytesseract import TesseractError
|
2023-02-27 17:30:54 +01:00
|
|
|
from unstructured_inference.inference import layout
|
2023-01-13 22:24:13 -06:00
|
|
|
|
2023-05-04 16:23:51 -04:00
|
|
|
from unstructured.documents.elements import Title
|
2023-02-27 17:30:54 +01:00
|
|
|
from unstructured.partition import image, pdf
|
2023-01-13 22:24:13 -06:00
|
|
|
|
2023-05-04 16:23:51 -04:00
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
|
|
|
2023-01-13 22:24:13 -06:00
|
|
|
|
|
|
|
class MockResponse:
|
|
|
|
def __init__(self, status_code, response):
|
|
|
|
self.status_code = status_code
|
|
|
|
self.response = response
|
|
|
|
|
|
|
|
def json(self):
|
|
|
|
return self.response
|
|
|
|
|
|
|
|
|
|
|
|
def mock_healthy_get(url, **kwargs):
|
|
|
|
return MockResponse(status_code=200, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_unhealthy_get(url, **kwargs):
|
|
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_unsuccessful_post(url, **kwargs):
|
|
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_successful_post(url, **kwargs):
|
|
|
|
response = {
|
|
|
|
"pages": [
|
|
|
|
{
|
|
|
|
"number": 0,
|
2023-07-26 15:10:14 -04:00
|
|
|
"elements": [
|
|
|
|
{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"},
|
|
|
|
],
|
2023-02-08 10:11:15 -05:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"number": 1,
|
|
|
|
"elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
|
|
|
|
},
|
2023-02-27 17:30:54 +01:00
|
|
|
],
|
2023-01-13 22:24:13 -06:00
|
|
|
}
|
|
|
|
return MockResponse(status_code=200, response=response)
|
|
|
|
|
|
|
|
|
|
|
|
class MockPageLayout(layout.PageLayout):
|
2023-07-05 11:25:11 -07:00
|
|
|
def __init__(self, number: int, image: Image):
|
|
|
|
self.number = number
|
|
|
|
self.image = image
|
2023-01-13 22:24:13 -06:00
|
|
|
|
|
|
|
@property
|
|
|
|
def elements(self):
|
|
|
|
return [
|
|
|
|
layout.LayoutElement(
|
|
|
|
type="Title",
|
2023-04-04 19:59:06 -07:00
|
|
|
x1=0,
|
|
|
|
y1=0,
|
|
|
|
x2=2,
|
|
|
|
y2=2,
|
2023-01-13 22:24:13 -06:00
|
|
|
text="Charlie Brown and the Great Pumpkin",
|
2023-02-27 17:30:54 +01:00
|
|
|
),
|
2023-01-13 22:24:13 -06:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
class MockDocumentLayout(layout.DocumentLayout):
|
|
|
|
@property
|
|
|
|
def pages(self):
|
|
|
|
return [
|
2023-07-05 11:25:11 -07:00
|
|
|
MockPageLayout(number=0, image=Image.new("1", (1, 1))),
|
2023-01-13 22:24:13 -06:00
|
|
|
]
|
|
|
|
|
|
|
|
|
2023-02-27 17:30:54 +01:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("filename", "file"),
|
|
|
|
[("example-docs/example.jpg", None), (None, b"0000")],
|
|
|
|
)
|
2023-01-13 22:24:13 -06:00
|
|
|
def test_partition_image_local(monkeypatch, filename, file):
|
|
|
|
monkeypatch.setattr(
|
2023-02-27 17:30:54 +01:00
|
|
|
layout,
|
|
|
|
"process_data_with_model",
|
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
2023-01-13 22:24:13 -06:00
|
|
|
)
|
|
|
|
monkeypatch.setattr(
|
2023-02-27 17:30:54 +01:00
|
|
|
layout,
|
|
|
|
"process_file_with_model",
|
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
2023-01-13 22:24:13 -06:00
|
|
|
)
|
|
|
|
|
2023-07-26 15:10:14 -04:00
|
|
|
partition_image_response = pdf._partition_pdf_or_image_local(
|
|
|
|
filename,
|
|
|
|
file,
|
|
|
|
is_image=True,
|
|
|
|
)
|
2023-01-13 22:24:13 -06:00
|
|
|
assert partition_image_response[0].text == "Charlie Brown and the Great Pumpkin"
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skip("Needs to be fixed upstream in unstructured-inference")
|
|
|
|
def test_partition_image_local_raises_with_no_filename():
|
|
|
|
with pytest.raises(FileNotFoundError):
|
|
|
|
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True)
|
|
|
|
|
|
|
|
|
2023-07-26 15:10:14 -04:00
|
|
|
def test_partition_image_with_auto_strategy(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.jpg",
|
|
|
|
):
|
2023-05-12 13:45:08 -04:00
|
|
|
elements = image.partition_image(filename=filename, strategy="auto")
|
|
|
|
titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
|
|
|
|
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
|
|
|
assert titles[0].text == title
|
|
|
|
|
|
|
|
|
2023-07-27 13:33:36 -04:00
|
|
|
def test_partition_image_with_table_extraction(
|
|
|
|
filename="example-docs/layout-parser-paper-with-table.jpg",
|
|
|
|
):
|
|
|
|
elements = image.partition_image(
|
|
|
|
filename=filename,
|
|
|
|
strategy="hi_res",
|
|
|
|
infer_table_structure=True,
|
|
|
|
)
|
|
|
|
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
|
|
|
|
assert len(table) == 1
|
|
|
|
assert "Layouts of history Japanese documents" in table[0]
|
|
|
|
|
|
|
|
|
2023-04-21 09:41:26 -04:00
|
|
|
def test_partition_image_with_language_passed(filename="example-docs/example.jpg"):
|
2023-07-26 15:10:14 -04:00
|
|
|
with mock.patch.object(
|
|
|
|
layout,
|
|
|
|
"process_file_with_model",
|
|
|
|
mock.MagicMock(),
|
|
|
|
) as mock_partition:
|
|
|
|
image.partition_image(
|
|
|
|
filename=filename,
|
|
|
|
strategy="hi_res",
|
|
|
|
ocr_languages="eng+swe",
|
|
|
|
)
|
2023-04-21 09:41:26 -04:00
|
|
|
|
|
|
|
assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
|
|
|
|
|
|
|
|
|
2023-07-26 15:10:14 -04:00
|
|
|
def test_partition_image_from_file_with_language_passed(
|
|
|
|
filename="example-docs/example.jpg",
|
|
|
|
):
|
|
|
|
with mock.patch.object(
|
|
|
|
layout,
|
|
|
|
"process_data_with_model",
|
|
|
|
mock.MagicMock(),
|
|
|
|
) as mock_partition, open(filename, "rb") as f:
|
|
|
|
image.partition_image(file=f, strategy="hi_res", ocr_languages="eng+swe")
|
2023-04-21 09:41:26 -04:00
|
|
|
|
|
|
|
assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
|
|
|
|
|
|
|
|
|
2023-08-10 13:57:46 -07:00
|
|
|
# NOTE(crag): see https://github.com/Unstructured-IO/unstructured/issues/1086
|
|
|
|
@pytest.mark.skip(reason="Current catching too many tesseract errors")
|
2023-07-26 15:10:14 -04:00
|
|
|
def test_partition_image_raises_with_invalid_language(
|
|
|
|
filename="example-docs/example.jpg",
|
|
|
|
):
|
2023-04-21 09:41:26 -04:00
|
|
|
with pytest.raises(TesseractError):
|
2023-07-26 15:10:14 -04:00
|
|
|
image.partition_image(
|
|
|
|
filename=filename,
|
|
|
|
strategy="hi_res",
|
|
|
|
ocr_languages="fakeroo",
|
|
|
|
)
|
2023-05-04 16:23:51 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_with_ocr_detects_korean():
|
2023-07-26 15:10:14 -04:00
|
|
|
filename = os.path.join(
|
|
|
|
DIRECTORY,
|
|
|
|
"..",
|
|
|
|
"..",
|
|
|
|
"example-docs",
|
|
|
|
"english-and-korean.png",
|
|
|
|
)
|
2023-05-04 16:23:51 -04:00
|
|
|
elements = image.partition_image(
|
|
|
|
filename=filename,
|
|
|
|
ocr_languages="eng+kor",
|
|
|
|
strategy="ocr_only",
|
|
|
|
)
|
|
|
|
|
|
|
|
assert elements[0] == Title("RULES AND INSTRUCTIONS")
|
2023-05-15 13:23:19 -05:00
|
|
|
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
|
2023-05-04 16:23:51 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_with_ocr_detects_korean_from_file():
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = image.partition_image(
|
|
|
|
file=f,
|
|
|
|
ocr_languages="eng+kor",
|
|
|
|
strategy="ocr_only",
|
|
|
|
)
|
|
|
|
|
|
|
|
assert elements[0] == Title("RULES AND INSTRUCTIONS")
|
2023-05-15 13:23:19 -05:00
|
|
|
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
|
2023-05-04 16:23:51 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_raises_with_bad_strategy():
|
2023-07-26 15:10:14 -04:00
|
|
|
filename = os.path.join(
|
|
|
|
DIRECTORY,
|
|
|
|
"..",
|
|
|
|
"..",
|
|
|
|
"example-docs",
|
|
|
|
"english-and-korean.png",
|
|
|
|
)
|
2023-05-04 16:23:51 -04:00
|
|
|
with pytest.raises(ValueError):
|
|
|
|
image.partition_image(filename=filename, strategy="fakeroo")
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
2023-08-02 09:22:20 -07:00
|
|
|
def test_partition_image_default_strategy_hi_res():
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.jpg")
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = image.partition_image(file=f)
|
|
|
|
|
|
|
|
first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
|
|
|
assert elements[0].text == first_line
|
|
|
|
assert elements[0].metadata.coordinates is not None
|
|
|
|
|
|
|
|
|
2023-07-26 15:10:14 -04:00
|
|
|
def test_partition_image_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
elements = image.partition_image(filename=filename)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_with_hi_res_strategy_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
elements = image.partition_image(filename=filename, stratefy="hi_res")
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_metadata_date_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2009-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
elements = image.partition_image(
|
|
|
|
filename=filename,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_with_hi_res_strategy_metadata_date_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2009-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
elements = image.partition_image(
|
|
|
|
filename=filename,
|
|
|
|
stratefy="hi_res",
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_from_file_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = image.partition_image(file=f)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_from_file_with_hi_res_strategy_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = image.partition_image(file=f, stratefy="hi_res")
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_from_file_metadata_date_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2009-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = image.partition_image(
|
|
|
|
file=f,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2009-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.pdf.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = image.partition_image(
|
|
|
|
file=f,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
stratefy="hi_res",
|
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|