mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-04 21:03:31 +00:00

Closes [#1859](https://github.com/Unstructured-IO/unstructured/issues/1859). * **Fixes elements partitioned from an image file missing certain metadata** Metadata for image files, like file type, was being handled differently from other file types. This caused a bug where other metadata, like the file name, was being missed. This change brought metadata handling for image files to be more in line with the handling for other file types so that file name and other metadata fields are being captured. Additionally: * Added test to verify filename is being captured in metadata * Cleaned up `CHANGELOG.md` formatting #### Testing: The following produces output `None` on `main`, but outputs the filename `layout-parser-paper-fast.jpg` on this branch: ```python from unstructured.partition.auto import partition elements = partition("example-docs/layout-parser-paper-fast.jpg") print(elements[0].metadata.filename) ```
586 lines
19 KiB
Python
586 lines
19 KiB
Python
import os
|
|
import pathlib
|
|
from unittest import mock
|
|
|
|
import pytest
|
|
from PIL import Image
|
|
from pytesseract import TesseractError
|
|
from unstructured_inference.inference import layout
|
|
|
|
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
|
from unstructured.chunking.title import chunk_by_title
|
|
from unstructured.partition import image, ocr, pdf
|
|
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
|
from unstructured.utils import only
|
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
|
|
|
|
class MockResponse:
|
|
def __init__(self, status_code, response):
|
|
self.status_code = status_code
|
|
self.response = response
|
|
|
|
def json(self):
|
|
return self.response
|
|
|
|
|
|
def mock_healthy_get(url, **kwargs):
|
|
return MockResponse(status_code=200, response={})
|
|
|
|
|
|
def mock_unhealthy_get(url, **kwargs):
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
def mock_unsuccessful_post(url, **kwargs):
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
def mock_successful_post(url, **kwargs):
|
|
response = {
|
|
"pages": [
|
|
{
|
|
"number": 0,
|
|
"elements": [
|
|
{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"},
|
|
],
|
|
},
|
|
{
|
|
"number": 1,
|
|
"elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
|
|
},
|
|
],
|
|
}
|
|
return MockResponse(status_code=200, response=response)
|
|
|
|
|
|
class MockPageLayout(layout.PageLayout):
|
|
def __init__(self, number: int, image: Image):
|
|
self.number = number
|
|
self.image = image
|
|
|
|
@property
|
|
def elements(self):
|
|
return [
|
|
layout.LayoutElement.from_coords(
|
|
type="Title",
|
|
x1=0,
|
|
y1=0,
|
|
x2=2,
|
|
y2=2,
|
|
text="Charlie Brown and the Great Pumpkin",
|
|
),
|
|
]
|
|
|
|
|
|
class MockDocumentLayout(layout.DocumentLayout):
|
|
@property
|
|
def pages(self):
|
|
return [
|
|
MockPageLayout(number=0, image=Image.new("1", (1, 1))),
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename", "file"),
|
|
[
|
|
("example-docs/example.jpg", None),
|
|
(None, b"0000"),
|
|
],
|
|
)
|
|
def test_partition_image_local(monkeypatch, filename, file):
|
|
monkeypatch.setattr(
|
|
layout,
|
|
"process_data_with_model",
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
|
)
|
|
monkeypatch.setattr(
|
|
layout,
|
|
"process_file_with_model",
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
|
)
|
|
monkeypatch.setattr(
|
|
ocr,
|
|
"process_data_with_ocr",
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
|
)
|
|
monkeypatch.setattr(
|
|
ocr,
|
|
"process_data_with_ocr",
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
|
)
|
|
|
|
partition_image_response = pdf._partition_pdf_or_image_local(
|
|
filename,
|
|
file,
|
|
is_image=True,
|
|
)
|
|
assert partition_image_response[0].text == "Charlie Brown and the Great Pumpkin"
|
|
|
|
|
|
@pytest.mark.skip("Needs to be fixed upstream in unstructured-inference")
|
|
def test_partition_image_local_raises_with_no_filename():
|
|
with pytest.raises(FileNotFoundError):
|
|
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True)
|
|
|
|
|
|
def test_partition_image_with_auto_strategy(
|
|
filename="example-docs/layout-parser-paper-fast.jpg",
|
|
):
|
|
elements = image.partition_image(filename=filename, strategy="auto")
|
|
titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
|
|
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
|
idx = 3
|
|
assert titles[0].text == title
|
|
assert elements[idx].metadata.detection_class_prob is not None
|
|
assert isinstance(elements[idx].metadata.detection_class_prob, float)
|
|
|
|
|
|
def test_partition_image_with_table_extraction(
|
|
filename="example-docs/layout-parser-paper-with-table.jpg",
|
|
):
|
|
elements = image.partition_image(
|
|
filename=filename,
|
|
strategy="hi_res",
|
|
infer_table_structure=True,
|
|
)
|
|
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
|
|
assert len(table) == 1
|
|
assert "<table><thead><th>" in table[0]
|
|
|
|
|
|
def test_partition_image_with_multipage_tiff(
|
|
filename="example-docs/layout-parser-paper-combined.tiff",
|
|
):
|
|
elements = image.partition_image(filename=filename, strategy="auto")
|
|
assert elements[-1].metadata.page_number == 2
|
|
|
|
|
|
def test_partition_image_with_language_passed(filename="example-docs/example.jpg"):
|
|
with mock.patch.object(
|
|
ocr,
|
|
"process_file_with_ocr",
|
|
mock.MagicMock(),
|
|
) as mock_partition:
|
|
image.partition_image(
|
|
filename=filename,
|
|
strategy="hi_res",
|
|
ocr_languages="eng+swe",
|
|
)
|
|
|
|
assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
|
|
|
|
|
|
def test_partition_image_from_file_with_language_passed(
|
|
filename="example-docs/example.jpg",
|
|
):
|
|
with mock.patch.object(
|
|
ocr,
|
|
"process_data_with_ocr",
|
|
mock.MagicMock(),
|
|
) as mock_partition, open(filename, "rb") as f:
|
|
image.partition_image(file=f, strategy="hi_res", ocr_languages="eng+swe")
|
|
|
|
assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
|
|
|
|
|
|
# NOTE(crag): see https://github.com/Unstructured-IO/unstructured/issues/1086
|
|
@pytest.mark.skip(reason="Current catching too many tesseract errors")
|
|
def test_partition_image_raises_with_invalid_language(
|
|
filename="example-docs/example.jpg",
|
|
):
|
|
with pytest.raises(TesseractError):
|
|
image.partition_image(
|
|
filename=filename,
|
|
strategy="hi_res",
|
|
ocr_languages="fakeroo",
|
|
)
|
|
|
|
|
|
def test_partition_image_with_ocr_detects_korean():
|
|
filename = os.path.join(
|
|
DIRECTORY,
|
|
"..",
|
|
"..",
|
|
"..",
|
|
"example-docs",
|
|
"english-and-korean.png",
|
|
)
|
|
elements = image.partition_image(
|
|
filename=filename,
|
|
ocr_languages="eng+kor",
|
|
strategy="ocr_only",
|
|
)
|
|
|
|
assert elements[0].text == "RULES AND INSTRUCTIONS"
|
|
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
|
|
|
|
|
|
def test_partition_image_with_ocr_detects_korean_from_file():
|
|
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "english-and-korean.png")
|
|
with open(filename, "rb") as f:
|
|
elements = image.partition_image(
|
|
file=f,
|
|
ocr_languages="eng+kor",
|
|
strategy="ocr_only",
|
|
)
|
|
|
|
assert elements[0].text == "RULES AND INSTRUCTIONS"
|
|
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
|
|
|
|
|
|
def test_partition_image_raises_with_bad_strategy():
|
|
filename = os.path.join(
|
|
DIRECTORY,
|
|
"..",
|
|
"..",
|
|
"..",
|
|
"example-docs",
|
|
"english-and-korean.png",
|
|
)
|
|
with pytest.raises(ValueError):
|
|
image.partition_image(filename=filename, strategy="fakeroo")
|
|
|
|
|
|
def test_partition_image_default_strategy_hi_res():
|
|
filename = os.path.join(
|
|
DIRECTORY,
|
|
"..",
|
|
"..",
|
|
"..",
|
|
"example-docs",
|
|
"layout-parser-paper-fast.jpg",
|
|
)
|
|
with open(filename, "rb") as f:
|
|
elements = image.partition_image(file=f)
|
|
|
|
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
|
idx = 3
|
|
assert elements[idx].text == title
|
|
assert elements[idx].metadata.coordinates is not None
|
|
assert elements[idx].metadata.detection_class_prob is not None
|
|
assert isinstance(elements[idx].metadata.detection_class_prob, float)
|
|
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
|
|
assert {element.metadata.detection_origin for element in elements} == {"image"}
|
|
|
|
|
|
def test_partition_image_metadata_date(
|
|
mocker,
|
|
filename="example-docs/english-and-korean.png",
|
|
):
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
mocker.patch(
|
|
"unstructured.partition.pdf.get_last_modified_date",
|
|
return_value=mocked_last_modification_date,
|
|
)
|
|
elements = image.partition_image(filename=filename)
|
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
|
|
|
|
|
def test_partition_image_with_hi_res_strategy_metadata_date(
|
|
mocker,
|
|
filename="example-docs/english-and-korean.png",
|
|
):
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
mocker.patch(
|
|
"unstructured.partition.pdf.get_last_modified_date",
|
|
return_value=mocked_last_modification_date,
|
|
)
|
|
elements = image.partition_image(filename=filename, stratefy="hi_res")
|
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
|
|
|
|
|
def test_partition_image_metadata_date_custom_metadata_date(
|
|
mocker,
|
|
filename="example-docs/english-and-korean.png",
|
|
):
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
expected_last_modification_date = "2009-07-05T09:24:28"
|
|
|
|
mocker.patch(
|
|
"unstructured.partition.pdf.get_last_modified_date",
|
|
return_value=mocked_last_modification_date,
|
|
)
|
|
elements = image.partition_image(
|
|
filename=filename,
|
|
metadata_last_modified=expected_last_modification_date,
|
|
)
|
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
|
|
|
|
|
def test_partition_image_with_hi_res_strategy_metadata_date_custom_metadata_date(
|
|
mocker,
|
|
filename="example-docs/english-and-korean.png",
|
|
):
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
expected_last_modification_date = "2009-07-05T09:24:28"
|
|
|
|
mocker.patch(
|
|
"unstructured.partition.pdf.get_last_modified_date",
|
|
return_value=mocked_last_modification_date,
|
|
)
|
|
elements = image.partition_image(
|
|
filename=filename,
|
|
stratefy="hi_res",
|
|
metadata_last_modified=expected_last_modification_date,
|
|
)
|
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
|
|
|
|
|
def test_partition_image_from_file_metadata_date(
|
|
mocker,
|
|
filename="example-docs/english-and-korean.png",
|
|
):
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
mocker.patch(
|
|
"unstructured.partition.pdf.get_last_modified_date_from_file",
|
|
return_value=mocked_last_modification_date,
|
|
)
|
|
with open(filename, "rb") as f:
|
|
elements = image.partition_image(file=f)
|
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
|
|
|
|
|
def test_partition_image_from_file_with_hi_res_strategy_metadata_date(
|
|
mocker,
|
|
filename="example-docs/english-and-korean.png",
|
|
):
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
mocker.patch(
|
|
"unstructured.partition.pdf.get_last_modified_date_from_file",
|
|
return_value=mocked_last_modification_date,
|
|
)
|
|
|
|
with open(filename, "rb") as f:
|
|
elements = image.partition_image(file=f, stratefy="hi_res")
|
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
|
|
|
|
|
def test_partition_image_from_file_metadata_date_custom_metadata_date(
|
|
mocker,
|
|
filename="example-docs/english-and-korean.png",
|
|
):
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
expected_last_modification_date = "2009-07-05T09:24:28"
|
|
|
|
mocker.patch(
|
|
"unstructured.partition.pdf.get_last_modified_date_from_file",
|
|
return_value=mocked_last_modification_date,
|
|
)
|
|
with open(filename, "rb") as f:
|
|
elements = image.partition_image(
|
|
file=f,
|
|
metadata_last_modified=expected_last_modification_date,
|
|
)
|
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
|
|
|
|
|
def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_metadata_date(
|
|
mocker,
|
|
filename="example-docs/english-and-korean.png",
|
|
):
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
expected_last_modification_date = "2009-07-05T09:24:28"
|
|
|
|
mocker.patch(
|
|
"unstructured.partition.pdf.get_last_modified_date_from_file",
|
|
return_value=mocked_last_modification_date,
|
|
)
|
|
with open(filename, "rb") as f:
|
|
elements = image.partition_image(
|
|
file=f,
|
|
metadata_last_modified=expected_last_modification_date,
|
|
stratefy="hi_res",
|
|
)
|
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
|
|
|
|
|
def test_partition_msg_with_json():
|
|
elements = image.partition_image(
|
|
example_doc_path("layout-parser-paper-fast.jpg"),
|
|
strategy="auto",
|
|
)
|
|
assert_round_trips_through_JSON(elements)
|
|
|
|
|
|
def test_partition_image_with_ocr_has_coordinates_from_filename(
|
|
filename="example-docs/english-and-korean.png",
|
|
):
|
|
elements = image.partition_image(filename=filename, strategy="ocr_only")
|
|
int_coordinates = [(int(x), int(y)) for x, y in elements[0].metadata.coordinates.points]
|
|
assert int_coordinates == [(14, 36), (14, 16), (381, 16), (381, 36)]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename"),
|
|
[
|
|
("example-docs/layout-parser-paper-with-table.jpg"),
|
|
("example-docs/english-and-korean.png"),
|
|
("example-docs/layout-parser-paper-fast.jpg"),
|
|
],
|
|
)
|
|
def test_partition_image_with_ocr_coordinates_are_not_nan_from_filename(
|
|
filename,
|
|
):
|
|
import math
|
|
|
|
elements = image.partition_image(filename=filename, strategy="ocr_only")
|
|
for element in elements:
|
|
# TODO (jennings) One or multiple elements is an empty string
|
|
# without coordinates. This should be fixed in a new issue
|
|
if element.text:
|
|
box = element.metadata.coordinates.points
|
|
for point in box:
|
|
assert point[0] is not math.nan
|
|
assert point[1] is not math.nan
|
|
|
|
|
|
def test_partition_image_formats_languages_for_tesseract():
|
|
filename = "example-docs/jpn-vert.jpeg"
|
|
with mock.patch(
|
|
"unstructured.partition.ocr.process_file_with_ocr",
|
|
) as mock_process_file_with_ocr:
|
|
image.partition_image(filename=filename, strategy="hi_res", languages=["jpn_vert"])
|
|
_, kwargs = mock_process_file_with_ocr.call_args_list[0]
|
|
assert "ocr_languages" in kwargs
|
|
assert kwargs["ocr_languages"] == "jpn_vert"
|
|
|
|
|
|
def test_partition_image_warns_with_ocr_languages(caplog):
|
|
filename = "example-docs/layout-parser-paper-fast.jpg"
|
|
image.partition_image(filename=filename, strategy="hi_res", ocr_languages="eng")
|
|
assert "The ocr_languages kwarg will be deprecated" in caplog.text
|
|
|
|
|
|
def test_add_chunking_strategy_on_partition_image(
|
|
filename="example-docs/layout-parser-paper-fast.jpg",
|
|
):
|
|
elements = image.partition_image(filename=filename)
|
|
chunk_elements = image.partition_image(filename, chunking_strategy="by_title")
|
|
chunks = chunk_by_title(elements)
|
|
assert chunk_elements != elements
|
|
assert chunk_elements == chunks
|
|
|
|
|
|
def test_add_chunking_strategy_on_partition_image_hi_res(
|
|
filename="example-docs/layout-parser-paper-with-table.jpg",
|
|
):
|
|
elements = image.partition_image(
|
|
filename=filename,
|
|
strategy="hi_res",
|
|
infer_table_structure=True,
|
|
)
|
|
chunk_elements = image.partition_image(
|
|
filename,
|
|
strategy="hi_res",
|
|
infer_table_structure=True,
|
|
chunking_strategy="by_title",
|
|
)
|
|
chunks = chunk_by_title(elements)
|
|
assert chunk_elements != elements
|
|
assert chunk_elements == chunks
|
|
|
|
|
|
def test_partition_image_uses_model_name():
|
|
with mock.patch.object(
|
|
pdf,
|
|
"_partition_pdf_or_image_local",
|
|
) as mockpartition:
|
|
image.partition_image("example-docs/layout-parser-paper-fast.jpg", model_name="test")
|
|
print(mockpartition.call_args)
|
|
assert "model_name" in mockpartition.call_args.kwargs
|
|
assert mockpartition.call_args.kwargs["model_name"]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("ocr_mode", "idx_title_element"),
|
|
[
|
|
("entire_page", 3),
|
|
("individual_blocks", 1),
|
|
],
|
|
)
|
|
def test_partition_image_hi_res_ocr_mode(ocr_mode, idx_title_element):
|
|
filename = "example-docs/layout-parser-paper-fast.jpg"
|
|
elements = image.partition_image(filename=filename, ocr_mode=ocr_mode, strategy="hi_res")
|
|
first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
|
# Note(yuming): idx_title_element is different based on xy-cut and ocr mode
|
|
assert elements[idx_title_element].text == first_line
|
|
|
|
|
|
def test_partition_image_hi_res_invalid_ocr_mode():
|
|
filename = "example-docs/layout-parser-paper-fast.jpg"
|
|
with pytest.raises(ValueError):
|
|
_ = image.partition_image(filename=filename, ocr_mode="invalid_ocr_mode", strategy="hi_res")
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("ocr_mode"),
|
|
[
|
|
("entire_page"),
|
|
("individual_blocks"),
|
|
],
|
|
)
|
|
def test_partition_image_hi_res_ocr_mode_with_table_extraction(ocr_mode):
|
|
filename = "example-docs/layout-parser-paper-with-table.jpg"
|
|
elements = image.partition_image(
|
|
filename=filename,
|
|
ocr_mode=ocr_mode,
|
|
strategy="hi_res",
|
|
infer_table_structure=True,
|
|
)
|
|
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
|
|
assert len(table) == 1
|
|
assert "<table><thead><th>" in table[0]
|
|
assert "Layouts of history Japanese documents" in table[0]
|
|
assert "Layouts of scanned modern magazines and scientific reports" in table[0]
|
|
|
|
|
|
def test_partition_image_raises_TypeError_for_invalid_languages():
|
|
filename = "example-docs/layout-parser-paper-fast.jpg"
|
|
with pytest.raises(TypeError):
|
|
image.partition_image(filename=filename, strategy="hi_res", languages="eng")
|
|
|
|
|
|
@pytest.fixture()
|
|
def inference_results():
|
|
page = layout.PageLayout(
|
|
number=1,
|
|
image=mock.MagicMock(format="JPEG"),
|
|
layout=layout.TextRegion.from_coords(0, 0, 600, 800, text="hello"),
|
|
)
|
|
page.elements = [layout.LayoutElement.from_coords(0, 0, 600, 800, text="hello")]
|
|
doc = layout.DocumentLayout(pages=[page])
|
|
return doc
|
|
|
|
|
|
def test_partition_image_has_filename(inference_results):
|
|
doc_path = "example-docs"
|
|
filename = "layout-parser-paper-fast.jpg"
|
|
# Mock inference call with known return results
|
|
with mock.patch(
|
|
"unstructured_inference.inference.layout.process_file_with_model",
|
|
return_value=inference_results,
|
|
) as mock_inference_func:
|
|
elements = image.partition_image(
|
|
filename=os.path.join(doc_path, filename),
|
|
strategy="hi_res",
|
|
)
|
|
# Make sure we actually went down the path we expect.
|
|
mock_inference_func.assert_called_once()
|
|
# Unpack element but also make sure there is only one
|
|
element = only(elements)
|
|
# This makes sure we are still getting the filetype metadata (should be translated from the
|
|
# fixtures)
|
|
assert element.metadata.filetype == "JPEG"
|
|
# This should be kept from the filename we originally gave
|
|
assert element.metadata.filename == filename
|