2023-08-10 16:28:57 -07:00
|
|
|
import pytest
|
2023-08-24 17:46:19 -07:00
|
|
|
from PIL import Image
|
|
|
|
from unstructured_inference.inference import layout
|
2023-01-19 09:29:28 -05:00
|
|
|
from unstructured_inference.inference.layout import LayoutElement
|
2023-08-24 17:46:19 -07:00
|
|
|
from unstructured_inference.inference.layoutelement import LocationlessLayoutElement
|
2023-01-19 09:29:28 -05:00
|
|
|
|
2023-07-05 11:25:11 -07:00
|
|
|
from unstructured.documents.coordinates import PixelSpace
|
2023-01-20 08:55:11 -05:00
|
|
|
from unstructured.documents.elements import (
|
|
|
|
CheckBox,
|
|
|
|
FigureCaption,
|
|
|
|
ListItem,
|
|
|
|
NarrativeText,
|
|
|
|
Text,
|
|
|
|
Title,
|
|
|
|
)
|
2023-02-27 17:30:54 +01:00
|
|
|
from unstructured.partition import common
|
2023-08-24 17:46:19 -07:00
|
|
|
from unstructured.partition.common import (
|
|
|
|
_get_page_image_metadata,
|
|
|
|
contains_emoji,
|
|
|
|
document_to_element_list,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
class MockPageLayout(layout.PageLayout):
|
|
|
|
def __init__(self, number: int, image: Image):
|
|
|
|
self.number = number
|
|
|
|
self.image = image
|
|
|
|
|
|
|
|
@property
|
|
|
|
def elements(self):
|
|
|
|
return [
|
|
|
|
LocationlessLayoutElement(
|
|
|
|
type="Headline",
|
|
|
|
text="Charlie Brown and the Great Pumpkin",
|
|
|
|
),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
class MockDocumentLayout(layout.DocumentLayout):
|
|
|
|
@property
|
|
|
|
def pages(self):
|
|
|
|
return [
|
|
|
|
MockPageLayout(number=1, image=Image.new("1", (1, 1))),
|
|
|
|
]
|
2023-01-19 09:29:28 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_normalize_layout_element_dict():
|
|
|
|
layout_element = {
|
|
|
|
"type": "Title",
|
|
|
|
"coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],
|
2023-06-20 11:19:55 -05:00
|
|
|
"coordinate_system": None,
|
2023-01-19 09:29:28 -05:00
|
|
|
"text": "Some lovely text",
|
|
|
|
}
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
|
|
|
element = common.normalize_layout_element(layout_element, coordinate_system=coordinate_system)
|
2023-06-20 11:19:55 -05:00
|
|
|
assert element == Title(
|
|
|
|
text="Some lovely text",
|
|
|
|
coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system=coordinate_system,
|
2023-06-20 11:19:55 -05:00
|
|
|
)
|
2023-01-19 09:29:28 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_normalize_layout_element_dict_caption():
|
|
|
|
layout_element = {
|
|
|
|
"type": "Figure",
|
|
|
|
"coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],
|
|
|
|
"text": "Some lovely text",
|
|
|
|
}
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
|
|
|
element = common.normalize_layout_element(layout_element, coordinate_system=coordinate_system)
|
2023-01-19 09:29:28 -05:00
|
|
|
assert element == FigureCaption(
|
2023-02-27 17:30:54 +01:00
|
|
|
text="Some lovely text",
|
|
|
|
coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system=coordinate_system,
|
2023-01-19 09:29:28 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-02-28 10:36:08 -05:00
|
|
|
def test_normalize_layout_element_dict_figure_caption():
|
|
|
|
layout_element = {
|
|
|
|
"type": "FigureCaption",
|
|
|
|
"coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],
|
|
|
|
"text": "Some lovely text",
|
|
|
|
}
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
|
|
|
element = common.normalize_layout_element(layout_element, coordinate_system=coordinate_system)
|
2023-02-28 10:36:08 -05:00
|
|
|
assert element == FigureCaption(
|
|
|
|
text="Some lovely text",
|
|
|
|
coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system=coordinate_system,
|
2023-02-28 10:36:08 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-01-19 09:29:28 -05:00
|
|
|
def test_normalize_layout_element_dict_misc():
|
|
|
|
layout_element = {
|
|
|
|
"type": "Misc",
|
|
|
|
"coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],
|
|
|
|
"text": "Some lovely text",
|
|
|
|
}
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
|
|
|
element = common.normalize_layout_element(layout_element, coordinate_system=coordinate_system)
|
|
|
|
assert element == Text(
|
|
|
|
text="Some lovely text",
|
|
|
|
coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
)
|
2023-01-19 09:29:28 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_normalize_layout_element_layout_element():
|
|
|
|
layout_element = LayoutElement(
|
|
|
|
type="Text",
|
2023-04-04 19:59:06 -07:00
|
|
|
x1=1,
|
|
|
|
y1=2,
|
|
|
|
x2=3,
|
|
|
|
y2=4,
|
2023-01-19 09:29:28 -05:00
|
|
|
text="Some lovely text",
|
|
|
|
)
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
|
|
|
element = common.normalize_layout_element(layout_element, coordinate_system=coordinate_system)
|
2023-02-28 10:36:08 -05:00
|
|
|
assert element == NarrativeText(
|
|
|
|
text="Some lovely text",
|
2023-04-04 19:59:06 -07:00
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system=coordinate_system,
|
2023-02-28 10:36:08 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_normalize_layout_element_layout_element_narrative_text():
|
|
|
|
layout_element = LayoutElement(
|
|
|
|
type="NarrativeText",
|
2023-04-04 19:59:06 -07:00
|
|
|
x1=1,
|
|
|
|
y1=2,
|
|
|
|
x2=3,
|
|
|
|
y2=4,
|
2023-02-28 10:36:08 -05:00
|
|
|
text="Some lovely text",
|
|
|
|
)
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
|
|
|
element = common.normalize_layout_element(layout_element, coordinate_system=coordinate_system)
|
2023-01-19 09:29:28 -05:00
|
|
|
assert element == NarrativeText(
|
2023-02-27 17:30:54 +01:00
|
|
|
text="Some lovely text",
|
2023-04-04 19:59:06 -07:00
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system=coordinate_system,
|
2023-01-19 09:29:28 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_normalize_layout_element_checked_box():
|
|
|
|
layout_element = LayoutElement(
|
|
|
|
type="Checked",
|
2023-04-04 19:59:06 -07:00
|
|
|
x1=1,
|
|
|
|
y1=2,
|
|
|
|
x2=3,
|
|
|
|
y2=4,
|
2023-01-19 09:29:28 -05:00
|
|
|
text="",
|
|
|
|
)
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
|
|
|
element = common.normalize_layout_element(layout_element, coordinate_system=coordinate_system)
|
|
|
|
assert element == CheckBox(
|
|
|
|
checked=True,
|
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
)
|
2023-01-19 09:29:28 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_normalize_layout_element_unchecked_box():
|
|
|
|
layout_element = LayoutElement(
|
|
|
|
type="Unchecked",
|
2023-04-04 19:59:06 -07:00
|
|
|
x1=1,
|
|
|
|
y1=2,
|
|
|
|
x2=3,
|
|
|
|
y2=4,
|
2023-01-19 09:29:28 -05:00
|
|
|
text="",
|
|
|
|
)
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
|
|
|
element = common.normalize_layout_element(layout_element, coordinate_system=coordinate_system)
|
|
|
|
assert element == CheckBox(
|
|
|
|
checked=False,
|
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
)
|
2023-01-20 08:55:11 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_normalize_layout_element_enumerated_list():
|
|
|
|
layout_element = LayoutElement(
|
|
|
|
type="List",
|
2023-04-04 19:59:06 -07:00
|
|
|
x1=1,
|
|
|
|
y1=2,
|
|
|
|
x2=3,
|
|
|
|
y2=4,
|
2023-01-20 08:55:11 -05:00
|
|
|
text="1. I'm so cool! 2. You're cool too. 3. We're all cool!",
|
|
|
|
)
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
|
|
|
elements = common.normalize_layout_element(layout_element, coordinate_system=coordinate_system)
|
2023-01-20 08:55:11 -05:00
|
|
|
assert elements == [
|
2023-07-05 11:25:11 -07:00
|
|
|
ListItem(
|
|
|
|
text="I'm so cool!",
|
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
),
|
|
|
|
ListItem(
|
|
|
|
text="You're cool too.",
|
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
),
|
|
|
|
ListItem(
|
|
|
|
text="We're all cool!",
|
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
),
|
2023-01-20 08:55:11 -05:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_normalize_layout_element_bulleted_list():
|
|
|
|
layout_element = LayoutElement(
|
|
|
|
type="List",
|
2023-04-04 19:59:06 -07:00
|
|
|
x1=1,
|
|
|
|
y1=2,
|
|
|
|
x2=3,
|
|
|
|
y2=4,
|
2023-01-20 08:55:11 -05:00
|
|
|
text="* I'm so cool! * You're cool too. * We're all cool!",
|
|
|
|
)
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
|
|
|
elements = common.normalize_layout_element(layout_element, coordinate_system=coordinate_system)
|
2023-01-20 08:55:11 -05:00
|
|
|
assert elements == [
|
2023-07-05 11:25:11 -07:00
|
|
|
ListItem(
|
|
|
|
text="I'm so cool!",
|
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
),
|
|
|
|
ListItem(
|
|
|
|
text="You're cool too.",
|
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
),
|
|
|
|
ListItem(
|
|
|
|
text="We're all cool!",
|
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
),
|
2023-01-20 08:55:11 -05:00
|
|
|
]
|
2023-06-08 12:33:06 -04:00
|
|
|
|
|
|
|
|
|
|
|
class MockPopenWithError:
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def communicate(self):
|
|
|
|
return b"", b"an error occurred"
|
|
|
|
|
|
|
|
|
|
|
|
def test_convert_office_doc_captures_errors(monkeypatch, caplog):
|
|
|
|
import subprocess
|
|
|
|
|
|
|
|
monkeypatch.setattr(subprocess, "Popen", MockPopenWithError)
|
|
|
|
common.convert_office_doc("no-real.docx", "fake-directory", target_format="docx")
|
|
|
|
assert "an error occurred" in caplog.text
|
2023-07-27 11:07:27 -04:00
|
|
|
|
|
|
|
|
|
|
|
class MockDocxEmptyTable:
|
|
|
|
def __init__(self):
|
|
|
|
self.rows = []
|
|
|
|
|
|
|
|
|
|
|
|
def test_convert_ms_office_table_to_text_works_with_empty_tables():
|
|
|
|
table = MockDocxEmptyTable()
|
|
|
|
assert common.convert_ms_office_table_to_text(table, as_html=True) == ""
|
|
|
|
assert common.convert_ms_office_table_to_text(table, as_html=False) == ""
|
2023-08-10 16:28:57 -07:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("text", "expected"),
|
|
|
|
[
|
|
|
|
("<table><tbody><tr><td>👨\\U+1F3FB🔧</td></tr></tbody></table>", True),
|
|
|
|
("<table><tbody><tr><td>Hello!</td></tr></tbody></table>", False),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_contains_emoji(text, expected):
|
|
|
|
assert contains_emoji(text) is expected
|
2023-08-24 17:46:19 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
|
|
|
|
layout_elem_absent_coordinates = MockDocumentLayout()
|
|
|
|
elements = document_to_element_list(layout_elem_absent_coordinates)
|
|
|
|
assert elements[0].metadata.coordinates is None
|
|
|
|
|
|
|
|
|
|
|
|
def test_get_page_image_metadata_and_coordinate_system():
|
|
|
|
doc = MockDocumentLayout()
|
|
|
|
metadata = _get_page_image_metadata(doc.pages[0])
|
|
|
|
assert isinstance(metadata, dict)
|