John de4d496fcf
Fix bbox coordinates for ocr_only strategy (#1325)
### Summary
Duplicate PR of #1259 because of issues with checks
Closes #1227, which found that `nan` values were present in the
coordinates being generated for some elements.
This breaks logic out from `add_pytesseract_bbox_to_elements` to new
functions `_get_element_box` and
`convert_multiple_coordinates_to_new_system`. It also updates the logic
to check that the current bounding box matches the first character of
the element's text (as to avoid the `~` characters that
`pytesseract.image_to_boxes` includes, but are not present in
`pytesseract.image_to_string`.

### Testing
```
from unstructured.partition.image import partition_image
from PIL import Image, ImageDraw

filename="example-docs/layout-parser-paper-with-table.jpg"
elements = partition_image(filename=filename, strategy="ocr_only")
image = Image.open(filename)
draw = ImageDraw.Draw(image)
for i, element in enumerate(elements):
    print(i, element.metadata.coordinates)
    if element.metadata.coordinates:
        draw.polygon(element.metadata.coordinates.points, outline="red", width=2)
output = "example-docs/box-layout-parser-paper-with-table.jpg"
image.save(output)
image.close()
```

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
2023-09-15 15:11:16 -05:00

295 lines
11 KiB
Python

import os
from tempfile import SpooledTemporaryFile
import docx
import pytest
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import (
Address,
ListItem,
NarrativeText,
Text,
Title,
)
from unstructured.partition.common import convert_office_doc
from unstructured.partition.doc import partition_doc
from unstructured.partition.docx import partition_docx
from unstructured.partition.json import partition_json
from unstructured.staging.base import elements_to_json
@pytest.fixture()
def mock_document():
document = docx.Document()
document.add_paragraph("These are a few of my favorite things:", style="Heading 1")
# NOTE(robinson) - this should get picked up as a list item due to the •
document.add_paragraph("• Parrots", style="Normal")
# NOTE(robinson) - this should get dropped because it's empty
document.add_paragraph("", style="Normal")
document.add_paragraph("Hockey", style="List Bullet")
# NOTE(robinson) - this should get dropped because it's empty
document.add_paragraph("", style="List Bullet")
# NOTE(robinson) - this should get picked up as a title
document.add_paragraph("Analysis", style="Normal")
# NOTE(robinson) - this should get dropped because it is empty
document.add_paragraph("", style="Normal")
# NOTE(robinson) - this should get picked up as a narrative text
document.add_paragraph("This is my first thought. This is my second thought.", style="Normal")
document.add_paragraph("This is my third thought.", style="Body Text")
# NOTE(robinson) - this should just be regular text
document.add_paragraph("2023")
# NOTE(robinson) - this should be an address
document.add_paragraph("DOYLESTOWN, PA 18901")
return document
@pytest.fixture()
def expected_elements():
return [
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Address("DOYLESTOWN, PA 18901"),
]
def test_partition_doc_from_filename(mock_document, expected_elements, tmpdir, capsys):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
elements = partition_doc(filename=doc_filename)
assert elements == expected_elements
assert elements[0].metadata.filename == "mock_document.doc"
assert elements[0].metadata.file_directory == tmpdir.dirname
assert capsys.readouterr().out == ""
assert capsys.readouterr().err == ""
def test_partition_doc_from_filename_with_metadata_filename(
mock_document,
expected_elements,
tmpdir,
):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
elements = partition_doc(filename=doc_filename, metadata_filename="test")
assert elements == expected_elements
assert all(element.metadata.filename == "test" for element in elements)
def test_partition_doc_matches_partition_docx(mock_document, expected_elements, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
assert partition_doc(filename=doc_filename) == partition_docx(filename=docx_filename)
def test_partition_raises_with_missing_doc(mock_document, expected_elements, tmpdir):
doc_filename = os.path.join(tmpdir.dirname, "asdf.doc")
with pytest.raises(ValueError):
partition_doc(filename=doc_filename)
def test_partition_doc_from_file_with_filter(mock_document, expected_elements, tmpdir, capsys):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
with open(doc_filename, "rb") as f:
elements = partition_doc(file=f, libre_office_filter="MS Word 2007 XML")
assert elements == expected_elements
assert capsys.readouterr().out == ""
assert capsys.readouterr().err == ""
for element in elements:
assert element.metadata.filename is None
def test_partition_doc_from_file_with_no_filter(mock_document, expected_elements, tmpdir, capsys):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
with open(doc_filename, "rb") as f:
elements = partition_doc(file=f, libre_office_filter=None)
assert elements == expected_elements
assert capsys.readouterr().out == ""
assert capsys.readouterr().err == ""
for element in elements:
assert element.metadata.filename is None
def test_partition_doc_from_file_with_metadata_filename(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
with open(doc_filename, "rb") as f:
elements = partition_doc(file=f, metadata_filename="test")
for element in elements:
assert element.metadata.filename == "test"
def test_partition_doc_raises_with_both_specified(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
with open(doc_filename, "rb") as f, pytest.raises(ValueError):
partition_doc(filename=doc_filename, file=f)
def test_partition_doc_raises_with_neither():
with pytest.raises(ValueError):
partition_doc()
def test_partition_doc_from_file_exclude_metadata(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
with open(doc_filename, "rb") as f:
elements = partition_doc(file=f, include_metadata=False)
assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None
assert elements[0].metadata.filename is None
def test_partition_doc_from_filename_exclude_metadata(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
elements = partition_doc(filename=doc_filename, include_metadata=False)
assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None
assert elements[0].metadata.filename is None
def test_partition_doc_metadata_date(
mocker,
filename="example-docs/fake.doc",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.doc.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = partition_doc(filename=filename)
assert elements[0].metadata.last_modified == mocked_last_modification_date
def test_partition_doc_metadata_date_with_custom_metadata(
mocker,
filename="example-docs/fake.doc",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modified_date = "2020-07-05T09:24:28"
mocker.patch(
"unstructured.partition.doc.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = partition_doc(
filename=filename,
metadata_last_modified=expected_last_modified_date,
)
assert elements[0].metadata.last_modified == expected_last_modified_date
def test_partition_doc_from_file_metadata_date(
mocker,
filename="example-docs/fake.doc",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.doc.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
elements = partition_doc(file=f)
assert elements[0].metadata.last_modified == mocked_last_modification_date
def test_partition_doc_from_file_metadata_date_with_custom_metadata(
mocker,
filename="example-docs/fake.doc",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modified_date = "2020-07-05T09:24:28"
mocker.patch(
"unstructured.partition.doc.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
elements = partition_doc(file=f, metadata_last_modified=expected_last_modified_date)
assert elements[0].metadata.last_modified == expected_last_modified_date
@pytest.mark.xfail(reason="handling of last_modified for file vs. filename to be refined later")
def test_partition_doc_from_file_without_metadata_date(
filename="example-docs/fake.doc",
):
"""Test partition_doc() with file that are not possible to get last modified date"""
with open(filename, "rb") as f:
sf = SpooledTemporaryFile()
sf.write(f.read())
sf.seek(0)
elements = partition_doc(file=sf, metadata_date="2020-07-05")
assert elements[0].metadata.date == "2020-07-05"
def test_partition_doc_with_json(mock_document, expected_elements, tmpdir, capsys):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
elements = partition_doc(filename=doc_filename)
test_elements = partition_json(text=elements_to_json(elements))
assert len(elements) == len(test_elements)
assert elements[0].metadata.filename == test_elements[0].metadata.filename
for i in range(len(elements)):
assert elements[i] == test_elements[i]
def test_add_chunking_strategy_on_partition_doc(filename="example-docs/fake.doc"):
chunk_elements = partition_doc(filename, chunking_strategy="by_title")
elements = partition_doc(filename)
chunks = chunk_by_title(elements)
assert chunk_elements != elements
assert chunk_elements == chunks