Don't instantiate an element with a coordinate system when there isn't a way to get its location (#913)

This commit is contained in:
Emily Chen 2023-07-10 21:47:41 -07:00 committed by GitHub
parent b3936893b8
commit 2635b0be07
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 62 additions and 11 deletions

View File

@ -1,4 +1,4 @@
## 0.8.1-dev1
## 0.8.1-dev2
### Enhancements
@ -10,6 +10,7 @@
* Fixed `auto` strategy detected scanned document as having extractable text and using `fast` strategy, resulting in no output.
* Fix list detection in MS Word documents.
* Don't instantiate an element with a coordinate system when there isn't a way to get its location data.
## 0.8.0

View File

@ -4,6 +4,9 @@ import zipfile
import magic
import pytest
from PIL import Image
from unstructured_inference.inference import layout
from unstructured_inference.inference.layoutelement import LocationlessLayoutElement
from unstructured.file_utils import filetype
from unstructured.file_utils.filetype import (
@ -12,6 +15,7 @@ from unstructured.file_utils.filetype import (
_is_text_file_a_csv,
_is_text_file_a_json,
detect_filetype,
document_to_element_list,
)
FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
@ -26,6 +30,29 @@ XLSX_MIME_TYPES = [
]
class MockPageLayout(layout.PageLayout):
def __init__(self, number: int, image: Image):
self.number = number
self.image = image
@property
def elements(self):
return [
LocationlessLayoutElement(
type="Headline",
text="Charlie Brown and the Great Pumpkin",
),
]
class MockDocumentLayout(layout.DocumentLayout):
@property
def pages(self):
return [
MockPageLayout(number=1, image=Image.new("1", (1, 1))),
]
@pytest.mark.parametrize(
("file", "expected"),
[
@ -436,3 +463,9 @@ def test_detect_filetype_skips_escape_commas_for_csv(tmpdir):
with open(filename, "rb") as f:
assert detect_filetype(file=f) == FileType.CSV
def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
layout_elem_absent_coordinates = MockDocumentLayout()
elements = document_to_element_list(layout_elem_absent_coordinates)
assert elements[0].metadata.coordinates is None

View File

@ -1 +1 @@
__version__ = "0.8.1-dev1" # pragma: no cover
__version__ = "0.8.1-dev2" # pragma: no cover

View File

@ -451,7 +451,7 @@ def document_to_element_list(
for i, page in enumerate(document.pages):
page_elements: List[Element] = []
for layout_element in page.elements:
if hasattr(page, "image"):
if hasattr(page, "image") and hasattr(layout_element, "coordinates"):
image_format = page.image.format
coordinate_system = PixelSpace(width=page.image.width, height=page.image.height)
else:

View File

@ -326,12 +326,16 @@ def _process_pdfminer_pages(
_text = clean_extra_whitespace(_text)
if _text.strip():
text_segments.append(_text)
element = element_from_text(_text)
coordinate_system = PixelSpace(
width=width,
height=height,
)
points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
element = element_from_text(
_text,
coordinates=points,
coordinate_system=coordinate_system,
)
coordinates_metadata = CoordinatesMetadata(
points=points,
system=coordinate_system,

View File

@ -1,7 +1,8 @@
import re
from typing import IO, Callable, List, Optional
from typing import IO, Callable, List, Optional, Tuple
from unstructured.cleaners.core import clean_bullets, group_broken_paragraphs
from unstructured.documents.coordinates import CoordinateSystem
from unstructured.documents.elements import (
Address,
Element,
@ -143,14 +144,26 @@ def partition_text(
return elements
def element_from_text(text: str) -> Element:
def element_from_text(
text: str,
coordinates: Optional[Tuple[Tuple[float, float], ...]] = None,
coordinate_system: Optional[CoordinateSystem] = None,
) -> Element:
if is_bulleted_text(text):
return ListItem(text=clean_bullets(text))
return ListItem(
text=clean_bullets(text),
coordinates=coordinates,
coordinate_system=coordinate_system,
)
elif is_us_city_state_zip(text):
return Address(text=text)
return Address(text=text, coordinates=coordinates, coordinate_system=coordinate_system)
elif is_possible_narrative_text(text):
return NarrativeText(text=text)
return NarrativeText(
text=text,
coordinates=coordinates,
coordinate_system=coordinate_system,
)
elif is_possible_title(text):
return Title(text=text)
return Title(text=text, coordinates=coordinates, coordinate_system=coordinate_system)
else:
return Text(text=text)
return Text(text=text, coordinates=coordinates, coordinate_system=coordinate_system)