Don't instantiate an element with a coordinate system when there isn't a way to get its location (#913)

2025-12-26 06:36:06 +00:00 · 2023-07-10 21:47:41 -07:00 · 2023-07-10 21:47:41 -07:00 · 2635b0be07
commit 2635b0be07
parent b3936893b8
6 changed files with 62 additions and 11 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.8.1-dev1
+## 0.8.1-dev2

 ### Enhancements

@ -10,6 +10,7 @@

 * Fixed `auto` strategy detected scanned document as having extractable text and using `fast` strategy, resulting in no output.
 * Fix list detection in MS Word documents.
+* Don't instantiate an element with a coordinate system when there isn't a way to get its location data.

 ## 0.8.0

--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@ -4,6 +4,9 @@ import zipfile

 import magic
 import pytest
+from PIL import Image
+from unstructured_inference.inference import layout
+from unstructured_inference.inference.layoutelement import LocationlessLayoutElement

 from unstructured.file_utils import filetype
 from unstructured.file_utils.filetype import (
@ -12,6 +15,7 @@ from unstructured.file_utils.filetype import (
    _is_text_file_a_csv,
    _is_text_file_a_json,
    detect_filetype,
+    document_to_element_list,
 )

 FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
@ -26,6 +30,29 @@ XLSX_MIME_TYPES = [
 ]


+class MockPageLayout(layout.PageLayout):
+    def __init__(self, number: int, image: Image):
+        self.number = number
+        self.image = image
+
+    @property
+    def elements(self):
+        return [
+            LocationlessLayoutElement(
+                type="Headline",
+                text="Charlie Brown and the Great Pumpkin",
+            ),
+        ]
+
+
+class MockDocumentLayout(layout.DocumentLayout):
+    @property
+    def pages(self):
+        return [
+            MockPageLayout(number=1, image=Image.new("1", (1, 1))),
+        ]
+
+
@pytest.mark.parametrize(
    ("file", "expected"),
    [
@ -436,3 +463,9 @@ def test_detect_filetype_skips_escape_commas_for_csv(tmpdir):

    with open(filename, "rb") as f:
        assert detect_filetype(file=f) == FileType.CSV
+
+
+def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
+    layout_elem_absent_coordinates = MockDocumentLayout()
+    elements = document_to_element_list(layout_elem_absent_coordinates)
+    assert elements[0].metadata.coordinates is None
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.8.1-dev1"  # pragma: no cover
+__version__ = "0.8.1-dev2"  # pragma: no cover
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -451,7 +451,7 @@ def document_to_element_list(
    for i, page in enumerate(document.pages):
        page_elements: List[Element] = []
        for layout_element in page.elements:
-            if hasattr(page, "image"):
+            if hasattr(page, "image") and hasattr(layout_element, "coordinates"):
                image_format = page.image.format
                coordinate_system = PixelSpace(width=page.image.width, height=page.image.height)
            else:
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -326,12 +326,16 @@ def _process_pdfminer_pages(
                _text = clean_extra_whitespace(_text)
                if _text.strip():
                    text_segments.append(_text)
-                    element = element_from_text(_text)
                    coordinate_system = PixelSpace(
                        width=width,
                        height=height,
                    )
                    points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
+                    element = element_from_text(
+                        _text,
+                        coordinates=points,
+                        coordinate_system=coordinate_system,
+                    )
                    coordinates_metadata = CoordinatesMetadata(
                        points=points,
                        system=coordinate_system,
--- a/unstructured/partition/text.py
+++ b/unstructured/partition/text.py
@ -1,7 +1,8 @@
 import re
-from typing import IO, Callable, List, Optional
+from typing import IO, Callable, List, Optional, Tuple

 from unstructured.cleaners.core import clean_bullets, group_broken_paragraphs
+from unstructured.documents.coordinates import CoordinateSystem
 from unstructured.documents.elements import (
    Address,
    Element,
@ -143,14 +144,26 @@ def partition_text(
    return elements


-def element_from_text(text: str) -> Element:
+def element_from_text(
+    text: str,
+    coordinates: Optional[Tuple[Tuple[float, float], ...]] = None,
+    coordinate_system: Optional[CoordinateSystem] = None,
+) -> Element:
    if is_bulleted_text(text):
-        return ListItem(text=clean_bullets(text))
+        return ListItem(
+            text=clean_bullets(text),
+            coordinates=coordinates,
+            coordinate_system=coordinate_system,
+        )
    elif is_us_city_state_zip(text):
-        return Address(text=text)
+        return Address(text=text, coordinates=coordinates, coordinate_system=coordinate_system)
    elif is_possible_narrative_text(text):
-        return NarrativeText(text=text)
+        return NarrativeText(
+            text=text,
+            coordinates=coordinates,
+            coordinate_system=coordinate_system,
+        )
    elif is_possible_title(text):
-        return Title(text=text)
+        return Title(text=text, coordinates=coordinates, coordinate_system=coordinate_system)
    else:
-        return Text(text=text)
+        return Text(text=text, coordinates=coordinates, coordinate_system=coordinate_system)