enhancement: include coords in fast (#626)

Makes the bounding box coordinates available when using fast strategy. * Refactored partition_text to make the workflow of categorizing an element purely from the text available without running the entirety of partition_text. * Transformed the coordinates from pdf space into pixel space to be consistent with hi_res. We will probably want to revisit the coordinate system soon.
2025-12-27 07:03:52 +00:00 · 2023-05-20 16:26:55 -05:00 · 2023-05-20 16:26:55 -05:00 · 55e5d8ea2f
commit 55e5d8ea2f
parent fda51d6ead
5 changed files with 46 additions and 23 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,7 +1,9 @@
-## 0.6.9-dev0
+## 0.6.9-dev1

 ### Enhancements

+* fast strategy for pdf now keeps element bounding box data
+
 ### Features

 ### Fixes
--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@ -404,7 +404,15 @@ def test_partition_pdf_fast_groups_text_in_text_box():
    filename = os.path.join("example-docs", "chevron-page.pdf")
    elements = pdf.partition_pdf(filename=filename, strategy="fast")

-    assert elements[0] == Title("eastern mediterranean")
+    assert elements[0] == Title(
+        "eastern mediterranean",
+        coordinates=(
+            (193.1741, 71.94000000000005),
+            (193.1741, 91.94000000000005),
+            (418.6881, 91.94000000000005),
+            (418.6881, 71.94000000000005),
+        ),
+    )

    assert isinstance(elements[1], NarrativeText)
    assert str(elements[1]).startswith("We")
@ -412,4 +420,10 @@ def test_partition_pdf_fast_groups_text_in_text_box():

    assert elements[3] == Title(
        "kilograms CO₂e/boe carbon intensity from our Eastern Mediterranean operations in 2022",
+        coordinates=(
+            (69.4871, 222.4357),
+            (69.4871, 272.1607),
+            (197.8209, 272.1607),
+            (197.8209, 222.4357),
+        ),
    )
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.6.9-dev0"  # pragma: no cover
+__version__ = "0.6.9-dev1"  # pragma: no cover
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -22,7 +22,7 @@ from unstructured.partition.common import (
    spooled_to_bytes_io_if_needed,
 )
 from unstructured.partition.strategies import determine_pdf_or_image_strategy
-from unstructured.partition.text import partition_text
+from unstructured.partition.text import element_from_text, partition_text
 from unstructured.utils import requires_dependencies


@ -275,11 +275,16 @@ def _process_pdfminer_pages(

    for i, page in enumerate(extract_pages(fp)):  # type: ignore
        metadata = ElementMetadata(filename=filename, page_number=i + 1)
+        height = page.height

        text_segments = []
        for obj in page:
            # NOTE(robinson) - "Figure" is an example of an object type that does
            # not have a get_text method
+            x1, y2, x2, y1 = obj.bbox
+            y1 = height - y1
+            y2 = height - y2
+
            if not hasattr(obj, "get_text"):
                continue
            _text = obj.get_text()
@ -287,13 +292,10 @@ def _process_pdfminer_pages(
            _text = clean_extra_whitespace(_text)
            if _text.strip():
                text_segments.append(_text)
-
-        text = "\n\n".join(text_segments)
-
-        _elements = partition_text(text=text)
-        for element in _elements:
-            element.metadata = metadata
-            elements.append(element)
+                element = element_from_text(_text)
+                element.coordinates = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
+                element.metadata = metadata
+                elements.append(element)

        if include_page_breaks:
            elements.append(PageBreak())
--- a/unstructured/partition/text.py
+++ b/unstructured/partition/text.py
@ -90,17 +90,22 @@ def partition_text(
    for ctext in file_content:
        ctext = ctext.strip()

-        if ctext == "":
-            continue
-        if is_bulleted_text(ctext):
-            elements.append(ListItem(text=clean_bullets(ctext), metadata=metadata))
-        elif is_us_city_state_zip(ctext):
-            elements.append(Address(text=ctext, metadata=metadata))
-        elif is_possible_narrative_text(ctext):
-            elements.append(NarrativeText(text=ctext, metadata=metadata))
-        elif is_possible_title(ctext):
-            elements.append(Title(text=ctext, metadata=metadata))
-        else:
-            elements.append(Text(text=ctext, metadata=metadata))
+        if ctext:
+            element = element_from_text(ctext)
+            element.metadata = metadata
+            elements.append(element)

    return elements
+
+
+def element_from_text(text: str) -> Element:
+    if is_bulleted_text(text):
+        return ListItem(text=clean_bullets(text))
+    elif is_us_city_state_zip(text):
+        return Address(text=text)
+    elif is_possible_narrative_text(text):
+        return NarrativeText(text=text)
+    elif is_possible_title(text):
+        return Title(text=text)
+    else:
+        return Text(text=text)