fix: group together text from the same bounding box in partition_pdf with fast strategy (#542)

* switch to using PDF objects * linting, linting, linting * couple more tweaks * added test for chevron-page * version and changelog * linting, linting, linting * now processing 4 files
2025-12-12 23:51:47 +00:00 · 2023-05-03 18:33:24 -04:00 · 2023-05-03 18:33:24 -04:00 · aa01cdfc7a
commit aa01cdfc7a
parent 7e43a25f07
6 changed files with 46 additions and 27 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.6.3-dev0
+## 0.6.3-dev1

 ### Enhancements

@ -10,6 +10,9 @@

 ### Fixes

+* Updates the grouping logic in the `partition_pdf` fast strategy to group together text
+  in the same bounding box.
+

 ## 0.6.2

--- a/example-docs/chevron-page.pdf
+++ b/example-docs/chevron-page.pdf
--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@ -319,3 +319,18 @@ def test_partition_pdf_fails_if_pdf_not_processable(

    with pytest.raises(ValueError):
        pdf.partition_pdf(filename=filename)
+
+
+def test_partition_pdf_fast_groups_text_in_text_box():
+    filename = os.path.join("example-docs", "chevron-page.pdf")
+    elements = pdf.partition_pdf(filename=filename, strategy="fast")
+
+    assert elements[0] == Title("eastern mediterranean")
+
+    assert isinstance(elements[1], NarrativeText)
+    assert str(elements[1]).startswith("We")
+    assert str(elements[1]).endswith("Jordan and Egypt.")
+
+    assert elements[3] == Title(
+        "kilograms CO₂e/boe carbon intensity from our Eastern Mediterranean operations in 2022",
+    )
--- a/test_unstructured_ingest/test-ingest-against-api.sh
+++ b/test_unstructured_ingest/test-ingest-against-api.sh
@ -15,8 +15,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \

 set +e

-if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 3 ]; then
+if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 4 ]; then
   echo
-   echo "3 files should have been created."
+   echo "4 files should have been created."
   exit 1
 fi
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.6.3-dev0"  # pragma: no cover
+__version__ = "0.6.3-dev1"  # pragma: no cover
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -1,12 +1,15 @@
+import re
 import warnings
-from io import StringIO
 from typing import BinaryIO, List, Optional, cast

+from pdfminer.high_level import extract_pages
 from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
 from pdfminer.utils import open_filename

+from unstructured.cleaners.core import clean_extra_whitespace
 from unstructured.documents.elements import Element, ElementMetadata, PageBreak
 from unstructured.logger import logger
+from unstructured.nlp.patterns import PARAGRAPH_PATTERN
 from unstructured.partition import _partition_via_api
 from unstructured.partition.common import (
    add_element_metadata,
@ -285,31 +288,29 @@ def _process_pdfminer_pages(
    include_page_breaks: bool = False,
 ):
    """Uses PDF miner to split a document into pages and process them."""
-    from pdfminer.converter import TextConverter
-    from pdfminer.layout import LAParams
-    from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
-
-    rsrcmgr = PDFResourceManager(caching=False)
-    laparams = LAParams()
-
    elements: List[Element] = []

-    for i, page in enumerate(PDFPage.get_pages(fp, check_extractable=True)):
+    for i, page in enumerate(extract_pages(fp)):  # type: ignore
        metadata = ElementMetadata(filename=filename, page_number=i + 1)
-        with StringIO() as output_string:
-            device = TextConverter(
-                rsrcmgr,
-                output_string,
-                codec=encoding,
-                laparams=laparams,
-            )
-            interpreter = PDFPageInterpreter(rsrcmgr, device)
-            interpreter.process_page(page)
-            text = output_string.getvalue()
-            _elements = partition_text(text=text)
-            for element in _elements:
-                element.metadata = metadata
-                elements.append(element)
+
+        text_segments = []
+        for obj in page:
+            # NOTE(robinson) - "Figure" is an example of an object type that does
+            # not have a get_text method
+            if not hasattr(obj, "get_text"):
+                continue
+            _text = obj.get_text()
+            _text = re.sub(PARAGRAPH_PATTERN, " ", _text)
+            _text = clean_extra_whitespace(_text)
+            if _text.strip():
+                text_segments.append(_text)
+
+        text = "\n\n".join(text_segments)
+
+        _elements = partition_text(text=text)
+        for element in _elements:
+            element.metadata = metadata
+            elements.append(element)

        if include_page_breaks:
            elements.append(PageBreak())