fix: group together text from the same bounding box in partition_pdf with fast strategy (#542)

* switch to using PDF objects * linting, linting, linting * couple more tweaks * added test for chevron-page * version and changelog * linting, linting, linting * now processing 4 files
2025-12-18 02:34:13 +00:00 · 2023-05-03 18:33:24 -04:00 · 2023-05-03 18:33:24 -04:00 · aa01cdfc7a
commit aa01cdfc7a
parent 7e43a25f07
6 changed files with 46 additions and 27 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.6.3-dev0
+## 0.6.3-dev1
 ### Enhancements
@ -10,6 +10,9 @@
 ### Fixes
 * Updates the grouping logic in the `partition_pdf` fast strategy to group together text
  in the same bounding box.
 ## 0.6.2
--- a/example-docs/chevron-page.pdf
+++ b/example-docs/chevron-page.pdf
--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@ -319,3 +319,18 @@ def test_partition_pdf_fails_if_pdf_not_processable(
    with pytest.raises(ValueError):
        pdf.partition_pdf(filename=filename)
 def test_partition_pdf_fast_groups_text_in_text_box():
    filename = os.path.join("example-docs", "chevron-page.pdf")
    elements = pdf.partition_pdf(filename=filename, strategy="fast")
    assert elements[0] == Title("eastern mediterranean")
    assert isinstance(elements[1], NarrativeText)
    assert str(elements[1]).startswith("We")
    assert str(elements[1]).endswith("Jordan and Egypt.")
    assert elements[3] == Title(
        "kilograms CO₂e/boe carbon intensity from our Eastern Mediterranean operations in 2022",
    )
--- a/test_unstructured_ingest/test-ingest-against-api.sh
+++ b/test_unstructured_ingest/test-ingest-against-api.sh
@ -15,8 +15,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
 set +e
-if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 3 ]; then
+if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 4 ]; then
   echo
-   echo "3 files should have been created."
+   echo "4 files should have been created."
   exit 1
 fi
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.6.3-dev0"  # pragma: no cover
+__version__ = "0.6.3-dev1"  # pragma: no cover
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -1,12 +1,15 @@
 import re
 import warnings
 from io import StringIO
 from typing import BinaryIO, List, Optional, cast
 from pdfminer.high_level import extract_pages
 from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
 from pdfminer.utils import open_filename
 from unstructured.cleaners.core import clean_extra_whitespace
 from unstructured.documents.elements import Element, ElementMetadata, PageBreak
 from unstructured.logger import logger
 from unstructured.nlp.patterns import PARAGRAPH_PATTERN
 from unstructured.partition import _partition_via_api
 from unstructured.partition.common import (
    add_element_metadata,
@ -285,31 +288,29 @@ def _process_pdfminer_pages(
    include_page_breaks: bool = False,
 ):
    """Uses PDF miner to split a document into pages and process them."""
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
    rsrcmgr = PDFResourceManager(caching=False)
    laparams = LAParams()
    elements: List[Element] = []
-    for i, page in enumerate(PDFPage.get_pages(fp, check_extractable=True)):
+    for i, page in enumerate(extract_pages(fp)):  # type: ignore
        metadata = ElementMetadata(filename=filename, page_number=i + 1)
-        with StringIO() as output_string:
+
-            device = TextConverter(
+        text_segments = []
-                rsrcmgr,
+        for obj in page:
-                output_string,
+            # NOTE(robinson) - "Figure" is an example of an object type that does
-                codec=encoding,
+            # not have a get_text method
-                laparams=laparams,
+            if not hasattr(obj, "get_text"):
-            )
+                continue
-            interpreter = PDFPageInterpreter(rsrcmgr, device)
+            _text = obj.get_text()
-            interpreter.process_page(page)
+            _text = re.sub(PARAGRAPH_PATTERN, " ", _text)
-            text = output_string.getvalue()
+            _text = clean_extra_whitespace(_text)
-            _elements = partition_text(text=text)
+            if _text.strip():
-            for element in _elements:
+                text_segments.append(_text)
-                element.metadata = metadata
+
-                elements.append(element)
+        text = "\n\n".join(text_segments)
        _elements = partition_text(text=text)
        for element in _elements:
            element.metadata = metadata
            elements.append(element)
        if include_page_breaks:
            elements.append(PageBreak())
`@ -1 +1 @@`
	`__version__ = "0.6.3-dev0" # pragma: no cover`	`__version__ = "0.6.3-dev1" # pragma: no cover`