fix: enable partition_pdf to recursively grab text with fast strategy (#796)

* initial pass on text in figures * refactor text extraction * update tests * fix title test * add test for docs that require recursive text grab * version and changelog * ingest-test-fixtures-update * there are 8 pdf files now
2025-12-14 08:44:29 +00:00 · 2023-06-22 11:19:54 -04:00 · 2023-06-22 11:19:54 -04:00 · 8683e2695c
commit 8683e2695c
parent 3b472cb7df
5 changed files with 54 additions and 26 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,15 +2,16 @@

 ### Enhancements

-* Adds recursive functionality to all fsspec connectors
-* Adds generic --recursive ingest flag
-
 ### Features

 * Adds Google Cloud Service connector

 ### Fixes

+* Updates the `"fast"` strategy for `partition_pdf` so that it's able to recursively
+* Adds recursive functionality to all fsspec connectors
+* Adds generic --recursive ingest flag
+
 ## 0.7.7

 ### Enhancements
--- a/example-docs/reliance.pdf
+++ b/example-docs/reliance.pdf
--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@ -243,11 +243,10 @@ def test_partition_pdf_with_auto_strategy(
    filename="example-docs/layout-parser-paper-fast.pdf",
 ):
    elements = pdf.partition_pdf(filename=filename, strategy="auto")
-    titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
    title = "LayoutParser: A Uniﬁed Toolkit for Deep Learning Based Document Image Analysis"
-    assert titles[0].text == title
-    assert titles[0].metadata.filename == "layout-parser-paper-fast.pdf"
-    assert titles[0].metadata.file_directory == "example-docs"
+    assert elements[0].text == title
+    assert elements[0].metadata.filename == "layout-parser-paper-fast.pdf"
+    assert elements[0].metadata.file_directory == "example-docs"


 def test_partition_pdf_with_page_breaks(
@ -430,6 +429,13 @@ def test_partition_pdf_with_copy_protection():
    assert {element.metadata.page_number for element in elements} == {1, 2}


+def test_partition_pdf_requiring_recursive_text_grab(filename="example-docs/reliance.pdf"):
+    elements = pdf.partition_pdf(filename=filename, strategy="fast")
+    assert len(elements) > 50
+    assert elements[0].metadata.page_number == 1
+    assert elements[-1].metadata.page_number == 3
+
+
 def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog):
    filename = os.path.join("example-docs", "copy-protected.pdf")
    elements = pdf.partition_pdf(filename=filename, strategy="fast")
--- a/test_unstructured_ingest/test-ingest-against-api.sh
+++ b/test_unstructured_ingest/test-ingest-against-api.sh
@ -16,8 +16,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \

 set +e

-if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 7 ]; then
+if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 8 ]; then
   echo
-   echo "7 files should have been created."
+   echo "8 files should have been created."
   exit 1
 fi
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -5,6 +5,7 @@ from typing import BinaryIO, List, Optional, Union, cast

 import pdf2image
 from pdfminer.high_level import extract_pages
+from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
 from pdfminer.utils import open_filename
 from PIL import Image

@ -264,6 +265,25 @@ def _partition_pdf_with_pdfminer(
    return elements


+def _extract_text(item: LTItem) -> str:
+    """Recursively extracts text from PDFMiner objects to account
+    for scenarios where the text is in a sub-container."""
+    if hasattr(item, "get_text"):
+        return item.get_text()
+
+    elif isinstance(item, LTContainer):
+        text = ""
+        for child in item:
+            text += _extract_text(child) or ""
+        return text
+
+    elif isinstance(item, (LTTextBox, LTImage)):
+        # TODO(robinson) - Support pulling text out of images
+        # https://github.com/pdfminer/pdfminer.six/blob/master/pdfminer/image.py#L90
+        return "\n"
+    return "\n"
+
+
 def _process_pdfminer_pages(
    fp: BinaryIO,
    filename: str = "",
@ -283,12 +303,13 @@ def _process_pdfminer_pages(
            y1 = height - y1
            y2 = height - y2

-            # NOTE(robinson) - "Figure" is an example of an object type that does
-            # not have a get_text method
-            if not hasattr(obj, "get_text"):
-                continue
-            _text = obj.get_text()
-            _text = re.sub(PARAGRAPH_PATTERN, " ", _text)
+            if hasattr(obj, "get_text"):
+                _text_snippets = [obj.get_text()]
+            else:
+                _text = _extract_text(obj)
+                _text_snippets = re.split(PARAGRAPH_PATTERN, _text)
+
+            for _text in _text_snippets:
                _text = clean_extra_whitespace(_text)
                if _text.strip():
                    text_segments.append(_text)