fix: enable partition_pdf to recursively grab text with fast strategy (#796)

* initial pass on text in figures * refactor text extraction * update tests * fix title test * add test for docs that require recursive text grab * version and changelog * ingest-test-fixtures-update * there are 8 pdf files now
2025-12-16 09:47:18 +00:00 · 2023-06-22 11:19:54 -04:00 · 2023-06-22 11:19:54 -04:00 · 8683e2695c
commit 8683e2695c
parent 3b472cb7df
5 changed files with 54 additions and 26 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,15 +2,16 @@
 ### Enhancements
 * Adds recursive functionality to all fsspec connectors
 * Adds generic --recursive ingest flag
 ### Features
 * Adds Google Cloud Service connector
 ### Fixes
 * Updates the `"fast"` strategy for `partition_pdf` so that it's able to recursively
 * Adds recursive functionality to all fsspec connectors
 * Adds generic --recursive ingest flag
 ## 0.7.7
 ### Enhancements
--- a/example-docs/reliance.pdf
+++ b/example-docs/reliance.pdf
--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@ -243,11 +243,10 @@ def test_partition_pdf_with_auto_strategy(
    filename="example-docs/layout-parser-paper-fast.pdf",
 ):
    elements = pdf.partition_pdf(filename=filename, strategy="auto")
    titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
    title = "LayoutParser: A Uniﬁed Toolkit for Deep Learning Based Document Image Analysis"
-    assert titles[0].text == title
+    assert elements[0].text == title
-    assert titles[0].metadata.filename == "layout-parser-paper-fast.pdf"
+    assert elements[0].metadata.filename == "layout-parser-paper-fast.pdf"
-    assert titles[0].metadata.file_directory == "example-docs"
+    assert elements[0].metadata.file_directory == "example-docs"
 def test_partition_pdf_with_page_breaks(
@ -430,6 +429,13 @@ def test_partition_pdf_with_copy_protection():
    assert {element.metadata.page_number for element in elements} == {1, 2}
 def test_partition_pdf_requiring_recursive_text_grab(filename="example-docs/reliance.pdf"):
    elements = pdf.partition_pdf(filename=filename, strategy="fast")
    assert len(elements) > 50
    assert elements[0].metadata.page_number == 1
    assert elements[-1].metadata.page_number == 3
 def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog):
    filename = os.path.join("example-docs", "copy-protected.pdf")
    elements = pdf.partition_pdf(filename=filename, strategy="fast")
--- a/test_unstructured_ingest/test-ingest-against-api.sh
+++ b/test_unstructured_ingest/test-ingest-against-api.sh
@ -16,8 +16,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
 set +e
-if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 7 ]; then
+if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 8 ]; then
   echo
-   echo "7 files should have been created."
+   echo "8 files should have been created."
   exit 1
 fi
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -5,6 +5,7 @@ from typing import BinaryIO, List, Optional, Union, cast
 import pdf2image
 from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
 from pdfminer.utils import open_filename
 from PIL import Image
@ -264,6 +265,25 @@ def _partition_pdf_with_pdfminer(
    return elements
 def _extract_text(item: LTItem) -> str:
    """Recursively extracts text from PDFMiner objects to account
    for scenarios where the text is in a sub-container."""
    if hasattr(item, "get_text"):
        return item.get_text()
    elif isinstance(item, LTContainer):
        text = ""
        for child in item:
            text += _extract_text(child) or ""
        return text
    elif isinstance(item, (LTTextBox, LTImage)):
        # TODO(robinson) - Support pulling text out of images
        # https://github.com/pdfminer/pdfminer.six/blob/master/pdfminer/image.py#L90
        return "\n"
    return "\n"
 def _process_pdfminer_pages(
    fp: BinaryIO,
    filename: str = "",
@ -283,12 +303,13 @@ def _process_pdfminer_pages(
            y1 = height - y1
            y2 = height - y2
-            # NOTE(robinson) - "Figure" is an example of an object type that does
+            if hasattr(obj, "get_text"):
-            # not have a get_text method
+                _text_snippets = [obj.get_text()]
-            if not hasattr(obj, "get_text"):
+            else:
-                continue
+                _text = _extract_text(obj)
-            _text = obj.get_text()
+                _text_snippets = re.split(PARAGRAPH_PATTERN, _text)
-            _text = re.sub(PARAGRAPH_PATTERN, " ", _text)
+
            for _text in _text_snippets:
                _text = clean_extra_whitespace(_text)
                if _text.strip():
                    text_segments.append(_text)