fix: enable partition_pdf to recursively grab text with fast strategy (#796)

* initial pass on text in figures

* refactor text extraction

* update tests

* fix title test

* add test for docs that require recursive text grab

* version and changelog

* ingest-test-fixtures-update

* there are 8 pdf files now
This commit is contained in:
Matt Robinson 2023-06-22 11:19:54 -04:00 committed by GitHub
parent 3b472cb7df
commit 8683e2695c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 54 additions and 26 deletions

View File

@ -2,15 +2,16 @@
### Enhancements ### Enhancements
* Adds recursive functionality to all fsspec connectors
* Adds generic --recursive ingest flag
### Features ### Features
* Adds Google Cloud Service connector * Adds Google Cloud Service connector
### Fixes ### Fixes
* Updates the `"fast"` strategy for `partition_pdf` so that it's able to recursively
* Adds recursive functionality to all fsspec connectors
* Adds generic --recursive ingest flag
## 0.7.7 ## 0.7.7
### Enhancements ### Enhancements

BIN
example-docs/reliance.pdf Normal file

Binary file not shown.

View File

@ -243,11 +243,10 @@ def test_partition_pdf_with_auto_strategy(
filename="example-docs/layout-parser-paper-fast.pdf", filename="example-docs/layout-parser-paper-fast.pdf",
): ):
elements = pdf.partition_pdf(filename=filename, strategy="auto") elements = pdf.partition_pdf(filename=filename, strategy="auto")
titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis" title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
assert titles[0].text == title assert elements[0].text == title
assert titles[0].metadata.filename == "layout-parser-paper-fast.pdf" assert elements[0].metadata.filename == "layout-parser-paper-fast.pdf"
assert titles[0].metadata.file_directory == "example-docs" assert elements[0].metadata.file_directory == "example-docs"
def test_partition_pdf_with_page_breaks( def test_partition_pdf_with_page_breaks(
@ -430,6 +429,13 @@ def test_partition_pdf_with_copy_protection():
assert {element.metadata.page_number for element in elements} == {1, 2} assert {element.metadata.page_number for element in elements} == {1, 2}
def test_partition_pdf_requiring_recursive_text_grab(filename="example-docs/reliance.pdf"):
elements = pdf.partition_pdf(filename=filename, strategy="fast")
assert len(elements) > 50
assert elements[0].metadata.page_number == 1
assert elements[-1].metadata.page_number == 3
def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog): def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog):
filename = os.path.join("example-docs", "copy-protected.pdf") filename = os.path.join("example-docs", "copy-protected.pdf")
elements = pdf.partition_pdf(filename=filename, strategy="fast") elements = pdf.partition_pdf(filename=filename, strategy="fast")

View File

@ -16,8 +16,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
set +e set +e
if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 7 ]; then if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 8 ]; then
echo echo
echo "7 files should have been created." echo "8 files should have been created."
exit 1 exit 1
fi fi

View File

@ -5,6 +5,7 @@ from typing import BinaryIO, List, Optional, Union, cast
import pdf2image import pdf2image
from pdfminer.high_level import extract_pages from pdfminer.high_level import extract_pages
from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
from pdfminer.utils import open_filename from pdfminer.utils import open_filename
from PIL import Image from PIL import Image
@ -264,6 +265,25 @@ def _partition_pdf_with_pdfminer(
return elements return elements
def _extract_text(item: LTItem) -> str:
"""Recursively extracts text from PDFMiner objects to account
for scenarios where the text is in a sub-container."""
if hasattr(item, "get_text"):
return item.get_text()
elif isinstance(item, LTContainer):
text = ""
for child in item:
text += _extract_text(child) or ""
return text
elif isinstance(item, (LTTextBox, LTImage)):
# TODO(robinson) - Support pulling text out of images
# https://github.com/pdfminer/pdfminer.six/blob/master/pdfminer/image.py#L90
return "\n"
return "\n"
def _process_pdfminer_pages( def _process_pdfminer_pages(
fp: BinaryIO, fp: BinaryIO,
filename: str = "", filename: str = "",
@ -283,12 +303,13 @@ def _process_pdfminer_pages(
y1 = height - y1 y1 = height - y1
y2 = height - y2 y2 = height - y2
# NOTE(robinson) - "Figure" is an example of an object type that does if hasattr(obj, "get_text"):
# not have a get_text method _text_snippets = [obj.get_text()]
if not hasattr(obj, "get_text"): else:
continue _text = _extract_text(obj)
_text = obj.get_text() _text_snippets = re.split(PARAGRAPH_PATTERN, _text)
_text = re.sub(PARAGRAPH_PATTERN, " ", _text)
for _text in _text_snippets:
_text = clean_extra_whitespace(_text) _text = clean_extra_whitespace(_text)
if _text.strip(): if _text.strip():
text_segments.append(_text) text_segments.append(_text)