mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-16 04:34:54 +00:00
fix: enable partition_pdf
to recursively grab text with fast strategy (#796)
* initial pass on text in figures * refactor text extraction * update tests * fix title test * add test for docs that require recursive text grab * version and changelog * ingest-test-fixtures-update * there are 8 pdf files now
This commit is contained in:
parent
3b472cb7df
commit
8683e2695c
@ -2,15 +2,16 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Adds recursive functionality to all fsspec connectors
|
||||
* Adds generic --recursive ingest flag
|
||||
|
||||
### Features
|
||||
|
||||
* Adds Google Cloud Service connector
|
||||
|
||||
### Fixes
|
||||
|
||||
* Updates the `"fast"` strategy for `partition_pdf` so that it's able to recursively
|
||||
* Adds recursive functionality to all fsspec connectors
|
||||
* Adds generic --recursive ingest flag
|
||||
|
||||
## 0.7.7
|
||||
|
||||
### Enhancements
|
||||
|
BIN
example-docs/reliance.pdf
Normal file
BIN
example-docs/reliance.pdf
Normal file
Binary file not shown.
@ -243,11 +243,10 @@ def test_partition_pdf_with_auto_strategy(
|
||||
filename="example-docs/layout-parser-paper-fast.pdf",
|
||||
):
|
||||
elements = pdf.partition_pdf(filename=filename, strategy="auto")
|
||||
titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
|
||||
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
||||
assert titles[0].text == title
|
||||
assert titles[0].metadata.filename == "layout-parser-paper-fast.pdf"
|
||||
assert titles[0].metadata.file_directory == "example-docs"
|
||||
assert elements[0].text == title
|
||||
assert elements[0].metadata.filename == "layout-parser-paper-fast.pdf"
|
||||
assert elements[0].metadata.file_directory == "example-docs"
|
||||
|
||||
|
||||
def test_partition_pdf_with_page_breaks(
|
||||
@ -430,6 +429,13 @@ def test_partition_pdf_with_copy_protection():
|
||||
assert {element.metadata.page_number for element in elements} == {1, 2}
|
||||
|
||||
|
||||
def test_partition_pdf_requiring_recursive_text_grab(filename="example-docs/reliance.pdf"):
|
||||
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
||||
assert len(elements) > 50
|
||||
assert elements[0].metadata.page_number == 1
|
||||
assert elements[-1].metadata.page_number == 3
|
||||
|
||||
|
||||
def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog):
|
||||
filename = os.path.join("example-docs", "copy-protected.pdf")
|
||||
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
||||
|
@ -16,8 +16,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
|
||||
set +e
|
||||
|
||||
if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 7 ]; then
|
||||
if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 8 ]; then
|
||||
echo
|
||||
echo "7 files should have been created."
|
||||
echo "8 files should have been created."
|
||||
exit 1
|
||||
fi
|
||||
|
@ -5,6 +5,7 @@ from typing import BinaryIO, List, Optional, Union, cast
|
||||
|
||||
import pdf2image
|
||||
from pdfminer.high_level import extract_pages
|
||||
from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
|
||||
from pdfminer.utils import open_filename
|
||||
from PIL import Image
|
||||
|
||||
@ -264,6 +265,25 @@ def _partition_pdf_with_pdfminer(
|
||||
return elements
|
||||
|
||||
|
||||
def _extract_text(item: LTItem) -> str:
|
||||
"""Recursively extracts text from PDFMiner objects to account
|
||||
for scenarios where the text is in a sub-container."""
|
||||
if hasattr(item, "get_text"):
|
||||
return item.get_text()
|
||||
|
||||
elif isinstance(item, LTContainer):
|
||||
text = ""
|
||||
for child in item:
|
||||
text += _extract_text(child) or ""
|
||||
return text
|
||||
|
||||
elif isinstance(item, (LTTextBox, LTImage)):
|
||||
# TODO(robinson) - Support pulling text out of images
|
||||
# https://github.com/pdfminer/pdfminer.six/blob/master/pdfminer/image.py#L90
|
||||
return "\n"
|
||||
return "\n"
|
||||
|
||||
|
||||
def _process_pdfminer_pages(
|
||||
fp: BinaryIO,
|
||||
filename: str = "",
|
||||
@ -283,12 +303,13 @@ def _process_pdfminer_pages(
|
||||
y1 = height - y1
|
||||
y2 = height - y2
|
||||
|
||||
# NOTE(robinson) - "Figure" is an example of an object type that does
|
||||
# not have a get_text method
|
||||
if not hasattr(obj, "get_text"):
|
||||
continue
|
||||
_text = obj.get_text()
|
||||
_text = re.sub(PARAGRAPH_PATTERN, " ", _text)
|
||||
if hasattr(obj, "get_text"):
|
||||
_text_snippets = [obj.get_text()]
|
||||
else:
|
||||
_text = _extract_text(obj)
|
||||
_text_snippets = re.split(PARAGRAPH_PATTERN, _text)
|
||||
|
||||
for _text in _text_snippets:
|
||||
_text = clean_extra_whitespace(_text)
|
||||
if _text.strip():
|
||||
text_segments.append(_text)
|
||||
|
Loading…
x
Reference in New Issue
Block a user