mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-15 20:27:37 +00:00
fix: enable partition_pdf
to recursively grab text with fast strategy (#796)
* initial pass on text in figures * refactor text extraction * update tests * fix title test * add test for docs that require recursive text grab * version and changelog * ingest-test-fixtures-update * there are 8 pdf files now
This commit is contained in:
parent
3b472cb7df
commit
8683e2695c
@ -2,15 +2,16 @@
|
|||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* Adds recursive functionality to all fsspec connectors
|
|
||||||
* Adds generic --recursive ingest flag
|
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
* Adds Google Cloud Service connector
|
* Adds Google Cloud Service connector
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
* Updates the `"fast"` strategy for `partition_pdf` so that it's able to recursively
|
||||||
|
* Adds recursive functionality to all fsspec connectors
|
||||||
|
* Adds generic --recursive ingest flag
|
||||||
|
|
||||||
## 0.7.7
|
## 0.7.7
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
BIN
example-docs/reliance.pdf
Normal file
BIN
example-docs/reliance.pdf
Normal file
Binary file not shown.
@ -243,11 +243,10 @@ def test_partition_pdf_with_auto_strategy(
|
|||||||
filename="example-docs/layout-parser-paper-fast.pdf",
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
||||||
):
|
):
|
||||||
elements = pdf.partition_pdf(filename=filename, strategy="auto")
|
elements = pdf.partition_pdf(filename=filename, strategy="auto")
|
||||||
titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
|
|
||||||
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
||||||
assert titles[0].text == title
|
assert elements[0].text == title
|
||||||
assert titles[0].metadata.filename == "layout-parser-paper-fast.pdf"
|
assert elements[0].metadata.filename == "layout-parser-paper-fast.pdf"
|
||||||
assert titles[0].metadata.file_directory == "example-docs"
|
assert elements[0].metadata.file_directory == "example-docs"
|
||||||
|
|
||||||
|
|
||||||
def test_partition_pdf_with_page_breaks(
|
def test_partition_pdf_with_page_breaks(
|
||||||
@ -430,6 +429,13 @@ def test_partition_pdf_with_copy_protection():
|
|||||||
assert {element.metadata.page_number for element in elements} == {1, 2}
|
assert {element.metadata.page_number for element in elements} == {1, 2}
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_pdf_requiring_recursive_text_grab(filename="example-docs/reliance.pdf"):
|
||||||
|
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
||||||
|
assert len(elements) > 50
|
||||||
|
assert elements[0].metadata.page_number == 1
|
||||||
|
assert elements[-1].metadata.page_number == 3
|
||||||
|
|
||||||
|
|
||||||
def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog):
|
def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog):
|
||||||
filename = os.path.join("example-docs", "copy-protected.pdf")
|
filename = os.path.join("example-docs", "copy-protected.pdf")
|
||||||
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
||||||
|
@ -16,8 +16,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
|
|
||||||
set +e
|
set +e
|
||||||
|
|
||||||
if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 7 ]; then
|
if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 8 ]; then
|
||||||
echo
|
echo
|
||||||
echo "7 files should have been created."
|
echo "8 files should have been created."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
@ -5,6 +5,7 @@ from typing import BinaryIO, List, Optional, Union, cast
|
|||||||
|
|
||||||
import pdf2image
|
import pdf2image
|
||||||
from pdfminer.high_level import extract_pages
|
from pdfminer.high_level import extract_pages
|
||||||
|
from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
|
||||||
from pdfminer.utils import open_filename
|
from pdfminer.utils import open_filename
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
@ -264,6 +265,25 @@ def _partition_pdf_with_pdfminer(
|
|||||||
return elements
|
return elements
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_text(item: LTItem) -> str:
|
||||||
|
"""Recursively extracts text from PDFMiner objects to account
|
||||||
|
for scenarios where the text is in a sub-container."""
|
||||||
|
if hasattr(item, "get_text"):
|
||||||
|
return item.get_text()
|
||||||
|
|
||||||
|
elif isinstance(item, LTContainer):
|
||||||
|
text = ""
|
||||||
|
for child in item:
|
||||||
|
text += _extract_text(child) or ""
|
||||||
|
return text
|
||||||
|
|
||||||
|
elif isinstance(item, (LTTextBox, LTImage)):
|
||||||
|
# TODO(robinson) - Support pulling text out of images
|
||||||
|
# https://github.com/pdfminer/pdfminer.six/blob/master/pdfminer/image.py#L90
|
||||||
|
return "\n"
|
||||||
|
return "\n"
|
||||||
|
|
||||||
|
|
||||||
def _process_pdfminer_pages(
|
def _process_pdfminer_pages(
|
||||||
fp: BinaryIO,
|
fp: BinaryIO,
|
||||||
filename: str = "",
|
filename: str = "",
|
||||||
@ -283,23 +303,24 @@ def _process_pdfminer_pages(
|
|||||||
y1 = height - y1
|
y1 = height - y1
|
||||||
y2 = height - y2
|
y2 = height - y2
|
||||||
|
|
||||||
# NOTE(robinson) - "Figure" is an example of an object type that does
|
if hasattr(obj, "get_text"):
|
||||||
# not have a get_text method
|
_text_snippets = [obj.get_text()]
|
||||||
if not hasattr(obj, "get_text"):
|
else:
|
||||||
continue
|
_text = _extract_text(obj)
|
||||||
_text = obj.get_text()
|
_text_snippets = re.split(PARAGRAPH_PATTERN, _text)
|
||||||
_text = re.sub(PARAGRAPH_PATTERN, " ", _text)
|
|
||||||
_text = clean_extra_whitespace(_text)
|
for _text in _text_snippets:
|
||||||
if _text.strip():
|
_text = clean_extra_whitespace(_text)
|
||||||
text_segments.append(_text)
|
if _text.strip():
|
||||||
element = element_from_text(_text)
|
text_segments.append(_text)
|
||||||
element._coordinate_system = PixelSpace(
|
element = element_from_text(_text)
|
||||||
width=width,
|
element._coordinate_system = PixelSpace(
|
||||||
height=height,
|
width=width,
|
||||||
)
|
height=height,
|
||||||
element.coordinates = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
|
)
|
||||||
element.metadata = metadata
|
element.coordinates = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
|
||||||
page_elements.append(element)
|
element.metadata = metadata
|
||||||
|
page_elements.append(element)
|
||||||
|
|
||||||
sorted_page_elements = sorted(
|
sorted_page_elements = sorted(
|
||||||
page_elements,
|
page_elements,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user