diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d9ec4f3b..acdf7ba8a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,15 +2,16 @@ ### Enhancements -* Adds recursive functionality to all fsspec connectors -* Adds generic --recursive ingest flag - ### Features * Adds Google Cloud Service connector ### Fixes +* Updates the `"fast"` strategy for `partition_pdf` so that it's able to recursively +* Adds recursive functionality to all fsspec connectors +* Adds generic --recursive ingest flag + ## 0.7.7 ### Enhancements diff --git a/example-docs/reliance.pdf b/example-docs/reliance.pdf new file mode 100644 index 000000000..53514cfbc Binary files /dev/null and b/example-docs/reliance.pdf differ diff --git a/test_unstructured/partition/test_pdf.py b/test_unstructured/partition/test_pdf.py index f854c4dc3..c6bb66aa4 100644 --- a/test_unstructured/partition/test_pdf.py +++ b/test_unstructured/partition/test_pdf.py @@ -243,11 +243,10 @@ def test_partition_pdf_with_auto_strategy( filename="example-docs/layout-parser-paper-fast.pdf", ): elements = pdf.partition_pdf(filename=filename, strategy="auto") - titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10] title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis" - assert titles[0].text == title - assert titles[0].metadata.filename == "layout-parser-paper-fast.pdf" - assert titles[0].metadata.file_directory == "example-docs" + assert elements[0].text == title + assert elements[0].metadata.filename == "layout-parser-paper-fast.pdf" + assert elements[0].metadata.file_directory == "example-docs" def test_partition_pdf_with_page_breaks( @@ -430,6 +429,13 @@ def test_partition_pdf_with_copy_protection(): assert {element.metadata.page_number for element in elements} == {1, 2} +def test_partition_pdf_requiring_recursive_text_grab(filename="example-docs/reliance.pdf"): + elements = pdf.partition_pdf(filename=filename, strategy="fast") + assert len(elements) > 50 + assert elements[0].metadata.page_number == 1 + assert elements[-1].metadata.page_number == 3 + + def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog): filename = os.path.join("example-docs", "copy-protected.pdf") elements = pdf.partition_pdf(filename=filename, strategy="fast") diff --git a/test_unstructured_ingest/test-ingest-against-api.sh b/test_unstructured_ingest/test-ingest-against-api.sh index 926429cd3..6f1c2ff40 100755 --- a/test_unstructured_ingest/test-ingest-against-api.sh +++ b/test_unstructured_ingest/test-ingest-against-api.sh @@ -16,8 +16,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ set +e -if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 7 ]; then +if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 8 ]; then echo - echo "7 files should have been created." + echo "8 files should have been created." exit 1 fi diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 38486b7c4..59da9331a 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -5,6 +5,7 @@ from typing import BinaryIO, List, Optional, Union, cast import pdf2image from pdfminer.high_level import extract_pages +from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox from pdfminer.utils import open_filename from PIL import Image @@ -264,6 +265,25 @@ def _partition_pdf_with_pdfminer( return elements +def _extract_text(item: LTItem) -> str: + """Recursively extracts text from PDFMiner objects to account + for scenarios where the text is in a sub-container.""" + if hasattr(item, "get_text"): + return item.get_text() + + elif isinstance(item, LTContainer): + text = "" + for child in item: + text += _extract_text(child) or "" + return text + + elif isinstance(item, (LTTextBox, LTImage)): + # TODO(robinson) - Support pulling text out of images + # https://github.com/pdfminer/pdfminer.six/blob/master/pdfminer/image.py#L90 + return "\n" + return "\n" + + def _process_pdfminer_pages( fp: BinaryIO, filename: str = "", @@ -283,23 +303,24 @@ def _process_pdfminer_pages( y1 = height - y1 y2 = height - y2 - # NOTE(robinson) - "Figure" is an example of an object type that does - # not have a get_text method - if not hasattr(obj, "get_text"): - continue - _text = obj.get_text() - _text = re.sub(PARAGRAPH_PATTERN, " ", _text) - _text = clean_extra_whitespace(_text) - if _text.strip(): - text_segments.append(_text) - element = element_from_text(_text) - element._coordinate_system = PixelSpace( - width=width, - height=height, - ) - element.coordinates = ((x1, y1), (x1, y2), (x2, y2), (x2, y1)) - element.metadata = metadata - page_elements.append(element) + if hasattr(obj, "get_text"): + _text_snippets = [obj.get_text()] + else: + _text = _extract_text(obj) + _text_snippets = re.split(PARAGRAPH_PATTERN, _text) + + for _text in _text_snippets: + _text = clean_extra_whitespace(_text) + if _text.strip(): + text_segments.append(_text) + element = element_from_text(_text) + element._coordinate_system = PixelSpace( + width=width, + height=height, + ) + element.coordinates = ((x1, y1), (x1, y2), (x2, y2), (x2, y1)) + element.metadata = metadata + page_elements.append(element) sorted_page_elements = sorted( page_elements,