diff --git a/CHANGELOG.md b/CHANGELOG.md index e199c99c6..bef0a33c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.6.3-dev0 +## 0.6.3-dev1 ### Enhancements @@ -10,6 +10,9 @@ ### Fixes +* Updates the grouping logic in the `partition_pdf` fast strategy to group together text + in the same bounding box. + ## 0.6.2 diff --git a/example-docs/chevron-page.pdf b/example-docs/chevron-page.pdf new file mode 100644 index 000000000..a53aa2356 Binary files /dev/null and b/example-docs/chevron-page.pdf differ diff --git a/test_unstructured/partition/test_pdf.py b/test_unstructured/partition/test_pdf.py index cc203ca24..44edb2353 100644 --- a/test_unstructured/partition/test_pdf.py +++ b/test_unstructured/partition/test_pdf.py @@ -319,3 +319,18 @@ def test_partition_pdf_fails_if_pdf_not_processable( with pytest.raises(ValueError): pdf.partition_pdf(filename=filename) + + +def test_partition_pdf_fast_groups_text_in_text_box(): + filename = os.path.join("example-docs", "chevron-page.pdf") + elements = pdf.partition_pdf(filename=filename, strategy="fast") + + assert elements[0] == Title("eastern mediterranean") + + assert isinstance(elements[1], NarrativeText) + assert str(elements[1]).startswith("We") + assert str(elements[1]).endswith("Jordan and Egypt.") + + assert elements[3] == Title( + "kilograms CO₂e/boe carbon intensity from our Eastern Mediterranean operations in 2022", + ) diff --git a/test_unstructured_ingest/test-ingest-against-api.sh b/test_unstructured_ingest/test-ingest-against-api.sh index 6caea334a..909ed1e43 100755 --- a/test_unstructured_ingest/test-ingest-against-api.sh +++ b/test_unstructured_ingest/test-ingest-against-api.sh @@ -15,8 +15,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ set +e -if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 3 ]; then +if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 4 ]; then echo - echo "3 files should have been created." + echo "4 files should have been created." exit 1 fi diff --git a/unstructured/__version__.py b/unstructured/__version__.py index de1f54595..8bc59ac32 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.6.3-dev0" # pragma: no cover +__version__ = "0.6.3-dev1" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 84b95bd0c..3863890d9 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -1,12 +1,15 @@ +import re import warnings -from io import StringIO from typing import BinaryIO, List, Optional, cast +from pdfminer.high_level import extract_pages from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed from pdfminer.utils import open_filename +from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import Element, ElementMetadata, PageBreak from unstructured.logger import logger +from unstructured.nlp.patterns import PARAGRAPH_PATTERN from unstructured.partition import _partition_via_api from unstructured.partition.common import ( add_element_metadata, @@ -285,31 +288,29 @@ def _process_pdfminer_pages( include_page_breaks: bool = False, ): """Uses PDF miner to split a document into pages and process them.""" - from pdfminer.converter import TextConverter - from pdfminer.layout import LAParams - from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager - - rsrcmgr = PDFResourceManager(caching=False) - laparams = LAParams() - elements: List[Element] = [] - for i, page in enumerate(PDFPage.get_pages(fp, check_extractable=True)): + for i, page in enumerate(extract_pages(fp)): # type: ignore metadata = ElementMetadata(filename=filename, page_number=i + 1) - with StringIO() as output_string: - device = TextConverter( - rsrcmgr, - output_string, - codec=encoding, - laparams=laparams, - ) - interpreter = PDFPageInterpreter(rsrcmgr, device) - interpreter.process_page(page) - text = output_string.getvalue() - _elements = partition_text(text=text) - for element in _elements: - element.metadata = metadata - elements.append(element) + + text_segments = [] + for obj in page: + # NOTE(robinson) - "Figure" is an example of an object type that does + # not have a get_text method + if not hasattr(obj, "get_text"): + continue + _text = obj.get_text() + _text = re.sub(PARAGRAPH_PATTERN, " ", _text) + _text = clean_extra_whitespace(_text) + if _text.strip(): + text_segments.append(_text) + + text = "\n\n".join(text_segments) + + _elements = partition_text(text=text) + for element in _elements: + element.metadata = metadata + elements.append(element) if include_page_breaks: elements.append(PageBreak())