fix: group together text from the same bounding box in partition_pdf with fast strategy (#542)

* switch to using PDF objects

* linting, linting, linting

* couple more tweaks

* added test for chevron-page

* version and changelog

* linting, linting, linting

* now processing 4 files
This commit is contained in:
Matt Robinson 2023-05-03 18:33:24 -04:00 committed by GitHub
parent 7e43a25f07
commit aa01cdfc7a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 46 additions and 27 deletions

View File

@ -1,4 +1,4 @@
## 0.6.3-dev0 ## 0.6.3-dev1
### Enhancements ### Enhancements
@ -10,6 +10,9 @@
### Fixes ### Fixes
* Updates the grouping logic in the `partition_pdf` fast strategy to group together text
in the same bounding box.
## 0.6.2 ## 0.6.2

Binary file not shown.

View File

@ -319,3 +319,18 @@ def test_partition_pdf_fails_if_pdf_not_processable(
with pytest.raises(ValueError): with pytest.raises(ValueError):
pdf.partition_pdf(filename=filename) pdf.partition_pdf(filename=filename)
def test_partition_pdf_fast_groups_text_in_text_box():
filename = os.path.join("example-docs", "chevron-page.pdf")
elements = pdf.partition_pdf(filename=filename, strategy="fast")
assert elements[0] == Title("eastern mediterranean")
assert isinstance(elements[1], NarrativeText)
assert str(elements[1]).startswith("We")
assert str(elements[1]).endswith("Jordan and Egypt.")
assert elements[3] == Title(
"kilograms CO₂e/boe carbon intensity from our Eastern Mediterranean operations in 2022",
)

View File

@ -15,8 +15,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
set +e set +e
if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 3 ]; then if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 4 ]; then
echo echo
echo "3 files should have been created." echo "4 files should have been created."
exit 1 exit 1
fi fi

View File

@ -1 +1 @@
__version__ = "0.6.3-dev0" # pragma: no cover __version__ = "0.6.3-dev1" # pragma: no cover

View File

@ -1,12 +1,15 @@
import re
import warnings import warnings
from io import StringIO
from typing import BinaryIO, List, Optional, cast from typing import BinaryIO, List, Optional, cast
from pdfminer.high_level import extract_pages
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.utils import open_filename from pdfminer.utils import open_filename
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Element, ElementMetadata, PageBreak from unstructured.documents.elements import Element, ElementMetadata, PageBreak
from unstructured.logger import logger from unstructured.logger import logger
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
from unstructured.partition import _partition_via_api from unstructured.partition import _partition_via_api
from unstructured.partition.common import ( from unstructured.partition.common import (
add_element_metadata, add_element_metadata,
@ -285,31 +288,29 @@ def _process_pdfminer_pages(
include_page_breaks: bool = False, include_page_breaks: bool = False,
): ):
"""Uses PDF miner to split a document into pages and process them.""" """Uses PDF miner to split a document into pages and process them."""
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
rsrcmgr = PDFResourceManager(caching=False)
laparams = LAParams()
elements: List[Element] = [] elements: List[Element] = []
for i, page in enumerate(PDFPage.get_pages(fp, check_extractable=True)): for i, page in enumerate(extract_pages(fp)): # type: ignore
metadata = ElementMetadata(filename=filename, page_number=i + 1) metadata = ElementMetadata(filename=filename, page_number=i + 1)
with StringIO() as output_string:
device = TextConverter( text_segments = []
rsrcmgr, for obj in page:
output_string, # NOTE(robinson) - "Figure" is an example of an object type that does
codec=encoding, # not have a get_text method
laparams=laparams, if not hasattr(obj, "get_text"):
) continue
interpreter = PDFPageInterpreter(rsrcmgr, device) _text = obj.get_text()
interpreter.process_page(page) _text = re.sub(PARAGRAPH_PATTERN, " ", _text)
text = output_string.getvalue() _text = clean_extra_whitespace(_text)
_elements = partition_text(text=text) if _text.strip():
for element in _elements: text_segments.append(_text)
element.metadata = metadata
elements.append(element) text = "\n\n".join(text_segments)
_elements = partition_text(text=text)
for element in _elements:
element.metadata = metadata
elements.append(element)
if include_page_breaks: if include_page_breaks:
elements.append(PageBreak()) elements.append(PageBreak())