mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-10 15:44:31 +00:00
fix: group together text from the same bounding box in partition_pdf
with fast strategy (#542)
* switch to using PDF objects * linting, linting, linting * couple more tweaks * added test for chevron-page * version and changelog * linting, linting, linting * now processing 4 files
This commit is contained in:
parent
7e43a25f07
commit
aa01cdfc7a
@ -1,4 +1,4 @@
|
||||
## 0.6.3-dev0
|
||||
## 0.6.3-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -10,6 +10,9 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* Updates the grouping logic in the `partition_pdf` fast strategy to group together text
|
||||
in the same bounding box.
|
||||
|
||||
|
||||
## 0.6.2
|
||||
|
||||
|
BIN
example-docs/chevron-page.pdf
Normal file
BIN
example-docs/chevron-page.pdf
Normal file
Binary file not shown.
@ -319,3 +319,18 @@ def test_partition_pdf_fails_if_pdf_not_processable(
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
pdf.partition_pdf(filename=filename)
|
||||
|
||||
|
||||
def test_partition_pdf_fast_groups_text_in_text_box():
|
||||
filename = os.path.join("example-docs", "chevron-page.pdf")
|
||||
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
||||
|
||||
assert elements[0] == Title("eastern mediterranean")
|
||||
|
||||
assert isinstance(elements[1], NarrativeText)
|
||||
assert str(elements[1]).startswith("We")
|
||||
assert str(elements[1]).endswith("Jordan and Egypt.")
|
||||
|
||||
assert elements[3] == Title(
|
||||
"kilograms CO₂e/boe carbon intensity from our Eastern Mediterranean operations in 2022",
|
||||
)
|
||||
|
@ -15,8 +15,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
|
||||
set +e
|
||||
|
||||
if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 3 ]; then
|
||||
if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 4 ]; then
|
||||
echo
|
||||
echo "3 files should have been created."
|
||||
echo "4 files should have been created."
|
||||
exit 1
|
||||
fi
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.6.3-dev0" # pragma: no cover
|
||||
__version__ = "0.6.3-dev1" # pragma: no cover
|
||||
|
@ -1,12 +1,15 @@
|
||||
import re
|
||||
import warnings
|
||||
from io import StringIO
|
||||
from typing import BinaryIO, List, Optional, cast
|
||||
|
||||
from pdfminer.high_level import extract_pages
|
||||
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
|
||||
from pdfminer.utils import open_filename
|
||||
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
from unstructured.documents.elements import Element, ElementMetadata, PageBreak
|
||||
from unstructured.logger import logger
|
||||
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
|
||||
from unstructured.partition import _partition_via_api
|
||||
from unstructured.partition.common import (
|
||||
add_element_metadata,
|
||||
@ -285,31 +288,29 @@ def _process_pdfminer_pages(
|
||||
include_page_breaks: bool = False,
|
||||
):
|
||||
"""Uses PDF miner to split a document into pages and process them."""
|
||||
from pdfminer.converter import TextConverter
|
||||
from pdfminer.layout import LAParams
|
||||
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
|
||||
|
||||
rsrcmgr = PDFResourceManager(caching=False)
|
||||
laparams = LAParams()
|
||||
|
||||
elements: List[Element] = []
|
||||
|
||||
for i, page in enumerate(PDFPage.get_pages(fp, check_extractable=True)):
|
||||
for i, page in enumerate(extract_pages(fp)): # type: ignore
|
||||
metadata = ElementMetadata(filename=filename, page_number=i + 1)
|
||||
with StringIO() as output_string:
|
||||
device = TextConverter(
|
||||
rsrcmgr,
|
||||
output_string,
|
||||
codec=encoding,
|
||||
laparams=laparams,
|
||||
)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
interpreter.process_page(page)
|
||||
text = output_string.getvalue()
|
||||
_elements = partition_text(text=text)
|
||||
for element in _elements:
|
||||
element.metadata = metadata
|
||||
elements.append(element)
|
||||
|
||||
text_segments = []
|
||||
for obj in page:
|
||||
# NOTE(robinson) - "Figure" is an example of an object type that does
|
||||
# not have a get_text method
|
||||
if not hasattr(obj, "get_text"):
|
||||
continue
|
||||
_text = obj.get_text()
|
||||
_text = re.sub(PARAGRAPH_PATTERN, " ", _text)
|
||||
_text = clean_extra_whitespace(_text)
|
||||
if _text.strip():
|
||||
text_segments.append(_text)
|
||||
|
||||
text = "\n\n".join(text_segments)
|
||||
|
||||
_elements = partition_text(text=text)
|
||||
for element in _elements:
|
||||
element.metadata = metadata
|
||||
elements.append(element)
|
||||
|
||||
if include_page_breaks:
|
||||
elements.append(PageBreak())
|
||||
|
Loading…
x
Reference in New Issue
Block a user