mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-11 16:13:24 +00:00
fix: group together text from the same bounding box in partition_pdf
with fast strategy (#542)
* switch to using PDF objects * linting, linting, linting * couple more tweaks * added test for chevron-page * version and changelog * linting, linting, linting * now processing 4 files
This commit is contained in:
parent
7e43a25f07
commit
aa01cdfc7a
@ -1,4 +1,4 @@
|
|||||||
## 0.6.3-dev0
|
## 0.6.3-dev1
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -10,6 +10,9 @@
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
* Updates the grouping logic in the `partition_pdf` fast strategy to group together text
|
||||||
|
in the same bounding box.
|
||||||
|
|
||||||
|
|
||||||
## 0.6.2
|
## 0.6.2
|
||||||
|
|
||||||
|
BIN
example-docs/chevron-page.pdf
Normal file
BIN
example-docs/chevron-page.pdf
Normal file
Binary file not shown.
@ -319,3 +319,18 @@ def test_partition_pdf_fails_if_pdf_not_processable(
|
|||||||
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
pdf.partition_pdf(filename=filename)
|
pdf.partition_pdf(filename=filename)
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_pdf_fast_groups_text_in_text_box():
|
||||||
|
filename = os.path.join("example-docs", "chevron-page.pdf")
|
||||||
|
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
||||||
|
|
||||||
|
assert elements[0] == Title("eastern mediterranean")
|
||||||
|
|
||||||
|
assert isinstance(elements[1], NarrativeText)
|
||||||
|
assert str(elements[1]).startswith("We")
|
||||||
|
assert str(elements[1]).endswith("Jordan and Egypt.")
|
||||||
|
|
||||||
|
assert elements[3] == Title(
|
||||||
|
"kilograms CO₂e/boe carbon intensity from our Eastern Mediterranean operations in 2022",
|
||||||
|
)
|
||||||
|
@ -15,8 +15,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
|
|
||||||
set +e
|
set +e
|
||||||
|
|
||||||
if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 3 ]; then
|
if [ "$(find 'api-ingest-output' -type f -printf '.' | wc -c)" != 4 ]; then
|
||||||
echo
|
echo
|
||||||
echo "3 files should have been created."
|
echo "4 files should have been created."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.6.3-dev0" # pragma: no cover
|
__version__ = "0.6.3-dev1" # pragma: no cover
|
||||||
|
@ -1,12 +1,15 @@
|
|||||||
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
from io import StringIO
|
|
||||||
from typing import BinaryIO, List, Optional, cast
|
from typing import BinaryIO, List, Optional, cast
|
||||||
|
|
||||||
|
from pdfminer.high_level import extract_pages
|
||||||
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
|
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
|
||||||
from pdfminer.utils import open_filename
|
from pdfminer.utils import open_filename
|
||||||
|
|
||||||
|
from unstructured.cleaners.core import clean_extra_whitespace
|
||||||
from unstructured.documents.elements import Element, ElementMetadata, PageBreak
|
from unstructured.documents.elements import Element, ElementMetadata, PageBreak
|
||||||
from unstructured.logger import logger
|
from unstructured.logger import logger
|
||||||
|
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
|
||||||
from unstructured.partition import _partition_via_api
|
from unstructured.partition import _partition_via_api
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common import (
|
||||||
add_element_metadata,
|
add_element_metadata,
|
||||||
@ -285,27 +288,25 @@ def _process_pdfminer_pages(
|
|||||||
include_page_breaks: bool = False,
|
include_page_breaks: bool = False,
|
||||||
):
|
):
|
||||||
"""Uses PDF miner to split a document into pages and process them."""
|
"""Uses PDF miner to split a document into pages and process them."""
|
||||||
from pdfminer.converter import TextConverter
|
|
||||||
from pdfminer.layout import LAParams
|
|
||||||
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
|
|
||||||
|
|
||||||
rsrcmgr = PDFResourceManager(caching=False)
|
|
||||||
laparams = LAParams()
|
|
||||||
|
|
||||||
elements: List[Element] = []
|
elements: List[Element] = []
|
||||||
|
|
||||||
for i, page in enumerate(PDFPage.get_pages(fp, check_extractable=True)):
|
for i, page in enumerate(extract_pages(fp)): # type: ignore
|
||||||
metadata = ElementMetadata(filename=filename, page_number=i + 1)
|
metadata = ElementMetadata(filename=filename, page_number=i + 1)
|
||||||
with StringIO() as output_string:
|
|
||||||
device = TextConverter(
|
text_segments = []
|
||||||
rsrcmgr,
|
for obj in page:
|
||||||
output_string,
|
# NOTE(robinson) - "Figure" is an example of an object type that does
|
||||||
codec=encoding,
|
# not have a get_text method
|
||||||
laparams=laparams,
|
if not hasattr(obj, "get_text"):
|
||||||
)
|
continue
|
||||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
_text = obj.get_text()
|
||||||
interpreter.process_page(page)
|
_text = re.sub(PARAGRAPH_PATTERN, " ", _text)
|
||||||
text = output_string.getvalue()
|
_text = clean_extra_whitespace(_text)
|
||||||
|
if _text.strip():
|
||||||
|
text_segments.append(_text)
|
||||||
|
|
||||||
|
text = "\n\n".join(text_segments)
|
||||||
|
|
||||||
_elements = partition_text(text=text)
|
_elements = partition_text(text=text)
|
||||||
for element in _elements:
|
for element in _elements:
|
||||||
element.metadata = metadata
|
element.metadata = metadata
|
||||||
|
Loading…
x
Reference in New Issue
Block a user