refactor: embedded text processing modules (#2535)

This PR is similar to ocr module refactoring PR -
https://github.com/Unstructured-IO/unstructured/pull/2492.

### Summary
- refactor "embedded text extraction" related modules to use decorator -
`@requires_dependencies` on functions that require external libraries
and import those libraries inside those functions instead of on module
level.
- add missing test cases for `pdf_image_utils.py` module to improve
average test coverage

### Testing
CI should pass.
This commit is contained in:
Christine Straub 2024-02-13 21:19:07 -08:00 committed by GitHub
parent d9f8467187
commit d11a83ce65
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 314 additions and 134 deletions

View File

@ -31,7 +31,7 @@
* **Fix `partition_pdf()` not working when using chipper model with `file`** * **Fix `partition_pdf()` not working when using chipper model with `file`**
* **Handle common incorrect arguments for `languages` and `ocr_languages`** Users are regularly receiving errors on the API because they are defining `ocr_languages` or `languages` with additional quotationmarks, brackets, and similar mistakes. This update handles common incorrect arguments and raises an appropriate warning. * **Handle common incorrect arguments for `languages` and `ocr_languages`** Users are regularly receiving errors on the API because they are defining `ocr_languages` or `languages` with additional quotationmarks, brackets, and similar mistakes. This update handles common incorrect arguments and raises an appropriate warning.
* **Default `hi_res_model_name` now relies on `unstructured-inference`** When no explicit `hi_res_model_name` is passed into `partition` or `partition_pdf_or_image` the default model is picked by `unstructured-inference`'s settings or os env variable `UNSTRUCTURED_HI_RES_MODEL_NAME`; it now returns the same model name regardless of `infer_table_structure`'s value; this function will be deprecated in the future and the default model name will simply rely on `unstructured-inference` and will not consider os env in a future release. * **Default `hi_res_model_name` now relies on `unstructured-inference`** When no explicit `hi_res_model_name` is passed into `partition` or `partition_pdf_or_image` the default model is picked by `unstructured-inference`'s settings or os env variable `UNSTRUCTURED_HI_RES_MODEL_NAME`; it now returns the same model name regardless of `infer_table_structure`'s value; this function will be deprecated in the future and the default model name will simply rely on `unstructured-inference` and will not consider os env in a future release.
* **Fix remove Vectara requirements from setup.py - there are no dependencies ** * **Fix remove Vectara requirements from setup.py - there are no dependencies**
* **Add missing dependency files to package manifest**. Updates the file path for the ingest * **Add missing dependency files to package manifest**. Updates the file path for the ingest
dependencies and adds missing extra dependencies. dependencies and adds missing extra dependencies.
* **Fix remove Vectara requirements from setup.py - there are no dependencies ** * **Fix remove Vectara requirements from setup.py - there are no dependencies **

View File

@ -1,5 +1,6 @@
import os import os
import tempfile import tempfile
from unittest.mock import MagicMock, patch
import numpy as np import numpy as np
import pytest import pytest
@ -60,46 +61,66 @@ def test_convert_pdf_to_image(
assert isinstance(images[0], PILImg.Image) assert isinstance(images[0], PILImg.Image)
def test_convert_pdf_to_image_raises_error(filename=example_doc_path("embedded-images.pdf")):
with pytest.raises(ValueError) as exc_info:
pdf_image_utils.convert_pdf_to_image(filename=filename, path_only=True, output_folder=None)
assert str(exc_info.value) == "output_folder must be specified if path_only is true"
@pytest.mark.parametrize(
("filename", "is_image"),
[
(example_doc_path("layout-parser-paper-fast.pdf"), False),
(example_doc_path("layout-parser-paper-fast.jpg"), True),
],
)
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE]) @pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True]) @pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_save_elements( def test_save_elements(
element_category_to_save, element_category_to_save,
extract_image_block_to_payload, extract_image_block_to_payload,
filename=example_doc_path("layout-parser-paper-fast.pdf"), filename,
is_image,
): ):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
elements = [ elements = [
Image( Image(
text="3", text="Image Text 1",
coordinates=((78, 86), (78, 519), (512, 519), (512, 86)), coordinates=((78, 86), (78, 519), (512, 519), (512, 86)),
coordinate_system=PixelSpace(width=1575, height=1166), coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1), metadata=ElementMetadata(page_number=1),
), ),
Image( Image(
text="4", text="Image Text 2",
coordinates=((570, 86), (570, 519), (1003, 519), (1003, 86)), coordinates=((570, 86), (570, 519), (1003, 519), (1003, 86)),
coordinate_system=PixelSpace(width=1575, height=1166), coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1), metadata=ElementMetadata(page_number=1),
), ),
Image( Image(
text="5", text="Table 1",
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)), coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
coordinate_system=PixelSpace(width=1575, height=1166), coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1), metadata=ElementMetadata(page_number=1),
), ),
]
if not is_image:
# add a page 2 element
elements.append(
Table( Table(
text="Sample Table", text="Table 2",
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)), coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
coordinate_system=PixelSpace(width=1575, height=1166), coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=2), metadata=ElementMetadata(page_number=2),
), ),
] )
pdf_image_utils.save_elements( pdf_image_utils.save_elements(
elements=elements, elements=elements,
element_category_to_save=element_category_to_save, element_category_to_save=element_category_to_save,
pdf_image_dpi=200, pdf_image_dpi=200,
filename=filename, filename=filename,
is_image=is_image,
output_dir_path=str(tmpdir), output_dir_path=str(tmpdir),
extract_image_block_to_payload=extract_image_block_to_payload, extract_image_block_to_payload=extract_image_block_to_payload,
) )
@ -122,6 +143,30 @@ def test_save_elements(
assert not el.metadata.image_mime_type assert not el.metadata.image_mime_type
def test_save_elements_with_output_dir_path_none():
with (
patch("PIL.Image.open"),
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"),
patch("unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image"),
tempfile.TemporaryDirectory() as tmpdir,
):
original_cwd = os.getcwd()
os.chdir(tmpdir)
pdf_image_utils.save_elements(
elements=[],
element_category_to_save="",
pdf_image_dpi=200,
filename="dummy.pdf",
output_dir_path=None,
)
# Verify that the images are saved in the expected directory
expected_output_dir = os.path.join(tmpdir, "figures")
assert os.path.exists(expected_output_dir)
assert os.path.isdir(expected_output_dir)
os.chdir(original_cwd)
def test_write_image_raises_error(): def test_write_image_raises_error():
with pytest.raises(ValueError): with pytest.raises(ValueError):
pdf_image_utils.write_image("invalid_type", "test_image.jpg") pdf_image_utils.write_image("invalid_type", "test_image.jpg")
@ -141,3 +186,126 @@ def test_pad_bbox():
result = pdf_image_utils.pad_bbox(bbox, padding) result = pdf_image_utils.pad_bbox(bbox, padding)
assert result == expected assert result == expected
@pytest.mark.parametrize(
("input_types", "expected"),
[
(None, []),
(["table", "image"], ["Table", "Image"]),
(["unknown"], ["Unknown"]),
(["Table", "image", "UnknOwn"], ["Table", "Image", "Unknown"]),
],
)
def test_check_element_types_to_extract(input_types, expected):
assert pdf_image_utils.check_element_types_to_extract(input_types) == expected
def test_check_element_types_to_extract_raises_error():
with pytest.raises(TypeError) as exc_info:
pdf_image_utils.check_element_types_to_extract("not a list")
assert "must be a list" in str(exc_info.value)
class MockPageLayout:
def annotate(self, colors):
return "mock_image"
class MockDocumentLayout:
pages = [MockPageLayout(), MockPageLayout]
def test_annotate_layout_elements_with_image():
inferred_layout = MockPageLayout()
extracted_layout = MockPageLayout()
output_basename = "test_page"
page_number = 1
# Check if images for both layouts were saved
with (
tempfile.TemporaryDirectory() as tmpdir,
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image,
):
pdf_image_utils.annotate_layout_elements_with_image(
inferred_page_layout=inferred_layout,
extracted_page_layout=extracted_layout,
output_dir_path=str(tmpdir),
output_f_basename=output_basename,
page_number=page_number,
)
expected_filenames = [
f"{output_basename}_{page_number}_inferred.jpg",
f"{output_basename}_{page_number}_extracted.jpg",
]
actual_calls = [call.args[1] for call in mock_write_image.call_args_list]
for expected_filename in expected_filenames:
assert any(expected_filename in actual_call for actual_call in actual_calls)
# Check if only the inferred layout image was saved if extracted layout is None
with (
tempfile.TemporaryDirectory() as tmpdir,
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image,
):
pdf_image_utils.annotate_layout_elements_with_image(
inferred_page_layout=inferred_layout,
extracted_page_layout=None,
output_dir_path=str(tmpdir),
output_f_basename=output_basename,
page_number=page_number,
)
expected_filename = f"{output_basename}_{page_number}_inferred.jpg"
actual_calls = [call.args[1] for call in mock_write_image.call_args_list]
assert any(expected_filename in actual_call for actual_call in actual_calls)
assert len(actual_calls) == 1 # Only one image should be saved
@pytest.mark.parametrize(
("filename", "is_image"),
[
(example_doc_path("layout-parser-paper-fast.pdf"), False),
(example_doc_path("layout-parser-paper-fast.jpg"), True),
],
)
def test_annotate_layout_elements(filename, is_image):
inferred_document_layout = MockDocumentLayout
extracted_layout = [MagicMock(), MagicMock()]
with (
patch("PIL.Image.open"),
patch(
"unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image",
return_value=["/path/to/image1.jpg", "/path/to/image2.jpg"],
) as mock_pdf2image,
patch(
"unstructured.partition.pdf_image.pdf_image_utils.annotate_layout_elements_with_image"
) as mock_annotate_layout_elements_with_image,
):
pdf_image_utils.annotate_layout_elements(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
filename=filename,
output_dir_path="/output",
pdf_image_dpi=200,
is_image=is_image,
)
if is_image:
mock_annotate_layout_elements_with_image.assert_called_once()
else:
assert mock_annotate_layout_elements_with_image.call_count == len(
mock_pdf2image.return_value
)
def test_annotate_layout_elements_file_not_found_error():
with pytest.raises(FileNotFoundError):
pdf_image_utils.annotate_layout_elements(
inferred_document_layout=MagicMock(),
extracted_layout=[],
filename="nonexistent.jpg",
output_dir_path="/output",
pdf_image_dpi=200,
is_image=True,
)

View File

@ -227,6 +227,112 @@ def partition_pdf(
) )
def partition_pdf_or_image(
filename: str = "",
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
is_image: bool = False,
include_page_breaks: bool = False,
strategy: str = PartitionStrategy.AUTO,
infer_table_structure: bool = False,
ocr_languages: Optional[str] = None,
languages: Optional[List[str]] = None,
metadata_last_modified: Optional[str] = None,
hi_res_model_name: Optional[str] = None,
extract_images_in_pdf: bool = False,
extract_image_block_types: Optional[List[str]] = None,
extract_image_block_output_dir: Optional[str] = None,
extract_image_block_to_payload: bool = False,
**kwargs,
) -> List[Element]:
"""Parses a pdf or image document into a list of interpreted elements."""
# TODO(alan): Extract information about the filetype to be processed from the template
# route. Decoding the routing should probably be handled by a single function designed for
# that task so as routing design changes, those changes are implemented in a single
# function.
# init ability to process .heic files
register_heif_opener()
validate_strategy(strategy, is_image)
last_modification_date = get_the_last_modification_date_pdf_or_img(
file=file,
filename=filename,
)
extracted_elements = []
pdf_text_extractable = False
if not is_image:
try:
extracted_elements = extractable_elements(
filename=filename,
file=spooled_to_bytes_io_if_needed(file),
include_page_breaks=include_page_breaks,
languages=languages,
metadata_last_modified=metadata_last_modified or last_modification_date,
**kwargs,
)
pdf_text_extractable = any(
isinstance(el, Text) and el.text.strip() for el in extracted_elements
)
except Exception as e:
logger.error(e)
logger.warning("PDF text extraction failed, skip text extraction...")
strategy = determine_pdf_or_image_strategy(
strategy,
is_image=is_image,
pdf_text_extractable=pdf_text_extractable,
infer_table_structure=infer_table_structure,
extract_images_in_pdf=extract_images_in_pdf,
extract_image_block_types=extract_image_block_types,
)
if file is not None:
file.seek(0)
if strategy == PartitionStrategy.HI_RES:
# NOTE(robinson): Catches a UserWarning that occurs when detectron is called
with warnings.catch_warnings():
warnings.simplefilter("ignore")
elements = _partition_pdf_or_image_local(
filename=filename,
file=spooled_to_bytes_io_if_needed(file),
is_image=is_image,
infer_table_structure=infer_table_structure,
include_page_breaks=include_page_breaks,
languages=languages,
metadata_last_modified=metadata_last_modified or last_modification_date,
hi_res_model_name=hi_res_model_name,
pdf_text_extractable=pdf_text_extractable,
extract_images_in_pdf=extract_images_in_pdf,
extract_image_block_types=extract_image_block_types,
extract_image_block_output_dir=extract_image_block_output_dir,
extract_image_block_to_payload=extract_image_block_to_payload,
**kwargs,
)
out_elements = _process_uncategorized_text_elements(elements)
elif strategy == PartitionStrategy.FAST:
return extracted_elements
elif strategy == PartitionStrategy.OCR_ONLY:
# NOTE(robinson): Catches file conversion warnings when running with PDFs
with warnings.catch_warnings():
elements = _partition_pdf_or_image_with_ocr(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
languages=languages,
is_image=is_image,
metadata_last_modified=metadata_last_modified or last_modification_date,
**kwargs,
)
out_elements = _process_uncategorized_text_elements(elements)
return out_elements
def extractable_elements( def extractable_elements(
filename: str = "", filename: str = "",
file: Optional[Union[bytes, IO[bytes]]] = None, file: Optional[Union[bytes, IO[bytes]]] = None,
@ -471,112 +577,6 @@ def _partition_pdf_or_image_local(
return out_elements return out_elements
def partition_pdf_or_image(
filename: str = "",
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
is_image: bool = False,
include_page_breaks: bool = False,
strategy: str = PartitionStrategy.AUTO,
infer_table_structure: bool = False,
ocr_languages: Optional[str] = None,
languages: Optional[List[str]] = None,
metadata_last_modified: Optional[str] = None,
hi_res_model_name: Optional[str] = None,
extract_images_in_pdf: bool = False,
extract_image_block_types: Optional[List[str]] = None,
extract_image_block_output_dir: Optional[str] = None,
extract_image_block_to_payload: bool = False,
**kwargs,
) -> List[Element]:
"""Parses a pdf or image document into a list of interpreted elements."""
# TODO(alan): Extract information about the filetype to be processed from the template
# route. Decoding the routing should probably be handled by a single function designed for
# that task so as routing design changes, those changes are implemented in a single
# function.
# init ability to process .heic files
register_heif_opener()
validate_strategy(strategy, is_image)
last_modification_date = get_the_last_modification_date_pdf_or_img(
file=file,
filename=filename,
)
extracted_elements = []
pdf_text_extractable = False
if not is_image:
try:
extracted_elements = extractable_elements(
filename=filename,
file=spooled_to_bytes_io_if_needed(file),
include_page_breaks=include_page_breaks,
languages=languages,
metadata_last_modified=metadata_last_modified or last_modification_date,
**kwargs,
)
pdf_text_extractable = any(
isinstance(el, Text) and el.text.strip() for el in extracted_elements
)
except Exception as e:
logger.error(e)
logger.warning("PDF text extraction failed, skip text extraction...")
strategy = determine_pdf_or_image_strategy(
strategy,
is_image=is_image,
pdf_text_extractable=pdf_text_extractable,
infer_table_structure=infer_table_structure,
extract_images_in_pdf=extract_images_in_pdf,
extract_image_block_types=extract_image_block_types,
)
if file is not None:
file.seek(0)
if strategy == PartitionStrategy.HI_RES:
# NOTE(robinson): Catches a UserWarning that occurs when detectron is called
with warnings.catch_warnings():
warnings.simplefilter("ignore")
elements = _partition_pdf_or_image_local(
filename=filename,
file=spooled_to_bytes_io_if_needed(file),
is_image=is_image,
infer_table_structure=infer_table_structure,
include_page_breaks=include_page_breaks,
languages=languages,
metadata_last_modified=metadata_last_modified or last_modification_date,
hi_res_model_name=hi_res_model_name,
pdf_text_extractable=pdf_text_extractable,
extract_images_in_pdf=extract_images_in_pdf,
extract_image_block_types=extract_image_block_types,
extract_image_block_output_dir=extract_image_block_output_dir,
extract_image_block_to_payload=extract_image_block_to_payload,
**kwargs,
)
out_elements = _process_uncategorized_text_elements(elements)
elif strategy == PartitionStrategy.FAST:
return extracted_elements
elif strategy == PartitionStrategy.OCR_ONLY:
# NOTE(robinson): Catches file conversion warnings when running with PDFs
with warnings.catch_warnings():
elements = _partition_pdf_or_image_with_ocr(
filename=filename,
file=file,
include_page_breaks=include_page_breaks,
languages=languages,
is_image=is_image,
metadata_last_modified=metadata_last_modified or last_modification_date,
**kwargs,
)
out_elements = _process_uncategorized_text_elements(elements)
return out_elements
def _process_uncategorized_text_elements(elements: List[Element]): def _process_uncategorized_text_elements(elements: List[Element]):
"""Processes a list of elements, creating a new list where elements with the """Processes a list of elements, creating a new list where elements with the
category `UncategorizedText` are replaced with corresponding category `UncategorizedText` are replaced with corresponding
@ -594,7 +594,6 @@ def _process_uncategorized_text_elements(elements: List[Element]):
return out_elements return out_elements
@requires_dependencies("pdfminer", "local-inference")
def _partition_pdf_with_pdfminer( def _partition_pdf_with_pdfminer(
filename: str, filename: str,
file: Optional[IO[bytes]], file: Optional[IO[bytes]],
@ -673,6 +672,7 @@ def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs):
return wrapped(resources) return wrapped(resources)
@requires_dependencies("pdfminer")
def _process_pdfminer_pages( def _process_pdfminer_pages(
fp: BinaryIO, fp: BinaryIO,
filename: str, filename: str,
@ -683,6 +683,7 @@ def _process_pdfminer_pages(
**kwargs, **kwargs,
): ):
"""Uses PDFMiner to split a document into pages and process them.""" """Uses PDFMiner to split a document into pages and process them."""
elements: List[Element] = [] elements: List[Element] = []
for i, (page, page_layout) in enumerate(open_pdfminer_pages_generator(fp)): for i, (page, page_layout) in enumerate(open_pdfminer_pages_generator(fp)):

View File

@ -1,16 +1,6 @@
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
from pdfminer.utils import open_filename from pdfminer.utils import open_filename
from unstructured_inference.inference.elements import (
EmbeddedTextRegion,
ImageTextRegion,
TextRegion,
)
from unstructured_inference.inference.layoutelement import (
merge_inferred_layout_with_extracted_layout as merge_inferred_with_extracted_page,
)
from unstructured_inference.inference.ordering import order_layout
from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
from unstructured.partition.pdf_image.pdfminer_utils import ( from unstructured.partition.pdf_image.pdfminer_utils import (
get_images_from_pdf_element, get_images_from_pdf_element,
@ -19,15 +9,17 @@ from unstructured.partition.pdf_image.pdfminer_utils import (
) )
from unstructured.partition.utils.constants import Source from unstructured.partition.utils.constants import Source
from unstructured.partition.utils.sorting import sort_text_regions from unstructured.partition.utils.sorting import sort_text_regions
from unstructured.utils import requires_dependencies
if TYPE_CHECKING: if TYPE_CHECKING:
from unstructured_inference.inference.elements import TextRegion
from unstructured_inference.inference.layout import DocumentLayout from unstructured_inference.inference.layout import DocumentLayout
def process_file_with_pdfminer( def process_file_with_pdfminer(
filename: str = "", filename: str = "",
dpi: int = 200, dpi: int = 200,
) -> List[List[TextRegion]]: ) -> List[List["TextRegion"]]:
with open_filename(filename, "rb") as fp: with open_filename(filename, "rb") as fp:
fp = cast(BinaryIO, fp) fp = cast(BinaryIO, fp)
extracted_layout = process_data_with_pdfminer( extracted_layout = process_data_with_pdfminer(
@ -37,13 +29,20 @@ def process_file_with_pdfminer(
return extracted_layout return extracted_layout
@requires_dependencies("unstructured_inference")
def process_data_with_pdfminer( def process_data_with_pdfminer(
file: Optional[Union[bytes, BinaryIO]] = None, file: Optional[Union[bytes, BinaryIO]] = None,
dpi: int = 200, dpi: int = 200,
) -> List[List[TextRegion]]: ) -> List[List["TextRegion"]]:
"""Loads the image and word objects from a pdf using pdfplumber and the image renderings of the """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
pdf pages using pdf2image""" pdf pages using pdf2image"""
from unstructured_inference.inference.elements import (
EmbeddedTextRegion,
ImageTextRegion,
)
from unstructured_inference.inference.ordering import order_layout
layouts = [] layouts = []
# Coefficient to rescale bounding box to be compatible with images # Coefficient to rescale bounding box to be compatible with images
coef = dpi / 72 coef = dpi / 72
@ -89,10 +88,18 @@ def process_data_with_pdfminer(
return layouts return layouts
@requires_dependencies("unstructured_inference")
def merge_inferred_with_extracted_layout( def merge_inferred_with_extracted_layout(
inferred_document_layout: "DocumentLayout", inferred_document_layout: "DocumentLayout",
extracted_layout: List[List[TextRegion]], extracted_layout: List[List["TextRegion"]],
) -> "DocumentLayout": ) -> "DocumentLayout":
"""Merge an inferred layout with an extracted layout"""
from unstructured_inference.inference.layoutelement import (
merge_inferred_layout_with_extracted_layout as merge_inferred_with_extracted_page,
)
from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
inferred_pages = inferred_document_layout.pages inferred_pages = inferred_document_layout.pages
for i, (inferred_page, extracted_page_layout) in enumerate( for i, (inferred_page, extracted_page_layout) in enumerate(
zip(inferred_pages, extracted_layout) zip(inferred_pages, extracted_layout)
@ -120,7 +127,7 @@ def merge_inferred_with_extracted_layout(
) )
elements = inferred_page.get_elements_from_layout( elements = inferred_page.get_elements_from_layout(
layout=cast(List[TextRegion], merged_layout), layout=cast(List["TextRegion"], merged_layout),
pdf_objects=extracted_page_layout, pdf_objects=extracted_page_layout,
) )

View File

@ -1,7 +1,6 @@
import tempfile import tempfile
from typing import Any, BinaryIO, List, Tuple from typing import Any, BinaryIO, List, Tuple
import pikepdf
from pdfminer.converter import PDFPageAggregator from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTContainer, LTImage from pdfminer.layout import LAParams, LTContainer, LTImage
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
@ -9,7 +8,7 @@ from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PSSyntaxError from pdfminer.pdfparser import PSSyntaxError
from unstructured.logger import logger from unstructured.logger import logger
from unstructured.partition.pdf_image.pypdf_utils import get_page_data from unstructured.utils import requires_dependencies
def init_pdfminer(): def init_pdfminer():
@ -79,11 +78,16 @@ def rect_to_bbox(
return (x1, y1, x2, y2) return (x1, y1, x2, y2)
@requires_dependencies(["pikepdf", "pypdf"])
def open_pdfminer_pages_generator( def open_pdfminer_pages_generator(
fp: BinaryIO, fp: BinaryIO,
): ):
"""Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs.""" """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs."""
import pikepdf
from unstructured.partition.pdf_image.pypdf_utils import get_page_data
device, interpreter = init_pdfminer() device, interpreter = init_pdfminer()
try: try:
pages = PDFPage.get_pages(fp) pages = PDFPage.get_pages(fp)