refactor: embedded text processing modules (#2535)

This PR is similar to ocr module refactoring PR - https://github.com/Unstructured-IO/unstructured/pull/2492. ### Summary - refactor "embedded text extraction" related modules to use decorator - `@requires_dependencies` on functions that require external libraries and import those libraries inside those functions instead of on module level. - add missing test cases for `pdf_image_utils.py` module to improve average test coverage ### Testing CI should pass.
2025-12-16 01:34:56 +00:00 · 2024-02-13 21:19:07 -08:00 · 2024-02-13 21:19:07 -08:00 · d11a83ce65
commit d11a83ce65
parent d9f8467187
5 changed files with 314 additions and 134 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -31,7 +31,7 @@
 * **Fix `partition_pdf()` not working when using chipper model with `file`**
 * **Handle common incorrect arguments for `languages` and `ocr_languages`** Users are regularly receiving errors on the API because they are defining `ocr_languages` or `languages` with additional quotationmarks, brackets, and similar mistakes. This update handles common incorrect arguments and raises an appropriate warning.
 * **Default `hi_res_model_name` now relies on `unstructured-inference`** When no explicit `hi_res_model_name` is passed into `partition` or `partition_pdf_or_image` the default model is picked by `unstructured-inference`'s settings or os env variable `UNSTRUCTURED_HI_RES_MODEL_NAME`; it now returns the same model name regardless of `infer_table_structure`'s value; this function will be deprecated in the future and the default model name will simply rely on `unstructured-inference` and will not consider os env in a future release.
-* **Fix remove Vectara requirements from setup.py - there are no dependencies **
+* **Fix remove Vectara requirements from setup.py - there are no dependencies**
 * **Add missing dependency files to package manifest**. Updates the file path for the ingest
  dependencies and adds missing extra dependencies.
 * **Fix remove Vectara requirements from setup.py - there are no dependencies **
--- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
+++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
@ -1,5 +1,6 @@
 import os
 import tempfile
+from unittest.mock import MagicMock, patch

 import numpy as np
 import pytest
@ -60,46 +61,66 @@ def test_convert_pdf_to_image(
            assert isinstance(images[0], PILImg.Image)


+def test_convert_pdf_to_image_raises_error(filename=example_doc_path("embedded-images.pdf")):
+    with pytest.raises(ValueError) as exc_info:
+        pdf_image_utils.convert_pdf_to_image(filename=filename, path_only=True, output_folder=None)
+
+    assert str(exc_info.value) == "output_folder must be specified if path_only is true"
+
+
+@pytest.mark.parametrize(
+    ("filename", "is_image"),
+    [
+        (example_doc_path("layout-parser-paper-fast.pdf"), False),
+        (example_doc_path("layout-parser-paper-fast.jpg"), True),
+    ],
+)
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
 def test_save_elements(
    element_category_to_save,
    extract_image_block_to_payload,
-    filename=example_doc_path("layout-parser-paper-fast.pdf"),
+    filename,
+    is_image,
 ):
    with tempfile.TemporaryDirectory() as tmpdir:
        elements = [
            Image(
-                text="3",
+                text="Image Text 1",
                coordinates=((78, 86), (78, 519), (512, 519), (512, 86)),
                coordinate_system=PixelSpace(width=1575, height=1166),
                metadata=ElementMetadata(page_number=1),
            ),
            Image(
-                text="4",
+                text="Image Text 2",
                coordinates=((570, 86), (570, 519), (1003, 519), (1003, 86)),
                coordinate_system=PixelSpace(width=1575, height=1166),
                metadata=ElementMetadata(page_number=1),
            ),
            Image(
-                text="5",
+                text="Table 1",
                coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
                coordinate_system=PixelSpace(width=1575, height=1166),
                metadata=ElementMetadata(page_number=1),
            ),
-            Table(
-                text="Sample Table",
-                coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
-                coordinate_system=PixelSpace(width=1575, height=1166),
-                metadata=ElementMetadata(page_number=2),
-            ),
        ]
+        if not is_image:
+            # add a page 2 element
+            elements.append(
+                Table(
+                    text="Table 2",
+                    coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
+                    coordinate_system=PixelSpace(width=1575, height=1166),
+                    metadata=ElementMetadata(page_number=2),
+                ),
+            )

        pdf_image_utils.save_elements(
            elements=elements,
            element_category_to_save=element_category_to_save,
            pdf_image_dpi=200,
            filename=filename,
+            is_image=is_image,
            output_dir_path=str(tmpdir),
            extract_image_block_to_payload=extract_image_block_to_payload,
        )
@ -122,6 +143,30 @@ def test_save_elements(
                assert not el.metadata.image_mime_type


+def test_save_elements_with_output_dir_path_none():
+    with (
+        patch("PIL.Image.open"),
+        patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"),
+        patch("unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image"),
+        tempfile.TemporaryDirectory() as tmpdir,
+    ):
+        original_cwd = os.getcwd()
+        os.chdir(tmpdir)
+        pdf_image_utils.save_elements(
+            elements=[],
+            element_category_to_save="",
+            pdf_image_dpi=200,
+            filename="dummy.pdf",
+            output_dir_path=None,
+        )
+
+        # Verify that the images are saved in the expected directory
+        expected_output_dir = os.path.join(tmpdir, "figures")
+        assert os.path.exists(expected_output_dir)
+        assert os.path.isdir(expected_output_dir)
+        os.chdir(original_cwd)
+
+
 def test_write_image_raises_error():
    with pytest.raises(ValueError):
        pdf_image_utils.write_image("invalid_type", "test_image.jpg")
@ -141,3 +186,126 @@ def test_pad_bbox():

    result = pdf_image_utils.pad_bbox(bbox, padding)
    assert result == expected
+
+
+@pytest.mark.parametrize(
+    ("input_types", "expected"),
+    [
+        (None, []),
+        (["table", "image"], ["Table", "Image"]),
+        (["unknown"], ["Unknown"]),
+        (["Table", "image", "UnknOwn"], ["Table", "Image", "Unknown"]),
+    ],
+)
+def test_check_element_types_to_extract(input_types, expected):
+    assert pdf_image_utils.check_element_types_to_extract(input_types) == expected
+
+
+def test_check_element_types_to_extract_raises_error():
+    with pytest.raises(TypeError) as exc_info:
+        pdf_image_utils.check_element_types_to_extract("not a list")
+    assert "must be a list" in str(exc_info.value)
+
+
+class MockPageLayout:
+    def annotate(self, colors):
+        return "mock_image"
+
+
+class MockDocumentLayout:
+    pages = [MockPageLayout(), MockPageLayout]
+
+
+def test_annotate_layout_elements_with_image():
+    inferred_layout = MockPageLayout()
+    extracted_layout = MockPageLayout()
+    output_basename = "test_page"
+    page_number = 1
+
+    # Check if images for both layouts were saved
+    with (
+        tempfile.TemporaryDirectory() as tmpdir,
+        patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image,
+    ):
+        pdf_image_utils.annotate_layout_elements_with_image(
+            inferred_page_layout=inferred_layout,
+            extracted_page_layout=extracted_layout,
+            output_dir_path=str(tmpdir),
+            output_f_basename=output_basename,
+            page_number=page_number,
+        )
+
+        expected_filenames = [
+            f"{output_basename}_{page_number}_inferred.jpg",
+            f"{output_basename}_{page_number}_extracted.jpg",
+        ]
+        actual_calls = [call.args[1] for call in mock_write_image.call_args_list]
+        for expected_filename in expected_filenames:
+            assert any(expected_filename in actual_call for actual_call in actual_calls)
+
+    # Check if only the inferred layout image was saved if extracted layout is None
+    with (
+        tempfile.TemporaryDirectory() as tmpdir,
+        patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image,
+    ):
+        pdf_image_utils.annotate_layout_elements_with_image(
+            inferred_page_layout=inferred_layout,
+            extracted_page_layout=None,
+            output_dir_path=str(tmpdir),
+            output_f_basename=output_basename,
+            page_number=page_number,
+        )
+
+        expected_filename = f"{output_basename}_{page_number}_inferred.jpg"
+        actual_calls = [call.args[1] for call in mock_write_image.call_args_list]
+        assert any(expected_filename in actual_call for actual_call in actual_calls)
+        assert len(actual_calls) == 1  # Only one image should be saved
+
+
+@pytest.mark.parametrize(
+    ("filename", "is_image"),
+    [
+        (example_doc_path("layout-parser-paper-fast.pdf"), False),
+        (example_doc_path("layout-parser-paper-fast.jpg"), True),
+    ],
+)
+def test_annotate_layout_elements(filename, is_image):
+    inferred_document_layout = MockDocumentLayout
+    extracted_layout = [MagicMock(), MagicMock()]
+
+    with (
+        patch("PIL.Image.open"),
+        patch(
+            "unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image",
+            return_value=["/path/to/image1.jpg", "/path/to/image2.jpg"],
+        ) as mock_pdf2image,
+        patch(
+            "unstructured.partition.pdf_image.pdf_image_utils.annotate_layout_elements_with_image"
+        ) as mock_annotate_layout_elements_with_image,
+    ):
+        pdf_image_utils.annotate_layout_elements(
+            inferred_document_layout=inferred_document_layout,
+            extracted_layout=extracted_layout,
+            filename=filename,
+            output_dir_path="/output",
+            pdf_image_dpi=200,
+            is_image=is_image,
+        )
+        if is_image:
+            mock_annotate_layout_elements_with_image.assert_called_once()
+        else:
+            assert mock_annotate_layout_elements_with_image.call_count == len(
+                mock_pdf2image.return_value
+            )
+
+
+def test_annotate_layout_elements_file_not_found_error():
+    with pytest.raises(FileNotFoundError):
+        pdf_image_utils.annotate_layout_elements(
+            inferred_document_layout=MagicMock(),
+            extracted_layout=[],
+            filename="nonexistent.jpg",
+            output_dir_path="/output",
+            pdf_image_dpi=200,
+            is_image=True,
+        )
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -227,6 +227,112 @@ def partition_pdf(
    )


+def partition_pdf_or_image(
+    filename: str = "",
+    file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
+    is_image: bool = False,
+    include_page_breaks: bool = False,
+    strategy: str = PartitionStrategy.AUTO,
+    infer_table_structure: bool = False,
+    ocr_languages: Optional[str] = None,
+    languages: Optional[List[str]] = None,
+    metadata_last_modified: Optional[str] = None,
+    hi_res_model_name: Optional[str] = None,
+    extract_images_in_pdf: bool = False,
+    extract_image_block_types: Optional[List[str]] = None,
+    extract_image_block_output_dir: Optional[str] = None,
+    extract_image_block_to_payload: bool = False,
+    **kwargs,
+) -> List[Element]:
+    """Parses a pdf or image document into a list of interpreted elements."""
+    # TODO(alan): Extract information about the filetype to be processed from the template
+    # route. Decoding the routing should probably be handled by a single function designed for
+    # that task so as routing design changes, those changes are implemented in a single
+    # function.
+
+    # init ability to process .heic files
+    register_heif_opener()
+
+    validate_strategy(strategy, is_image)
+
+    last_modification_date = get_the_last_modification_date_pdf_or_img(
+        file=file,
+        filename=filename,
+    )
+
+    extracted_elements = []
+    pdf_text_extractable = False
+    if not is_image:
+        try:
+            extracted_elements = extractable_elements(
+                filename=filename,
+                file=spooled_to_bytes_io_if_needed(file),
+                include_page_breaks=include_page_breaks,
+                languages=languages,
+                metadata_last_modified=metadata_last_modified or last_modification_date,
+                **kwargs,
+            )
+            pdf_text_extractable = any(
+                isinstance(el, Text) and el.text.strip() for el in extracted_elements
+            )
+        except Exception as e:
+            logger.error(e)
+            logger.warning("PDF text extraction failed, skip text extraction...")
+
+    strategy = determine_pdf_or_image_strategy(
+        strategy,
+        is_image=is_image,
+        pdf_text_extractable=pdf_text_extractable,
+        infer_table_structure=infer_table_structure,
+        extract_images_in_pdf=extract_images_in_pdf,
+        extract_image_block_types=extract_image_block_types,
+    )
+
+    if file is not None:
+        file.seek(0)
+
+    if strategy == PartitionStrategy.HI_RES:
+        # NOTE(robinson): Catches a UserWarning that occurs when detectron is called
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            elements = _partition_pdf_or_image_local(
+                filename=filename,
+                file=spooled_to_bytes_io_if_needed(file),
+                is_image=is_image,
+                infer_table_structure=infer_table_structure,
+                include_page_breaks=include_page_breaks,
+                languages=languages,
+                metadata_last_modified=metadata_last_modified or last_modification_date,
+                hi_res_model_name=hi_res_model_name,
+                pdf_text_extractable=pdf_text_extractable,
+                extract_images_in_pdf=extract_images_in_pdf,
+                extract_image_block_types=extract_image_block_types,
+                extract_image_block_output_dir=extract_image_block_output_dir,
+                extract_image_block_to_payload=extract_image_block_to_payload,
+                **kwargs,
+            )
+            out_elements = _process_uncategorized_text_elements(elements)
+
+    elif strategy == PartitionStrategy.FAST:
+        return extracted_elements
+
+    elif strategy == PartitionStrategy.OCR_ONLY:
+        # NOTE(robinson): Catches file conversion warnings when running with PDFs
+        with warnings.catch_warnings():
+            elements = _partition_pdf_or_image_with_ocr(
+                filename=filename,
+                file=file,
+                include_page_breaks=include_page_breaks,
+                languages=languages,
+                is_image=is_image,
+                metadata_last_modified=metadata_last_modified or last_modification_date,
+                **kwargs,
+            )
+            out_elements = _process_uncategorized_text_elements(elements)
+
+    return out_elements
+
+
 def extractable_elements(
    filename: str = "",
    file: Optional[Union[bytes, IO[bytes]]] = None,
@ -471,112 +577,6 @@ def _partition_pdf_or_image_local(
    return out_elements


-def partition_pdf_or_image(
-    filename: str = "",
-    file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
-    is_image: bool = False,
-    include_page_breaks: bool = False,
-    strategy: str = PartitionStrategy.AUTO,
-    infer_table_structure: bool = False,
-    ocr_languages: Optional[str] = None,
-    languages: Optional[List[str]] = None,
-    metadata_last_modified: Optional[str] = None,
-    hi_res_model_name: Optional[str] = None,
-    extract_images_in_pdf: bool = False,
-    extract_image_block_types: Optional[List[str]] = None,
-    extract_image_block_output_dir: Optional[str] = None,
-    extract_image_block_to_payload: bool = False,
-    **kwargs,
-) -> List[Element]:
-    """Parses a pdf or image document into a list of interpreted elements."""
-    # TODO(alan): Extract information about the filetype to be processed from the template
-    # route. Decoding the routing should probably be handled by a single function designed for
-    # that task so as routing design changes, those changes are implemented in a single
-    # function.
-
-    # init ability to process .heic files
-    register_heif_opener()
-
-    validate_strategy(strategy, is_image)
-
-    last_modification_date = get_the_last_modification_date_pdf_or_img(
-        file=file,
-        filename=filename,
-    )
-
-    extracted_elements = []
-    pdf_text_extractable = False
-    if not is_image:
-        try:
-            extracted_elements = extractable_elements(
-                filename=filename,
-                file=spooled_to_bytes_io_if_needed(file),
-                include_page_breaks=include_page_breaks,
-                languages=languages,
-                metadata_last_modified=metadata_last_modified or last_modification_date,
-                **kwargs,
-            )
-            pdf_text_extractable = any(
-                isinstance(el, Text) and el.text.strip() for el in extracted_elements
-            )
-        except Exception as e:
-            logger.error(e)
-            logger.warning("PDF text extraction failed, skip text extraction...")
-
-    strategy = determine_pdf_or_image_strategy(
-        strategy,
-        is_image=is_image,
-        pdf_text_extractable=pdf_text_extractable,
-        infer_table_structure=infer_table_structure,
-        extract_images_in_pdf=extract_images_in_pdf,
-        extract_image_block_types=extract_image_block_types,
-    )
-
-    if file is not None:
-        file.seek(0)
-
-    if strategy == PartitionStrategy.HI_RES:
-        # NOTE(robinson): Catches a UserWarning that occurs when detectron is called
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            elements = _partition_pdf_or_image_local(
-                filename=filename,
-                file=spooled_to_bytes_io_if_needed(file),
-                is_image=is_image,
-                infer_table_structure=infer_table_structure,
-                include_page_breaks=include_page_breaks,
-                languages=languages,
-                metadata_last_modified=metadata_last_modified or last_modification_date,
-                hi_res_model_name=hi_res_model_name,
-                pdf_text_extractable=pdf_text_extractable,
-                extract_images_in_pdf=extract_images_in_pdf,
-                extract_image_block_types=extract_image_block_types,
-                extract_image_block_output_dir=extract_image_block_output_dir,
-                extract_image_block_to_payload=extract_image_block_to_payload,
-                **kwargs,
-            )
-            out_elements = _process_uncategorized_text_elements(elements)
-
-    elif strategy == PartitionStrategy.FAST:
-        return extracted_elements
-
-    elif strategy == PartitionStrategy.OCR_ONLY:
-        # NOTE(robinson): Catches file conversion warnings when running with PDFs
-        with warnings.catch_warnings():
-            elements = _partition_pdf_or_image_with_ocr(
-                filename=filename,
-                file=file,
-                include_page_breaks=include_page_breaks,
-                languages=languages,
-                is_image=is_image,
-                metadata_last_modified=metadata_last_modified or last_modification_date,
-                **kwargs,
-            )
-            out_elements = _process_uncategorized_text_elements(elements)
-
-    return out_elements
-
-
 def _process_uncategorized_text_elements(elements: List[Element]):
    """Processes a list of elements, creating a new list where elements with the
    category `UncategorizedText` are replaced with corresponding
@ -594,7 +594,6 @@ def _process_uncategorized_text_elements(elements: List[Element]):
    return out_elements


-@requires_dependencies("pdfminer", "local-inference")
 def _partition_pdf_with_pdfminer(
    filename: str,
    file: Optional[IO[bytes]],
@ -673,6 +672,7 @@ def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs):
    return wrapped(resources)


+@requires_dependencies("pdfminer")
 def _process_pdfminer_pages(
    fp: BinaryIO,
    filename: str,
@ -683,6 +683,7 @@ def _process_pdfminer_pages(
    **kwargs,
 ):
    """Uses PDFMiner to split a document into pages and process them."""
+
    elements: List[Element] = []

    for i, (page, page_layout) in enumerate(open_pdfminer_pages_generator(fp)):
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@ -1,16 +1,6 @@
 from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast

 from pdfminer.utils import open_filename
-from unstructured_inference.inference.elements import (
-    EmbeddedTextRegion,
-    ImageTextRegion,
-    TextRegion,
-)
-from unstructured_inference.inference.layoutelement import (
-    merge_inferred_layout_with_extracted_layout as merge_inferred_with_extracted_page,
-)
-from unstructured_inference.inference.ordering import order_layout
-from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel

 from unstructured.partition.pdf_image.pdfminer_utils import (
    get_images_from_pdf_element,
@ -19,15 +9,17 @@ from unstructured.partition.pdf_image.pdfminer_utils import (
 )
 from unstructured.partition.utils.constants import Source
 from unstructured.partition.utils.sorting import sort_text_regions
+from unstructured.utils import requires_dependencies

 if TYPE_CHECKING:
+    from unstructured_inference.inference.elements import TextRegion
    from unstructured_inference.inference.layout import DocumentLayout


 def process_file_with_pdfminer(
    filename: str = "",
    dpi: int = 200,
-) -> List[List[TextRegion]]:
+) -> List[List["TextRegion"]]:
    with open_filename(filename, "rb") as fp:
        fp = cast(BinaryIO, fp)
        extracted_layout = process_data_with_pdfminer(
@ -37,13 +29,20 @@ def process_file_with_pdfminer(
        return extracted_layout


+@requires_dependencies("unstructured_inference")
 def process_data_with_pdfminer(
    file: Optional[Union[bytes, BinaryIO]] = None,
    dpi: int = 200,
-) -> List[List[TextRegion]]:
+) -> List[List["TextRegion"]]:
    """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
    pdf pages using pdf2image"""

+    from unstructured_inference.inference.elements import (
+        EmbeddedTextRegion,
+        ImageTextRegion,
+    )
+    from unstructured_inference.inference.ordering import order_layout
+
    layouts = []
    # Coefficient to rescale bounding box to be compatible with images
    coef = dpi / 72
@ -89,10 +88,18 @@ def process_data_with_pdfminer(
    return layouts


+@requires_dependencies("unstructured_inference")
 def merge_inferred_with_extracted_layout(
    inferred_document_layout: "DocumentLayout",
-    extracted_layout: List[List[TextRegion]],
+    extracted_layout: List[List["TextRegion"]],
 ) -> "DocumentLayout":
+    """Merge an inferred layout with an extracted layout"""
+
+    from unstructured_inference.inference.layoutelement import (
+        merge_inferred_layout_with_extracted_layout as merge_inferred_with_extracted_page,
+    )
+    from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
+
    inferred_pages = inferred_document_layout.pages
    for i, (inferred_page, extracted_page_layout) in enumerate(
        zip(inferred_pages, extracted_layout)
@ -120,7 +127,7 @@ def merge_inferred_with_extracted_layout(
        )

        elements = inferred_page.get_elements_from_layout(
-            layout=cast(List[TextRegion], merged_layout),
+            layout=cast(List["TextRegion"], merged_layout),
            pdf_objects=extracted_page_layout,
        )

--- a/unstructured/partition/pdf_image/pdfminer_utils.py
+++ b/unstructured/partition/pdf_image/pdfminer_utils.py
@ -1,7 +1,6 @@
 import tempfile
 from typing import Any, BinaryIO, List, Tuple

-import pikepdf
 from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LAParams, LTContainer, LTImage
 from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
@ -9,7 +8,7 @@ from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfparser import PSSyntaxError

 from unstructured.logger import logger
-from unstructured.partition.pdf_image.pypdf_utils import get_page_data
+from unstructured.utils import requires_dependencies


 def init_pdfminer():
@ -79,11 +78,16 @@ def rect_to_bbox(
    return (x1, y1, x2, y2)


+@requires_dependencies(["pikepdf", "pypdf"])
 def open_pdfminer_pages_generator(
    fp: BinaryIO,
 ):
    """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs."""

+    import pikepdf
+
+    from unstructured.partition.pdf_image.pypdf_utils import get_page_data
+
    device, interpreter = init_pdfminer()
    try:
        pages = PDFPage.get_pages(fp)