Refactor: support layout analysis (#2273)

### Summary This PR is the second part of the "layout analysis" refactor to move it from unstructured-inference repo to unstructured repo, the first part is done in https://github.com/Unstructured-IO/unstructured-inference/pull/305. This PR adds logic to support annotating `inferred` and `extracted` elements. ### Testing ``` PYTHONPATH=. python examples/layout-analysis/visualization.py <file_path> <strategy> <document_type> ``` e.g. ``` PYTHONPATH=. python examples/layout-analysis/visualization.py example-docs/layout-parser-paper-fast.pdf hi_res pdf ```
2025-12-10 14:42:24 +00:00 · 2023-12-18 22:21:56 -08:00 · 2023-12-18 22:21:56 -08:00 · 096d23bc28
commit 096d23bc28
parent 09f86f28fb
6 changed files with 230 additions and 78 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,7 +1,9 @@
-## 0.11.6-dev1
+## 0.11.6-dev2

 ### Enhancements

+* **Update the layout analysis script.** The previous script only supported annotating `final` elements. The updated script also supports annotating `inferred` and `extracted` elements.
+
 ### Features

 ### Fixes
--- a/examples/layout-analysis/visualization.py
+++ b/examples/layout-analysis/visualization.py
@ -3,7 +3,8 @@ import pathlib
 import sys

 import pdf2image
-from unstructured_inference.inference.elements import Rectangle
+from PIL import Image
+from unstructured_inference.inference.elements import TextRegion
 from unstructured_inference.visualize import draw_bbox

 from unstructured.documents.elements import PageBreak
@ -29,11 +30,14 @@ def extract_element_coordinates(elements):
    return elements_coordinates


-def run_partition_pdf(f_path, strategy, images, output_dir):
+def run_partition_pdf(f_path, strategy, images, output_dir, output_f_basename, is_image):
    elements = partition_pdf(
        f_path,
        strategy=strategy,
+        is_image=is_image,
        include_page_breaks=True,
+        analysis=True,
+        analyzed_image_output_dir_path=output_dir,
    )

    elements_coordinates = extract_element_coordinates(elements)
@ -44,22 +48,28 @@ def run_partition_pdf(f_path, strategy, images, output_dir):
            points = coordinate.points
            x1, y1 = points[0]
            x2, y2 = points[2]
-            rect = Rectangle(x1, y1, x2, y2)
-            img = draw_bbox(img, rect, color="red")
+            el = TextRegion.from_coords(x1, y1, x2, y2)
+            img = draw_bbox(img, el, color="red")

-        output_image_path = os.path.join(output_dir, f"{strategy}-{idx + 1}.jpg")
+        output_image_path = os.path.join(output_dir, f"{output_f_basename}_{idx + 1}_final.jpg")
+        img.save(output_image_path)
        print(f"output_image_path: {output_image_path}")

-        img.save(output_image_path)

-
-def run(f_path, strategy):
+def run(f_path, strategy, document_type):
    f_basename = os.path.splitext(os.path.basename(f_path))[0]
    output_dir_path = os.path.join(output_basedir_path, f_basename)
    os.makedirs(output_dir_path, exist_ok=True)

-    images = pdf2image.convert_from_path(f_path)
-    run_partition_pdf(f_path, strategy, images, output_dir_path)
+    is_image = document_type == "image"
+    if is_image:
+        with Image.open(f_path) as img:
+            img = img.convert("RGB")
+            images = [img]
+    else:
+        images = pdf2image.convert_from_path(f_path)
+
+    run_partition_pdf(f_path, strategy, images, output_dir_path, f_basename, is_image)


 if __name__ == "__main__":
@ -74,7 +84,11 @@ if __name__ == "__main__":
        print("Invalid strategy")
        sys.exit(1)

+    if sys.argv[3] not in ["pdf", "image"]:
+        print("Invalid document type")
+        sys.exit(1)
+
    output_basedir_path = os.path.join(CUR_DIR, "output")
    os.makedirs(output_basedir_path, exist_ok=True)

-    run(f_path=sys.argv[1], strategy=sys.argv[2])
+    run(f_path=sys.argv[1], strategy=sys.argv[2], document_type=sys.argv[3])
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.11.6-dev1"  # pragma: no cover
+__version__ = "0.11.6-dev2"  # pragma: no cover
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -74,9 +74,13 @@ from unstructured.partition.lang import (
    prepare_languages_for_tesseract,
 )
 from unstructured.partition.pdf_image.pdf_image_utils import (
+    annotate_layout_elements,
    check_element_types_to_extract,
    save_elements,
 )
+from unstructured.partition.pdf_image.pdfminer_processing import (
+    merge_inferred_with_extracted_layout,
+)
 from unstructured.partition.pdf_image.pdfminer_utils import (
    open_pdfminer_pages_generator,
    rect_to_bbox,
@ -247,6 +251,8 @@ def _partition_pdf_or_image_local(
    extract_element_types: Optional[List[str]] = None,
    image_output_dir_path: Optional[str] = None,
    pdf_image_dpi: Optional[int] = None,
+    analysis: bool = False,
+    analyzed_image_output_dir_path: Optional[str] = None,
    **kwargs,
 ) -> List[Element]:
    """Partition using package installed locally"""
@ -286,14 +292,27 @@ def _partition_pdf_or_image_local(
            pdf_image_dpi=pdf_image_dpi,
        )

-        if pdf_text_extractable is True:
-            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-            merged_document_layout = process_file_with_pdfminer(
-                inferred_document_layout,
-                filename,
+        extracted_layout = (
+            process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
+            if pdf_text_extractable
+            else []
+        )
+
+        if analysis:
+            annotate_layout_elements(
+                inferred_document_layout=inferred_document_layout,
+                extracted_layout=extracted_layout,
+                filename=filename,
+                output_dir_path=analyzed_image_output_dir_path,
+                pdf_image_dpi=pdf_image_dpi,
+                is_image=is_image,
            )
-        else:
-            merged_document_layout = inferred_document_layout
+
+        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+        merged_document_layout = merge_inferred_with_extracted_layout(
+            inferred_document_layout=inferred_document_layout,
+            extracted_layout=extracted_layout,
+        )

        if model_name.startswith("chipper"):
            # NOTE(alan): We shouldn't do OCR with chipper
@ -317,14 +336,16 @@ def _partition_pdf_or_image_local(
        )
        if hasattr(file, "seek"):
            file.seek(0)
-        if pdf_text_extractable is True:
-            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-            merged_document_layout = process_data_with_pdfminer(
-                inferred_document_layout,
-                file,
-            )
-        else:
-            merged_document_layout = inferred_document_layout
+
+        extracted_layout = (
+            process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
+        )
+
+        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+        merged_document_layout = merge_inferred_with_extracted_layout(
+            inferred_document_layout=inferred_document_layout,
+            extracted_layout=extracted_layout,
+        )

        if model_name.startswith("chipper"):
            # NOTE(alan): We shouldn't do OCR with chipper
@ -655,7 +676,7 @@ def _process_pdfminer_pages(
                    urls_metadata.append(map_bbox_and_index(words, annot))

            if hasattr(obj, "get_text"):
-                _text_snippets: List[str | Any] = [obj.get_text()]  # type: ignore
+                _text_snippets: List = [obj.get_text()]
            else:
                _text = _extract_text(obj)
                _text_snippets = re.split(PARAGRAPH_PATTERN, _text)
--- a/unstructured/partition/pdf_image/pdf_image_utils.py
+++ b/unstructured/partition/pdf_image/pdf_image_utils.py
@ -13,6 +13,8 @@ from unstructured.logger import logger
 from unstructured.partition.common import convert_to_bytes

 if TYPE_CHECKING:
+    from unstructured_inference.inference.layout import DocumentLayout, PageLayout, TextRegion
+
    from unstructured.documents.elements import Element


@ -159,3 +161,118 @@ def valid_text(text: str) -> bool:
    if not text:
        return False
    return "(cid:" not in text
+
+
+def annotate_layout_elements_with_image(
+    inferred_page_layout: "PageLayout",
+    extracted_page_layout: Optional["PageLayout"],
+    output_dir_path: str,
+    output_f_basename: str,
+    page_number: int,
+):
+    """
+     Annotates a page image with both inferred and extracted layout elements.
+
+    This function takes the layout elements of a single page, either extracted from or inferred
+    for the document, and annotates them on the page image. It creates two separate annotated
+    images, one for each set of layout elements: 'inferred' and 'extracted'.
+    These annotated images are saved to a specified directory.
+    """
+
+    layout_map = {"inferred": {"layout": inferred_page_layout, "color": "blue"}}
+    if extracted_page_layout:
+        layout_map["extracted"] = {"layout": extracted_page_layout, "color": "green"}
+
+    for label, layout_data in layout_map.items():
+        page_layout = layout_data.get("layout")
+        color = layout_data.get("color")
+
+        img = page_layout.annotate(colors=color)
+        output_f_path = os.path.join(
+            output_dir_path, f"{output_f_basename}_{page_number}_{label}.jpg"
+        )
+        write_image(img, output_f_path)
+        print(f"output_image_path: {output_f_path}")
+
+
+def annotate_layout_elements(
+    inferred_document_layout: "DocumentLayout",
+    extracted_layout: List["TextRegion"],
+    filename: str,
+    output_dir_path: str,
+    pdf_image_dpi: int,
+    is_image: bool = False,
+) -> None:
+    """
+    Annotates layout elements on images extracted from a PDF or an image file.
+
+    This function processes a given document (PDF or image) and annotates layout elements based
+    on the inferred and extracted layout information.
+    It handles both PDF documents and standalone image files. For PDFs, it converts each page
+    into an image, whereas for image files, it processes the single image.
+    """
+
+    from unstructured_inference.inference.layout import PageLayout
+
+    output_f_basename = os.path.splitext(os.path.basename(filename))[0]
+    images = []
+    try:
+        if is_image:
+            with Image.open(filename) as img:
+                img = img.convert("RGB")
+                images.append(img)
+
+                extracted_page_layout = None
+                if extracted_layout:
+                    extracted_page_layout = PageLayout(
+                        number=1,
+                        image=img,
+                    )
+                    extracted_page_layout.elements = extracted_layout[0]
+
+                inferred_page_layout = inferred_document_layout.pages[0]
+                inferred_page_layout.image = img
+
+                annotate_layout_elements_with_image(
+                    inferred_page_layout=inferred_document_layout.pages[0],
+                    extracted_page_layout=extracted_page_layout,
+                    output_dir_path=output_dir_path,
+                    output_f_basename=output_f_basename,
+                    page_number=1,
+                )
+        else:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                _image_paths = pdf2image.convert_from_path(
+                    filename,
+                    dpi=pdf_image_dpi,
+                    output_folder=temp_dir,
+                    paths_only=True,
+                )
+                image_paths = cast(List[str], _image_paths)
+                for i, image_path in enumerate(image_paths):
+                    with Image.open(image_path) as img:
+                        page_number = i + 1
+
+                        extracted_page_layout = None
+                        if extracted_layout:
+                            extracted_page_layout = PageLayout(
+                                number=page_number,
+                                image=img,
+                            )
+                            extracted_page_layout.elements = extracted_layout[i]
+
+                        inferred_page_layout = inferred_document_layout.pages[i]
+                        inferred_page_layout.image = img
+
+                        annotate_layout_elements_with_image(
+                            inferred_page_layout=inferred_document_layout.pages[i],
+                            extracted_page_layout=extracted_page_layout,
+                            output_dir_path=output_dir_path,
+                            output_f_basename=output_f_basename,
+                            page_number=page_number,
+                        )
+    except Exception as e:
+        if os.path.isdir(filename) or os.path.isfile(filename):
+            raise e
+        else:
+            raise FileNotFoundError(f'File "{filename}" not found!') from e
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@ -7,7 +7,7 @@ from unstructured_inference.inference.elements import (
    TextRegion,
 )
 from unstructured_inference.inference.layoutelement import (
-    merge_inferred_layout_with_extracted_layout,
+    merge_inferred_layout_with_extracted_layout as merge_inferred_with_extracted_page,
 )
 from unstructured_inference.inference.ordering import order_layout
 from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
@ -25,62 +25,20 @@ if TYPE_CHECKING:


 def process_file_with_pdfminer(
-    inferred_document_layout: "DocumentLayout",
    filename: str = "",
-) -> "DocumentLayout":
+    dpi: int = 200,
+) -> List[List[TextRegion]]:
    with open_filename(filename, "rb") as fp:
        fp = cast(BinaryIO, fp)
-        inferred_document_layout = process_data_with_pdfminer(
-            inferred_document_layout=inferred_document_layout,
+        extracted_layout = process_data_with_pdfminer(
            file=fp,
+            dpi=dpi,
        )
-        return inferred_document_layout
+        return extracted_layout


 def process_data_with_pdfminer(
-    inferred_document_layout: "DocumentLayout",
    file: Optional[Union[bytes, BinaryIO]] = None,
-) -> "DocumentLayout":
-    """Process document data using PDFMiner to extract layout information."""
-
-    extracted_layouts = get_regions_by_pdfminer(file)
-
-    inferred_pages = inferred_document_layout.pages
-    for i, (inferred_page, extracted_layout) in enumerate(zip(inferred_pages, extracted_layouts)):
-        inferred_layout = inferred_page.elements
-        image_metadata = inferred_page.image_metadata
-        w = image_metadata.get("width")
-        h = image_metadata.get("height")
-        image_size = (w, h)
-
-        threshold_kwargs = {}
-        # NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn
-        # In other case the default values for the functions are used
-        if (
-            isinstance(inferred_page.detection_model, UnstructuredDetectronONNXModel)
-            and "R_50" not in inferred_page.detection_model.model_path
-        ):
-            threshold_kwargs = {"same_region_threshold": 0.5, "subregion_threshold": 0.5}
-
-        merged_layout = merge_inferred_layout_with_extracted_layout(
-            inferred_layout=inferred_layout,
-            extracted_layout=extracted_layout,
-            page_image_size=image_size,
-            **threshold_kwargs,
-        )
-
-        elements = inferred_page.get_elements_from_layout(
-            layout=cast(List[TextRegion], merged_layout),
-            pdf_objects=extracted_layout,
-        )
-
-        inferred_page.elements[:] = elements
-
-    return inferred_document_layout
-
-
-def get_regions_by_pdfminer(
-    fp: Optional[Union[bytes, BinaryIO]],
    dpi: int = 200,
 ) -> List[List[TextRegion]]:
    """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
@ -89,7 +47,7 @@ def get_regions_by_pdfminer(
    layouts = []
    # Coefficient to rescale bounding box to be compatible with images
    coef = dpi / 72
-    for page, page_layout in open_pdfminer_pages_generator(fp):
+    for page, page_layout in open_pdfminer_pages_generator(file):
        height = page_layout.height

        layout: List["TextRegion"] = []
@ -129,3 +87,43 @@ def get_regions_by_pdfminer(
        layouts.append(layout)

    return layouts
+
+
+def merge_inferred_with_extracted_layout(
+    inferred_document_layout: "DocumentLayout",
+    extracted_layout: List[List[TextRegion]],
+) -> "DocumentLayout":
+    inferred_pages = inferred_document_layout.pages
+    for i, (inferred_page, extracted_page_layout) in enumerate(
+        zip(inferred_pages, extracted_layout)
+    ):
+        inferred_layout = inferred_page.elements
+        image_metadata = inferred_page.image_metadata
+        w = image_metadata.get("width")
+        h = image_metadata.get("height")
+        image_size = (w, h)
+
+        threshold_kwargs = {}
+        # NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn
+        # In other case the default values for the functions are used
+        if (
+            isinstance(inferred_page.detection_model, UnstructuredDetectronONNXModel)
+            and "R_50" not in inferred_page.detection_model.model_path
+        ):
+            threshold_kwargs = {"same_region_threshold": 0.5, "subregion_threshold": 0.5}
+
+        merged_layout = merge_inferred_with_extracted_page(
+            inferred_layout=inferred_layout,
+            extracted_layout=extracted_page_layout,
+            page_image_size=image_size,
+            **threshold_kwargs,
+        )
+
+        elements = inferred_page.get_elements_from_layout(
+            layout=cast(List[TextRegion], merged_layout),
+            pdf_objects=extracted_page_layout,
+        )
+
+        inferred_page.elements[:] = elements
+
+    return inferred_document_layout