Refactor: support layout analysis (#2273)

### Summary This PR is the second part of the "layout analysis" refactor to move it from unstructured-inference repo to unstructured repo, the first part is done in https://github.com/Unstructured-IO/unstructured-inference/pull/305. This PR adds logic to support annotating `inferred` and `extracted` elements. ### Testing ``` PYTHONPATH=. python examples/layout-analysis/visualization.py <file_path> <strategy> <document_type> ``` e.g. ``` PYTHONPATH=. python examples/layout-analysis/visualization.py example-docs/layout-parser-paper-fast.pdf hi_res pdf ```
2025-12-16 09:47:18 +00:00 · 2023-12-18 22:21:56 -08:00 · 2023-12-18 22:21:56 -08:00 · 096d23bc28
commit 096d23bc28
parent 09f86f28fb
6 changed files with 230 additions and 78 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,7 +1,9 @@
-## 0.11.6-dev1
+## 0.11.6-dev2
 ### Enhancements
 * **Update the layout analysis script.** The previous script only supported annotating `final` elements. The updated script also supports annotating `inferred` and `extracted` elements.
 ### Features
 ### Fixes
--- a/examples/layout-analysis/visualization.py
+++ b/examples/layout-analysis/visualization.py
@ -3,7 +3,8 @@ import pathlib
 import sys
 import pdf2image
-from unstructured_inference.inference.elements import Rectangle
+from PIL import Image
 from unstructured_inference.inference.elements import TextRegion
 from unstructured_inference.visualize import draw_bbox
 from unstructured.documents.elements import PageBreak
@ -29,11 +30,14 @@ def extract_element_coordinates(elements):
    return elements_coordinates
-def run_partition_pdf(f_path, strategy, images, output_dir):
+def run_partition_pdf(f_path, strategy, images, output_dir, output_f_basename, is_image):
    elements = partition_pdf(
        f_path,
        strategy=strategy,
        is_image=is_image,
        include_page_breaks=True,
        analysis=True,
        analyzed_image_output_dir_path=output_dir,
    )
    elements_coordinates = extract_element_coordinates(elements)
@ -44,22 +48,28 @@ def run_partition_pdf(f_path, strategy, images, output_dir):
            points = coordinate.points
            x1, y1 = points[0]
            x2, y2 = points[2]
-            rect = Rectangle(x1, y1, x2, y2)
+            el = TextRegion.from_coords(x1, y1, x2, y2)
-            img = draw_bbox(img, rect, color="red")
+            img = draw_bbox(img, el, color="red")
-        output_image_path = os.path.join(output_dir, f"{strategy}-{idx + 1}.jpg")
+        output_image_path = os.path.join(output_dir, f"{output_f_basename}_{idx + 1}_final.jpg")
        img.save(output_image_path)
        print(f"output_image_path: {output_image_path}")
        img.save(output_image_path)
-
+def run(f_path, strategy, document_type):
 def run(f_path, strategy):
    f_basename = os.path.splitext(os.path.basename(f_path))[0]
    output_dir_path = os.path.join(output_basedir_path, f_basename)
    os.makedirs(output_dir_path, exist_ok=True)
-    images = pdf2image.convert_from_path(f_path)
+    is_image = document_type == "image"
-    run_partition_pdf(f_path, strategy, images, output_dir_path)
+    if is_image:
        with Image.open(f_path) as img:
            img = img.convert("RGB")
            images = [img]
    else:
        images = pdf2image.convert_from_path(f_path)
    run_partition_pdf(f_path, strategy, images, output_dir_path, f_basename, is_image)
 if __name__ == "__main__":
@ -74,7 +84,11 @@ if __name__ == "__main__":
        print("Invalid strategy")
        sys.exit(1)
    if sys.argv[3] not in ["pdf", "image"]:
        print("Invalid document type")
        sys.exit(1)
    output_basedir_path = os.path.join(CUR_DIR, "output")
    os.makedirs(output_basedir_path, exist_ok=True)
-    run(f_path=sys.argv[1], strategy=sys.argv[2])
+    run(f_path=sys.argv[1], strategy=sys.argv[2], document_type=sys.argv[3])
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.11.6-dev1"  # pragma: no cover
+__version__ = "0.11.6-dev2"  # pragma: no cover
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -74,9 +74,13 @@ from unstructured.partition.lang import (
    prepare_languages_for_tesseract,
 )
 from unstructured.partition.pdf_image.pdf_image_utils import (
    annotate_layout_elements,
    check_element_types_to_extract,
    save_elements,
 )
 from unstructured.partition.pdf_image.pdfminer_processing import (
    merge_inferred_with_extracted_layout,
 )
 from unstructured.partition.pdf_image.pdfminer_utils import (
    open_pdfminer_pages_generator,
    rect_to_bbox,
@ -247,6 +251,8 @@ def _partition_pdf_or_image_local(
    extract_element_types: Optional[List[str]] = None,
    image_output_dir_path: Optional[str] = None,
    pdf_image_dpi: Optional[int] = None,
    analysis: bool = False,
    analyzed_image_output_dir_path: Optional[str] = None,
    **kwargs,
 ) -> List[Element]:
    """Partition using package installed locally"""
@ -286,14 +292,27 @@ def _partition_pdf_or_image_local(
            pdf_image_dpi=pdf_image_dpi,
        )
-        if pdf_text_extractable is True:
+        extracted_layout = (
-            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+            process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
-            merged_document_layout = process_file_with_pdfminer(
+            if pdf_text_extractable
-                inferred_document_layout,
+            else []
-                filename,
+        )
        if analysis:
            annotate_layout_elements(
                inferred_document_layout=inferred_document_layout,
                extracted_layout=extracted_layout,
                filename=filename,
                output_dir_path=analyzed_image_output_dir_path,
                pdf_image_dpi=pdf_image_dpi,
                is_image=is_image,
            )
-        else:
+
-            merged_document_layout = inferred_document_layout
+        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
        merged_document_layout = merge_inferred_with_extracted_layout(
            inferred_document_layout=inferred_document_layout,
            extracted_layout=extracted_layout,
        )
        if model_name.startswith("chipper"):
            # NOTE(alan): We shouldn't do OCR with chipper
@ -317,14 +336,16 @@ def _partition_pdf_or_image_local(
        )
        if hasattr(file, "seek"):
            file.seek(0)
-        if pdf_text_extractable is True:
+
-            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+        extracted_layout = (
-            merged_document_layout = process_data_with_pdfminer(
+            process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
-                inferred_document_layout,
+        )
-                file,
+
-            )
+        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-        else:
+        merged_document_layout = merge_inferred_with_extracted_layout(
-            merged_document_layout = inferred_document_layout
+            inferred_document_layout=inferred_document_layout,
            extracted_layout=extracted_layout,
        )
        if model_name.startswith("chipper"):
            # NOTE(alan): We shouldn't do OCR with chipper
@ -655,7 +676,7 @@ def _process_pdfminer_pages(
                    urls_metadata.append(map_bbox_and_index(words, annot))
            if hasattr(obj, "get_text"):
-                _text_snippets: List[str | Any] = [obj.get_text()]  # type: ignore
+                _text_snippets: List = [obj.get_text()]
            else:
                _text = _extract_text(obj)
                _text_snippets = re.split(PARAGRAPH_PATTERN, _text)
--- a/unstructured/partition/pdf_image/pdf_image_utils.py
+++ b/unstructured/partition/pdf_image/pdf_image_utils.py
@ -13,6 +13,8 @@ from unstructured.logger import logger
 from unstructured.partition.common import convert_to_bytes
 if TYPE_CHECKING:
    from unstructured_inference.inference.layout import DocumentLayout, PageLayout, TextRegion
    from unstructured.documents.elements import Element
@ -159,3 +161,118 @@ def valid_text(text: str) -> bool:
    if not text:
        return False
    return "(cid:" not in text
 def annotate_layout_elements_with_image(
    inferred_page_layout: "PageLayout",
    extracted_page_layout: Optional["PageLayout"],
    output_dir_path: str,
    output_f_basename: str,
    page_number: int,
 ):
    """
     Annotates a page image with both inferred and extracted layout elements.
    This function takes the layout elements of a single page, either extracted from or inferred
    for the document, and annotates them on the page image. It creates two separate annotated
    images, one for each set of layout elements: 'inferred' and 'extracted'.
    These annotated images are saved to a specified directory.
    """
    layout_map = {"inferred": {"layout": inferred_page_layout, "color": "blue"}}
    if extracted_page_layout:
        layout_map["extracted"] = {"layout": extracted_page_layout, "color": "green"}
    for label, layout_data in layout_map.items():
        page_layout = layout_data.get("layout")
        color = layout_data.get("color")
        img = page_layout.annotate(colors=color)
        output_f_path = os.path.join(
            output_dir_path, f"{output_f_basename}_{page_number}_{label}.jpg"
        )
        write_image(img, output_f_path)
        print(f"output_image_path: {output_f_path}")
 def annotate_layout_elements(
    inferred_document_layout: "DocumentLayout",
    extracted_layout: List["TextRegion"],
    filename: str,
    output_dir_path: str,
    pdf_image_dpi: int,
    is_image: bool = False,
 ) -> None:
    """
    Annotates layout elements on images extracted from a PDF or an image file.
    This function processes a given document (PDF or image) and annotates layout elements based
    on the inferred and extracted layout information.
    It handles both PDF documents and standalone image files. For PDFs, it converts each page
    into an image, whereas for image files, it processes the single image.
    """
    from unstructured_inference.inference.layout import PageLayout
    output_f_basename = os.path.splitext(os.path.basename(filename))[0]
    images = []
    try:
        if is_image:
            with Image.open(filename) as img:
                img = img.convert("RGB")
                images.append(img)
                extracted_page_layout = None
                if extracted_layout:
                    extracted_page_layout = PageLayout(
                        number=1,
                        image=img,
                    )
                    extracted_page_layout.elements = extracted_layout[0]
                inferred_page_layout = inferred_document_layout.pages[0]
                inferred_page_layout.image = img
                annotate_layout_elements_with_image(
                    inferred_page_layout=inferred_document_layout.pages[0],
                    extracted_page_layout=extracted_page_layout,
                    output_dir_path=output_dir_path,
                    output_f_basename=output_f_basename,
                    page_number=1,
                )
        else:
            with tempfile.TemporaryDirectory() as temp_dir:
                _image_paths = pdf2image.convert_from_path(
                    filename,
                    dpi=pdf_image_dpi,
                    output_folder=temp_dir,
                    paths_only=True,
                )
                image_paths = cast(List[str], _image_paths)
                for i, image_path in enumerate(image_paths):
                    with Image.open(image_path) as img:
                        page_number = i + 1
                        extracted_page_layout = None
                        if extracted_layout:
                            extracted_page_layout = PageLayout(
                                number=page_number,
                                image=img,
                            )
                            extracted_page_layout.elements = extracted_layout[i]
                        inferred_page_layout = inferred_document_layout.pages[i]
                        inferred_page_layout.image = img
                        annotate_layout_elements_with_image(
                            inferred_page_layout=inferred_document_layout.pages[i],
                            extracted_page_layout=extracted_page_layout,
                            output_dir_path=output_dir_path,
                            output_f_basename=output_f_basename,
                            page_number=page_number,
                        )
    except Exception as e:
        if os.path.isdir(filename) or os.path.isfile(filename):
            raise e
        else:
            raise FileNotFoundError(f'File "{filename}" not found!') from e
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@ -7,7 +7,7 @@ from unstructured_inference.inference.elements import (
    TextRegion,
 )
 from unstructured_inference.inference.layoutelement import (
-    merge_inferred_layout_with_extracted_layout,
+    merge_inferred_layout_with_extracted_layout as merge_inferred_with_extracted_page,
 )
 from unstructured_inference.inference.ordering import order_layout
 from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
@ -25,62 +25,20 @@ if TYPE_CHECKING:
 def process_file_with_pdfminer(
    inferred_document_layout: "DocumentLayout",
    filename: str = "",
-) -> "DocumentLayout":
+    dpi: int = 200,
 ) -> List[List[TextRegion]]:
    with open_filename(filename, "rb") as fp:
        fp = cast(BinaryIO, fp)
-        inferred_document_layout = process_data_with_pdfminer(
+        extracted_layout = process_data_with_pdfminer(
            inferred_document_layout=inferred_document_layout,
            file=fp,
            dpi=dpi,
        )
-        return inferred_document_layout
+        return extracted_layout
 def process_data_with_pdfminer(
    inferred_document_layout: "DocumentLayout",
    file: Optional[Union[bytes, BinaryIO]] = None,
 ) -> "DocumentLayout":
    """Process document data using PDFMiner to extract layout information."""
    extracted_layouts = get_regions_by_pdfminer(file)
    inferred_pages = inferred_document_layout.pages
    for i, (inferred_page, extracted_layout) in enumerate(zip(inferred_pages, extracted_layouts)):
        inferred_layout = inferred_page.elements
        image_metadata = inferred_page.image_metadata
        w = image_metadata.get("width")
        h = image_metadata.get("height")
        image_size = (w, h)
        threshold_kwargs = {}
        # NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn
        # In other case the default values for the functions are used
        if (
            isinstance(inferred_page.detection_model, UnstructuredDetectronONNXModel)
            and "R_50" not in inferred_page.detection_model.model_path
        ):
            threshold_kwargs = {"same_region_threshold": 0.5, "subregion_threshold": 0.5}
        merged_layout = merge_inferred_layout_with_extracted_layout(
            inferred_layout=inferred_layout,
            extracted_layout=extracted_layout,
            page_image_size=image_size,
            **threshold_kwargs,
        )
        elements = inferred_page.get_elements_from_layout(
            layout=cast(List[TextRegion], merged_layout),
            pdf_objects=extracted_layout,
        )
        inferred_page.elements[:] = elements
    return inferred_document_layout
 def get_regions_by_pdfminer(
    fp: Optional[Union[bytes, BinaryIO]],
    dpi: int = 200,
 ) -> List[List[TextRegion]]:
    """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
@ -89,7 +47,7 @@ def get_regions_by_pdfminer(
    layouts = []
    # Coefficient to rescale bounding box to be compatible with images
    coef = dpi / 72
-    for page, page_layout in open_pdfminer_pages_generator(fp):
+    for page, page_layout in open_pdfminer_pages_generator(file):
        height = page_layout.height
        layout: List["TextRegion"] = []
@ -129,3 +87,43 @@ def get_regions_by_pdfminer(
        layouts.append(layout)
    return layouts
 def merge_inferred_with_extracted_layout(
    inferred_document_layout: "DocumentLayout",
    extracted_layout: List[List[TextRegion]],
 ) -> "DocumentLayout":
    inferred_pages = inferred_document_layout.pages
    for i, (inferred_page, extracted_page_layout) in enumerate(
        zip(inferred_pages, extracted_layout)
    ):
        inferred_layout = inferred_page.elements
        image_metadata = inferred_page.image_metadata
        w = image_metadata.get("width")
        h = image_metadata.get("height")
        image_size = (w, h)
        threshold_kwargs = {}
        # NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn
        # In other case the default values for the functions are used
        if (
            isinstance(inferred_page.detection_model, UnstructuredDetectronONNXModel)
            and "R_50" not in inferred_page.detection_model.model_path
        ):
            threshold_kwargs = {"same_region_threshold": 0.5, "subregion_threshold": 0.5}
        merged_layout = merge_inferred_with_extracted_page(
            inferred_layout=inferred_layout,
            extracted_layout=extracted_page_layout,
            page_image_size=image_size,
            **threshold_kwargs,
        )
        elements = inferred_page.get_elements_from_layout(
            layout=cast(List[TextRegion], merged_layout),
            pdf_objects=extracted_page_layout,
        )
        inferred_page.elements[:] = elements
    return inferred_document_layout
`@ -1 +1 @@`
	`__version__ = "0.11.6-dev1" # pragma: no cover`	`__version__ = "0.11.6-dev2" # pragma: no cover`