unstructured/examples/custom-layout-order/evaluate_natural_reading_order.py

import os
import sys

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pdf2image
from PIL import Image

from unstructured.documents.elements import PageBreak
from unstructured.partition.pdf_image.pdf import partition_pdf
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT
from unstructured.partition.utils.xycut import (
    bbox2points,
    recursive_xy_cut,
    vis_polygons_with_index,
)


def show_plot(image, desired_width=None):
    image_height, image_width, _ = image.shape
    if desired_width:
        # Calculate the desired height based on the original aspect ratio
        aspect_ratio = image_width / image_height
        desired_height = desired_width / aspect_ratio

        # Create a figure with the desired size and aspect ratio
        fig, ax = plt.subplots(figsize=(desired_width, desired_height))
    else:
        # Create figure and axes
        fig, ax = plt.subplots()
    # Display the image
    ax.imshow(image)
    plt.show()


def extract_element_coordinates(elements):
    elements_coordinates = []
    page_elements_coordinates = []

    for el in elements:
        if isinstance(el, PageBreak) and page_elements_coordinates:
            elements_coordinates.append(page_elements_coordinates)
            page_elements_coordinates = []
        else:
            page_elements_coordinates.append(el.metadata.coordinates)

    if page_elements_coordinates:
        elements_coordinates.append(page_elements_coordinates)

    return elements_coordinates


def convert_coordinates_to_boxes(coordinates, image):
    boxes = []

    for coordinate in coordinates:
        if not coordinate:
            continue

        points = coordinate.points
        _left, _top = points[0]
        _right, _bottom = points[2]
        w = coordinate.system.width
        h = coordinate.system.height
        image_height, image_width, _ = image.shape
        left = _left * image_width / w
        right = _right * image_width / w
        top = _top * image_height / h
        bottom = _bottom * image_height / h
        boxes.append([int(left), int(top), int(right), int(bottom)])

    return boxes


def order_boxes(boxes):
    res = []
    recursive_xy_cut(np.asarray(boxes).astype(int), np.arange(len(boxes)), res)
    np_array_boxes = np.array(boxes)
    ordered_boxes = np_array_boxes[np.array(res)].tolist()
    return ordered_boxes


def draw_boxes(image, boxes, output_dir, base_name, page_num, output_type, label):
    annotated_image = vis_polygons_with_index(image, [bbox2points(it) for it in boxes])

    if output_type in ["plot", "all"]:
        print(f"{label} elements - Page: {page_num}")
        show_plot(annotated_image, desired_width=20)

    if output_type in ["image", "all"]:
        output_image_path = os.path.join(output_dir, f"{base_name}_{page_num}_{label}.jpg")
        cv2.imwrite(output_image_path, annotated_image)


def draw_elements(elements, images, output_type, output_dir, base_name, label):
    elements_coordinates = extract_element_coordinates(elements)

    assert len(images) == len(elements_coordinates)
    for idx, (img, coords_per_page) in enumerate(zip(images, elements_coordinates)):
        image = np.array(img)
        boxes = convert_coordinates_to_boxes(coords_per_page, image)
        if len(boxes) < len(coords_per_page):
            delta = len(coords_per_page) - len(boxes)
            print(f"{delta} elements in page {idx+1} do not have coordinate data")
        draw_boxes(image, boxes, output_dir, base_name, idx + 1, output_type, label)


def run_partition_pdf(
    f_path,
    strategy,
    sort_mode,
    filetype,
    output_type="plot",
    output_root_dir="",
):
    print(
        f">>> Starting run_partition_pdf - f_path: {f_path} - strategy: {strategy} "
        f"- sort_mode: {sort_mode} - filetype: {filetype}",
    )
    f_base_name = os.path.splitext(os.path.basename(f_path))[0]

    output_dir = os.path.join(output_root_dir, strategy, f_base_name)
    os.makedirs(output_dir, exist_ok=True)

    is_image = filetype == "image"

    images = [Image.open(f_path)] if is_image else pdf2image.convert_from_path(f_path)

    ordered_elements = partition_pdf(
        filename=f_path,
        strategy=strategy,
        include_page_breaks=True,
        sort_mode=sort_mode,
        is_image=is_image,
    )
    print("\n\n".join([str(el) for el in ordered_elements]))

    draw_elements(ordered_elements, images, output_type, output_dir, f_base_name, sort_mode)

    print("<<< Finished run_partition_pdf")


def run():
    f_sub_path = sys.argv[1]
    strategy = sys.argv[2]
    sort_mode = sys.argv[3]
    filetype = sys.argv[4]

    base_dir = os.getcwd()
    output_root_dir = os.path.join(base_dir, "examples", "custom-layout-order", "output")
    os.makedirs(output_root_dir, exist_ok=True)

    f_path = os.path.join(base_dir, f_sub_path)
    run_partition_pdf(f_path, strategy, sort_mode, filetype, "image", output_root_dir)


if __name__ == "__main__":
    if len(sys.argv) < 5:
        print(
            "Please provide the path to the file name as the first argument, the strategy as the "
            "second argument, the sort_mode as the third argument, and the filetype as fourth "
            "argument.",
        )
        sys.exit(1)

    if sys.argv[3] not in [SORT_MODE_XY_CUT, SORT_MODE_BASIC, SORT_MODE_DONT]:
        print("Invalid sort mode! The sort mode should be `xy-cut`, `basic`, or `dont`")
        sys.exit(1)

    if sys.argv[4] not in ["pdf", "image"]:
        print("Invalid filetype! The filetype should be eiter `pdf` or `image`")
        sys.exit(1)

    run()