unstructured/examples/custom-layout-order/evaluate_natural_reading_order.py

176 lines
5.5 KiB
Python
Raw Normal View History

import os
import sys
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pdf2image
from PIL import Image
from unstructured.documents.elements import PageBreak
Refactor: support merging `extracted` layout with `inferred` layout (#2158) ### Summary This PR is the second part of `pdfminer` refactor to move it from `unstructured-inference` repo to `unstructured` repo, the first part is done in https://github.com/Unstructured-IO/unstructured-inference/pull/294. This PR adds logic to merge the extracted layout with the inferred layout. The updated workflow for the `hi_res` strategy: * pass the document (as data/filename) to the `inference` repo to get `inferred_layout` (DocumentLayout) * pass the `inferred_layout` returned from the `inference` repo and the document (as data/filename) to the `pdfminer_processing` module, which first opens the document (create temp file/dir as needed), and splits the document by pages * if is_image is `True`, return the passed inferred_layout(DocumentLayout) * if is_image is `False`: * get extracted_layout (TextRegions) from the passed document(data/filename) by pdfminer * merge `extracted_layout` (TextRegions) with the passed `inferred_layout` (DocumentLayout) * return the `inferred_layout `(DocumentLayout) with updated elements (all merged LayoutElements) as merged_layout (DocumentLayout) * pass merged_layout and the document (as data/filename) to the `OCR` module, which first opens the document (create temp file/dir as needed), and splits the document by pages (convert PDF pages to image pages for PDF file) ### Note This PR also fixes issue #2164 by using functionality similar to the one implemented in the `fast` strategy workflow when extracting elements by `pdfminer`. ### TODO * image extraction refactor to move it from `unstructured-inference` repo to `unstructured` repo * improving natural reading order by applying the current default `xycut` sorting to the elements extracted by `pdfminer`
2023-12-01 12:56:31 -08:00
from unstructured.partition.pdf_image.pdf import partition_pdf
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT
from unstructured.partition.utils.xycut import (
bbox2points,
recursive_xy_cut,
vis_polygons_with_index,
)
def show_plot(image, desired_width=None):
image_height, image_width, _ = image.shape
if desired_width:
# Calculate the desired height based on the original aspect ratio
aspect_ratio = image_width / image_height
desired_height = desired_width / aspect_ratio
# Create a figure with the desired size and aspect ratio
fig, ax = plt.subplots(figsize=(desired_width, desired_height))
else:
# Create figure and axes
fig, ax = plt.subplots()
# Display the image
ax.imshow(image)
plt.show()
def extract_element_coordinates(elements):
elements_coordinates = []
page_elements_coordinates = []
for el in elements:
if isinstance(el, PageBreak) and page_elements_coordinates:
elements_coordinates.append(page_elements_coordinates)
page_elements_coordinates = []
else:
page_elements_coordinates.append(el.metadata.coordinates)
if page_elements_coordinates:
elements_coordinates.append(page_elements_coordinates)
return elements_coordinates
def convert_coordinates_to_boxes(coordinates, image):
boxes = []
for coordinate in coordinates:
refactor: `partition_pdf()` for `ocr_only` strategy (#1811) ### Summary Update `ocr_only` strategy in `partition_pdf()`. This PR adds the functionality to get accurate coordinate data when partitioning PDFs and Images with the `ocr_only` strategy. - Add functionality to perform OCR region grouping based on the OCR text taken from `pytesseract.image_to_string()` - Add functionality to get layout elements from OCR regions (ocr_layout) for both `tesseract` and `paddle` - Add functionality to determine the `source` of merged text regions when merging text regions in `merge_text_regions()` - Merge multiple test functions related to "ocr_only" strategy into `test_partition_pdf_with_ocr_only_strategy()` - This PR also fixes [issue #1792](https://github.com/Unstructured-IO/unstructured/issues/1792) ### Evaluation ``` # Image PYTHONPATH=. python examples/custom-layout-order/evaluate_natural_reading_order.py example-docs/double-column-A.jpg ocr_only xy-cut image # PDF PYTHONPATH=. python examples/custom-layout-order/evaluate_natural_reading_order.py example-docs/multi-column-2p.pdf ocr_only xy-cut pdf ``` ### Test - **Before update** All elements have the same coordinate data ![multi-column-2p_1_xy-cut](https://github.com/Unstructured-IO/unstructured/assets/9475974/aae0195a-2943-4fa8-bdd8-807f2f09c768) - **After update** All elements have accurate coordinate data ![multi-column-2p_1_xy-cut](https://github.com/Unstructured-IO/unstructured/assets/9475974/0f6c6202-9e65-4acf-bcd4-ac9dd01ab64a) --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-30 13:13:29 -07:00
if not coordinate:
continue
points = coordinate.points
_left, _top = points[0]
_right, _bottom = points[2]
w = coordinate.system.width
h = coordinate.system.height
image_height, image_width, _ = image.shape
left = _left * image_width / w
right = _right * image_width / w
top = _top * image_height / h
bottom = _bottom * image_height / h
boxes.append([int(left), int(top), int(right), int(bottom)])
return boxes
def order_boxes(boxes):
res = []
recursive_xy_cut(np.asarray(boxes).astype(int), np.arange(len(boxes)), res)
np_array_boxes = np.array(boxes)
ordered_boxes = np_array_boxes[np.array(res)].tolist()
return ordered_boxes
def draw_boxes(image, boxes, output_dir, base_name, page_num, output_type, label):
annotated_image = vis_polygons_with_index(image, [bbox2points(it) for it in boxes])
if output_type in ["plot", "all"]:
print(f"{label} elements - Page: {page_num}")
show_plot(annotated_image, desired_width=20)
if output_type in ["image", "all"]:
output_image_path = os.path.join(output_dir, f"{base_name}_{page_num}_{label}.jpg")
cv2.imwrite(output_image_path, annotated_image)
def draw_elements(elements, images, output_type, output_dir, base_name, label):
elements_coordinates = extract_element_coordinates(elements)
assert len(images) == len(elements_coordinates)
for idx, (img, coords_per_page) in enumerate(zip(images, elements_coordinates)):
image = np.array(img)
boxes = convert_coordinates_to_boxes(coords_per_page, image)
refactor: `partition_pdf()` for `ocr_only` strategy (#1811) ### Summary Update `ocr_only` strategy in `partition_pdf()`. This PR adds the functionality to get accurate coordinate data when partitioning PDFs and Images with the `ocr_only` strategy. - Add functionality to perform OCR region grouping based on the OCR text taken from `pytesseract.image_to_string()` - Add functionality to get layout elements from OCR regions (ocr_layout) for both `tesseract` and `paddle` - Add functionality to determine the `source` of merged text regions when merging text regions in `merge_text_regions()` - Merge multiple test functions related to "ocr_only" strategy into `test_partition_pdf_with_ocr_only_strategy()` - This PR also fixes [issue #1792](https://github.com/Unstructured-IO/unstructured/issues/1792) ### Evaluation ``` # Image PYTHONPATH=. python examples/custom-layout-order/evaluate_natural_reading_order.py example-docs/double-column-A.jpg ocr_only xy-cut image # PDF PYTHONPATH=. python examples/custom-layout-order/evaluate_natural_reading_order.py example-docs/multi-column-2p.pdf ocr_only xy-cut pdf ``` ### Test - **Before update** All elements have the same coordinate data ![multi-column-2p_1_xy-cut](https://github.com/Unstructured-IO/unstructured/assets/9475974/aae0195a-2943-4fa8-bdd8-807f2f09c768) - **After update** All elements have accurate coordinate data ![multi-column-2p_1_xy-cut](https://github.com/Unstructured-IO/unstructured/assets/9475974/0f6c6202-9e65-4acf-bcd4-ac9dd01ab64a) --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-30 13:13:29 -07:00
if len(boxes) < len(coords_per_page):
delta = len(coords_per_page) - len(boxes)
print(f"{delta} elements in page {idx+1} do not have coordinate data")
draw_boxes(image, boxes, output_dir, base_name, idx + 1, output_type, label)
def run_partition_pdf(
f_path,
strategy,
sort_mode,
filetype,
output_type="plot",
output_root_dir="",
):
print(
f">>> Starting run_partition_pdf - f_path: {f_path} - strategy: {strategy} "
f"- sort_mode: {sort_mode} - filetype: {filetype}",
)
f_base_name = os.path.splitext(os.path.basename(f_path))[0]
output_dir = os.path.join(output_root_dir, strategy, f_base_name)
os.makedirs(output_dir, exist_ok=True)
is_image = filetype == "image"
images = [Image.open(f_path)] if is_image else pdf2image.convert_from_path(f_path)
ordered_elements = partition_pdf(
filename=f_path,
strategy=strategy,
include_page_breaks=True,
sort_mode=sort_mode,
is_image=is_image,
)
print("\n\n".join([str(el) for el in ordered_elements]))
draw_elements(ordered_elements, images, output_type, output_dir, f_base_name, sort_mode)
print("<<< Finished run_partition_pdf")
def run():
f_sub_path = sys.argv[1]
strategy = sys.argv[2]
sort_mode = sys.argv[3]
filetype = sys.argv[4]
base_dir = os.getcwd()
output_root_dir = os.path.join(base_dir, "examples", "custom-layout-order", "output")
os.makedirs(output_root_dir, exist_ok=True)
f_path = os.path.join(base_dir, f_sub_path)
run_partition_pdf(f_path, strategy, sort_mode, filetype, "image", output_root_dir)
if __name__ == "__main__":
if len(sys.argv) < 5:
print(
"Please provide the path to the file name as the first argument, the strategy as the "
"second argument, the sort_mode as the third argument, and the filetype as fourth "
"argument.",
)
sys.exit(1)
if sys.argv[3] not in [SORT_MODE_XY_CUT, SORT_MODE_BASIC, SORT_MODE_DONT]:
print("Invalid sort mode! The sort mode should be `xy-cut`, `basic`, or `dont`")
sys.exit(1)
if sys.argv[4] not in ["pdf", "image"]:
print("Invalid filetype! The filetype should be eiter `pdf` or `image`")
sys.exit(1)
run()