mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-27 10:56:47 +00:00

### Description Currently linting only takes place over the base unstructured directory but we support python files throughout the repo. It makes sense for all those files to also abide by the same linting rules so the entire repo was set to be inspected when the linters are run. Along with that autoflake was added as a linter which has a lot of added benefits such as removing unused imports for you that would currently break flake and require manual intervention. The only real relevant changes in this PR are in the `Makefile`, `setup.cfg`, and `requirements/test.in`. The rest is the result of running the linters.
170 lines
5.3 KiB
Python
170 lines
5.3 KiB
Python
import os
|
|
import sys
|
|
|
|
import cv2
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
import pdf2image
|
|
from PIL import Image
|
|
|
|
from unstructured.documents.elements import PageBreak
|
|
from unstructured.partition.pdf import partition_pdf
|
|
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT
|
|
from unstructured.partition.utils.xycut import (
|
|
bbox2points,
|
|
recursive_xy_cut,
|
|
vis_polygons_with_index,
|
|
)
|
|
|
|
|
|
def show_plot(image, desired_width=None):
|
|
image_height, image_width, _ = image.shape
|
|
if desired_width:
|
|
# Calculate the desired height based on the original aspect ratio
|
|
aspect_ratio = image_width / image_height
|
|
desired_height = desired_width / aspect_ratio
|
|
|
|
# Create a figure with the desired size and aspect ratio
|
|
fig, ax = plt.subplots(figsize=(desired_width, desired_height))
|
|
else:
|
|
# Create figure and axes
|
|
fig, ax = plt.subplots()
|
|
# Display the image
|
|
ax.imshow(image)
|
|
plt.show()
|
|
|
|
|
|
def extract_element_coordinates(elements):
|
|
elements_coordinates = []
|
|
page_elements_coordinates = []
|
|
|
|
for el in elements:
|
|
if isinstance(el, PageBreak) and page_elements_coordinates:
|
|
elements_coordinates.append(page_elements_coordinates)
|
|
page_elements_coordinates = []
|
|
else:
|
|
page_elements_coordinates.append(el.metadata.coordinates)
|
|
|
|
if page_elements_coordinates:
|
|
elements_coordinates.append(page_elements_coordinates)
|
|
|
|
return elements_coordinates
|
|
|
|
|
|
def convert_coordinates_to_boxes(coordinates, image):
|
|
boxes = []
|
|
|
|
for coordinate in coordinates:
|
|
points = coordinate.points
|
|
_left, _top = points[0]
|
|
_right, _bottom = points[2]
|
|
w = coordinate.system.width
|
|
h = coordinate.system.height
|
|
image_height, image_width, _ = image.shape
|
|
left = _left * image_width / w
|
|
right = _right * image_width / w
|
|
top = _top * image_height / h
|
|
bottom = _bottom * image_height / h
|
|
boxes.append([int(left), int(top), int(right), int(bottom)])
|
|
|
|
return boxes
|
|
|
|
|
|
def order_boxes(boxes):
|
|
res = []
|
|
recursive_xy_cut(np.asarray(boxes).astype(int), np.arange(len(boxes)), res)
|
|
np_array_boxes = np.array(boxes)
|
|
ordered_boxes = np_array_boxes[np.array(res)].tolist()
|
|
return ordered_boxes
|
|
|
|
|
|
def draw_boxes(image, boxes, output_dir, base_name, page_num, output_type, label):
|
|
annotated_image = vis_polygons_with_index(image, [bbox2points(it) for it in boxes])
|
|
|
|
if output_type in ["plot", "all"]:
|
|
print(f"{label} elements - Page: {page_num}")
|
|
show_plot(annotated_image, desired_width=20)
|
|
|
|
if output_type in ["image", "all"]:
|
|
output_image_path = os.path.join(output_dir, f"{base_name}_{page_num}_{label}.jpg")
|
|
cv2.imwrite(output_image_path, annotated_image)
|
|
|
|
|
|
def draw_elements(elements, images, output_type, output_dir, base_name, label):
|
|
elements_coordinates = extract_element_coordinates(elements)
|
|
|
|
assert len(images) == len(elements_coordinates)
|
|
for idx, (img, coords_per_page) in enumerate(zip(images, elements_coordinates)):
|
|
image = np.array(img)
|
|
boxes = convert_coordinates_to_boxes(coords_per_page, image)
|
|
draw_boxes(image, boxes, output_dir, base_name, idx + 1, output_type, label)
|
|
|
|
|
|
def run_partition_pdf(
|
|
f_path,
|
|
strategy,
|
|
sort_mode,
|
|
filetype,
|
|
output_type="plot",
|
|
output_root_dir="",
|
|
):
|
|
print(
|
|
f">>> Starting run_partition_pdf - f_path: {f_path} - strategy: {strategy} "
|
|
f"- sort_mode: {sort_mode} - filetype: {filetype}",
|
|
)
|
|
f_base_name = os.path.splitext(os.path.basename(f_path))[0]
|
|
|
|
output_dir = os.path.join(output_root_dir, strategy, f_base_name)
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
is_image = filetype == "image"
|
|
|
|
images = [Image.open(f_path)] if is_image else pdf2image.convert_from_path(f_path)
|
|
|
|
ordered_elements = partition_pdf(
|
|
filename=f_path,
|
|
strategy=strategy,
|
|
include_page_breaks=True,
|
|
sort_mode=sort_mode,
|
|
is_image=is_image,
|
|
)
|
|
print("\n\n".join([str(el) for el in ordered_elements]))
|
|
|
|
draw_elements(ordered_elements, images, output_type, output_dir, f_base_name, sort_mode)
|
|
|
|
print("<<< Finished run_partition_pdf")
|
|
|
|
|
|
def run():
|
|
f_sub_path = sys.argv[1]
|
|
strategy = sys.argv[2]
|
|
sort_mode = sys.argv[3]
|
|
filetype = sys.argv[4]
|
|
|
|
base_dir = os.getcwd()
|
|
output_root_dir = os.path.join(base_dir, "examples", "custom-layout-order", "output")
|
|
os.makedirs(output_root_dir, exist_ok=True)
|
|
|
|
f_path = os.path.join(base_dir, f_sub_path)
|
|
run_partition_pdf(f_path, strategy, sort_mode, filetype, "image", output_root_dir)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) < 5:
|
|
print(
|
|
"Please provide the path to the file name as the first argument, the strategy as the "
|
|
"second argument, the sort_mode as the third argument, and the filetype as fourth "
|
|
"argument.",
|
|
)
|
|
sys.exit(1)
|
|
|
|
if sys.argv[3] not in [SORT_MODE_XY_CUT, SORT_MODE_BASIC, SORT_MODE_DONT]:
|
|
print("Invalid sort mode! The sort mode should be `xy-cut`, `basic`, or `dont`")
|
|
sys.exit(1)
|
|
|
|
if sys.argv[4] not in ["pdf", "image"]:
|
|
print("Invalid filetype! The filetype should be eiter `pdf` or `image`")
|
|
sys.exit(1)
|
|
|
|
run()
|