2023-10-05 12:53:16 -07:00
|
|
|
import os
|
|
|
|
import pathlib
|
|
|
|
import sys
|
|
|
|
|
|
|
|
import pdf2image
|
|
|
|
from unstructured_inference.inference.elements import Rectangle
|
|
|
|
from unstructured_inference.visualize import draw_bbox
|
|
|
|
|
|
|
|
from unstructured.documents.elements import PageBreak
|
2023-12-01 12:56:31 -08:00
|
|
|
from unstructured.partition.pdf_image.pdf import partition_pdf
|
2023-10-05 12:53:16 -07:00
|
|
|
|
|
|
|
CUR_DIR = pathlib.Path(__file__).parent.resolve()
|
|
|
|
|
|
|
|
|
|
|
|
def extract_element_coordinates(elements):
|
|
|
|
elements_coordinates = []
|
|
|
|
page_elements_coordinates = []
|
|
|
|
|
|
|
|
for el in elements:
|
2023-10-17 08:45:12 -04:00
|
|
|
if isinstance(el, PageBreak) and page_elements_coordinates:
|
|
|
|
elements_coordinates.append(page_elements_coordinates)
|
|
|
|
page_elements_coordinates = []
|
2023-10-05 12:53:16 -07:00
|
|
|
else:
|
|
|
|
page_elements_coordinates.append(el.metadata.coordinates)
|
|
|
|
|
|
|
|
if page_elements_coordinates:
|
|
|
|
elements_coordinates.append(page_elements_coordinates)
|
|
|
|
|
|
|
|
return elements_coordinates
|
|
|
|
|
|
|
|
|
|
|
|
def run_partition_pdf(f_path, strategy, images, output_dir):
|
|
|
|
elements = partition_pdf(
|
|
|
|
f_path,
|
|
|
|
strategy=strategy,
|
|
|
|
include_page_breaks=True,
|
|
|
|
)
|
|
|
|
|
|
|
|
elements_coordinates = extract_element_coordinates(elements)
|
|
|
|
assert len(images) == len(elements_coordinates)
|
|
|
|
|
|
|
|
for idx, (img, coords_per_page) in enumerate(zip(images, elements_coordinates)):
|
|
|
|
for coordinate in coords_per_page:
|
|
|
|
points = coordinate.points
|
|
|
|
x1, y1 = points[0]
|
|
|
|
x2, y2 = points[2]
|
|
|
|
rect = Rectangle(x1, y1, x2, y2)
|
|
|
|
img = draw_bbox(img, rect, color="red")
|
|
|
|
|
|
|
|
output_image_path = os.path.join(output_dir, f"{strategy}-{idx + 1}.jpg")
|
|
|
|
print(f"output_image_path: {output_image_path}")
|
|
|
|
|
|
|
|
img.save(output_image_path)
|
|
|
|
|
|
|
|
|
|
|
|
def run(f_path, strategy):
|
|
|
|
f_basename = os.path.splitext(os.path.basename(f_path))[0]
|
|
|
|
output_dir_path = os.path.join(output_basedir_path, f_basename)
|
|
|
|
os.makedirs(output_dir_path, exist_ok=True)
|
|
|
|
|
|
|
|
images = pdf2image.convert_from_path(f_path)
|
|
|
|
run_partition_pdf(f_path, strategy, images, output_dir_path)
|
|
|
|
|
|
|
|
|
2023-10-17 08:45:12 -04:00
|
|
|
if __name__ == "__main__":
|
2023-10-05 12:53:16 -07:00
|
|
|
if len(sys.argv) < 3:
|
|
|
|
print(
|
2023-10-17 08:45:12 -04:00
|
|
|
"Please provide the path to the file name as the first argument "
|
|
|
|
"and the strategy as the second argument.",
|
2023-10-05 12:53:16 -07:00
|
|
|
)
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
if sys.argv[2] not in ["auto", "hi_res", "ocr_only", "fast"]:
|
|
|
|
print("Invalid strategy")
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
output_basedir_path = os.path.join(CUR_DIR, "output")
|
|
|
|
os.makedirs(output_basedir_path, exist_ok=True)
|
|
|
|
|
|
|
|
run(f_path=sys.argv[1], strategy=sys.argv[2])
|