Christine Straub 69d0ee1aea
Refactor: support merging extracted layout with inferred layout (#2158)
### Summary
This PR is the second part of `pdfminer` refactor to move it from
`unstructured-inference` repo to `unstructured` repo, the first part is
done in
https://github.com/Unstructured-IO/unstructured-inference/pull/294. This
PR adds logic to merge the extracted layout with the inferred layout.

The updated workflow for the `hi_res` strategy:
* pass the document (as data/filename) to the `inference` repo to get
`inferred_layout` (DocumentLayout)
* pass the `inferred_layout` returned from the `inference` repo and the
document (as data/filename) to the `pdfminer_processing` module, which
first opens the document (create temp file/dir as needed), and splits
the document by pages
* if is_image is `True`, return the passed
inferred_layout(DocumentLayout)
  * if is_image is `False`:
* get extracted_layout (TextRegions) from the passed
document(data/filename) by pdfminer
* merge `extracted_layout` (TextRegions) with the passed
`inferred_layout` (DocumentLayout)
* return the `inferred_layout `(DocumentLayout) with updated elements
(all merged LayoutElements) as merged_layout (DocumentLayout)
* pass merged_layout and the document (as data/filename) to the `OCR`
module, which first opens the document (create temp file/dir as needed),
and splits the document by pages (convert PDF pages to image pages for
PDF file)

### Note
This PR also fixes issue #2164 by using functionality similar to the one
implemented in the `fast` strategy workflow when extracting elements by
`pdfminer`.

### TODO
* image extraction refactor to move it from `unstructured-inference`
repo to `unstructured` repo
* improving natural reading order by applying the current default
`xycut` sorting to the elements extracted by `pdfminer`
2023-12-01 20:56:31 +00:00

81 lines
2.4 KiB
Python

import os
import pathlib
import sys
import pdf2image
from unstructured_inference.inference.elements import Rectangle
from unstructured_inference.visualize import draw_bbox
from unstructured.documents.elements import PageBreak
from unstructured.partition.pdf_image.pdf import partition_pdf
CUR_DIR = pathlib.Path(__file__).parent.resolve()
def extract_element_coordinates(elements):
elements_coordinates = []
page_elements_coordinates = []
for el in elements:
if isinstance(el, PageBreak) and page_elements_coordinates:
elements_coordinates.append(page_elements_coordinates)
page_elements_coordinates = []
else:
page_elements_coordinates.append(el.metadata.coordinates)
if page_elements_coordinates:
elements_coordinates.append(page_elements_coordinates)
return elements_coordinates
def run_partition_pdf(f_path, strategy, images, output_dir):
elements = partition_pdf(
f_path,
strategy=strategy,
include_page_breaks=True,
)
elements_coordinates = extract_element_coordinates(elements)
assert len(images) == len(elements_coordinates)
for idx, (img, coords_per_page) in enumerate(zip(images, elements_coordinates)):
for coordinate in coords_per_page:
points = coordinate.points
x1, y1 = points[0]
x2, y2 = points[2]
rect = Rectangle(x1, y1, x2, y2)
img = draw_bbox(img, rect, color="red")
output_image_path = os.path.join(output_dir, f"{strategy}-{idx + 1}.jpg")
print(f"output_image_path: {output_image_path}")
img.save(output_image_path)
def run(f_path, strategy):
f_basename = os.path.splitext(os.path.basename(f_path))[0]
output_dir_path = os.path.join(output_basedir_path, f_basename)
os.makedirs(output_dir_path, exist_ok=True)
images = pdf2image.convert_from_path(f_path)
run_partition_pdf(f_path, strategy, images, output_dir_path)
if __name__ == "__main__":
if len(sys.argv) < 3:
print(
"Please provide the path to the file name as the first argument "
"and the strategy as the second argument.",
)
sys.exit(1)
if sys.argv[2] not in ["auto", "hi_res", "ocr_only", "fast"]:
print("Invalid strategy")
sys.exit(1)
output_basedir_path = os.path.join(CUR_DIR, "output")
os.makedirs(output_basedir_path, exist_ok=True)
run(f_path=sys.argv[1], strategy=sys.argv[2])