Refactor: support layout analysis (#2273)

### Summary
This PR is the second part of the "layout analysis" refactor to move it
from unstructured-inference repo to unstructured repo, the first part is
done in
https://github.com/Unstructured-IO/unstructured-inference/pull/305. This
PR adds logic to support annotating `inferred` and `extracted` elements.

### Testing

```
PYTHONPATH=. python examples/layout-analysis/visualization.py <file_path> <strategy> <document_type>
```
e.g.
```
PYTHONPATH=. python examples/layout-analysis/visualization.py example-docs/layout-parser-paper-fast.pdf hi_res pdf
```
This commit is contained in:
Christine Straub 2023-12-18 22:21:56 -08:00 committed by GitHub
parent 09f86f28fb
commit 096d23bc28
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 230 additions and 78 deletions

View File

@ -1,7 +1,9 @@
## 0.11.6-dev1
## 0.11.6-dev2
### Enhancements
* **Update the layout analysis script.** The previous script only supported annotating `final` elements. The updated script also supports annotating `inferred` and `extracted` elements.
### Features
### Fixes

View File

@ -3,7 +3,8 @@ import pathlib
import sys
import pdf2image
from unstructured_inference.inference.elements import Rectangle
from PIL import Image
from unstructured_inference.inference.elements import TextRegion
from unstructured_inference.visualize import draw_bbox
from unstructured.documents.elements import PageBreak
@ -29,11 +30,14 @@ def extract_element_coordinates(elements):
return elements_coordinates
def run_partition_pdf(f_path, strategy, images, output_dir):
def run_partition_pdf(f_path, strategy, images, output_dir, output_f_basename, is_image):
elements = partition_pdf(
f_path,
strategy=strategy,
is_image=is_image,
include_page_breaks=True,
analysis=True,
analyzed_image_output_dir_path=output_dir,
)
elements_coordinates = extract_element_coordinates(elements)
@ -44,22 +48,28 @@ def run_partition_pdf(f_path, strategy, images, output_dir):
points = coordinate.points
x1, y1 = points[0]
x2, y2 = points[2]
rect = Rectangle(x1, y1, x2, y2)
img = draw_bbox(img, rect, color="red")
el = TextRegion.from_coords(x1, y1, x2, y2)
img = draw_bbox(img, el, color="red")
output_image_path = os.path.join(output_dir, f"{strategy}-{idx + 1}.jpg")
output_image_path = os.path.join(output_dir, f"{output_f_basename}_{idx + 1}_final.jpg")
img.save(output_image_path)
print(f"output_image_path: {output_image_path}")
img.save(output_image_path)
def run(f_path, strategy):
def run(f_path, strategy, document_type):
f_basename = os.path.splitext(os.path.basename(f_path))[0]
output_dir_path = os.path.join(output_basedir_path, f_basename)
os.makedirs(output_dir_path, exist_ok=True)
images = pdf2image.convert_from_path(f_path)
run_partition_pdf(f_path, strategy, images, output_dir_path)
is_image = document_type == "image"
if is_image:
with Image.open(f_path) as img:
img = img.convert("RGB")
images = [img]
else:
images = pdf2image.convert_from_path(f_path)
run_partition_pdf(f_path, strategy, images, output_dir_path, f_basename, is_image)
if __name__ == "__main__":
@ -74,7 +84,11 @@ if __name__ == "__main__":
print("Invalid strategy")
sys.exit(1)
if sys.argv[3] not in ["pdf", "image"]:
print("Invalid document type")
sys.exit(1)
output_basedir_path = os.path.join(CUR_DIR, "output")
os.makedirs(output_basedir_path, exist_ok=True)
run(f_path=sys.argv[1], strategy=sys.argv[2])
run(f_path=sys.argv[1], strategy=sys.argv[2], document_type=sys.argv[3])

View File

@ -1 +1 @@
__version__ = "0.11.6-dev1" # pragma: no cover
__version__ = "0.11.6-dev2" # pragma: no cover

View File

@ -74,9 +74,13 @@ from unstructured.partition.lang import (
prepare_languages_for_tesseract,
)
from unstructured.partition.pdf_image.pdf_image_utils import (
annotate_layout_elements,
check_element_types_to_extract,
save_elements,
)
from unstructured.partition.pdf_image.pdfminer_processing import (
merge_inferred_with_extracted_layout,
)
from unstructured.partition.pdf_image.pdfminer_utils import (
open_pdfminer_pages_generator,
rect_to_bbox,
@ -247,6 +251,8 @@ def _partition_pdf_or_image_local(
extract_element_types: Optional[List[str]] = None,
image_output_dir_path: Optional[str] = None,
pdf_image_dpi: Optional[int] = None,
analysis: bool = False,
analyzed_image_output_dir_path: Optional[str] = None,
**kwargs,
) -> List[Element]:
"""Partition using package installed locally"""
@ -286,14 +292,27 @@ def _partition_pdf_or_image_local(
pdf_image_dpi=pdf_image_dpi,
)
if pdf_text_extractable is True:
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = process_file_with_pdfminer(
inferred_document_layout,
filename,
extracted_layout = (
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
if pdf_text_extractable
else []
)
if analysis:
annotate_layout_elements(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
filename=filename,
output_dir_path=analyzed_image_output_dir_path,
pdf_image_dpi=pdf_image_dpi,
is_image=is_image,
)
else:
merged_document_layout = inferred_document_layout
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
)
if model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
@ -317,14 +336,16 @@ def _partition_pdf_or_image_local(
)
if hasattr(file, "seek"):
file.seek(0)
if pdf_text_extractable is True:
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = process_data_with_pdfminer(
inferred_document_layout,
file,
)
else:
merged_document_layout = inferred_document_layout
extracted_layout = (
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
)
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
)
if model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
@ -655,7 +676,7 @@ def _process_pdfminer_pages(
urls_metadata.append(map_bbox_and_index(words, annot))
if hasattr(obj, "get_text"):
_text_snippets: List[str | Any] = [obj.get_text()] # type: ignore
_text_snippets: List = [obj.get_text()]
else:
_text = _extract_text(obj)
_text_snippets = re.split(PARAGRAPH_PATTERN, _text)

View File

@ -13,6 +13,8 @@ from unstructured.logger import logger
from unstructured.partition.common import convert_to_bytes
if TYPE_CHECKING:
from unstructured_inference.inference.layout import DocumentLayout, PageLayout, TextRegion
from unstructured.documents.elements import Element
@ -159,3 +161,118 @@ def valid_text(text: str) -> bool:
if not text:
return False
return "(cid:" not in text
def annotate_layout_elements_with_image(
inferred_page_layout: "PageLayout",
extracted_page_layout: Optional["PageLayout"],
output_dir_path: str,
output_f_basename: str,
page_number: int,
):
"""
Annotates a page image with both inferred and extracted layout elements.
This function takes the layout elements of a single page, either extracted from or inferred
for the document, and annotates them on the page image. It creates two separate annotated
images, one for each set of layout elements: 'inferred' and 'extracted'.
These annotated images are saved to a specified directory.
"""
layout_map = {"inferred": {"layout": inferred_page_layout, "color": "blue"}}
if extracted_page_layout:
layout_map["extracted"] = {"layout": extracted_page_layout, "color": "green"}
for label, layout_data in layout_map.items():
page_layout = layout_data.get("layout")
color = layout_data.get("color")
img = page_layout.annotate(colors=color)
output_f_path = os.path.join(
output_dir_path, f"{output_f_basename}_{page_number}_{label}.jpg"
)
write_image(img, output_f_path)
print(f"output_image_path: {output_f_path}")
def annotate_layout_elements(
inferred_document_layout: "DocumentLayout",
extracted_layout: List["TextRegion"],
filename: str,
output_dir_path: str,
pdf_image_dpi: int,
is_image: bool = False,
) -> None:
"""
Annotates layout elements on images extracted from a PDF or an image file.
This function processes a given document (PDF or image) and annotates layout elements based
on the inferred and extracted layout information.
It handles both PDF documents and standalone image files. For PDFs, it converts each page
into an image, whereas for image files, it processes the single image.
"""
from unstructured_inference.inference.layout import PageLayout
output_f_basename = os.path.splitext(os.path.basename(filename))[0]
images = []
try:
if is_image:
with Image.open(filename) as img:
img = img.convert("RGB")
images.append(img)
extracted_page_layout = None
if extracted_layout:
extracted_page_layout = PageLayout(
number=1,
image=img,
)
extracted_page_layout.elements = extracted_layout[0]
inferred_page_layout = inferred_document_layout.pages[0]
inferred_page_layout.image = img
annotate_layout_elements_with_image(
inferred_page_layout=inferred_document_layout.pages[0],
extracted_page_layout=extracted_page_layout,
output_dir_path=output_dir_path,
output_f_basename=output_f_basename,
page_number=1,
)
else:
with tempfile.TemporaryDirectory() as temp_dir:
_image_paths = pdf2image.convert_from_path(
filename,
dpi=pdf_image_dpi,
output_folder=temp_dir,
paths_only=True,
)
image_paths = cast(List[str], _image_paths)
for i, image_path in enumerate(image_paths):
with Image.open(image_path) as img:
page_number = i + 1
extracted_page_layout = None
if extracted_layout:
extracted_page_layout = PageLayout(
number=page_number,
image=img,
)
extracted_page_layout.elements = extracted_layout[i]
inferred_page_layout = inferred_document_layout.pages[i]
inferred_page_layout.image = img
annotate_layout_elements_with_image(
inferred_page_layout=inferred_document_layout.pages[i],
extracted_page_layout=extracted_page_layout,
output_dir_path=output_dir_path,
output_f_basename=output_f_basename,
page_number=page_number,
)
except Exception as e:
if os.path.isdir(filename) or os.path.isfile(filename):
raise e
else:
raise FileNotFoundError(f'File "{filename}" not found!') from e

View File

@ -7,7 +7,7 @@ from unstructured_inference.inference.elements import (
TextRegion,
)
from unstructured_inference.inference.layoutelement import (
merge_inferred_layout_with_extracted_layout,
merge_inferred_layout_with_extracted_layout as merge_inferred_with_extracted_page,
)
from unstructured_inference.inference.ordering import order_layout
from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
@ -25,62 +25,20 @@ if TYPE_CHECKING:
def process_file_with_pdfminer(
inferred_document_layout: "DocumentLayout",
filename: str = "",
) -> "DocumentLayout":
dpi: int = 200,
) -> List[List[TextRegion]]:
with open_filename(filename, "rb") as fp:
fp = cast(BinaryIO, fp)
inferred_document_layout = process_data_with_pdfminer(
inferred_document_layout=inferred_document_layout,
extracted_layout = process_data_with_pdfminer(
file=fp,
dpi=dpi,
)
return inferred_document_layout
return extracted_layout
def process_data_with_pdfminer(
inferred_document_layout: "DocumentLayout",
file: Optional[Union[bytes, BinaryIO]] = None,
) -> "DocumentLayout":
"""Process document data using PDFMiner to extract layout information."""
extracted_layouts = get_regions_by_pdfminer(file)
inferred_pages = inferred_document_layout.pages
for i, (inferred_page, extracted_layout) in enumerate(zip(inferred_pages, extracted_layouts)):
inferred_layout = inferred_page.elements
image_metadata = inferred_page.image_metadata
w = image_metadata.get("width")
h = image_metadata.get("height")
image_size = (w, h)
threshold_kwargs = {}
# NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn
# In other case the default values for the functions are used
if (
isinstance(inferred_page.detection_model, UnstructuredDetectronONNXModel)
and "R_50" not in inferred_page.detection_model.model_path
):
threshold_kwargs = {"same_region_threshold": 0.5, "subregion_threshold": 0.5}
merged_layout = merge_inferred_layout_with_extracted_layout(
inferred_layout=inferred_layout,
extracted_layout=extracted_layout,
page_image_size=image_size,
**threshold_kwargs,
)
elements = inferred_page.get_elements_from_layout(
layout=cast(List[TextRegion], merged_layout),
pdf_objects=extracted_layout,
)
inferred_page.elements[:] = elements
return inferred_document_layout
def get_regions_by_pdfminer(
fp: Optional[Union[bytes, BinaryIO]],
dpi: int = 200,
) -> List[List[TextRegion]]:
"""Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
@ -89,7 +47,7 @@ def get_regions_by_pdfminer(
layouts = []
# Coefficient to rescale bounding box to be compatible with images
coef = dpi / 72
for page, page_layout in open_pdfminer_pages_generator(fp):
for page, page_layout in open_pdfminer_pages_generator(file):
height = page_layout.height
layout: List["TextRegion"] = []
@ -129,3 +87,43 @@ def get_regions_by_pdfminer(
layouts.append(layout)
return layouts
def merge_inferred_with_extracted_layout(
inferred_document_layout: "DocumentLayout",
extracted_layout: List[List[TextRegion]],
) -> "DocumentLayout":
inferred_pages = inferred_document_layout.pages
for i, (inferred_page, extracted_page_layout) in enumerate(
zip(inferred_pages, extracted_layout)
):
inferred_layout = inferred_page.elements
image_metadata = inferred_page.image_metadata
w = image_metadata.get("width")
h = image_metadata.get("height")
image_size = (w, h)
threshold_kwargs = {}
# NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn
# In other case the default values for the functions are used
if (
isinstance(inferred_page.detection_model, UnstructuredDetectronONNXModel)
and "R_50" not in inferred_page.detection_model.model_path
):
threshold_kwargs = {"same_region_threshold": 0.5, "subregion_threshold": 0.5}
merged_layout = merge_inferred_with_extracted_page(
inferred_layout=inferred_layout,
extracted_layout=extracted_page_layout,
page_image_size=image_size,
**threshold_kwargs,
)
elements = inferred_page.get_elements_from_layout(
layout=cast(List[TextRegion], merged_layout),
pdf_objects=extracted_page_layout,
)
inferred_page.elements[:] = elements
return inferred_document_layout