mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-31 12:23:49 +00:00
Refactor: support layout analysis (#2273)
### Summary This PR is the second part of the "layout analysis" refactor to move it from unstructured-inference repo to unstructured repo, the first part is done in https://github.com/Unstructured-IO/unstructured-inference/pull/305. This PR adds logic to support annotating `inferred` and `extracted` elements. ### Testing ``` PYTHONPATH=. python examples/layout-analysis/visualization.py <file_path> <strategy> <document_type> ``` e.g. ``` PYTHONPATH=. python examples/layout-analysis/visualization.py example-docs/layout-parser-paper-fast.pdf hi_res pdf ```
This commit is contained in:
parent
09f86f28fb
commit
096d23bc28
@ -1,7 +1,9 @@
|
||||
## 0.11.6-dev1
|
||||
## 0.11.6-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Update the layout analysis script.** The previous script only supported annotating `final` elements. The updated script also supports annotating `inferred` and `extracted` elements.
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
@ -3,7 +3,8 @@ import pathlib
|
||||
import sys
|
||||
|
||||
import pdf2image
|
||||
from unstructured_inference.inference.elements import Rectangle
|
||||
from PIL import Image
|
||||
from unstructured_inference.inference.elements import TextRegion
|
||||
from unstructured_inference.visualize import draw_bbox
|
||||
|
||||
from unstructured.documents.elements import PageBreak
|
||||
@ -29,11 +30,14 @@ def extract_element_coordinates(elements):
|
||||
return elements_coordinates
|
||||
|
||||
|
||||
def run_partition_pdf(f_path, strategy, images, output_dir):
|
||||
def run_partition_pdf(f_path, strategy, images, output_dir, output_f_basename, is_image):
|
||||
elements = partition_pdf(
|
||||
f_path,
|
||||
strategy=strategy,
|
||||
is_image=is_image,
|
||||
include_page_breaks=True,
|
||||
analysis=True,
|
||||
analyzed_image_output_dir_path=output_dir,
|
||||
)
|
||||
|
||||
elements_coordinates = extract_element_coordinates(elements)
|
||||
@ -44,22 +48,28 @@ def run_partition_pdf(f_path, strategy, images, output_dir):
|
||||
points = coordinate.points
|
||||
x1, y1 = points[0]
|
||||
x2, y2 = points[2]
|
||||
rect = Rectangle(x1, y1, x2, y2)
|
||||
img = draw_bbox(img, rect, color="red")
|
||||
el = TextRegion.from_coords(x1, y1, x2, y2)
|
||||
img = draw_bbox(img, el, color="red")
|
||||
|
||||
output_image_path = os.path.join(output_dir, f"{strategy}-{idx + 1}.jpg")
|
||||
output_image_path = os.path.join(output_dir, f"{output_f_basename}_{idx + 1}_final.jpg")
|
||||
img.save(output_image_path)
|
||||
print(f"output_image_path: {output_image_path}")
|
||||
|
||||
img.save(output_image_path)
|
||||
|
||||
|
||||
def run(f_path, strategy):
|
||||
def run(f_path, strategy, document_type):
|
||||
f_basename = os.path.splitext(os.path.basename(f_path))[0]
|
||||
output_dir_path = os.path.join(output_basedir_path, f_basename)
|
||||
os.makedirs(output_dir_path, exist_ok=True)
|
||||
|
||||
images = pdf2image.convert_from_path(f_path)
|
||||
run_partition_pdf(f_path, strategy, images, output_dir_path)
|
||||
is_image = document_type == "image"
|
||||
if is_image:
|
||||
with Image.open(f_path) as img:
|
||||
img = img.convert("RGB")
|
||||
images = [img]
|
||||
else:
|
||||
images = pdf2image.convert_from_path(f_path)
|
||||
|
||||
run_partition_pdf(f_path, strategy, images, output_dir_path, f_basename, is_image)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -74,7 +84,11 @@ if __name__ == "__main__":
|
||||
print("Invalid strategy")
|
||||
sys.exit(1)
|
||||
|
||||
if sys.argv[3] not in ["pdf", "image"]:
|
||||
print("Invalid document type")
|
||||
sys.exit(1)
|
||||
|
||||
output_basedir_path = os.path.join(CUR_DIR, "output")
|
||||
os.makedirs(output_basedir_path, exist_ok=True)
|
||||
|
||||
run(f_path=sys.argv[1], strategy=sys.argv[2])
|
||||
run(f_path=sys.argv[1], strategy=sys.argv[2], document_type=sys.argv[3])
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.11.6-dev1" # pragma: no cover
|
||||
__version__ = "0.11.6-dev2" # pragma: no cover
|
||||
|
@ -74,9 +74,13 @@ from unstructured.partition.lang import (
|
||||
prepare_languages_for_tesseract,
|
||||
)
|
||||
from unstructured.partition.pdf_image.pdf_image_utils import (
|
||||
annotate_layout_elements,
|
||||
check_element_types_to_extract,
|
||||
save_elements,
|
||||
)
|
||||
from unstructured.partition.pdf_image.pdfminer_processing import (
|
||||
merge_inferred_with_extracted_layout,
|
||||
)
|
||||
from unstructured.partition.pdf_image.pdfminer_utils import (
|
||||
open_pdfminer_pages_generator,
|
||||
rect_to_bbox,
|
||||
@ -247,6 +251,8 @@ def _partition_pdf_or_image_local(
|
||||
extract_element_types: Optional[List[str]] = None,
|
||||
image_output_dir_path: Optional[str] = None,
|
||||
pdf_image_dpi: Optional[int] = None,
|
||||
analysis: bool = False,
|
||||
analyzed_image_output_dir_path: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partition using package installed locally"""
|
||||
@ -286,14 +292,27 @@ def _partition_pdf_or_image_local(
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
)
|
||||
|
||||
if pdf_text_extractable is True:
|
||||
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
||||
merged_document_layout = process_file_with_pdfminer(
|
||||
inferred_document_layout,
|
||||
filename,
|
||||
extracted_layout = (
|
||||
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
|
||||
if pdf_text_extractable
|
||||
else []
|
||||
)
|
||||
|
||||
if analysis:
|
||||
annotate_layout_elements(
|
||||
inferred_document_layout=inferred_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
filename=filename,
|
||||
output_dir_path=analyzed_image_output_dir_path,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
is_image=is_image,
|
||||
)
|
||||
else:
|
||||
merged_document_layout = inferred_document_layout
|
||||
|
||||
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
||||
merged_document_layout = merge_inferred_with_extracted_layout(
|
||||
inferred_document_layout=inferred_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
)
|
||||
|
||||
if model_name.startswith("chipper"):
|
||||
# NOTE(alan): We shouldn't do OCR with chipper
|
||||
@ -317,14 +336,16 @@ def _partition_pdf_or_image_local(
|
||||
)
|
||||
if hasattr(file, "seek"):
|
||||
file.seek(0)
|
||||
if pdf_text_extractable is True:
|
||||
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
||||
merged_document_layout = process_data_with_pdfminer(
|
||||
inferred_document_layout,
|
||||
file,
|
||||
)
|
||||
else:
|
||||
merged_document_layout = inferred_document_layout
|
||||
|
||||
extracted_layout = (
|
||||
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
|
||||
)
|
||||
|
||||
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
||||
merged_document_layout = merge_inferred_with_extracted_layout(
|
||||
inferred_document_layout=inferred_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
)
|
||||
|
||||
if model_name.startswith("chipper"):
|
||||
# NOTE(alan): We shouldn't do OCR with chipper
|
||||
@ -655,7 +676,7 @@ def _process_pdfminer_pages(
|
||||
urls_metadata.append(map_bbox_and_index(words, annot))
|
||||
|
||||
if hasattr(obj, "get_text"):
|
||||
_text_snippets: List[str | Any] = [obj.get_text()] # type: ignore
|
||||
_text_snippets: List = [obj.get_text()]
|
||||
else:
|
||||
_text = _extract_text(obj)
|
||||
_text_snippets = re.split(PARAGRAPH_PATTERN, _text)
|
||||
|
@ -13,6 +13,8 @@ from unstructured.logger import logger
|
||||
from unstructured.partition.common import convert_to_bytes
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from unstructured_inference.inference.layout import DocumentLayout, PageLayout, TextRegion
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
|
||||
|
||||
@ -159,3 +161,118 @@ def valid_text(text: str) -> bool:
|
||||
if not text:
|
||||
return False
|
||||
return "(cid:" not in text
|
||||
|
||||
|
||||
def annotate_layout_elements_with_image(
|
||||
inferred_page_layout: "PageLayout",
|
||||
extracted_page_layout: Optional["PageLayout"],
|
||||
output_dir_path: str,
|
||||
output_f_basename: str,
|
||||
page_number: int,
|
||||
):
|
||||
"""
|
||||
Annotates a page image with both inferred and extracted layout elements.
|
||||
|
||||
This function takes the layout elements of a single page, either extracted from or inferred
|
||||
for the document, and annotates them on the page image. It creates two separate annotated
|
||||
images, one for each set of layout elements: 'inferred' and 'extracted'.
|
||||
These annotated images are saved to a specified directory.
|
||||
"""
|
||||
|
||||
layout_map = {"inferred": {"layout": inferred_page_layout, "color": "blue"}}
|
||||
if extracted_page_layout:
|
||||
layout_map["extracted"] = {"layout": extracted_page_layout, "color": "green"}
|
||||
|
||||
for label, layout_data in layout_map.items():
|
||||
page_layout = layout_data.get("layout")
|
||||
color = layout_data.get("color")
|
||||
|
||||
img = page_layout.annotate(colors=color)
|
||||
output_f_path = os.path.join(
|
||||
output_dir_path, f"{output_f_basename}_{page_number}_{label}.jpg"
|
||||
)
|
||||
write_image(img, output_f_path)
|
||||
print(f"output_image_path: {output_f_path}")
|
||||
|
||||
|
||||
def annotate_layout_elements(
|
||||
inferred_document_layout: "DocumentLayout",
|
||||
extracted_layout: List["TextRegion"],
|
||||
filename: str,
|
||||
output_dir_path: str,
|
||||
pdf_image_dpi: int,
|
||||
is_image: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Annotates layout elements on images extracted from a PDF or an image file.
|
||||
|
||||
This function processes a given document (PDF or image) and annotates layout elements based
|
||||
on the inferred and extracted layout information.
|
||||
It handles both PDF documents and standalone image files. For PDFs, it converts each page
|
||||
into an image, whereas for image files, it processes the single image.
|
||||
"""
|
||||
|
||||
from unstructured_inference.inference.layout import PageLayout
|
||||
|
||||
output_f_basename = os.path.splitext(os.path.basename(filename))[0]
|
||||
images = []
|
||||
try:
|
||||
if is_image:
|
||||
with Image.open(filename) as img:
|
||||
img = img.convert("RGB")
|
||||
images.append(img)
|
||||
|
||||
extracted_page_layout = None
|
||||
if extracted_layout:
|
||||
extracted_page_layout = PageLayout(
|
||||
number=1,
|
||||
image=img,
|
||||
)
|
||||
extracted_page_layout.elements = extracted_layout[0]
|
||||
|
||||
inferred_page_layout = inferred_document_layout.pages[0]
|
||||
inferred_page_layout.image = img
|
||||
|
||||
annotate_layout_elements_with_image(
|
||||
inferred_page_layout=inferred_document_layout.pages[0],
|
||||
extracted_page_layout=extracted_page_layout,
|
||||
output_dir_path=output_dir_path,
|
||||
output_f_basename=output_f_basename,
|
||||
page_number=1,
|
||||
)
|
||||
else:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
_image_paths = pdf2image.convert_from_path(
|
||||
filename,
|
||||
dpi=pdf_image_dpi,
|
||||
output_folder=temp_dir,
|
||||
paths_only=True,
|
||||
)
|
||||
image_paths = cast(List[str], _image_paths)
|
||||
for i, image_path in enumerate(image_paths):
|
||||
with Image.open(image_path) as img:
|
||||
page_number = i + 1
|
||||
|
||||
extracted_page_layout = None
|
||||
if extracted_layout:
|
||||
extracted_page_layout = PageLayout(
|
||||
number=page_number,
|
||||
image=img,
|
||||
)
|
||||
extracted_page_layout.elements = extracted_layout[i]
|
||||
|
||||
inferred_page_layout = inferred_document_layout.pages[i]
|
||||
inferred_page_layout.image = img
|
||||
|
||||
annotate_layout_elements_with_image(
|
||||
inferred_page_layout=inferred_document_layout.pages[i],
|
||||
extracted_page_layout=extracted_page_layout,
|
||||
output_dir_path=output_dir_path,
|
||||
output_f_basename=output_f_basename,
|
||||
page_number=page_number,
|
||||
)
|
||||
except Exception as e:
|
||||
if os.path.isdir(filename) or os.path.isfile(filename):
|
||||
raise e
|
||||
else:
|
||||
raise FileNotFoundError(f'File "{filename}" not found!') from e
|
||||
|
@ -7,7 +7,7 @@ from unstructured_inference.inference.elements import (
|
||||
TextRegion,
|
||||
)
|
||||
from unstructured_inference.inference.layoutelement import (
|
||||
merge_inferred_layout_with_extracted_layout,
|
||||
merge_inferred_layout_with_extracted_layout as merge_inferred_with_extracted_page,
|
||||
)
|
||||
from unstructured_inference.inference.ordering import order_layout
|
||||
from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
|
||||
@ -25,62 +25,20 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
def process_file_with_pdfminer(
|
||||
inferred_document_layout: "DocumentLayout",
|
||||
filename: str = "",
|
||||
) -> "DocumentLayout":
|
||||
dpi: int = 200,
|
||||
) -> List[List[TextRegion]]:
|
||||
with open_filename(filename, "rb") as fp:
|
||||
fp = cast(BinaryIO, fp)
|
||||
inferred_document_layout = process_data_with_pdfminer(
|
||||
inferred_document_layout=inferred_document_layout,
|
||||
extracted_layout = process_data_with_pdfminer(
|
||||
file=fp,
|
||||
dpi=dpi,
|
||||
)
|
||||
return inferred_document_layout
|
||||
return extracted_layout
|
||||
|
||||
|
||||
def process_data_with_pdfminer(
|
||||
inferred_document_layout: "DocumentLayout",
|
||||
file: Optional[Union[bytes, BinaryIO]] = None,
|
||||
) -> "DocumentLayout":
|
||||
"""Process document data using PDFMiner to extract layout information."""
|
||||
|
||||
extracted_layouts = get_regions_by_pdfminer(file)
|
||||
|
||||
inferred_pages = inferred_document_layout.pages
|
||||
for i, (inferred_page, extracted_layout) in enumerate(zip(inferred_pages, extracted_layouts)):
|
||||
inferred_layout = inferred_page.elements
|
||||
image_metadata = inferred_page.image_metadata
|
||||
w = image_metadata.get("width")
|
||||
h = image_metadata.get("height")
|
||||
image_size = (w, h)
|
||||
|
||||
threshold_kwargs = {}
|
||||
# NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn
|
||||
# In other case the default values for the functions are used
|
||||
if (
|
||||
isinstance(inferred_page.detection_model, UnstructuredDetectronONNXModel)
|
||||
and "R_50" not in inferred_page.detection_model.model_path
|
||||
):
|
||||
threshold_kwargs = {"same_region_threshold": 0.5, "subregion_threshold": 0.5}
|
||||
|
||||
merged_layout = merge_inferred_layout_with_extracted_layout(
|
||||
inferred_layout=inferred_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
page_image_size=image_size,
|
||||
**threshold_kwargs,
|
||||
)
|
||||
|
||||
elements = inferred_page.get_elements_from_layout(
|
||||
layout=cast(List[TextRegion], merged_layout),
|
||||
pdf_objects=extracted_layout,
|
||||
)
|
||||
|
||||
inferred_page.elements[:] = elements
|
||||
|
||||
return inferred_document_layout
|
||||
|
||||
|
||||
def get_regions_by_pdfminer(
|
||||
fp: Optional[Union[bytes, BinaryIO]],
|
||||
dpi: int = 200,
|
||||
) -> List[List[TextRegion]]:
|
||||
"""Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
|
||||
@ -89,7 +47,7 @@ def get_regions_by_pdfminer(
|
||||
layouts = []
|
||||
# Coefficient to rescale bounding box to be compatible with images
|
||||
coef = dpi / 72
|
||||
for page, page_layout in open_pdfminer_pages_generator(fp):
|
||||
for page, page_layout in open_pdfminer_pages_generator(file):
|
||||
height = page_layout.height
|
||||
|
||||
layout: List["TextRegion"] = []
|
||||
@ -129,3 +87,43 @@ def get_regions_by_pdfminer(
|
||||
layouts.append(layout)
|
||||
|
||||
return layouts
|
||||
|
||||
|
||||
def merge_inferred_with_extracted_layout(
|
||||
inferred_document_layout: "DocumentLayout",
|
||||
extracted_layout: List[List[TextRegion]],
|
||||
) -> "DocumentLayout":
|
||||
inferred_pages = inferred_document_layout.pages
|
||||
for i, (inferred_page, extracted_page_layout) in enumerate(
|
||||
zip(inferred_pages, extracted_layout)
|
||||
):
|
||||
inferred_layout = inferred_page.elements
|
||||
image_metadata = inferred_page.image_metadata
|
||||
w = image_metadata.get("width")
|
||||
h = image_metadata.get("height")
|
||||
image_size = (w, h)
|
||||
|
||||
threshold_kwargs = {}
|
||||
# NOTE(Benjamin): With this the thresholds are only changed for detextron2_mask_rcnn
|
||||
# In other case the default values for the functions are used
|
||||
if (
|
||||
isinstance(inferred_page.detection_model, UnstructuredDetectronONNXModel)
|
||||
and "R_50" not in inferred_page.detection_model.model_path
|
||||
):
|
||||
threshold_kwargs = {"same_region_threshold": 0.5, "subregion_threshold": 0.5}
|
||||
|
||||
merged_layout = merge_inferred_with_extracted_page(
|
||||
inferred_layout=inferred_layout,
|
||||
extracted_layout=extracted_page_layout,
|
||||
page_image_size=image_size,
|
||||
**threshold_kwargs,
|
||||
)
|
||||
|
||||
elements = inferred_page.get_elements_from_layout(
|
||||
layout=cast(List[TextRegion], merged_layout),
|
||||
pdf_objects=extracted_page_layout,
|
||||
)
|
||||
|
||||
inferred_page.elements[:] = elements
|
||||
|
||||
return inferred_document_layout
|
||||
|
Loading…
x
Reference in New Issue
Block a user