This commit is contained in:
Yao You 2025-03-21 16:55:21 -05:00
parent 0fa5174bd7
commit c2c3f545a5
2 changed files with 126 additions and 8 deletions

View File

@ -622,8 +622,7 @@ def _partition_pdf_or_image_local(
from unstructured.partition.pdf_image.ocr import process_data_with_ocr, process_file_with_ocr
from unstructured.partition.pdf_image.pdfminer_processing import (
process_data_with_pdfminer,
process_file_with_pdfminer,
process_file_with_pdfplumber,
)
if not is_image:
@ -652,8 +651,8 @@ def _partition_pdf_or_image_local(
)
extracted_layout, layouts_links = (
process_file_with_pdfminer(
filename=filename,
process_file_with_pdfplumber(
filename,
dpi=pdf_image_dpi,
password=password,
pdfminer_config=pdfminer_config,
@ -714,8 +713,8 @@ def _partition_pdf_or_image_local(
file.seek(0)
extracted_layout, layouts_links = (
process_data_with_pdfminer(
file=file, dpi=pdf_image_dpi, password=password, pdfminer_config=pdfminer_config
process_file_with_pdfplumber(
file, dpi=pdf_image_dpi, password=password, pdfminer_config=pdfminer_config
)
if pdf_text_extractable
else ([], [])

View File

@ -4,6 +4,7 @@ import os
from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union, cast
import numpy as np
import pdfplumber
from pdfminer.layout import LTChar, LTTextBox
from pdfminer.pdftypes import PDFObjRef
from pdfminer.utils import open_filename
@ -371,6 +372,57 @@ def array_merge_inferred_layout_with_extracted_layout(
return final_layout
@requires_dependencies("unstructured_inference")
def process_page_from_pdfplumber(
page: pdfplumber.page.Page, page_number: int, coord_coef: float = 1.0
) -> tuple[LayoutElements, list]:
from unstructured_inference.inference.layoutelement import LayoutElements
urls_metadata = []
element_coords, texts, element_class = [], [], []
annotation_list = page.annots
def _get_bbox(obj):
return (obj["x0"], obj["top"], obj["x1"], obj["bottom"])
for text in page.extract_words(return_chars=False, y_tolerance=3 * coord_coef):
bbox = _get_bbox(text)
element_coords.append(bbox)
texts.append(text["text"])
element_class.append(0)
if len(annotation_list) > 0:
annotations_within_element = check_annotations_within_element(
annotation_list,
bbox,
page_number,
env_config.PDF_ANNOTATION_THRESHOLD,
)
for annot in annotations_within_element:
urls_metadata.append(map_bbox_and_index(texts["text"], annot))
for img in page.images:
bbox = _get_bbox(img)
if not _validate_bbox(bbox):
continue
texts.append(None)
element_coords.append(bbox)
element_class.append(1)
return (
LayoutElements(
element_coords=coord_coef * np.array(element_coords),
texts=np.array(texts).astype(object),
element_class_ids=np.array(element_class),
element_class_id_map={0: ElementType.UNCATEGORIZED_TEXT, 1: ElementType.IMAGE},
sources=np.array([Source.PDFMINER] * len(element_class)),
),
urls_metadata,
)
@requires_dependencies("unstructured_inference")
def process_page_layout_from_pdfminer(
annotation_list: list,
@ -502,6 +554,72 @@ def process_data_with_pdfminer(
return layouts, layouts_links
@requires_dependencies("unstructured_inference")
def process_file_with_pdfplumber(
file: Union[str, bytes, BinaryIO] = None,
dpi: int = 200,
password: Optional[str] = None,
pdfminer_config: Optional[PDFMinerConfig] = None,
) -> tuple[List[LayoutElements], List[List]]:
"""Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
pdf pages using pdf2image"""
from unstructured_inference.inference.layoutelement import LayoutElements
layouts = []
layouts_links = []
# Coefficient to rescale bounding box to be compatible with images
coef = dpi / 72
pdf_file = pdfplumber.open(file, password=password)
for page_number, page in enumerate(pdf_file.pages):
width, height = page.width, page.height
coordinate_system = PixelSpace(
width=width,
height=height,
)
layout, urls_metadata = process_page_from_pdfplumber(page, page_number, coef)
links = [
{
"bbox": [metadata[x] * coef for x in ["x0", "x1", "y0", "y1"]],
"text": metadata["text"],
"url": metadata["uri"],
"start_index": metadata["start_index"],
}
for metadata in urls_metadata
]
clean_layouts = []
for threshold, element_class in zip(
(
env_config.EMBEDDED_TEXT_SAME_REGION_THRESHOLD,
env_config.EMBEDDED_IMAGE_SAME_REGION_THRESHOLD,
),
(0, 1),
):
elements_to_sort = layout.slice(layout.element_class_ids == element_class)
clean_layouts.append(
remove_duplicate_elements(elements_to_sort, threshold)
if len(elements_to_sort)
else elements_to_sort
)
layout = LayoutElements.concatenate(clean_layouts)
# NOTE(christine): always do the basic sort first for deterministic order across
# python versions.
layout = sort_text_regions(layout, SORT_MODE_BASIC)
# apply the current default sorting to the layout elements extracted by pdfminer
layout = sort_text_regions(layout)
layouts.append(layout)
layouts_links.append(links)
pdf_file.close()
return layouts, layouts_links
def _create_text_region(x1, y1, x2, y2, coef, text, source, region_class):
"""Creates a text region of the specified class with scaled coordinates."""
return region_class.from_coords(
@ -893,9 +1011,10 @@ def check_annotations_within_element(
annotations_within_element = []
for annotation in annotation_list:
if annotation["page_number"] == page_number:
annotation_bbox_size = calculate_bbox_area(annotation["bbox"])
bbox = annotation.get("bbox", [annotation.get(x) for x in ["x0", "y0", "x1", "y1"]])
annotation_bbox_size = calculate_bbox_area(bbox)
if annotation_bbox_size and (
calculate_intersection_area(element_bbox, annotation["bbox"]) / annotation_bbox_size
calculate_intersection_area(element_bbox, bbox) / annotation_bbox_size
> annotation_threshold
):
annotations_within_element.append(annotation)