From c2c3f545a514f6cdb517e96a1efc37ff86042cb0 Mon Sep 17 00:00:00 2001 From: Yao You Date: Fri, 21 Mar 2025 16:55:21 -0500 Subject: [PATCH] WIP --- unstructured/partition/pdf.py | 11 +- .../pdf_image/pdfminer_processing.py | 123 +++++++++++++++++- 2 files changed, 126 insertions(+), 8 deletions(-) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index e0e64854d..b35f10379 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -622,8 +622,7 @@ def _partition_pdf_or_image_local( from unstructured.partition.pdf_image.ocr import process_data_with_ocr, process_file_with_ocr from unstructured.partition.pdf_image.pdfminer_processing import ( - process_data_with_pdfminer, - process_file_with_pdfminer, + process_file_with_pdfplumber, ) if not is_image: @@ -652,8 +651,8 @@ def _partition_pdf_or_image_local( ) extracted_layout, layouts_links = ( - process_file_with_pdfminer( - filename=filename, + process_file_with_pdfplumber( + filename, dpi=pdf_image_dpi, password=password, pdfminer_config=pdfminer_config, @@ -714,8 +713,8 @@ def _partition_pdf_or_image_local( file.seek(0) extracted_layout, layouts_links = ( - process_data_with_pdfminer( - file=file, dpi=pdf_image_dpi, password=password, pdfminer_config=pdfminer_config + process_file_with_pdfplumber( + file, dpi=pdf_image_dpi, password=password, pdfminer_config=pdfminer_config ) if pdf_text_extractable else ([], []) diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index fc7428c57..68ee89813 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -4,6 +4,7 @@ import os from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union, cast import numpy as np +import pdfplumber from pdfminer.layout import LTChar, LTTextBox from pdfminer.pdftypes import PDFObjRef from pdfminer.utils import open_filename @@ -371,6 +372,57 @@ def array_merge_inferred_layout_with_extracted_layout( return final_layout +@requires_dependencies("unstructured_inference") +def process_page_from_pdfplumber( + page: pdfplumber.page.Page, page_number: int, coord_coef: float = 1.0 +) -> tuple[LayoutElements, list]: + from unstructured_inference.inference.layoutelement import LayoutElements + + urls_metadata = [] + element_coords, texts, element_class = [], [], [] + annotation_list = page.annots + + def _get_bbox(obj): + return (obj["x0"], obj["top"], obj["x1"], obj["bottom"]) + + for text in page.extract_words(return_chars=False, y_tolerance=3 * coord_coef): + bbox = _get_bbox(text) + element_coords.append(bbox) + texts.append(text["text"]) + element_class.append(0) + + if len(annotation_list) > 0: + annotations_within_element = check_annotations_within_element( + annotation_list, + bbox, + page_number, + env_config.PDF_ANNOTATION_THRESHOLD, + ) + for annot in annotations_within_element: + urls_metadata.append(map_bbox_and_index(texts["text"], annot)) + + for img in page.images: + bbox = _get_bbox(img) + + if not _validate_bbox(bbox): + continue + + texts.append(None) + element_coords.append(bbox) + element_class.append(1) + + return ( + LayoutElements( + element_coords=coord_coef * np.array(element_coords), + texts=np.array(texts).astype(object), + element_class_ids=np.array(element_class), + element_class_id_map={0: ElementType.UNCATEGORIZED_TEXT, 1: ElementType.IMAGE}, + sources=np.array([Source.PDFMINER] * len(element_class)), + ), + urls_metadata, + ) + + @requires_dependencies("unstructured_inference") def process_page_layout_from_pdfminer( annotation_list: list, @@ -502,6 +554,72 @@ def process_data_with_pdfminer( return layouts, layouts_links +@requires_dependencies("unstructured_inference") +def process_file_with_pdfplumber( + file: Union[str, bytes, BinaryIO] = None, + dpi: int = 200, + password: Optional[str] = None, + pdfminer_config: Optional[PDFMinerConfig] = None, +) -> tuple[List[LayoutElements], List[List]]: + """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the + pdf pages using pdf2image""" + + from unstructured_inference.inference.layoutelement import LayoutElements + + layouts = [] + layouts_links = [] + # Coefficient to rescale bounding box to be compatible with images + coef = dpi / 72 + pdf_file = pdfplumber.open(file, password=password) + for page_number, page in enumerate(pdf_file.pages): + width, height = page.width, page.height + + coordinate_system = PixelSpace( + width=width, + height=height, + ) + + layout, urls_metadata = process_page_from_pdfplumber(page, page_number, coef) + + links = [ + { + "bbox": [metadata[x] * coef for x in ["x0", "x1", "y0", "y1"]], + "text": metadata["text"], + "url": metadata["uri"], + "start_index": metadata["start_index"], + } + for metadata in urls_metadata + ] + + clean_layouts = [] + for threshold, element_class in zip( + ( + env_config.EMBEDDED_TEXT_SAME_REGION_THRESHOLD, + env_config.EMBEDDED_IMAGE_SAME_REGION_THRESHOLD, + ), + (0, 1), + ): + elements_to_sort = layout.slice(layout.element_class_ids == element_class) + clean_layouts.append( + remove_duplicate_elements(elements_to_sort, threshold) + if len(elements_to_sort) + else elements_to_sort + ) + + layout = LayoutElements.concatenate(clean_layouts) + # NOTE(christine): always do the basic sort first for deterministic order across + # python versions. + layout = sort_text_regions(layout, SORT_MODE_BASIC) + + # apply the current default sorting to the layout elements extracted by pdfminer + layout = sort_text_regions(layout) + + layouts.append(layout) + layouts_links.append(links) + pdf_file.close() + return layouts, layouts_links + + def _create_text_region(x1, y1, x2, y2, coef, text, source, region_class): """Creates a text region of the specified class with scaled coordinates.""" return region_class.from_coords( @@ -893,9 +1011,10 @@ def check_annotations_within_element( annotations_within_element = [] for annotation in annotation_list: if annotation["page_number"] == page_number: - annotation_bbox_size = calculate_bbox_area(annotation["bbox"]) + bbox = annotation.get("bbox", [annotation.get(x) for x in ["x0", "y0", "x1", "y1"]]) + annotation_bbox_size = calculate_bbox_area(bbox) if annotation_bbox_size and ( - calculate_intersection_area(element_bbox, annotation["bbox"]) / annotation_bbox_size + calculate_intersection_area(element_bbox, bbox) / annotation_bbox_size > annotation_threshold ): annotations_within_element.append(annotation)