WIP

2025-06-27 02:30:08 +00:00 · 2025-03-21 16:55:21 -05:00 · 2025-03-21 16:55:21 -05:00 · c2c3f545a5
commit c2c3f545a5
parent 0fa5174bd7
2 changed files with 126 additions and 8 deletions
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -622,8 +622,7 @@ def _partition_pdf_or_image_local(

    from unstructured.partition.pdf_image.ocr import process_data_with_ocr, process_file_with_ocr
    from unstructured.partition.pdf_image.pdfminer_processing import (
-        process_data_with_pdfminer,
-        process_file_with_pdfminer,
+        process_file_with_pdfplumber,
    )

    if not is_image:
@ -652,8 +651,8 @@ def _partition_pdf_or_image_local(
        )

        extracted_layout, layouts_links = (
-            process_file_with_pdfminer(
-                filename=filename,
+            process_file_with_pdfplumber(
+                filename,
                dpi=pdf_image_dpi,
                password=password,
                pdfminer_config=pdfminer_config,
@ -714,8 +713,8 @@ def _partition_pdf_or_image_local(
            file.seek(0)

        extracted_layout, layouts_links = (
-            process_data_with_pdfminer(
-                file=file, dpi=pdf_image_dpi, password=password, pdfminer_config=pdfminer_config
+            process_file_with_pdfplumber(
+                file, dpi=pdf_image_dpi, password=password, pdfminer_config=pdfminer_config
            )
            if pdf_text_extractable
            else ([], [])
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@ -4,6 +4,7 @@ import os
 from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union, cast

 import numpy as np
+import pdfplumber
 from pdfminer.layout import LTChar, LTTextBox
 from pdfminer.pdftypes import PDFObjRef
 from pdfminer.utils import open_filename
@ -371,6 +372,57 @@ def array_merge_inferred_layout_with_extracted_layout(
    return final_layout


+@requires_dependencies("unstructured_inference")
+def process_page_from_pdfplumber(
+    page: pdfplumber.page.Page, page_number: int, coord_coef: float = 1.0
+) -> tuple[LayoutElements, list]:
+    from unstructured_inference.inference.layoutelement import LayoutElements
+
+    urls_metadata = []
+    element_coords, texts, element_class = [], [], []
+    annotation_list = page.annots
+
+    def _get_bbox(obj):
+        return (obj["x0"], obj["top"], obj["x1"], obj["bottom"])
+
+    for text in page.extract_words(return_chars=False, y_tolerance=3 * coord_coef):
+        bbox = _get_bbox(text)
+        element_coords.append(bbox)
+        texts.append(text["text"])
+        element_class.append(0)
+
+        if len(annotation_list) > 0:
+            annotations_within_element = check_annotations_within_element(
+                annotation_list,
+                bbox,
+                page_number,
+                env_config.PDF_ANNOTATION_THRESHOLD,
+            )
+            for annot in annotations_within_element:
+                urls_metadata.append(map_bbox_and_index(texts["text"], annot))
+
+    for img in page.images:
+        bbox = _get_bbox(img)
+
+        if not _validate_bbox(bbox):
+            continue
+
+        texts.append(None)
+        element_coords.append(bbox)
+        element_class.append(1)
+
+    return (
+        LayoutElements(
+            element_coords=coord_coef * np.array(element_coords),
+            texts=np.array(texts).astype(object),
+            element_class_ids=np.array(element_class),
+            element_class_id_map={0: ElementType.UNCATEGORIZED_TEXT, 1: ElementType.IMAGE},
+            sources=np.array([Source.PDFMINER] * len(element_class)),
+        ),
+        urls_metadata,
+    )
+
+
@requires_dependencies("unstructured_inference")
 def process_page_layout_from_pdfminer(
    annotation_list: list,
@ -502,6 +554,72 @@ def process_data_with_pdfminer(
    return layouts, layouts_links


+@requires_dependencies("unstructured_inference")
+def process_file_with_pdfplumber(
+    file: Union[str, bytes, BinaryIO] = None,
+    dpi: int = 200,
+    password: Optional[str] = None,
+    pdfminer_config: Optional[PDFMinerConfig] = None,
+) -> tuple[List[LayoutElements], List[List]]:
+    """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
+    pdf pages using pdf2image"""
+
+    from unstructured_inference.inference.layoutelement import LayoutElements
+
+    layouts = []
+    layouts_links = []
+    # Coefficient to rescale bounding box to be compatible with images
+    coef = dpi / 72
+    pdf_file = pdfplumber.open(file, password=password)
+    for page_number, page in enumerate(pdf_file.pages):
+        width, height = page.width, page.height
+
+        coordinate_system = PixelSpace(
+            width=width,
+            height=height,
+        )
+
+        layout, urls_metadata = process_page_from_pdfplumber(page, page_number, coef)
+
+        links = [
+            {
+                "bbox": [metadata[x] * coef for x in ["x0", "x1", "y0", "y1"]],
+                "text": metadata["text"],
+                "url": metadata["uri"],
+                "start_index": metadata["start_index"],
+            }
+            for metadata in urls_metadata
+        ]
+
+        clean_layouts = []
+        for threshold, element_class in zip(
+            (
+                env_config.EMBEDDED_TEXT_SAME_REGION_THRESHOLD,
+                env_config.EMBEDDED_IMAGE_SAME_REGION_THRESHOLD,
+            ),
+            (0, 1),
+        ):
+            elements_to_sort = layout.slice(layout.element_class_ids == element_class)
+            clean_layouts.append(
+                remove_duplicate_elements(elements_to_sort, threshold)
+                if len(elements_to_sort)
+                else elements_to_sort
+            )
+
+        layout = LayoutElements.concatenate(clean_layouts)
+        # NOTE(christine): always do the basic sort first for deterministic order across
+        # python versions.
+        layout = sort_text_regions(layout, SORT_MODE_BASIC)
+
+        # apply the current default sorting to the layout elements extracted by pdfminer
+        layout = sort_text_regions(layout)
+
+        layouts.append(layout)
+        layouts_links.append(links)
+    pdf_file.close()
+    return layouts, layouts_links
+
+
 def _create_text_region(x1, y1, x2, y2, coef, text, source, region_class):
    """Creates a text region of the specified class with scaled coordinates."""
    return region_class.from_coords(
@ -893,9 +1011,10 @@ def check_annotations_within_element(
    annotations_within_element = []
    for annotation in annotation_list:
        if annotation["page_number"] == page_number:
-            annotation_bbox_size = calculate_bbox_area(annotation["bbox"])
+            bbox = annotation.get("bbox", [annotation.get(x) for x in ["x0", "y0", "x1", "y1"]])
+            annotation_bbox_size = calculate_bbox_area(bbox)
            if annotation_bbox_size and (
-                calculate_intersection_area(element_bbox, annotation["bbox"]) / annotation_bbox_size
+                calculate_intersection_area(element_bbox, bbox) / annotation_bbox_size
                > annotation_threshold
            ):
                annotations_within_element.append(annotation)