From be88eef06fa8e14c7d0f596af5f1ba1a3bc67829 Mon Sep 17 00:00:00 2001
From: Christine Straub <christinemstraub@gmail.com>
Date: Thu, 19 Sep 2024 07:05:05 -0700
Subject: [PATCH] perf: optimize pdfminer image cleanup process for improved
 performance (#3630)

This PR enhances `pdfminer` image cleanup process by repositioning the
duplicate image removal step. It optimizes the removal of duplicated
pdfminer images by performing the cleanup before merging elements,
rather than after. This improvement reduces execution time and enhances
the overall processing speed of PDF documents.

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
---
 CHANGELOG.md                                  |  2 +
 .../pdf_image/test_pdfminer_processing.py     |  6 +--
 .../pdf_image/pdfminer_processing.py          | 51 +++++++++----------
 unstructured/partition/utils/config.py        |  2 +-
 4 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2ab5534c0..3cb32ab39 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ### Enhancements
 
+* **Improve `pdfminer` image cleanup process**. Optimized the removal of duplicated pdfminer images by performing the cleanup before merging elements, rather than after. This improvement reduces execution time and enhances overall processing speed of PDF documents.
+
 ### Features
 
 ### Fixes
diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
index e01587516..157295418 100644
--- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
+++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
@@ -11,7 +11,7 @@ from unstructured.partition.pdf_image.pdfminer_processing import (
     boxes_self_iou,
     clean_pdfminer_duplicate_image_elements,
     clean_pdfminer_inner_elements,
-    remove_duplicate_embedded_text,
+    remove_duplicate_elements,
 )
 from unstructured.partition.utils.constants import Source
 
@@ -212,14 +212,14 @@ def test_boxes_self_iou(coords, threshold, expected):
     np.testing.assert_array_equal(boxes_self_iou(bboxes, threshold), expected)
 
 
-def test_remove_duplicate_embedded_text():
+def test_remove_duplicate_elements():
     sample_elements = [
         EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 1"),
         EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 2"),
         EmbeddedTextRegion(bbox=Rectangle(20, 20, 30, 30), text="Text 3"),
     ]
 
-    result = remove_duplicate_embedded_text(sample_elements)
+    result = remove_duplicate_elements(sample_elements)
 
     # Check that duplicates were removed and only 2 unique elements remain
     assert len(result) == 2
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
index 549d43180..ca910db07 100644
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -56,37 +56,42 @@ def process_data_with_pdfminer(
     for page, page_layout in open_pdfminer_pages_generator(file):
         height = page_layout.height
 
-        layout: list["TextRegion"] = []
+        text_layout = []
+        image_layout = []
         for obj in page_layout:
             if hasattr(obj, "get_text"):
                 inner_text_objects = extract_text_objects(obj)
                 for inner_obj in inner_text_objects:
                     _text = inner_obj.get_text()
-                    new_x1, new_y1, new_x2, new_y2 = rect_to_bbox(inner_obj.bbox, height)
                     text_region = _create_text_region(
-                        new_x1,
-                        new_y1,
-                        new_x2,
-                        new_y2,
+                        *rect_to_bbox(inner_obj.bbox, height),
                         coef,
                         _text,
                         Source.PDFMINER,
                         EmbeddedTextRegion,
                     )
                     if text_region.bbox is not None and text_region.bbox.area > 0:
-                        layout.append(text_region)
+                        text_layout.append(text_region)
             else:
                 inner_image_objects = extract_image_objects(obj)
                 for img_obj in inner_image_objects:
-                    new_x1, new_y1, new_x2, new_y2 = rect_to_bbox(img_obj.bbox, height)
                     text_region = _create_text_region(
-                        new_x1, new_y1, new_x2, new_y2, coef, None, Source.PDFMINER, ImageTextRegion
+                        *rect_to_bbox(img_obj.bbox, height),
+                        coef,
+                        None,
+                        Source.PDFMINER,
+                        ImageTextRegion,
                     )
                     if text_region.bbox is not None and text_region.bbox.area > 0:
-                        layout.append(text_region)
-
-        layout = remove_duplicate_embedded_text(layout)
+                        image_layout.append(text_region)
 
+        clean_text_layout = remove_duplicate_elements(
+            text_layout, env_config.EMBEDDED_TEXT_SAME_REGION_THRESHOLD
+        )
+        clean_image_layout = remove_duplicate_elements(
+            image_layout, env_config.EMBEDDED_IMAGE_SAME_REGION_THRESHOLD
+        )
+        layout = [*clean_text_layout, *clean_image_layout]
         # NOTE(christine): always do the basic sort first for deterministic order across
         # python versions.
         layout = sort_text_regions(layout, SORT_MODE_BASIC)
@@ -301,31 +306,21 @@ def clean_pdfminer_duplicate_image_elements(document: "DocumentLayout") -> "Docu
 
 
 @requires_dependencies("unstructured_inference")
-def remove_duplicate_embedded_text(elements: list["TextRegion"]) -> list["TextRegion"]:
+def remove_duplicate_elements(
+    elements: list["TextRegion"],
+    threshold: float = 0.5,
+) -> list["TextRegion"]:
     """Removes duplicate text elements extracted by PDFMiner from a document layout."""
-    from unstructured_inference.inference.elements import EmbeddedTextRegion
 
     bboxes = []
-    texts = []
-    bbox_to_iou_mapping = {}
-    current_idx = 0
     for i, element in enumerate(elements):
-        if not isinstance(element, EmbeddedTextRegion):
-            continue
         bboxes.append(element.bbox)
-        texts.append(element.text)
-        bbox_to_iou_mapping[i] = current_idx
-        current_idx += 1
 
-    iou = boxes_self_iou(bboxes, env_config.EMBEDDED_TEXT_SAME_REGION_THRESHOLD)
+    iou = boxes_self_iou(bboxes, threshold)
 
     filtered_elements = []
     for i, element in enumerate(elements):
-        if not isinstance(element, EmbeddedTextRegion):
-            filtered_elements.append(element)
-            continue
-        this_idx = bbox_to_iou_mapping[i]
-        if iou[this_idx, this_idx + 1 :].any():
+        if iou[i, i + 1 :].any():
             continue
         filtered_elements.append(element)
 
diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
index 965f060b5..7023ff9d3 100644
--- a/unstructured/partition/utils/config.py
+++ b/unstructured/partition/utils/config.py
@@ -153,7 +153,7 @@ class ENVConfig:
     @property
     def EMBEDDED_TEXT_SAME_REGION_THRESHOLD(self) -> float:
         """threshold to consider the bounding boxes of two embedded images as the same region"""
-        return self._get_float("EMBEDDED_IMAGE_SAME_REGION_THRESHOLD", 0.9)
+        return self._get_float("EMBEDDED_TEXT_SAME_REGION_THRESHOLD", 0.9)
 
     @property
     def PDF_ANNOTATION_THRESHOLD(self) -> float: