Feat/remove reference of PageLayout.elements (#3943)

This PR removes usage of `PageLayout.elements` from partition function, except for when `analysis=True`. This PR updates the partition logic so that `PageLayout.elements_array` is used everywhere to save memory and cpu cost. Since the analysis function is intended for investigation and not for general document processing purposes, this part of the code is left for a future refactor. `PageLayout.elements` uses a list to store layout elements' data while `elements_array` uses `numpy` array to store the data, which has much lower memory requirements. Using `memory_profiler` to test the differences is usually around 10x.
2026-01-06 12:21:30 +00:00 · 2025-03-12 10:21:21 -05:00 · 2025-03-12 10:21:21 -05:00 · 2dceac34b5
commit 2dceac34b5
parent 8759b0aac9
9 changed files with 41 additions and 37 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,10 +1,13 @@
-## 0.16.26-dev3
+## 0.17.0

 ### Enhancements

 - **Add support for images in html partitioner** `<img>` tags will now be parsed as `Image` elements. When `extract_image_block_types` includes `Image` and `extract_image_block_to_payload`=True then the `image_base64` will be included for images that specify the base64 data (rather than url) as the source.
+
 - **Use kwargs instead of env to specify `ocr_agent` and `table_ocr_agent`** for `hi_res` strategy.

+- **stop using `PageLayout.elements` to save memory and cpu cost**. Now only use `PageLayout.elements_array` throughout the partition, except when `analysis=True` where the drawing logic still uses `elements`.
+
 ### Features

 ### Fixes
@ -28,6 +31,7 @@
  in unstructured and `register_partitioner` to enable registering your own partitioner for any file type.

 - **`extract_image_block_types` now also works for CamelCase elemenet type names**. Previously `NarrativeText` and similar CamelCase element types can't be extracted using the mentioned parameter in `partition`. Now figures for those elements can be extracted like `Image` and `Table` elements
+
 - **use block matrix to reduce peak memory usage for pdf/image partition**.

 ### Features
--- a/requirements/deps/constraints.txt
+++ b/requirements/deps/constraints.txt
@ -20,3 +20,5 @@ botocore<1.34.132
 importlib-metadata>=8.5.0
 # (austin): Versions below this have a different interface for passing parameters
 unstructured-client>=0.23.0,<0.26.0
+# paddle constrains protobuf; maybe we should put paddle here since its version is pinned in .in file
+protobuf>=6.30.0
--- a/requirements/extra-pdf-image.in
+++ b/requirements/extra-pdf-image.in
@ -11,5 +11,5 @@ google-cloud-vision
 effdet
 # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
 # when unstructured library is.
-unstructured-inference>=0.8.7
+unstructured-inference>=0.8.9
 unstructured.pytesseract>=0.3.12
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@ -1479,8 +1479,7 @@ def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
    # can't be None and it has to be a Rectangle object that has x1, y1, x2, y2 attributes.
    layout_elem_absent_coordinates = MockSinglePageDocumentLayout()
    for page in layout_elem_absent_coordinates.pages:
-        for el in page.elements:
-            el.bbox = None
+        page.elements_array.element_coords[:, :] = None
    elements = pdf.document_to_element_list(layout_elem_absent_coordinates)
    assert elements[0].metadata.coordinates is None

--- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
+++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
@ -12,6 +12,7 @@ from unstructured_inference.inference.elements import (
    TextRegions,
 )
 from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
+from unstructured_inference.inference.layoutelement import LayoutElements

 from test_unstructured.unit_utils import example_doc_path
 from unstructured.partition.auto import partition
@ -108,7 +109,7 @@ def test_valid_bbox(bbox, is_valid):
 def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_document_length):
    # create a sample document with pdfminer elements inside tables
    page = PageLayout(number=1, image=Image.new("1", (1, 1)))
-    page.elements = elements
+    page.elements_array = LayoutElements.from_list(elements)
    document_with_table = DocumentLayout(pages=[page])
    document = document_with_table

@ -116,7 +117,7 @@ def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_doc
    cleaned_doc = clean_pdfminer_inner_elements(document)

    # check that the pdfminer elements were stored in the extra_info dictionary
-    assert len(cleaned_doc.pages[0].elements) == expected_document_length
+    assert len(cleaned_doc.pages[0].elements_array) == expected_document_length


 elements_with_duplicate_images = [
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.16.26-dev3"  # pragma: no cover
+__version__ = "0.17.0"  # pragma: no cover
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -766,10 +766,6 @@ def _partition_pdf_or_image_local(
    # vectorization of the data structure ends here
    final_document_layout = clean_pdfminer_inner_elements(final_document_layout)

-    for page in final_document_layout.pages:
-        for el in page.elements:
-            el.text = el.text or ""
-
    elements = document_to_element_list(
        final_document_layout,
        sortable=True,
@ -1199,11 +1195,24 @@ def document_to_element_list(
            else None
        )

-        for layout_element in page.elements:
+        head_line_type_class_ids = [
+            idx
+            for idx, class_type in page.elements_array.element_class_id_map.items()
+            if class_type in ("Headline", "Subheadline")
+        ]
+        if head_line_type_class_ids:
+            has_headline = any(
+                np.any(page.elements_array.element_class_ids == idx)
+                for idx in head_line_type_class_ids
+            )
+        else:
+            has_headline = False
+
+        for layout_element in page.elements_array.iter_elements():
            if (
                image_width
                and image_height
-                and getattr(layout_element.bbox, "x1") not in (None, np.nan)
+                and not np.isnan(getattr(layout_element.bbox, "x1", np.nan))
            ):
                coordinate_system = PixelSpace(width=image_width, height=image_height)
            else:
@ -1234,8 +1243,8 @@ def document_to_element_list(
                element.metadata.text_as_html = getattr(layout_element, "text_as_html", None)
                element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None)

-                if (isinstance(element, Title) and element.metadata.category_depth is None) and any(
-                    getattr(el, "type", "") in ["Headline", "Subheadline"] for el in page.elements
+                if (isinstance(element, Title) and element.metadata.category_depth is None) and (
+                    has_headline
                ):
                    element.metadata.category_depth = 0

--- a/unstructured/partition/pdf_image/ocr.py
+++ b/unstructured/partition/pdf_image/ocr.py
@ -281,7 +281,6 @@ def supplement_page_layout_with_ocr(
            ocr_agent=_table_ocr_agent,
            extracted_regions=extracted_regions,
        )
-    page_layout.elements = page_layout.elements_array.as_list()

    return page_layout

--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@ -657,8 +657,6 @@ def merge_inferred_with_extracted_layout(
            merged_layout.texts[i] = remove_control_characters(text)

        inferred_page.elements_array = merged_layout
-        # NOTE: once we drop reference to elements we can remove this step below
-        inferred_page.elements[:] = merged_layout.as_list()

    return inferred_document_layout

@ -670,34 +668,26 @@ def clean_pdfminer_inner_elements(document: "DocumentLayout") -> "DocumentLayout
    """

    for page in document.pages:
-        non_pdfminer_element_boxes = [e.bbox for e in page.elements if e.source != Source.PDFMINER]
-        element_boxes = []
-        element_to_subregion_map = {}
-        subregion_indice = 0
-        for i, element in enumerate(page.elements):
-            if element.source != Source.PDFMINER:
-                continue
-            element_boxes.append(element.bbox)
-            element_to_subregion_map[i] = subregion_indice
-            subregion_indice += 1
+        pdfminer_mask = page.elements_array.sources == Source.PDFMINER
+        non_pdfminer_element_boxes = page.elements_array.slice(~pdfminer_mask).element_coords
+        pdfminer_element_boxes = page.elements_array.slice(pdfminer_mask).element_coords
+
+        if len(pdfminer_element_boxes) == 0 or len(non_pdfminer_element_boxes) == 0:
+            continue

        is_element_subregion_of_other_elements = (
            bboxes1_is_almost_subregion_of_bboxes2(
-                element_boxes,
+                pdfminer_element_boxes,
                non_pdfminer_element_boxes,
                env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
            ).sum(axis=1)
            == 1
        )

-        page.elements = [
-            e
-            for i, e in enumerate(page.elements)
-            if (
-                (i not in element_to_subregion_map)
-                or not is_element_subregion_of_other_elements[element_to_subregion_map[i]]
-            )
-        ]
+        pdfminer_to_keep = np.where(pdfminer_mask)[0][~is_element_subregion_of_other_elements]
+        page.elements_array = page.elements_array.slice(
+            np.sort(np.concatenate((np.where(~pdfminer_mask)[0], pdfminer_to_keep)))
+        )

    return document