Feat/remove reference of PageLayout.elements (#3943)

This PR removes usage of `PageLayout.elements` from partition function,
except for when `analysis=True`. This PR updates the partition logic so
that `PageLayout.elements_array` is used everywhere to save memory and
cpu cost.
Since the analysis function is intended for investigation and not for
general document processing purposes, this part of the code is left for
a future refactor.

`PageLayout.elements` uses a list to store layout elements' data while
`elements_array` uses `numpy` array to store the data, which has much
lower memory requirements. Using `memory_profiler` to test the
differences is usually around 10x.
This commit is contained in:
Yao You 2025-03-12 10:21:21 -05:00 committed by GitHub
parent 8759b0aac9
commit 2dceac34b5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 41 additions and 37 deletions

View File

@ -1,10 +1,13 @@
## 0.16.26-dev3
## 0.17.0
### Enhancements
- **Add support for images in html partitioner** `<img>` tags will now be parsed as `Image` elements. When `extract_image_block_types` includes `Image` and `extract_image_block_to_payload`=True then the `image_base64` will be included for images that specify the base64 data (rather than url) as the source.
- **Use kwargs instead of env to specify `ocr_agent` and `table_ocr_agent`** for `hi_res` strategy.
- **stop using `PageLayout.elements` to save memory and cpu cost**. Now only use `PageLayout.elements_array` throughout the partition, except when `analysis=True` where the drawing logic still uses `elements`.
### Features
### Fixes
@ -28,6 +31,7 @@
in unstructured and `register_partitioner` to enable registering your own partitioner for any file type.
- **`extract_image_block_types` now also works for CamelCase elemenet type names**. Previously `NarrativeText` and similar CamelCase element types can't be extracted using the mentioned parameter in `partition`. Now figures for those elements can be extracted like `Image` and `Table` elements
- **use block matrix to reduce peak memory usage for pdf/image partition**.
### Features

View File

@ -20,3 +20,5 @@ botocore<1.34.132
importlib-metadata>=8.5.0
# (austin): Versions below this have a different interface for passing parameters
unstructured-client>=0.23.0,<0.26.0
# paddle constrains protobuf; maybe we should put paddle here since its version is pinned in .in file
protobuf>=6.30.0

View File

@ -11,5 +11,5 @@ google-cloud-vision
effdet
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
# when unstructured library is.
unstructured-inference>=0.8.7
unstructured-inference>=0.8.9
unstructured.pytesseract>=0.3.12

View File

@ -1479,8 +1479,7 @@ def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
# can't be None and it has to be a Rectangle object that has x1, y1, x2, y2 attributes.
layout_elem_absent_coordinates = MockSinglePageDocumentLayout()
for page in layout_elem_absent_coordinates.pages:
for el in page.elements:
el.bbox = None
page.elements_array.element_coords[:, :] = None
elements = pdf.document_to_element_list(layout_elem_absent_coordinates)
assert elements[0].metadata.coordinates is None

View File

@ -12,6 +12,7 @@ from unstructured_inference.inference.elements import (
TextRegions,
)
from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
from unstructured_inference.inference.layoutelement import LayoutElements
from test_unstructured.unit_utils import example_doc_path
from unstructured.partition.auto import partition
@ -108,7 +109,7 @@ def test_valid_bbox(bbox, is_valid):
def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_document_length):
# create a sample document with pdfminer elements inside tables
page = PageLayout(number=1, image=Image.new("1", (1, 1)))
page.elements = elements
page.elements_array = LayoutElements.from_list(elements)
document_with_table = DocumentLayout(pages=[page])
document = document_with_table
@ -116,7 +117,7 @@ def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_doc
cleaned_doc = clean_pdfminer_inner_elements(document)
# check that the pdfminer elements were stored in the extra_info dictionary
assert len(cleaned_doc.pages[0].elements) == expected_document_length
assert len(cleaned_doc.pages[0].elements_array) == expected_document_length
elements_with_duplicate_images = [

View File

@ -1 +1 @@
__version__ = "0.16.26-dev3" # pragma: no cover
__version__ = "0.17.0" # pragma: no cover

View File

@ -766,10 +766,6 @@ def _partition_pdf_or_image_local(
# vectorization of the data structure ends here
final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
for page in final_document_layout.pages:
for el in page.elements:
el.text = el.text or ""
elements = document_to_element_list(
final_document_layout,
sortable=True,
@ -1199,11 +1195,24 @@ def document_to_element_list(
else None
)
for layout_element in page.elements:
head_line_type_class_ids = [
idx
for idx, class_type in page.elements_array.element_class_id_map.items()
if class_type in ("Headline", "Subheadline")
]
if head_line_type_class_ids:
has_headline = any(
np.any(page.elements_array.element_class_ids == idx)
for idx in head_line_type_class_ids
)
else:
has_headline = False
for layout_element in page.elements_array.iter_elements():
if (
image_width
and image_height
and getattr(layout_element.bbox, "x1") not in (None, np.nan)
and not np.isnan(getattr(layout_element.bbox, "x1", np.nan))
):
coordinate_system = PixelSpace(width=image_width, height=image_height)
else:
@ -1234,8 +1243,8 @@ def document_to_element_list(
element.metadata.text_as_html = getattr(layout_element, "text_as_html", None)
element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None)
if (isinstance(element, Title) and element.metadata.category_depth is None) and any(
getattr(el, "type", "") in ["Headline", "Subheadline"] for el in page.elements
if (isinstance(element, Title) and element.metadata.category_depth is None) and (
has_headline
):
element.metadata.category_depth = 0

View File

@ -281,7 +281,6 @@ def supplement_page_layout_with_ocr(
ocr_agent=_table_ocr_agent,
extracted_regions=extracted_regions,
)
page_layout.elements = page_layout.elements_array.as_list()
return page_layout

View File

@ -657,8 +657,6 @@ def merge_inferred_with_extracted_layout(
merged_layout.texts[i] = remove_control_characters(text)
inferred_page.elements_array = merged_layout
# NOTE: once we drop reference to elements we can remove this step below
inferred_page.elements[:] = merged_layout.as_list()
return inferred_document_layout
@ -670,34 +668,26 @@ def clean_pdfminer_inner_elements(document: "DocumentLayout") -> "DocumentLayout
"""
for page in document.pages:
non_pdfminer_element_boxes = [e.bbox for e in page.elements if e.source != Source.PDFMINER]
element_boxes = []
element_to_subregion_map = {}
subregion_indice = 0
for i, element in enumerate(page.elements):
if element.source != Source.PDFMINER:
continue
element_boxes.append(element.bbox)
element_to_subregion_map[i] = subregion_indice
subregion_indice += 1
pdfminer_mask = page.elements_array.sources == Source.PDFMINER
non_pdfminer_element_boxes = page.elements_array.slice(~pdfminer_mask).element_coords
pdfminer_element_boxes = page.elements_array.slice(pdfminer_mask).element_coords
if len(pdfminer_element_boxes) == 0 or len(non_pdfminer_element_boxes) == 0:
continue
is_element_subregion_of_other_elements = (
bboxes1_is_almost_subregion_of_bboxes2(
element_boxes,
pdfminer_element_boxes,
non_pdfminer_element_boxes,
env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
).sum(axis=1)
== 1
)
page.elements = [
e
for i, e in enumerate(page.elements)
if (
(i not in element_to_subregion_map)
or not is_element_subregion_of_other_elements[element_to_subregion_map[i]]
)
]
pdfminer_to_keep = np.where(pdfminer_mask)[0][~is_element_subregion_of_other_elements]
page.elements_array = page.elements_array.slice(
np.sort(np.concatenate((np.where(~pdfminer_mask)[0], pdfminer_to_keep)))
)
return document