mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2026-01-06 12:21:30 +00:00
Feat/remove reference of PageLayout.elements (#3943)
This PR removes usage of `PageLayout.elements` from partition function, except for when `analysis=True`. This PR updates the partition logic so that `PageLayout.elements_array` is used everywhere to save memory and cpu cost. Since the analysis function is intended for investigation and not for general document processing purposes, this part of the code is left for a future refactor. `PageLayout.elements` uses a list to store layout elements' data while `elements_array` uses `numpy` array to store the data, which has much lower memory requirements. Using `memory_profiler` to test the differences is usually around 10x.
This commit is contained in:
parent
8759b0aac9
commit
2dceac34b5
@ -1,10 +1,13 @@
|
||||
## 0.16.26-dev3
|
||||
## 0.17.0
|
||||
|
||||
### Enhancements
|
||||
|
||||
- **Add support for images in html partitioner** `<img>` tags will now be parsed as `Image` elements. When `extract_image_block_types` includes `Image` and `extract_image_block_to_payload`=True then the `image_base64` will be included for images that specify the base64 data (rather than url) as the source.
|
||||
|
||||
- **Use kwargs instead of env to specify `ocr_agent` and `table_ocr_agent`** for `hi_res` strategy.
|
||||
|
||||
- **stop using `PageLayout.elements` to save memory and cpu cost**. Now only use `PageLayout.elements_array` throughout the partition, except when `analysis=True` where the drawing logic still uses `elements`.
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
@ -28,6 +31,7 @@
|
||||
in unstructured and `register_partitioner` to enable registering your own partitioner for any file type.
|
||||
|
||||
- **`extract_image_block_types` now also works for CamelCase elemenet type names**. Previously `NarrativeText` and similar CamelCase element types can't be extracted using the mentioned parameter in `partition`. Now figures for those elements can be extracted like `Image` and `Table` elements
|
||||
|
||||
- **use block matrix to reduce peak memory usage for pdf/image partition**.
|
||||
|
||||
### Features
|
||||
|
||||
@ -20,3 +20,5 @@ botocore<1.34.132
|
||||
importlib-metadata>=8.5.0
|
||||
# (austin): Versions below this have a different interface for passing parameters
|
||||
unstructured-client>=0.23.0,<0.26.0
|
||||
# paddle constrains protobuf; maybe we should put paddle here since its version is pinned in .in file
|
||||
protobuf>=6.30.0
|
||||
|
||||
@ -11,5 +11,5 @@ google-cloud-vision
|
||||
effdet
|
||||
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
|
||||
# when unstructured library is.
|
||||
unstructured-inference>=0.8.7
|
||||
unstructured-inference>=0.8.9
|
||||
unstructured.pytesseract>=0.3.12
|
||||
|
||||
@ -1479,8 +1479,7 @@ def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
|
||||
# can't be None and it has to be a Rectangle object that has x1, y1, x2, y2 attributes.
|
||||
layout_elem_absent_coordinates = MockSinglePageDocumentLayout()
|
||||
for page in layout_elem_absent_coordinates.pages:
|
||||
for el in page.elements:
|
||||
el.bbox = None
|
||||
page.elements_array.element_coords[:, :] = None
|
||||
elements = pdf.document_to_element_list(layout_elem_absent_coordinates)
|
||||
assert elements[0].metadata.coordinates is None
|
||||
|
||||
|
||||
@ -12,6 +12,7 @@ from unstructured_inference.inference.elements import (
|
||||
TextRegions,
|
||||
)
|
||||
from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
|
||||
from unstructured_inference.inference.layoutelement import LayoutElements
|
||||
|
||||
from test_unstructured.unit_utils import example_doc_path
|
||||
from unstructured.partition.auto import partition
|
||||
@ -108,7 +109,7 @@ def test_valid_bbox(bbox, is_valid):
|
||||
def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_document_length):
|
||||
# create a sample document with pdfminer elements inside tables
|
||||
page = PageLayout(number=1, image=Image.new("1", (1, 1)))
|
||||
page.elements = elements
|
||||
page.elements_array = LayoutElements.from_list(elements)
|
||||
document_with_table = DocumentLayout(pages=[page])
|
||||
document = document_with_table
|
||||
|
||||
@ -116,7 +117,7 @@ def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_doc
|
||||
cleaned_doc = clean_pdfminer_inner_elements(document)
|
||||
|
||||
# check that the pdfminer elements were stored in the extra_info dictionary
|
||||
assert len(cleaned_doc.pages[0].elements) == expected_document_length
|
||||
assert len(cleaned_doc.pages[0].elements_array) == expected_document_length
|
||||
|
||||
|
||||
elements_with_duplicate_images = [
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.16.26-dev3" # pragma: no cover
|
||||
__version__ = "0.17.0" # pragma: no cover
|
||||
|
||||
@ -766,10 +766,6 @@ def _partition_pdf_or_image_local(
|
||||
# vectorization of the data structure ends here
|
||||
final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
|
||||
|
||||
for page in final_document_layout.pages:
|
||||
for el in page.elements:
|
||||
el.text = el.text or ""
|
||||
|
||||
elements = document_to_element_list(
|
||||
final_document_layout,
|
||||
sortable=True,
|
||||
@ -1199,11 +1195,24 @@ def document_to_element_list(
|
||||
else None
|
||||
)
|
||||
|
||||
for layout_element in page.elements:
|
||||
head_line_type_class_ids = [
|
||||
idx
|
||||
for idx, class_type in page.elements_array.element_class_id_map.items()
|
||||
if class_type in ("Headline", "Subheadline")
|
||||
]
|
||||
if head_line_type_class_ids:
|
||||
has_headline = any(
|
||||
np.any(page.elements_array.element_class_ids == idx)
|
||||
for idx in head_line_type_class_ids
|
||||
)
|
||||
else:
|
||||
has_headline = False
|
||||
|
||||
for layout_element in page.elements_array.iter_elements():
|
||||
if (
|
||||
image_width
|
||||
and image_height
|
||||
and getattr(layout_element.bbox, "x1") not in (None, np.nan)
|
||||
and not np.isnan(getattr(layout_element.bbox, "x1", np.nan))
|
||||
):
|
||||
coordinate_system = PixelSpace(width=image_width, height=image_height)
|
||||
else:
|
||||
@ -1234,8 +1243,8 @@ def document_to_element_list(
|
||||
element.metadata.text_as_html = getattr(layout_element, "text_as_html", None)
|
||||
element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None)
|
||||
|
||||
if (isinstance(element, Title) and element.metadata.category_depth is None) and any(
|
||||
getattr(el, "type", "") in ["Headline", "Subheadline"] for el in page.elements
|
||||
if (isinstance(element, Title) and element.metadata.category_depth is None) and (
|
||||
has_headline
|
||||
):
|
||||
element.metadata.category_depth = 0
|
||||
|
||||
|
||||
@ -281,7 +281,6 @@ def supplement_page_layout_with_ocr(
|
||||
ocr_agent=_table_ocr_agent,
|
||||
extracted_regions=extracted_regions,
|
||||
)
|
||||
page_layout.elements = page_layout.elements_array.as_list()
|
||||
|
||||
return page_layout
|
||||
|
||||
|
||||
@ -657,8 +657,6 @@ def merge_inferred_with_extracted_layout(
|
||||
merged_layout.texts[i] = remove_control_characters(text)
|
||||
|
||||
inferred_page.elements_array = merged_layout
|
||||
# NOTE: once we drop reference to elements we can remove this step below
|
||||
inferred_page.elements[:] = merged_layout.as_list()
|
||||
|
||||
return inferred_document_layout
|
||||
|
||||
@ -670,34 +668,26 @@ def clean_pdfminer_inner_elements(document: "DocumentLayout") -> "DocumentLayout
|
||||
"""
|
||||
|
||||
for page in document.pages:
|
||||
non_pdfminer_element_boxes = [e.bbox for e in page.elements if e.source != Source.PDFMINER]
|
||||
element_boxes = []
|
||||
element_to_subregion_map = {}
|
||||
subregion_indice = 0
|
||||
for i, element in enumerate(page.elements):
|
||||
if element.source != Source.PDFMINER:
|
||||
continue
|
||||
element_boxes.append(element.bbox)
|
||||
element_to_subregion_map[i] = subregion_indice
|
||||
subregion_indice += 1
|
||||
pdfminer_mask = page.elements_array.sources == Source.PDFMINER
|
||||
non_pdfminer_element_boxes = page.elements_array.slice(~pdfminer_mask).element_coords
|
||||
pdfminer_element_boxes = page.elements_array.slice(pdfminer_mask).element_coords
|
||||
|
||||
if len(pdfminer_element_boxes) == 0 or len(non_pdfminer_element_boxes) == 0:
|
||||
continue
|
||||
|
||||
is_element_subregion_of_other_elements = (
|
||||
bboxes1_is_almost_subregion_of_bboxes2(
|
||||
element_boxes,
|
||||
pdfminer_element_boxes,
|
||||
non_pdfminer_element_boxes,
|
||||
env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
|
||||
).sum(axis=1)
|
||||
== 1
|
||||
)
|
||||
|
||||
page.elements = [
|
||||
e
|
||||
for i, e in enumerate(page.elements)
|
||||
if (
|
||||
(i not in element_to_subregion_map)
|
||||
or not is_element_subregion_of_other_elements[element_to_subregion_map[i]]
|
||||
)
|
||||
]
|
||||
pdfminer_to_keep = np.where(pdfminer_mask)[0][~is_element_subregion_of_other_elements]
|
||||
page.elements_array = page.elements_array.slice(
|
||||
np.sort(np.concatenate((np.where(~pdfminer_mask)[0], pdfminer_to_keep)))
|
||||
)
|
||||
|
||||
return document
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user