diff --git a/CHANGELOG.md b/CHANGELOG.md index 1908bb1af..0b03b9557 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,13 @@ -## 0.16.26-dev3 +## 0.17.0 ### Enhancements - **Add support for images in html partitioner** `` tags will now be parsed as `Image` elements. When `extract_image_block_types` includes `Image` and `extract_image_block_to_payload`=True then the `image_base64` will be included for images that specify the base64 data (rather than url) as the source. + - **Use kwargs instead of env to specify `ocr_agent` and `table_ocr_agent`** for `hi_res` strategy. +- **stop using `PageLayout.elements` to save memory and cpu cost**. Now only use `PageLayout.elements_array` throughout the partition, except when `analysis=True` where the drawing logic still uses `elements`. + ### Features ### Fixes @@ -28,6 +31,7 @@ in unstructured and `register_partitioner` to enable registering your own partitioner for any file type. - **`extract_image_block_types` now also works for CamelCase elemenet type names**. Previously `NarrativeText` and similar CamelCase element types can't be extracted using the mentioned parameter in `partition`. Now figures for those elements can be extracted like `Image` and `Table` elements + - **use block matrix to reduce peak memory usage for pdf/image partition**. ### Features diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 543a2d8ac..be1d0c40f 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -20,3 +20,5 @@ botocore<1.34.132 importlib-metadata>=8.5.0 # (austin): Versions below this have a different interface for passing parameters unstructured-client>=0.23.0,<0.26.0 +# paddle constrains protobuf; maybe we should put paddle here since its version is pinned in .in file +protobuf>=6.30.0 diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 816388cbe..332ca01b6 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -11,5 +11,5 @@ google-cloud-vision effdet # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference>=0.8.7 +unstructured-inference>=0.8.9 unstructured.pytesseract>=0.3.12 diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index cea5d9557..6d1145eb8 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -1479,8 +1479,7 @@ def test_document_to_element_list_omits_coord_system_when_coord_points_absent(): # can't be None and it has to be a Rectangle object that has x1, y1, x2, y2 attributes. layout_elem_absent_coordinates = MockSinglePageDocumentLayout() for page in layout_elem_absent_coordinates.pages: - for el in page.elements: - el.bbox = None + page.elements_array.element_coords[:, :] = None elements = pdf.document_to_element_list(layout_elem_absent_coordinates) assert elements[0].metadata.coordinates is None diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py index 5e4114fce..309ea1336 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py @@ -12,6 +12,7 @@ from unstructured_inference.inference.elements import ( TextRegions, ) from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout +from unstructured_inference.inference.layoutelement import LayoutElements from test_unstructured.unit_utils import example_doc_path from unstructured.partition.auto import partition @@ -108,7 +109,7 @@ def test_valid_bbox(bbox, is_valid): def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_document_length): # create a sample document with pdfminer elements inside tables page = PageLayout(number=1, image=Image.new("1", (1, 1))) - page.elements = elements + page.elements_array = LayoutElements.from_list(elements) document_with_table = DocumentLayout(pages=[page]) document = document_with_table @@ -116,7 +117,7 @@ def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_doc cleaned_doc = clean_pdfminer_inner_elements(document) # check that the pdfminer elements were stored in the extra_info dictionary - assert len(cleaned_doc.pages[0].elements) == expected_document_length + assert len(cleaned_doc.pages[0].elements_array) == expected_document_length elements_with_duplicate_images = [ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index c3d4d620d..138620c64 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.26-dev3" # pragma: no cover +__version__ = "0.17.0" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 11242aa8e..e0e64854d 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -766,10 +766,6 @@ def _partition_pdf_or_image_local( # vectorization of the data structure ends here final_document_layout = clean_pdfminer_inner_elements(final_document_layout) - for page in final_document_layout.pages: - for el in page.elements: - el.text = el.text or "" - elements = document_to_element_list( final_document_layout, sortable=True, @@ -1199,11 +1195,24 @@ def document_to_element_list( else None ) - for layout_element in page.elements: + head_line_type_class_ids = [ + idx + for idx, class_type in page.elements_array.element_class_id_map.items() + if class_type in ("Headline", "Subheadline") + ] + if head_line_type_class_ids: + has_headline = any( + np.any(page.elements_array.element_class_ids == idx) + for idx in head_line_type_class_ids + ) + else: + has_headline = False + + for layout_element in page.elements_array.iter_elements(): if ( image_width and image_height - and getattr(layout_element.bbox, "x1") not in (None, np.nan) + and not np.isnan(getattr(layout_element.bbox, "x1", np.nan)) ): coordinate_system = PixelSpace(width=image_width, height=image_height) else: @@ -1234,8 +1243,8 @@ def document_to_element_list( element.metadata.text_as_html = getattr(layout_element, "text_as_html", None) element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None) - if (isinstance(element, Title) and element.metadata.category_depth is None) and any( - getattr(el, "type", "") in ["Headline", "Subheadline"] for el in page.elements + if (isinstance(element, Title) and element.metadata.category_depth is None) and ( + has_headline ): element.metadata.category_depth = 0 diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index 0a627ab7e..359d7e06d 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -281,7 +281,6 @@ def supplement_page_layout_with_ocr( ocr_agent=_table_ocr_agent, extracted_regions=extracted_regions, ) - page_layout.elements = page_layout.elements_array.as_list() return page_layout diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index dcbe103fd..fc7428c57 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -657,8 +657,6 @@ def merge_inferred_with_extracted_layout( merged_layout.texts[i] = remove_control_characters(text) inferred_page.elements_array = merged_layout - # NOTE: once we drop reference to elements we can remove this step below - inferred_page.elements[:] = merged_layout.as_list() return inferred_document_layout @@ -670,34 +668,26 @@ def clean_pdfminer_inner_elements(document: "DocumentLayout") -> "DocumentLayout """ for page in document.pages: - non_pdfminer_element_boxes = [e.bbox for e in page.elements if e.source != Source.PDFMINER] - element_boxes = [] - element_to_subregion_map = {} - subregion_indice = 0 - for i, element in enumerate(page.elements): - if element.source != Source.PDFMINER: - continue - element_boxes.append(element.bbox) - element_to_subregion_map[i] = subregion_indice - subregion_indice += 1 + pdfminer_mask = page.elements_array.sources == Source.PDFMINER + non_pdfminer_element_boxes = page.elements_array.slice(~pdfminer_mask).element_coords + pdfminer_element_boxes = page.elements_array.slice(pdfminer_mask).element_coords + + if len(pdfminer_element_boxes) == 0 or len(non_pdfminer_element_boxes) == 0: + continue is_element_subregion_of_other_elements = ( bboxes1_is_almost_subregion_of_bboxes2( - element_boxes, + pdfminer_element_boxes, non_pdfminer_element_boxes, env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD, ).sum(axis=1) == 1 ) - page.elements = [ - e - for i, e in enumerate(page.elements) - if ( - (i not in element_to_subregion_map) - or not is_element_subregion_of_other_elements[element_to_subregion_map[i]] - ) - ] + pdfminer_to_keep = np.where(pdfminer_mask)[0][~is_element_subregion_of_other_elements] + page.elements_array = page.elements_array.slice( + np.sort(np.concatenate((np.where(~pdfminer_mask)[0], pdfminer_to_keep))) + ) return document