diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b818f167..965de73a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ ### Features +* **Functionality to catch and classify overlapping/nested elements** Method to identify overlapping-bboxes cases within detected elements in a document. It returns two values: a boolean defining if there are overlapping elements present, and a list reporting them with relevant metadata. The output includes information about the `overlapping_elements`, `overlapping_case`, `overlapping_percentage`, `largest_ngram_percentage`, `overlap_percentage_total`, `max_area`, `min_area`, and `total_area`. +* **Add Local connector source metadata** python's os module used to pull stats from local file when processing via the local connector and populates fields such as last modified time, created time. * **Add Local connector source metadata.** python's os module used to pull stats from local file when processing via the local connector and populates fields such as last modified time, created time. ### Fixes diff --git a/test_unstructured/test_utils.py b/test_unstructured/test_utils.py index 0f4bd150e..cb5a8799f 100644 --- a/test_unstructured/test_utils.py +++ b/test_unstructured/test_utils.py @@ -4,6 +4,8 @@ import os import pytest from unstructured import utils +from unstructured.documents.coordinates import PixelSpace +from unstructured.documents.elements import ElementMetadata, NarrativeText, Title @pytest.fixture() @@ -110,3 +112,218 @@ def test_only_raises_when_len_more_than_1(iterator): def test_only_raises_if_empty(iterator): with pytest.raises(ValueError): utils.only(iterator) + + +@pytest.mark.parametrize( + ("elements", "nested_error_tolerance_px", "sm_overlap_threshold", "expectation"), + [ + ( + [ + Title( + text="Some lovely title", + coordinates=((4, 5), (4, 8), (7, 8), (7, 5)), + coordinate_system=PixelSpace(width=20, height=20), + metadata=ElementMetadata(page_number=1), + ), + NarrativeText( + text="Some lovely text", + coordinates=((2, 3), (2, 6), (5, 6), (5, 3)), + coordinate_system=PixelSpace(width=20, height=20), + metadata=ElementMetadata(page_number=1), + ), + ], + 5, + 10.0, + ( + True, + [ + { + "overlapping_elements": ["Title(ix=0)", "NarrativeText(ix=1)"], + "overlapping_case": "nested NarrativeText in Title", + "overlap_percentage": "100%", + "metadata": { + "largest_ngram_percentage": None, + "overlap_percentage_total": "5.88%", + "max_area": "9pxˆ2", + "min_area": "9pxˆ2", + "total_area": "18pxˆ2", + }, + }, + ], + ), + ), + ( + [ + Title( + text="Some lovely title", + coordinates=((4, 5), (4, 8), (7, 8), (7, 5)), + coordinate_system=PixelSpace(width=20, height=20), + metadata=ElementMetadata(page_number=1), + ), + NarrativeText( + text="Some lovely text", + coordinates=((2, 3), (2, 6), (5, 6), (5, 3)), + coordinate_system=PixelSpace(width=20, height=20), + metadata=ElementMetadata(page_number=1), + ), + ], + 1, + 10.0, + ( + True, + [ + { + "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"], + "overlapping_case": "partial overlap sharing 50.0% of the text from1. " + "NarrativeText(2-gram)", + "overlap_percentage": "11.11%", + "metadata": { + "largest_ngram_percentage": 50.0, + "overlap_percentage_total": "5.88%", + "max_area": "9pxˆ2", + "min_area": "9pxˆ2", + "total_area": "18pxˆ2", + }, + }, + ], + ), + ), + ( + [ + Title( + text="Some lovely title", + coordinates=((4, 5), (4, 8), (7, 8), (7, 5)), + coordinate_system=PixelSpace(width=20, height=20), + metadata=ElementMetadata(page_number=1), + ), + NarrativeText( + text="Some lovely title", + coordinates=((2, 3), (2, 6), (5, 6), (5, 3)), + coordinate_system=PixelSpace(width=20, height=20), + metadata=ElementMetadata(page_number=1), + ), + ], + 1, + 10.0, + ( + True, + [ + { + "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"], + "overlapping_case": "partial overlap with duplicate text", + "overlap_percentage": "11.11%", + "metadata": { + "largest_ngram_percentage": None, + "overlap_percentage_total": "5.88%", + "max_area": "9pxˆ2", + "min_area": "9pxˆ2", + "total_area": "18pxˆ2", + }, + }, + ], + ), + ), + ( + [ + Title( + text="Some lovely title", + coordinates=((4, 5), (4, 8), (7, 8), (7, 5)), + coordinate_system=PixelSpace(width=20, height=20), + metadata=ElementMetadata(page_number=1), + ), + NarrativeText( + text="Something totally different here", + coordinates=((2, 3), (2, 6), (5, 6), (5, 3)), + coordinate_system=PixelSpace(width=20, height=20), + metadata=ElementMetadata(page_number=1), + ), + ], + 1, + 10.0, + ( + True, + [ + { + "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"], + "overlapping_case": "partial overlap without sharing text", + "overlap_percentage": "11.11%", + "metadata": { + "largest_ngram_percentage": 0, + "overlap_percentage_total": "5.88%", + "max_area": "9pxˆ2", + "min_area": "9pxˆ2", + "total_area": "18pxˆ2", + }, + }, + ], + ), + ), + ( + [ + Title( + text="Some lovely title", + coordinates=((5, 6), (5, 10), (8, 10), (8, 6)), + coordinate_system=PixelSpace(width=20, height=20), + metadata=ElementMetadata(page_number=1), + ), + NarrativeText( + text="Some lovely text", + coordinates=((1, 3), (2, 7), (6, 7), (5, 3)), + coordinate_system=PixelSpace(width=20, height=20), + metadata=ElementMetadata(page_number=1), + ), + ], + 1, + 10.0, + ( + True, + [ + { + "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"], + "overlapping_case": "Small partial overlap", + "overlap_percentage": "8.33%", + "metadata": { + "largest_ngram_percentage": None, + "overlap_percentage_total": "3.23%", + "max_area": "20pxˆ2", + "min_area": "12pxˆ2", + "total_area": "32pxˆ2", + }, + }, + ], + ), + ), + ( + [ + Title( + text="Some lovely title", + coordinates=((4, 6), (4, 7), (7, 7), (7, 6)), + coordinate_system=PixelSpace(width=20, height=20), + metadata=ElementMetadata(page_number=1), + ), + NarrativeText( + text="Some lovely text", + coordinates=((6, 8), (6, 9), (9, 9), (9, 8)), + coordinate_system=PixelSpace(width=20, height=20), + metadata=ElementMetadata(page_number=1), + ), + ], + 1, + 10.0, + (False, []), + ), + ], +) +def test_catch_overlapping_and_nested_bboxes( + elements, + expectation, + nested_error_tolerance_px, + sm_overlap_threshold, +): + overlapping_flag, overlapping_cases = utils.catch_overlapping_and_nested_bboxes( + elements, + nested_error_tolerance_px, + sm_overlap_threshold, + ) + assert overlapping_flag == expectation[0] + assert overlapping_cases == expectation[1] diff --git a/unstructured/utils.py b/unstructured/utils.py index 8014e95c0..e960d722f 100644 --- a/unstructured/utils.py +++ b/unstructured/utils.py @@ -6,6 +6,7 @@ import platform import subprocess from datetime import datetime from functools import wraps +from itertools import combinations from typing import ( Any, Callable, @@ -25,10 +26,10 @@ import requests from typing_extensions import ParamSpec from unstructured.__version__ import __version__ +from unstructured.documents.elements import Text DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z") - _T = TypeVar("_T") _P = ParamSpec("_P") @@ -280,3 +281,404 @@ def scarf_analytics(): ) except Exception: pass + + +def ngrams(s: str, n: int) -> List: + """Generate n-grams from a string s""" + + ngrams_list = [] + for i in range(len(s) - n + 1): + ngram = [] + for j in range(n): + ngram.append(s[i + j]) + ngrams_list.append(tuple(ngram)) + return ngrams_list + + +def calculate_shared_ngram_percentage( + first_string: str, + second_string: str, + n: int, +) -> (float, List): + """Calculate the percentage of common_ngrams between string_A and string_B + with reference to the total number of ngrams in string_A""" + + if not n: + return 0, {} + first_string_ngrams = ngrams(first_string.split(), n) + second_string_ngrams = ngrams(second_string.split(), n) + + if not first_string_ngrams: + return 0 + + common_ngrams = set(first_string_ngrams) & set(second_string_ngrams) + percentage = (len(common_ngrams) / len(first_string_ngrams)) * 100 + return percentage, common_ngrams + + +def calculate_largest_ngram_percentage(first_string: str, second_string: str) -> (float, List, str): + """Iteratively calculate_shared_ngram_percentage starting from the biggest + ngram possible until is >0.0%""" + + shared_ngrams = [] + if len(first_string.split()) < len(second_string.split()): + n = len(first_string.split()) - 1 + else: + n = len(second_string.split()) - 1 + first_string, second_string = second_string, first_string + ngram_percentage = 0 + while not ngram_percentage: + ngram_percentage, shared_ngrams = calculate_shared_ngram_percentage( + first_string, + second_string, + n, + ) + if n == 0: + break + else: + n -= 1 + return round(ngram_percentage, 2), shared_ngrams, str(n + 1) + + +def is_parent_box( + parent_target: Union[List, Tuple], + child_target: Union[List, Tuple], + add: float = 0.0, +) -> bool: + """True if the child_target bounding box is nested in the parent_target. + Box format: [x_bottom_left, y_bottom_left, x_top_right, y_top_right]. + The parameter 'add' is the pixel error tolerance for extra pixels outside the parent region""" + + if len(parent_target) != 4: + return False + + if add and len(parent_target) == 4: + parent_target = list(parent_target) + parent_target[0] -= add + parent_target[1] -= add + parent_target[2] += add + parent_target[3] += add + + if ( + len(child_target) == 4 + and (child_target[0] >= parent_target[0] and child_target[1] >= parent_target[1]) + and (child_target[2] <= parent_target[2] and child_target[3] <= parent_target[3]) + ): + return True + if len(child_target) == 2 and ( + parent_target[0] <= child_target[0] <= parent_target[2] + and parent_target[1] <= child_target[1] <= parent_target[3] + ): + return True + return False + + +def calculate_overlap_percentage( + box1: Union[List, Tuple], + box2: Union[List, Tuple], + intersection_ratio_method: str = "total", +): + """Box format: [x_bottom_left, y_bottom_left, x_top_right, y_top_right]. + Calculates the percentage of overlapped region with reference to + the biggest element-region (intersection_ratio_method="parent"), + the smallest element-region (intersection_ratio_method="partial"), or to + the disjunctive union region (intersection_ratio_method="total") + """ + x1, y1 = box1[0] + x2, y2 = box1[2] + x3, y3 = box2[0] + x4, y4 = box2[2] + area_box1 = (x2 - x1) * (y2 - y1) + area_box2 = (x4 - x3) * (y4 - y3) + x_intersection1 = max(x1, x3) + y_intersection1 = max(y1, y3) + x_intersection2 = min(x2, x4) + y_intersection2 = min(y2, y4) + intersection_area = max(0, x_intersection2 - x_intersection1) * max( + 0, + y_intersection2 - y_intersection1, + ) + max_area = max(area_box1, area_box2) + min_area = min(area_box1, area_box2) + total_area = area_box1 + area_box2 + + if intersection_ratio_method == "parent": + if max_area == 0: + return 0 + overlap_percentage = (intersection_area / max_area) * 100 + + elif intersection_ratio_method == "partial": + if min_area == 0: + return 0 + overlap_percentage = (intersection_area / min_area) * 100 + + else: + if (area_box1 + area_box2) == 0: + return 0 + + overlap_percentage = (intersection_area / (area_box1 + area_box2 - intersection_area)) * 100 + + return round(overlap_percentage, 2), max_area, min_area, total_area + + +def identify_overlapping_case( + box_pair: Union[List[Union[List, Tuple]], Tuple[Union[List, Tuple]]], + label_pair: Union[List[str], Tuple[str]], + text_pair: Union[List[str], Tuple[str]], + ix_pair: Union[List[str], Tuple[str]], + sm_overlap_threshold: float = 10.0, +): + """Classifies the overlapping case for an element_pair input. + There are 5 categories of overlapping: + 'Small partial overlap', 'Partial overlap with empty content', + 'Partial overlap with duplicate text (sharing 100% of the text)', + 'Partial overlap without sharing text', and + 'Partial overlap sharing {calculate_largest_ngram_percentage(...)}% of the text' + """ + overlapping_elements, overlapping_case, overlap_percentage, largest_ngram_percentage = ( + None, + None, + None, + None, + ) + box1, box2 = box_pair + type1, type2 = label_pair + text1, text2 = text_pair + ix_element1, ix_element2 = ix_pair + ( + overlap_percentage, + max_area, + min_area, + total_area, + ) = calculate_overlap_percentage( + box1, + box2, + intersection_ratio_method="partial", + ) + if overlap_percentage < sm_overlap_threshold: + overlapping_elements = [ + f"{type1}(ix={ix_element1})", + f"{type2}(ix={ix_element2})", + ] + overlapping_case = "Small partial overlap" + + else: + if not text1: + overlapping_elements = [ + f"{type1}(ix={ix_element1})", + f"{type2}(ix={ix_element2})", + ] + overlapping_case = f"partial overlap with empty content in {type1}" + + elif not text2: + overlapping_elements = [ + f"{type2}(ix={ix_element2})", + f"{type1}(ix={ix_element1})", + ] + overlapping_case = f"partial overlap with empty content in {type2}" + + elif text1 in text2 or text2 in text1: + overlapping_elements = [ + f"{type1}(ix={ix_element1})", + f"{type2}(ix={ix_element2})", + ] + overlapping_case = "partial overlap with duplicate text" + + else: + ( + largest_ngram_percentage, + largest_shared_ngrams_max, + largest_n, + ) = calculate_largest_ngram_percentage(text1, text2) + largest_ngram_percentage = round(largest_ngram_percentage, 2) + if not largest_ngram_percentage: + overlapping_elements = [ + f"{type1}(ix={ix_element1})", + f"{type2}(ix={ix_element2})", + ] + overlapping_case = "partial overlap without sharing text" + + else: + overlapping_elements = [ + f"{type1}(ix={ix_element1})", + f"{type2}(ix={ix_element2})", + ] + ref_type = type1 if len(text1.split()) < len(text2.split()) else type2 + ref_type = "of the text from" + ref_type + f"({largest_n}-gram)" + overlapping_case = f"partial overlap sharing {largest_ngram_percentage}% {ref_type}" + return ( + overlapping_elements, + overlapping_case, + overlap_percentage, + largest_ngram_percentage, + max_area, + min_area, + total_area, + ) + + +# x1, y1 = box1[0] +def identify_overlapping_or_nesting_case( + box_pair: Union[List[Union[List, Tuple]], Tuple[Union[List, Tuple]]], + label_pair: Union[List[str], Tuple[str]], + text_pair: Union[List[str], Tuple[str]], + nested_error_tolerance_px: int = 5, + sm_overlap_threshold: float = 10.0, +): + """Identify if there are nested or overlapping elements. If overlapping is present, + it identifies the case calling the method identify_overlapping_case""" + box1, box2 = box_pair + type1, type2 = label_pair + ix_element1 = "".join([ch for ch in type1 if ch.isnumeric()]) + ix_element2 = "".join([ch for ch in type2 if ch.isnumeric()]) + type1 = type1[3:].strip() + type2 = type2[3:].strip() + x_bottom_left_1, y_bottom_left_1 = box1[0] + x_top_right_1, y_top_right_1 = box1[2] + x_bottom_left_2, y_bottom_left_2 = box2[0] + x_top_right_2, y_top_right_2 = box2[2] + box1_corners = [x_bottom_left_1, y_bottom_left_1, x_top_right_1, y_top_right_1] + box2_corners = [x_bottom_left_2, y_bottom_left_2, x_top_right_2, y_top_right_2] + + horizontal_overlap = x_bottom_left_1 < x_top_right_2 and x_top_right_1 > x_bottom_left_2 + vertical_overlap = y_bottom_left_1 < y_top_right_2 and y_top_right_1 > y_bottom_left_2 + ( + overlapping_elements, + overlapping_case, + overlap_percentage, + overlap_percentage_total, + largest_ngram_percentage, + ) = ( + None, + None, + None, + None, + None, + ) + max_area, min_area, total_area = None, None, None + + if horizontal_overlap and vertical_overlap: + overlap_percentage_total, _, _, _ = calculate_overlap_percentage( + box1, + box2, + intersection_ratio_method="total", + ) + overlap_percentage, max_area, min_area, total_area = calculate_overlap_percentage( + box1, + box2, + intersection_ratio_method="parent", + ) + + if is_parent_box(box1_corners, box2_corners, add=nested_error_tolerance_px): + overlapping_elements = [ + f"{type1}(ix={ix_element1})", + f"{type2}(ix={ix_element2})", + ] + overlapping_case = f"nested {type2} in {type1}" + overlap_percentage = 100 + + elif is_parent_box(box2_corners, box1_corners, add=nested_error_tolerance_px): + overlapping_elements = [ + f"{type2}(ix={ix_element2})", + f"{type1}(ix={ix_element1})", + ] + overlapping_case = f"nested {type1} in {type2}" + overlap_percentage = 100 + + else: + ( + overlapping_elements, + overlapping_case, + overlap_percentage, + largest_ngram_percentage, + max_area, + min_area, + total_area, + ) = identify_overlapping_case( + box_pair, + label_pair, + text_pair, + (ix_element1, ix_element2), + sm_overlap_threshold=sm_overlap_threshold, + ) + return ( + overlapping_elements, + overlapping_case, + overlap_percentage, + overlap_percentage_total, + largest_ngram_percentage, + max_area, + min_area, + total_area, + ) + + +def catch_overlapping_and_nested_bboxes( + elements: List[Text], + nested_error_tolerance_px: int = 5, + sm_overlap_threshold: float = 10.0, +) -> (bool, List[Dict]): + """Catch overlapping and nested bounding boxes cases across a list of elements.""" + + num_pages = elements[-1].metadata.page_number + bounding_boxes = [[] for _ in range(num_pages)] + + text_labels = [[] for _ in range(num_pages)] + text_content = [[] for _ in range(num_pages)] + + for ix, element in enumerate(elements): + n_page_to_ix = element.metadata.page_number - 1 + bounding_boxes[n_page_to_ix].append(element.metadata.coordinates.to_dict()["points"]) + text_labels[n_page_to_ix].append(f"{ix}. {element.category}") + text_content[n_page_to_ix].append(element.text) + + document_with_overlapping_flag = False + overlapping_cases = [] + for page_number, (page_bboxes, page_labels, page_text) in enumerate( + zip(bounding_boxes, text_labels, text_content), + start=1, + ): + page_bboxes_combinations = list(combinations(page_bboxes, 2)) + page_labels_combinations = list(combinations(page_labels, 2)) + text_content_combinations = list(combinations(page_text, 2)) + + for box_pair, label_pair, text_pair in zip( + page_bboxes_combinations, + page_labels_combinations, + text_content_combinations, + ): + ( + overlapping_elements, + overlapping_case, + overlap_percentage, + overlap_percentage_total, + largest_ngram_percentage, + max_area, + min_area, + total_area, + ) = identify_overlapping_or_nesting_case( + box_pair, + label_pair, + text_pair, + nested_error_tolerance_px, + sm_overlap_threshold, + ) + + if overlapping_case: + overlapping_cases.append( + { + "overlapping_elements": overlapping_elements, + "overlapping_case": overlapping_case, + "overlap_percentage": f"{overlap_percentage}%", + "metadata": { + "largest_ngram_percentage": largest_ngram_percentage, + "overlap_percentage_total": f"{overlap_percentage_total}%", + "max_area": f"{round(max_area, 2)}pxˆ2", + "min_area": f"{round(min_area, 2)}pxˆ2", + "total_area": f"{round(total_area, 2)}pxˆ2", + }, + }, + ) + document_with_overlapping_flag = True + + return document_with_overlapping_flag, overlapping_cases