diff --git a/CHANGELOG.md b/CHANGELOG.md index 69bf0a9bb..39576060b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ ### Features +* **Adds element type percent match function** In order to evaluate the element type extracted, we add a function that calculates the matched percentage between two frequency dictionary. + ### Fixes ## 0.10.23 @@ -35,8 +37,8 @@ * **Emit hyperlink metadata for DOCX file-type.** DOCX partitioner now adds `metadata.links`, `metadata.link_texts` and `metadata.link_urls` for elements that contain a hyperlink that points to an external resource. So-called "jump" links pointing to document internal locations (such as those found in a table-of-contents "jumping" to a chapter or section) are excluded. ### Features -* **Add `elements_to_text` as a staging helper function** In order to get a single clean text output from unstructured for metric calculations, automate the process of extracting text from elements using this function. +* **Add `elements_to_text` as a staging helper function** In order to get a single clean text output from unstructured for metric calculations, automate the process of extracting text from elements using this function. * **Adds permissions(RBAC) data ingestion functionality for the Sharepoint connector.** Problem: Role based access control is an important component in many data storage systems. Users may need to pass permissions (RBAC) data to downstream systems when ingesting data. Feature: Added permissions data ingestion functionality to the Sharepoint connector. ### Fixes diff --git a/test_unstructured/metrics/test_element_type.py b/test_unstructured/metrics/test_element_type.py index b1ba2094f..01d80b10d 100644 --- a/test_unstructured/metrics/test_element_type.py +++ b/test_unstructured/metrics/test_element_type.py @@ -1,6 +1,9 @@ import pytest -from unstructured.metrics.element_type import get_element_type_frequency +from unstructured.metrics.element_type import ( + calculate_element_type_percent_match, + get_element_type_frequency, +) from unstructured.partition.auto import partition from unstructured.staging.base import elements_to_json @@ -35,3 +38,72 @@ def test_get_element_type_frequency(filename, frequency): elements = partition(filename=f"example-docs/{filename}") elements_freq = get_element_type_frequency(elements_to_json(elements)) assert elements_freq == frequency + + +@pytest.mark.parametrize( + ("filename", "expected_frequency", "percent_matched"), + [ + ( + "fake-email.txt", + { + ("UncategorizedText", None): 14, + ("ListItem", None): 2, + ("NarrativeText", None): 2, + }, + (0.56, 0.56, 0.56), + ), + ( + "sample-presentation.pptx", + { + ("Title", 0): 3, + ("Title", 1): 1, + ("NarrativeText", None): 1, + ("NarrativeText", 0): 3, + ("ListItem", 0): 6, + ("ListItem", 1): 6, + ("ListItem", 2): 3, + ("Table", None): 1, + }, + (0.96, 0.96, 0.96), + ), + ( + "handbook-1p.docx", + { + ("Header", None): 1, + ("Title", 0): 1, + ("Title", 1): 1, + ("Title", 2): 1, + ("ListItem", 3): 3, + ("NarrativeText", 4): 7, + ("Footer", None): 1, + }, + (0.43, 0.07, 0.65), + ), + ( + "handbook-1p.docx", + { + ("Header", None): 1, + ("Title", 0): 6, + ("NarrativeText", 0): 7, + ("PageBreak", None): 1, + ("Footer", None): 1, + }, + (0.94, 0.88, 0.98), + ), + ], +) +def test_calculate_element_type_percent_match(filename, expected_frequency, percent_matched): + elements = partition(filename=f"example-docs/{filename}") + elements_frequency = get_element_type_frequency(elements_to_json(elements)) + assert ( + round(calculate_element_type_percent_match(elements_frequency, expected_frequency), 2) + == percent_matched[0] + ) + assert ( + round(calculate_element_type_percent_match(elements_frequency, expected_frequency, 0.0), 2) + == percent_matched[1] + ) + assert ( + round(calculate_element_type_percent_match(elements_frequency, expected_frequency, 0.8), 2) + == percent_matched[2] + ) diff --git a/unstructured/metrics/element_type.py b/unstructured/metrics/element_type.py index a54aacd3c..6c77d6eb1 100644 --- a/unstructured/metrics/element_type.py +++ b/unstructured/metrics/element_type.py @@ -20,3 +20,71 @@ def get_element_type_frequency( else: frequency[key] += 1 return frequency + + +def calculate_element_type_percent_match( + output: Dict, + source: Dict, + category_depth_weight: float = 0.5, +) -> float: + """ + Calculate the percent match between two frequency dictionary. Intended to use with + `get_element_type_frequency` function. The function counts the absolute exact match + (type and depth), and counts the weighted match (correct type but different depth), + then normalized with source's total elements. + """ + if len(output) == 0 or len(source) == 0: + return 0.0 + + output_copy = output.copy() + source_copy = source.copy() + total_source_element_count = 0 + total_match_element_count = 0 + + unmatched_depth_output = {} + unmatched_depth_source = {} + + # loop through the output list to find match with source + for k, _ in output_copy.items(): + if k in source_copy: + match_count = min(output_copy[k], source_copy[k]) + total_match_element_count += match_count + total_source_element_count += match_count + + # update the dictionary by removing already matched values + output_copy[k] -= match_count + source_copy[k] -= match_count + + # add unmatched leftovers from output_copy to a new dictionary + element_type = k[0] + if element_type not in unmatched_depth_output: + unmatched_depth_output[element_type] = output_copy[k] + else: + unmatched_depth_output[element_type] += output_copy[k] + + # add unmatched leftovers from source_copy to a new dictionary + unmatched_depth_source = _convert_to_frequency_without_depth(source_copy) + + # loop through the source list to match any existing partial match left + for k, _ in unmatched_depth_source.items(): + total_source_element_count += unmatched_depth_source[k] + if k in unmatched_depth_output: + match_count = min(unmatched_depth_output[k], unmatched_depth_source[k]) + total_match_element_count += match_count * category_depth_weight + + return min(max(total_match_element_count / total_source_element_count, 0.0), 1.0) + + +def _convert_to_frequency_without_depth(d: Dict) -> Dict: + """ + Takes in element frequency with depth of format (type, depth): value + and converts to dictionary without depth of format type: value + """ + res = {} + for k, v in d.items(): + element_type = k[0] + if element_type not in res: + res[element_type] = v + else: + res[element_type] += v + return res