feat: calculate element type percent match (#1723)

**Executive Summary** Adds function to calculate the percent match between two element type frequency output from `get_element_type_frequency` function. **Technical Detail** - The function takes two `Dict` input which both should be output from `get_element_type_frequency` - Implementors can define weight `category_depth_weight` they want to give to the matching `type` but different in `category_depth` case - The function loops through output item list first to find exact match and count total exact match, and collect the remaining value for both output and source in new list (of `dict` type). Then it loops through existing source item list that has not been an exact match, to find `type` match which then weigh with the factor of `category_depth_weight` defined earlier, default at 0.5) **Output** output ``` { ("Title", 0): 2, ("Title", 1): 1, ("NarrativeText", None): 3, ("UncategorizedText", None): 1, } ``` source ``` { ("Title", 0): 1, ("Title", 1): 2, ("NarrativeText", None): 5, } ``` With this output and source, and weight of 0.5, the % match will yield 5.5 / 8 -- for 5 exact match, and 1 partial match with 0.5 weight. --------- Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
2025-12-04 03:00:24 +00:00 · 2023-10-16 13:57:28 -04:00 · 2023-10-16 13:57:28 -04:00 · ba4c649cf0
commit ba4c649cf0
parent 9c7ee8921a
3 changed files with 144 additions and 2 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -6,6 +6,8 @@

 ### Features

+* **Adds element type percent match function** In order to evaluate the element type extracted, we add a function that calculates the matched percentage between two frequency dictionary.
+
 ### Fixes

 ## 0.10.23
@ -35,8 +37,8 @@
 * **Emit hyperlink metadata for DOCX file-type.** DOCX partitioner now adds `metadata.links`, `metadata.link_texts` and `metadata.link_urls` for elements that contain a hyperlink that points to an external resource. So-called "jump" links pointing to document internal locations (such as those found in a table-of-contents "jumping" to a chapter or section) are excluded.

 ### Features
-* **Add `elements_to_text` as a staging helper function** In order to get a single clean text output from unstructured for metric calculations, automate the process of extracting text from elements using this function.

+* **Add `elements_to_text` as a staging helper function** In order to get a single clean text output from unstructured for metric calculations, automate the process of extracting text from elements using this function.
 * **Adds permissions(RBAC) data ingestion functionality for the Sharepoint connector.** Problem: Role based access control is an important component in many data storage systems. Users may need to pass permissions (RBAC) data to downstream systems when ingesting data. Feature: Added permissions data ingestion functionality to the Sharepoint connector.

 ### Fixes
--- a/test_unstructured/metrics/test_element_type.py
+++ b/test_unstructured/metrics/test_element_type.py
@ -1,6 +1,9 @@
 import pytest

-from unstructured.metrics.element_type import get_element_type_frequency
+from unstructured.metrics.element_type import (
+    calculate_element_type_percent_match,
+    get_element_type_frequency,
+)
 from unstructured.partition.auto import partition
 from unstructured.staging.base import elements_to_json

@ -35,3 +38,72 @@ def test_get_element_type_frequency(filename, frequency):
    elements = partition(filename=f"example-docs/{filename}")
    elements_freq = get_element_type_frequency(elements_to_json(elements))
    assert elements_freq == frequency
+
+
+@pytest.mark.parametrize(
+    ("filename", "expected_frequency", "percent_matched"),
+    [
+        (
+            "fake-email.txt",
+            {
+                ("UncategorizedText", None): 14,
+                ("ListItem", None): 2,
+                ("NarrativeText", None): 2,
+            },
+            (0.56, 0.56, 0.56),
+        ),
+        (
+            "sample-presentation.pptx",
+            {
+                ("Title", 0): 3,
+                ("Title", 1): 1,
+                ("NarrativeText", None): 1,
+                ("NarrativeText", 0): 3,
+                ("ListItem", 0): 6,
+                ("ListItem", 1): 6,
+                ("ListItem", 2): 3,
+                ("Table", None): 1,
+            },
+            (0.96, 0.96, 0.96),
+        ),
+        (
+            "handbook-1p.docx",
+            {
+                ("Header", None): 1,
+                ("Title", 0): 1,
+                ("Title", 1): 1,
+                ("Title", 2): 1,
+                ("ListItem", 3): 3,
+                ("NarrativeText", 4): 7,
+                ("Footer", None): 1,
+            },
+            (0.43, 0.07, 0.65),
+        ),
+        (
+            "handbook-1p.docx",
+            {
+                ("Header", None): 1,
+                ("Title", 0): 6,
+                ("NarrativeText", 0): 7,
+                ("PageBreak", None): 1,
+                ("Footer", None): 1,
+            },
+            (0.94, 0.88, 0.98),
+        ),
+    ],
+)
+def test_calculate_element_type_percent_match(filename, expected_frequency, percent_matched):
+    elements = partition(filename=f"example-docs/{filename}")
+    elements_frequency = get_element_type_frequency(elements_to_json(elements))
+    assert (
+        round(calculate_element_type_percent_match(elements_frequency, expected_frequency), 2)
+        == percent_matched[0]
+    )
+    assert (
+        round(calculate_element_type_percent_match(elements_frequency, expected_frequency, 0.0), 2)
+        == percent_matched[1]
+    )
+    assert (
+        round(calculate_element_type_percent_match(elements_frequency, expected_frequency, 0.8), 2)
+        == percent_matched[2]
+    )
--- a/unstructured/metrics/element_type.py
+++ b/unstructured/metrics/element_type.py
@ -20,3 +20,71 @@ def get_element_type_frequency(
        else:
            frequency[key] += 1
    return frequency
+
+
+def calculate_element_type_percent_match(
+    output: Dict,
+    source: Dict,
+    category_depth_weight: float = 0.5,
+) -> float:
+    """
+    Calculate the percent match between two frequency dictionary. Intended to use with
+    `get_element_type_frequency` function. The function counts the absolute exact match
+    (type and depth), and counts the weighted match (correct type but different depth),
+    then normalized with source's total elements.
+    """
+    if len(output) == 0 or len(source) == 0:
+        return 0.0
+
+    output_copy = output.copy()
+    source_copy = source.copy()
+    total_source_element_count = 0
+    total_match_element_count = 0
+
+    unmatched_depth_output = {}
+    unmatched_depth_source = {}
+
+    # loop through the output list to find match with source
+    for k, _ in output_copy.items():
+        if k in source_copy:
+            match_count = min(output_copy[k], source_copy[k])
+            total_match_element_count += match_count
+            total_source_element_count += match_count
+
+            # update the dictionary by removing already matched values
+            output_copy[k] -= match_count
+            source_copy[k] -= match_count
+
+        # add unmatched leftovers from output_copy to a new dictionary
+        element_type = k[0]
+        if element_type not in unmatched_depth_output:
+            unmatched_depth_output[element_type] = output_copy[k]
+        else:
+            unmatched_depth_output[element_type] += output_copy[k]
+
+    # add unmatched leftovers from source_copy to a new dictionary
+    unmatched_depth_source = _convert_to_frequency_without_depth(source_copy)
+
+    # loop through the source list to match any existing partial match left
+    for k, _ in unmatched_depth_source.items():
+        total_source_element_count += unmatched_depth_source[k]
+        if k in unmatched_depth_output:
+            match_count = min(unmatched_depth_output[k], unmatched_depth_source[k])
+            total_match_element_count += match_count * category_depth_weight
+
+    return min(max(total_match_element_count / total_source_element_count, 0.0), 1.0)
+
+
+def _convert_to_frequency_without_depth(d: Dict) -> Dict:
+    """
+    Takes in element frequency with depth of format (type, depth): value
+    and converts to dictionary without depth of format type: value
+    """
+    res = {}
+    for k, v in d.items():
+        element_type = k[0]
+        if element_type not in res:
+            res[element_type] = v
+        else:
+            res[element_type] += v
+    return res