feat: calculate element type percent match (#1723)

**Executive Summary**
Adds function to calculate the percent match between two element type
frequency output from `get_element_type_frequency` function.

**Technical Detail**
- The function takes two `Dict` input which both should be output from
`get_element_type_frequency`
- Implementors can define weight `category_depth_weight` they want to
give to the matching `type` but different in `category_depth` case
- The function loops through output item list first to find exact match
and count total exact match, and collect the remaining value for both
output and source in new list (of `dict` type). Then it loops through
existing source item list that has not been an exact match, to find
`type` match which then weigh with the factor of `category_depth_weight`
defined earlier, default at 0.5)

**Output**
output
```
{
  ("Title", 0): 2,
  ("Title", 1): 1,
  ("NarrativeText", None): 3,
  ("UncategorizedText", None): 1,
}
```

source
```
{
  ("Title", 0): 1,
  ("Title", 1): 2,
  ("NarrativeText", None): 5,
}
```

With this output and source, and weight of 0.5, the % match will yield
5.5 / 8 -- for 5 exact match, and 1 partial match with 0.5 weight.

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
This commit is contained in:
Klaijan 2023-10-16 13:57:28 -04:00 committed by GitHub
parent 9c7ee8921a
commit ba4c649cf0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 144 additions and 2 deletions

View File

@ -6,6 +6,8 @@
### Features
* **Adds element type percent match function** In order to evaluate the element type extracted, we add a function that calculates the matched percentage between two frequency dictionary.
### Fixes
## 0.10.23
@ -35,8 +37,8 @@
* **Emit hyperlink metadata for DOCX file-type.** DOCX partitioner now adds `metadata.links`, `metadata.link_texts` and `metadata.link_urls` for elements that contain a hyperlink that points to an external resource. So-called "jump" links pointing to document internal locations (such as those found in a table-of-contents "jumping" to a chapter or section) are excluded.
### Features
* **Add `elements_to_text` as a staging helper function** In order to get a single clean text output from unstructured for metric calculations, automate the process of extracting text from elements using this function.
* **Add `elements_to_text` as a staging helper function** In order to get a single clean text output from unstructured for metric calculations, automate the process of extracting text from elements using this function.
* **Adds permissions(RBAC) data ingestion functionality for the Sharepoint connector.** Problem: Role based access control is an important component in many data storage systems. Users may need to pass permissions (RBAC) data to downstream systems when ingesting data. Feature: Added permissions data ingestion functionality to the Sharepoint connector.
### Fixes

View File

@ -1,6 +1,9 @@
import pytest
from unstructured.metrics.element_type import get_element_type_frequency
from unstructured.metrics.element_type import (
calculate_element_type_percent_match,
get_element_type_frequency,
)
from unstructured.partition.auto import partition
from unstructured.staging.base import elements_to_json
@ -35,3 +38,72 @@ def test_get_element_type_frequency(filename, frequency):
elements = partition(filename=f"example-docs/{filename}")
elements_freq = get_element_type_frequency(elements_to_json(elements))
assert elements_freq == frequency
@pytest.mark.parametrize(
("filename", "expected_frequency", "percent_matched"),
[
(
"fake-email.txt",
{
("UncategorizedText", None): 14,
("ListItem", None): 2,
("NarrativeText", None): 2,
},
(0.56, 0.56, 0.56),
),
(
"sample-presentation.pptx",
{
("Title", 0): 3,
("Title", 1): 1,
("NarrativeText", None): 1,
("NarrativeText", 0): 3,
("ListItem", 0): 6,
("ListItem", 1): 6,
("ListItem", 2): 3,
("Table", None): 1,
},
(0.96, 0.96, 0.96),
),
(
"handbook-1p.docx",
{
("Header", None): 1,
("Title", 0): 1,
("Title", 1): 1,
("Title", 2): 1,
("ListItem", 3): 3,
("NarrativeText", 4): 7,
("Footer", None): 1,
},
(0.43, 0.07, 0.65),
),
(
"handbook-1p.docx",
{
("Header", None): 1,
("Title", 0): 6,
("NarrativeText", 0): 7,
("PageBreak", None): 1,
("Footer", None): 1,
},
(0.94, 0.88, 0.98),
),
],
)
def test_calculate_element_type_percent_match(filename, expected_frequency, percent_matched):
elements = partition(filename=f"example-docs/{filename}")
elements_frequency = get_element_type_frequency(elements_to_json(elements))
assert (
round(calculate_element_type_percent_match(elements_frequency, expected_frequency), 2)
== percent_matched[0]
)
assert (
round(calculate_element_type_percent_match(elements_frequency, expected_frequency, 0.0), 2)
== percent_matched[1]
)
assert (
round(calculate_element_type_percent_match(elements_frequency, expected_frequency, 0.8), 2)
== percent_matched[2]
)

View File

@ -20,3 +20,71 @@ def get_element_type_frequency(
else:
frequency[key] += 1
return frequency
def calculate_element_type_percent_match(
output: Dict,
source: Dict,
category_depth_weight: float = 0.5,
) -> float:
"""
Calculate the percent match between two frequency dictionary. Intended to use with
`get_element_type_frequency` function. The function counts the absolute exact match
(type and depth), and counts the weighted match (correct type but different depth),
then normalized with source's total elements.
"""
if len(output) == 0 or len(source) == 0:
return 0.0
output_copy = output.copy()
source_copy = source.copy()
total_source_element_count = 0
total_match_element_count = 0
unmatched_depth_output = {}
unmatched_depth_source = {}
# loop through the output list to find match with source
for k, _ in output_copy.items():
if k in source_copy:
match_count = min(output_copy[k], source_copy[k])
total_match_element_count += match_count
total_source_element_count += match_count
# update the dictionary by removing already matched values
output_copy[k] -= match_count
source_copy[k] -= match_count
# add unmatched leftovers from output_copy to a new dictionary
element_type = k[0]
if element_type not in unmatched_depth_output:
unmatched_depth_output[element_type] = output_copy[k]
else:
unmatched_depth_output[element_type] += output_copy[k]
# add unmatched leftovers from source_copy to a new dictionary
unmatched_depth_source = _convert_to_frequency_without_depth(source_copy)
# loop through the source list to match any existing partial match left
for k, _ in unmatched_depth_source.items():
total_source_element_count += unmatched_depth_source[k]
if k in unmatched_depth_output:
match_count = min(unmatched_depth_output[k], unmatched_depth_source[k])
total_match_element_count += match_count * category_depth_weight
return min(max(total_match_element_count / total_source_element_count, 0.0), 1.0)
def _convert_to_frequency_without_depth(d: Dict) -> Dict:
"""
Takes in element frequency with depth of format (type, depth): value
and converts to dictionary without depth of format type: value
"""
res = {}
for k, v in d.items():
element_type = k[0]
if element_type not in res:
res[element_type] = v
else:
res[element_type] += v
return res