mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-18 18:54:09 +00:00
[Merge request] bug fix on table structure metric (#3089)
**Summary** This fix is to provide better logic oon matched_idx of calculating table structure metric to provide more accurate calculation on the acc **Additional Context** - this fix has passed CI run in Draft PR #3025 initially - therefore, this time we would like to merge into main branch - this commit has merged the latest change from main after the Draft PR
This commit is contained in:
parent
657a949a00
commit
d82a34519e
@ -6,6 +6,8 @@
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
**table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string.
|
||||||
|
|
||||||
## 0.14.5
|
## 0.14.5
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|||||||
@ -160,6 +160,7 @@ class BaseMetricsCalculator(ABC):
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _process_document(self, doc: Path) -> list:
|
def _process_document(self, doc: Path) -> list:
|
||||||
"""Should return all metadata and metrics for a single document."""
|
"""Should return all metadata and metrics for a single document."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@ -50,11 +50,9 @@ class TableAlignment:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _zip_to_dataframe(table_data: List[Dict[str, Any]]) -> pd.DataFrame:
|
def _zip_to_dataframe(table_data: List[Dict[str, Any]]) -> pd.DataFrame:
|
||||||
df = pd.DataFrame(table_data).pivot(
|
df = pd.DataFrame(table_data, columns=["row_index", "col_index", "content"])
|
||||||
index="row_index",
|
df = df.set_index("row_index")
|
||||||
columns="col_index",
|
df["col_index"] = df["col_index"].astype(str)
|
||||||
values="content",
|
|
||||||
)
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -100,7 +98,7 @@ class TableAlignment:
|
|||||||
|
|
||||||
# Get row and col index accuracy
|
# Get row and col index accuracy
|
||||||
ground_truth_td_contents_list = [gtd["content"].lower() for gtd in ground_truth_td]
|
ground_truth_td_contents_list = [gtd["content"].lower() for gtd in ground_truth_td]
|
||||||
|
used_indices = set()
|
||||||
indices_tuple_pairs = []
|
indices_tuple_pairs = []
|
||||||
for td_ele in td:
|
for td_ele in td:
|
||||||
content = td_ele["content"].lower()
|
content = td_ele["content"].lower()
|
||||||
@ -113,8 +111,31 @@ class TableAlignment:
|
|||||||
cutoff=cutoff,
|
cutoff=cutoff,
|
||||||
n=1,
|
n=1,
|
||||||
)
|
)
|
||||||
matched_idx = ground_truth_td_contents_list.index(matches[0]) if matches else -1
|
# BUG FIX: the previous matched_idx will only output the first matched index if
|
||||||
|
# the match has duplicates in the
|
||||||
|
# ground_truth_td_contents_list, the current fix will output its correspondence idx
|
||||||
|
# once matching is exhausted, it will go back search again the same fashion
|
||||||
|
matching_indices = []
|
||||||
|
if matches != []:
|
||||||
|
b_indices = [
|
||||||
|
i
|
||||||
|
for i, b_string in enumerate(ground_truth_td_contents_list)
|
||||||
|
if b_string == matches[0] and i not in used_indices
|
||||||
|
]
|
||||||
|
if not b_indices:
|
||||||
|
# If all indices are used, reset used_indices and use the first index
|
||||||
|
used_indices.clear()
|
||||||
|
b_indices = [
|
||||||
|
i
|
||||||
|
for i, b_string in enumerate(ground_truth_td_contents_list)
|
||||||
|
if b_string == matches[0] and i not in used_indices
|
||||||
|
]
|
||||||
|
matching_index = b_indices[0]
|
||||||
|
matching_indices.append(matching_index)
|
||||||
|
used_indices.add(matching_index)
|
||||||
|
else:
|
||||||
|
matching_indices = [-1]
|
||||||
|
matched_idx = matching_indices[0]
|
||||||
if matched_idx >= 0:
|
if matched_idx >= 0:
|
||||||
gt_row_index = ground_truth_td[matched_idx]["row_index"]
|
gt_row_index = ground_truth_td[matched_idx]["row_index"]
|
||||||
gt_col_index = ground_truth_td[matched_idx]["col_index"]
|
gt_col_index = ground_truth_td[matched_idx]["col_index"]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user