[Merge request] bug fix on table structure metric (#3089)

**Summary** This fix is to provide better logic oon matched_idx of calculating table structure metric to provide more accurate calculation on the acc **Additional Context** - this fix has passed CI run in Draft PR #3025 initially - therefore, this time we would like to merge into main branch - this commit has merged the latest change from main after the Draft PR
2025-12-13 08:01:37 +00:00 · 2024-06-10 11:14:32 -04:00 · 2024-06-10 11:14:32 -04:00 · d82a34519e
commit d82a34519e
parent 657a949a00
3 changed files with 32 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -6,6 +6,8 @@

 ### Fixes

+**table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string.
+
 ## 0.14.5

 ### Enhancements
--- a/unstructured/metrics/evaluate.py
+++ b/unstructured/metrics/evaluate.py
@ -160,6 +160,7 @@ class BaseMetricsCalculator(ABC):
    @abstractmethod
    def _process_document(self, doc: Path) -> list:
        """Should return all metadata and metrics for a single document."""
+        pass


@dataclass
--- a/unstructured/metrics/table/table_alignment.py
+++ b/unstructured/metrics/table/table_alignment.py
@ -50,11 +50,9 @@ class TableAlignment:

    @staticmethod
    def _zip_to_dataframe(table_data: List[Dict[str, Any]]) -> pd.DataFrame:
-        df = pd.DataFrame(table_data).pivot(
-            index="row_index",
-            columns="col_index",
-            values="content",
-        )
+        df = pd.DataFrame(table_data, columns=["row_index", "col_index", "content"])
+        df = df.set_index("row_index")
+        df["col_index"] = df["col_index"].astype(str)
        return df

    @staticmethod
@ -100,7 +98,7 @@ class TableAlignment:

            # Get row and col index accuracy
            ground_truth_td_contents_list = [gtd["content"].lower() for gtd in ground_truth_td]
-
+            used_indices = set()
            indices_tuple_pairs = []
            for td_ele in td:
                content = td_ele["content"].lower()
@ -113,8 +111,31 @@ class TableAlignment:
                    cutoff=cutoff,
                    n=1,
                )
-                matched_idx = ground_truth_td_contents_list.index(matches[0]) if matches else -1
-
+                # BUG FIX: the previous matched_idx will only output the first matched index if
+                # the match has duplicates in the
+                # ground_truth_td_contents_list, the current fix will output its correspondence idx
+                # once matching is exhausted, it will go back search again the same fashion
+                matching_indices = []
+                if matches != []:
+                    b_indices = [
+                        i
+                        for i, b_string in enumerate(ground_truth_td_contents_list)
+                        if b_string == matches[0] and i not in used_indices
+                    ]
+                    if not b_indices:
+                        # If all indices are used, reset used_indices and use the first index
+                        used_indices.clear()
+                        b_indices = [
+                            i
+                            for i, b_string in enumerate(ground_truth_td_contents_list)
+                            if b_string == matches[0] and i not in used_indices
+                        ]
+                    matching_index = b_indices[0]
+                    matching_indices.append(matching_index)
+                    used_indices.add(matching_index)
+                else:
+                    matching_indices = [-1]
+                matched_idx = matching_indices[0]
                if matched_idx >= 0:
                    gt_row_index = ground_truth_td[matched_idx]["row_index"]
                    gt_col_index = ground_truth_td[matched_idx]["col_index"]