Fixing normalizing during table cell comparison

2025-12-28 15:46:29 +00:00 · 2025-03-19 18:29:42 +00:00 · 2025-03-19 18:29:42 +00:00 · a4605e4efc
commit a4605e4efc
parent 17979118ba
2 changed files with 35 additions and 8 deletions
--- a/olmocr/bench/test_tests.py
+++ b/olmocr/bench/test_tests.py
@ -1,6 +1,5 @@
 import unittest

-
 from olmocr.bench.tests import (
    BaselineTest,
    BasePDFTest,
@ -546,6 +545,22 @@ Some text before the table.
        result, explanation = test.run(valid_table_eof)
        self.assertTrue(result, f"Valid table at EOF without newline not detected: {explanation}")

+    def test_normalizing(self):
+        table = """| Question - – Satisfaction on scale of 10 | Response | Resident Sample | Business Sample |
+|----------------------------------------|----------|----------------|-----------------|
+| Planning for and managing residential, commercial and industrial development | Rating of 8, 9 or 10 | 13% | 11% |
+| | Average rating | 6.4 | 5.7 |
+| | Don’t know responses | 11% | 6% |
+| Environmental protection, support for green projects (e.g. green grants, building retrofits programs, zero waste) | Rating of 8, 9 or 10 | 35% | 34% |
+| | Average rating | 8.0 | 7.5 |
+| | Don’t know responses | 8% | 6% |
+| Providing and maintaining parks and green spaces | Rating of 8, 9 or 10 | 42% | 41% |
+| | Average rating | 7.7 | 7.3 |
+| | Don’t know responses | 1% | 1% |"""
+        test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="6%", top_heading="Business\nSample")
+        result, explanation = test.run(table)
+        self.assertTrue(result, explanation)
+

 class TestBaselineTest(unittest.TestCase):
    """Test the BaselineTest class"""
--- a/olmocr/bench/tests.py
+++ b/olmocr/bench/tests.py
@ -114,6 +114,7 @@ class TextPresenceTest(BasePDFTest):
        super().__post_init__()
        if self.type not in {TestType.PRESENT.value, TestType.ABSENT.value}:
            raise ValidationError(f"Invalid type for TextPresenceTest: {self.type}")
+        self.text = normalize_text(self.text)
        if not self.text.strip():
            raise ValidationError("Text field cannot be empty")

@ -169,6 +170,8 @@ class TextOrderTest(BasePDFTest):
        super().__post_init__()
        if self.type != TestType.ORDER.value:
            raise ValidationError(f"Invalid type for TextOrderTest: {self.type}")
+        self.before = normalize_text(self.before)
+        self.after = normalize_text(self.after)
        if not self.before.strip():
            raise ValidationError("Before field cannot be empty")
        if not self.after.strip():
@ -216,6 +219,15 @@ class TableTest(BasePDFTest):
        if self.type != TestType.TABLE.value:
            raise ValidationError(f"Invalid type for TableTest: {self.type}")

+        # Normalize the search text too
+        self.cell = normalize_text(self.cell)
+        self.up = normalize_text(self.up)
+        self.down = normalize_text(self.down)
+        self.left = normalize_text(self.left)
+        self.right = normalize_text(self.right)
+        self.top_heading = normalize_text(self.top_heading)
+        self.left_heading = normalize_text(self.left_heading)
+
    def parse_markdown_tables(self, md_content: str) -> List[np.ndarray]:
        """
        Extract and parse all markdown tables from the provided content.
@ -390,7 +402,7 @@ class TableTest(BasePDFTest):
            matches = []
            for i in range(table_array.shape[0]):
                for j in range(table_array.shape[1]):
-                    cell_content = table_array[i, j]
+                    cell_content = normalize_text(table_array[i, j])
                    similarity = fuzz.ratio(self.cell, cell_content) / 100.0

                    if similarity >= threshold:
@ -407,7 +419,7 @@ class TableTest(BasePDFTest):

                # Check up relationship
                if self.up and row_idx > 0:
-                    up_cell = table_array[row_idx - 1, col_idx]
+                    up_cell = normalize_text(table_array[row_idx - 1, col_idx])
                    up_similarity = fuzz.ratio(self.up, up_cell) / 100.0
                    if up_similarity < threshold:
                        all_relationships_satisfied = False
@ -415,7 +427,7 @@ class TableTest(BasePDFTest):

                # Check down relationship
                if self.down and row_idx < table_array.shape[0] - 1:
-                    down_cell = table_array[row_idx + 1, col_idx]
+                    down_cell = normalize_text(table_array[row_idx + 1, col_idx])
                    down_similarity = fuzz.ratio(self.down, down_cell) / 100.0
                    if down_similarity < threshold:
                        all_relationships_satisfied = False
@ -423,7 +435,7 @@ class TableTest(BasePDFTest):

                # Check left relationship
                if self.left and col_idx > 0:
-                    left_cell = table_array[row_idx, col_idx - 1]
+                    left_cell = normalize_text(table_array[row_idx, col_idx - 1])
                    left_similarity = fuzz.ratio(self.left, left_cell) / 100.0
                    if left_similarity < threshold:
                        all_relationships_satisfied = False
@ -433,7 +445,7 @@ class TableTest(BasePDFTest):

                # Check right relationship
                if self.right and col_idx < table_array.shape[1] - 1:
-                    right_cell = table_array[row_idx, col_idx + 1]
+                    right_cell = normalize_text(table_array[row_idx, col_idx + 1])
                    right_similarity = fuzz.ratio(self.right, right_cell) / 100.0
                    if right_similarity < threshold:
                        all_relationships_satisfied = False
@ -447,7 +459,7 @@ class TableTest(BasePDFTest):
                    top_heading_cell = ""
                    for i in range(row_idx):
                        if table_array[i, col_idx].strip():
-                            top_heading_cell = table_array[i, col_idx]
+                            top_heading_cell = normalize_text(table_array[i, col_idx])
                            break

                    if not top_heading_cell:
@ -467,7 +479,7 @@ class TableTest(BasePDFTest):
                    left_heading_cell = ""
                    for j in range(col_idx):
                        if table_array[row_idx, j].strip():
-                            left_heading_cell = table_array[row_idx, j]
+                            left_heading_cell = normalize_text(table_array[row_idx, j])
                            break

                    if not left_heading_cell: