Fixing normalizing during table cell comparison

This commit is contained in:
Jake Poznanski 2025-03-19 18:29:42 +00:00
parent 17979118ba
commit a4605e4efc
2 changed files with 35 additions and 8 deletions

View File

@ -1,6 +1,5 @@
import unittest
from olmocr.bench.tests import (
BaselineTest,
BasePDFTest,
@ -546,6 +545,22 @@ Some text before the table.
result, explanation = test.run(valid_table_eof)
self.assertTrue(result, f"Valid table at EOF without newline not detected: {explanation}")
def test_normalizing(self):
table = """| Question - Satisfaction on scale of 10 | Response | Resident Sample | Business Sample |
|----------------------------------------|----------|----------------|-----------------|
| Planning for and managing residential, commercial and industrial development | Rating of 8, 9 or 10 | 13% | 11% |
| | Average rating | 6.4 | 5.7 |
| | Dont know responses | 11% | 6% |
| Environmental protection, support for green projects (e.g. green grants, building retrofits programs, zero waste) | Rating of 8, 9 or 10 | 35% | 34% |
| | Average rating | 8.0 | 7.5 |
| | Dont know responses | 8% | 6% |
| Providing and maintaining parks and green spaces | Rating of 8, 9 or 10 | 42% | 41% |
| | Average rating | 7.7 | 7.3 |
| | Dont know responses | 1% | 1% |"""
test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="6%", top_heading="Business\nSample")
result, explanation = test.run(table)
self.assertTrue(result, explanation)
class TestBaselineTest(unittest.TestCase):
"""Test the BaselineTest class"""

View File

@ -114,6 +114,7 @@ class TextPresenceTest(BasePDFTest):
super().__post_init__()
if self.type not in {TestType.PRESENT.value, TestType.ABSENT.value}:
raise ValidationError(f"Invalid type for TextPresenceTest: {self.type}")
self.text = normalize_text(self.text)
if not self.text.strip():
raise ValidationError("Text field cannot be empty")
@ -169,6 +170,8 @@ class TextOrderTest(BasePDFTest):
super().__post_init__()
if self.type != TestType.ORDER.value:
raise ValidationError(f"Invalid type for TextOrderTest: {self.type}")
self.before = normalize_text(self.before)
self.after = normalize_text(self.after)
if not self.before.strip():
raise ValidationError("Before field cannot be empty")
if not self.after.strip():
@ -216,6 +219,15 @@ class TableTest(BasePDFTest):
if self.type != TestType.TABLE.value:
raise ValidationError(f"Invalid type for TableTest: {self.type}")
# Normalize the search text too
self.cell = normalize_text(self.cell)
self.up = normalize_text(self.up)
self.down = normalize_text(self.down)
self.left = normalize_text(self.left)
self.right = normalize_text(self.right)
self.top_heading = normalize_text(self.top_heading)
self.left_heading = normalize_text(self.left_heading)
def parse_markdown_tables(self, md_content: str) -> List[np.ndarray]:
"""
Extract and parse all markdown tables from the provided content.
@ -390,7 +402,7 @@ class TableTest(BasePDFTest):
matches = []
for i in range(table_array.shape[0]):
for j in range(table_array.shape[1]):
cell_content = table_array[i, j]
cell_content = normalize_text(table_array[i, j])
similarity = fuzz.ratio(self.cell, cell_content) / 100.0
if similarity >= threshold:
@ -407,7 +419,7 @@ class TableTest(BasePDFTest):
# Check up relationship
if self.up and row_idx > 0:
up_cell = table_array[row_idx - 1, col_idx]
up_cell = normalize_text(table_array[row_idx - 1, col_idx])
up_similarity = fuzz.ratio(self.up, up_cell) / 100.0
if up_similarity < threshold:
all_relationships_satisfied = False
@ -415,7 +427,7 @@ class TableTest(BasePDFTest):
# Check down relationship
if self.down and row_idx < table_array.shape[0] - 1:
down_cell = table_array[row_idx + 1, col_idx]
down_cell = normalize_text(table_array[row_idx + 1, col_idx])
down_similarity = fuzz.ratio(self.down, down_cell) / 100.0
if down_similarity < threshold:
all_relationships_satisfied = False
@ -423,7 +435,7 @@ class TableTest(BasePDFTest):
# Check left relationship
if self.left and col_idx > 0:
left_cell = table_array[row_idx, col_idx - 1]
left_cell = normalize_text(table_array[row_idx, col_idx - 1])
left_similarity = fuzz.ratio(self.left, left_cell) / 100.0
if left_similarity < threshold:
all_relationships_satisfied = False
@ -433,7 +445,7 @@ class TableTest(BasePDFTest):
# Check right relationship
if self.right and col_idx < table_array.shape[1] - 1:
right_cell = table_array[row_idx, col_idx + 1]
right_cell = normalize_text(table_array[row_idx, col_idx + 1])
right_similarity = fuzz.ratio(self.right, right_cell) / 100.0
if right_similarity < threshold:
all_relationships_satisfied = False
@ -447,7 +459,7 @@ class TableTest(BasePDFTest):
top_heading_cell = ""
for i in range(row_idx):
if table_array[i, col_idx].strip():
top_heading_cell = table_array[i, col_idx]
top_heading_cell = normalize_text(table_array[i, col_idx])
break
if not top_heading_cell:
@ -467,7 +479,7 @@ class TableTest(BasePDFTest):
left_heading_cell = ""
for j in range(col_idx):
if table_array[row_idx, j].strip():
left_heading_cell = table_array[row_idx, j]
left_heading_cell = normalize_text(table_array[row_idx, j])
break
if not left_heading_cell: