From 3005ebd67d1cee1d1af2f87bff6cb14a9d114b70 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Wed, 19 Mar 2025 18:46:07 +0000 Subject: [PATCH] Normalization --- olmocr/bench/test_tests.py | 11 +++++++++++ olmocr/bench/tests.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/olmocr/bench/test_tests.py b/olmocr/bench/test_tests.py index 2266e7a..89993ae 100644 --- a/olmocr/bench/test_tests.py +++ b/olmocr/bench/test_tests.py @@ -561,6 +561,17 @@ Some text before the table. result, explanation = test.run(table) self.assertTrue(result, explanation) + def test_mathematical_minus(self): + table = """| Response | Chinese experimenter | White experimenter | +|----------|----------------------|--------------------| +| | Divided attention | Full attention | Divided attention | Full attention | +| Nonverbal| −.34 (.22) | .54* (.17) | .12 (.27) | −.20 (.24) | +| Verbal | −.25 (.23) | .36 (.20) | .12 (.27) | −.34 (.22) | +""" + test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="-.34 (.22)") + result, explanation = test.run(table) + self.assertTrue(result, explanation) + class TestBaselineTest(unittest.TestCase): """Test the BaselineTest class""" diff --git a/olmocr/bench/tests.py b/olmocr/bench/tests.py index f568bef..7aa5892 100644 --- a/olmocr/bench/tests.py +++ b/olmocr/bench/tests.py @@ -45,7 +45,7 @@ def normalize_text(md_content: str) -> str: md_content = re.sub(r"\s+", " ", md_content) # Dictionary of characters to replace: keys are fancy characters, values are ASCII equivalents, unicode micro with greek mu comes up often enough too - replacements = {"‘": "'", "’": "'", "‚": "'", "“": '"', "”": '"', "„": '"', "_": "_", "–": "-", "—": "-", "‑": "-", "‒": "-", "\u00b5": "\u03bc"} + replacements = {"‘": "'", "’": "'", "‚": "'", "“": '"', "”": '"', "„": '"', "_": "_", "–": "-", "—": "-", "‑": "-", "‒": "-", "−": "-", "\u00b5": "\u03bc"} # Apply all replacements from the dictionary for fancy_char, ascii_char in replacements.items():