From 3005ebd67d1cee1d1af2f87bff6cb14a9d114b70 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Wed, 19 Mar 2025 18:46:07 +0000
Subject: [PATCH] Normalization

---
 olmocr/bench/test_tests.py | 11 +++++++++++
 olmocr/bench/tests.py      |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/olmocr/bench/test_tests.py b/olmocr/bench/test_tests.py
index 2266e7a..89993ae 100644
--- a/olmocr/bench/test_tests.py
+++ b/olmocr/bench/test_tests.py
@@ -561,6 +561,17 @@ Some text before the table.
         result, explanation = test.run(table)
         self.assertTrue(result, explanation)
 
+    def test_mathematical_minus(self):
+        table = """| Response | Chinese experimenter | White experimenter |
+|----------|----------------------|--------------------|
+|          | Divided attention    | Full attention     | Divided attention | Full attention |
+| Nonverbal| −.34 (.22)           | .54* (.17)         | .12 (.27)         | −.20 (.24)     |
+| Verbal   | −.25 (.23)           | .36 (.20)          | .12 (.27)         | −.34 (.22)     |
+"""
+        test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="-.34 (.22)")
+        result, explanation = test.run(table)
+        self.assertTrue(result, explanation)
+
 
 class TestBaselineTest(unittest.TestCase):
     """Test the BaselineTest class"""
diff --git a/olmocr/bench/tests.py b/olmocr/bench/tests.py
index f568bef..7aa5892 100644
--- a/olmocr/bench/tests.py
+++ b/olmocr/bench/tests.py
@@ -45,7 +45,7 @@ def normalize_text(md_content: str) -> str:
     md_content = re.sub(r"\s+", " ", md_content)
 
     # Dictionary of characters to replace: keys are fancy characters, values are ASCII equivalents, unicode micro with greek mu comes up often enough too
-    replacements = {"‘": "'", "’": "'", "‚": "'", "“": '"', "”": '"', "„": '"', "＿": "_", "–": "-", "—": "-", "‑": "-", "‒": "-", "\u00b5": "\u03bc"}
+    replacements = {"‘": "'", "’": "'", "‚": "'", "“": '"', "”": '"', "„": '"', "＿": "_", "–": "-", "—": "-", "‑": "-", "‒": "-", "−": "-", "\u00b5": "\u03bc"}
 
     # Apply all replacements from the dictionary
     for fancy_char, ascii_char in replacements.items():