mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-11 16:22:29 +00:00
Normalization
This commit is contained in:
parent
8ec1ebe5ed
commit
3005ebd67d
@ -561,6 +561,17 @@ Some text before the table.
|
||||
result, explanation = test.run(table)
|
||||
self.assertTrue(result, explanation)
|
||||
|
||||
def test_mathematical_minus(self):
|
||||
table = """| Response | Chinese experimenter | White experimenter |
|
||||
|----------|----------------------|--------------------|
|
||||
| | Divided attention | Full attention | Divided attention | Full attention |
|
||||
| Nonverbal| −.34 (.22) | .54* (.17) | .12 (.27) | −.20 (.24) |
|
||||
| Verbal | −.25 (.23) | .36 (.20) | .12 (.27) | −.34 (.22) |
|
||||
"""
|
||||
test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="-.34 (.22)")
|
||||
result, explanation = test.run(table)
|
||||
self.assertTrue(result, explanation)
|
||||
|
||||
|
||||
class TestBaselineTest(unittest.TestCase):
|
||||
"""Test the BaselineTest class"""
|
||||
|
@ -45,7 +45,7 @@ def normalize_text(md_content: str) -> str:
|
||||
md_content = re.sub(r"\s+", " ", md_content)
|
||||
|
||||
# Dictionary of characters to replace: keys are fancy characters, values are ASCII equivalents, unicode micro with greek mu comes up often enough too
|
||||
replacements = {"‘": "'", "’": "'", "‚": "'", "“": '"', "”": '"', "„": '"', "_": "_", "–": "-", "—": "-", "‑": "-", "‒": "-", "\u00b5": "\u03bc"}
|
||||
replacements = {"‘": "'", "’": "'", "‚": "'", "“": '"', "”": '"', "„": '"', "_": "_", "–": "-", "—": "-", "‑": "-", "‒": "-", "−": "-", "\u00b5": "\u03bc"}
|
||||
|
||||
# Apply all replacements from the dictionary
|
||||
for fancy_char, ascii_char in replacements.items():
|
||||
|
Loading…
x
Reference in New Issue
Block a user