diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py index 432f77a..5c2a73c 100644 --- a/olmocr/bench/synth/mine_html_templates.py +++ b/olmocr/bench/synth/mine_html_templates.py @@ -352,6 +352,10 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb pdf_filename = f"{pdf_id}_page{page_num}.pdf" soup = BeautifulSoup(html_content, "html.parser") + # Remove any divs or spans with class "line-number" + for element in soup.find_all(["div", "span"], class_="line-number"): + element.extract() + # Rewrite any page-header and page-footer divs to be normalized to headers # Convert div.page-footer to footer in one line for div in soup.find_all("div", class_="page-header"): diff --git a/olmocr/bench/synth/test_mine.py b/olmocr/bench/synth/test_mine.py index 1b30d77..925a095 100644 --- a/olmocr/bench/synth/test_mine.py +++ b/olmocr/bench/synth/test_mine.py @@ -851,8 +851,31 @@ class TestMineTests(unittest.TestCase): tests = generate_tests_from_html(html_content, "0", 1) + superscript_map = { + "0": "⁰", + "1": "¹", + "2": "²", + "3": "³", + "4": "⁴", + "5": "⁵", + "6": "⁶", + "7": "⁷", + "8": "⁸", + "9": "⁹", + "+": "⁺", + "-": "⁻", + "=": "⁼", + "(": "⁽", + ")": "⁾", + "n": "ⁿ", + "i": "ⁱ", + } + for test in tests: - print(test) + for sup in superscript_map.values(): + self.assertTrue(sup not in test.get("text", "")) + self.assertTrue(sup not in test.get("before", "")) + self.assertTrue(sup not in test.get("after", "")) def test_katex_autorender(self): """Test that KaTeX math expressions are properly auto-rendered when using the render_pdf_with_playwright function.""" @@ -906,3 +929,97 @@ class TestMineTests(unittest.TestCase): # but at minimum we can verify the file was created successfully print(output_pdf_path) + + def test_line_numbers(self): + html_content = """ + + +
+ + +