diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py index 432f77a..5c2a73c 100644 --- a/olmocr/bench/synth/mine_html_templates.py +++ b/olmocr/bench/synth/mine_html_templates.py @@ -352,6 +352,10 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb pdf_filename = f"{pdf_id}_page{page_num}.pdf" soup = BeautifulSoup(html_content, "html.parser") + # Remove any divs or spans with class "line-number" + for element in soup.find_all(["div", "span"], class_="line-number"): + element.extract() + # Rewrite any page-header and page-footer divs to be normalized to headers # Convert div.page-footer to footer in one line for div in soup.find_all("div", class_="page-header"): diff --git a/olmocr/bench/synth/test_mine.py b/olmocr/bench/synth/test_mine.py index 1b30d77..925a095 100644 --- a/olmocr/bench/synth/test_mine.py +++ b/olmocr/bench/synth/test_mine.py @@ -851,8 +851,31 @@ class TestMineTests(unittest.TestCase): tests = generate_tests_from_html(html_content, "0", 1) + superscript_map = { + "0": "⁰", + "1": "¹", + "2": "²", + "3": "³", + "4": "⁴", + "5": "⁵", + "6": "⁶", + "7": "⁷", + "8": "⁸", + "9": "⁹", + "+": "⁺", + "-": "⁻", + "=": "⁼", + "(": "⁽", + ")": "⁾", + "n": "ⁿ", + "i": "ⁱ", + } + for test in tests: - print(test) + for sup in superscript_map.values(): + self.assertTrue(sup not in test.get("text", "")) + self.assertTrue(sup not in test.get("before", "")) + self.assertTrue(sup not in test.get("after", "")) def test_katex_autorender(self): """Test that KaTeX math expressions are properly auto-rendered when using the render_pdf_with_playwright function.""" @@ -906,3 +929,97 @@ class TestMineTests(unittest.TestCase): # but at minimum we can verify the file was created successfully print(output_pdf_path) + + def test_line_numbers(self): + html_content = """ + + + + + + House Amendment Bill No. CS/CS/SB 7030 + + +
+
HOUSE AMENDMENT
+
Bill No. CS/CS/SB 7030, 1st Eng. (2019)
+
+ +
Amendment No.
+ +
+
+
Senate
+
House
+
+
.
+
+ +
+ +
+ +
+
+ 1 +
Representative Jenne offered the following:
+
+
+ 2 +
+
+
+ 3 +
Amendment
+
+
+ 4 +
Remove lines 274-280 and insert:
+
+
+ 5 +
c.3. Pass an initial a psychological evaluation, and
+
+
+ 6 +
subsequent yearly psychological evaluations before each school
+
+
+ 7 +
year, administered by a psychologist licensed under chapter 490
+
+
+ 8 +
and designated by the Department of Law Enforcement and submit
+
+
+ 9 +
the results of such evaluations the evaluation to the sheriff's
+
+
+ 10 +
office. The Department of Law Enforcement is authorized to
+
+
+ 11 +
provide the sheriff's office with mental health and substance
+
+
+ 12 +
abuse data for compliance with this paragraph.
+
+
+ + + +""" + + tests = generate_tests_from_html(html_content, "0", 1) + + for test in tests: + if test["type"] == "order": + self.assertTrue(len([c for c in test["before"] if not c.isdigit()]) > 0)