mirror of
https://github.com/allenai/olmocr.git
synced 2025-07-30 12:32:16 +00:00
Test mining not including line numbers
This commit is contained in:
parent
2614fc9050
commit
e856e9de1d
@ -352,6 +352,10 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
|
||||
pdf_filename = f"{pdf_id}_page{page_num}.pdf"
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
|
||||
# Remove any divs or spans with class "line-number"
|
||||
for element in soup.find_all(["div", "span"], class_="line-number"):
|
||||
element.extract()
|
||||
|
||||
# Rewrite any page-header and page-footer divs to be normalized to headers
|
||||
# Convert div.page-footer to footer in one line
|
||||
for div in soup.find_all("div", class_="page-header"):
|
||||
|
@ -851,8 +851,31 @@ class TestMineTests(unittest.TestCase):
|
||||
|
||||
tests = generate_tests_from_html(html_content, "0", 1)
|
||||
|
||||
superscript_map = {
|
||||
"0": "⁰",
|
||||
"1": "¹",
|
||||
"2": "²",
|
||||
"3": "³",
|
||||
"4": "⁴",
|
||||
"5": "⁵",
|
||||
"6": "⁶",
|
||||
"7": "⁷",
|
||||
"8": "⁸",
|
||||
"9": "⁹",
|
||||
"+": "⁺",
|
||||
"-": "⁻",
|
||||
"=": "⁼",
|
||||
"(": "⁽",
|
||||
")": "⁾",
|
||||
"n": "ⁿ",
|
||||
"i": "ⁱ",
|
||||
}
|
||||
|
||||
for test in tests:
|
||||
print(test)
|
||||
for sup in superscript_map.values():
|
||||
self.assertTrue(sup not in test.get("text", ""))
|
||||
self.assertTrue(sup not in test.get("before", ""))
|
||||
self.assertTrue(sup not in test.get("after", ""))
|
||||
|
||||
def test_katex_autorender(self):
|
||||
"""Test that KaTeX math expressions are properly auto-rendered when using the render_pdf_with_playwright function."""
|
||||
@ -906,3 +929,97 @@ class TestMineTests(unittest.TestCase):
|
||||
# but at minimum we can verify the file was created successfully
|
||||
|
||||
print(output_pdf_path)
|
||||
|
||||
def test_line_numbers(self):
|
||||
html_content = """
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>House Amendment Bill No. CS/CS/SB 7030</title>
|
||||
</head>
|
||||
<body>
|
||||
<header class="document-header">
|
||||
<div class="bill-title">HOUSE AMENDMENT</div>
|
||||
<div>Bill No. CS/CS/SB 7030, 1st Eng. (2019)</div>
|
||||
</header>
|
||||
|
||||
<div class="amendment-label">Amendment No.</div>
|
||||
|
||||
<div class="chamber-action">
|
||||
<div class="chamber-columns">
|
||||
<div class="senate-column">Senate</div>
|
||||
<div class="house-column">House</div>
|
||||
</div>
|
||||
<div style="text-align: center;">.</div>
|
||||
</div>
|
||||
|
||||
<div class="horizontal-line"></div>
|
||||
|
||||
<div class="horizontal-line"></div>
|
||||
|
||||
<div class="amendment-content">
|
||||
<div>
|
||||
<span class="line-number">1</span>
|
||||
<div class="line-content">Representative Jenne offered the following:</div>
|
||||
</div>
|
||||
<div>
|
||||
<span class="line-number">2</span>
|
||||
<div class="line-content"></div>
|
||||
</div>
|
||||
<div>
|
||||
<span class="line-number">3</span>
|
||||
<div class="line-content"><strong>Amendment</strong></div>
|
||||
</div>
|
||||
<div>
|
||||
<span class="line-number">4</span>
|
||||
<div class="line-content">Remove lines 274-280 and insert:</div>
|
||||
</div>
|
||||
<div>
|
||||
<span class="line-number">5</span>
|
||||
<div class="line-content">c.3. Pass <span class="underline">an initial</span> a psychological evaluation, and</div>
|
||||
</div>
|
||||
<div>
|
||||
<span class="line-number">6</span>
|
||||
<div class="line-content"><span class="underline">subsequent yearly psychological evaluations before each school</span></div>
|
||||
</div>
|
||||
<div>
|
||||
<span class="line-number">7</span>
|
||||
<div class="line-content"><span class="underline">year, administered by a psychologist licensed under chapter 490</span></div>
|
||||
</div>
|
||||
<div>
|
||||
<span class="line-number">8</span>
|
||||
<div class="line-content">and designated by the Department of Law Enforcement and submit</div>
|
||||
</div>
|
||||
<div>
|
||||
<span class="line-number">9</span>
|
||||
<div class="line-content">the results of <span class="underline">such evaluations</span> <span class="strikethrough">the evaluation</span> to the sheriff's</div>
|
||||
</div>
|
||||
<div>
|
||||
<span class="line-number">10</span>
|
||||
<div class="line-content">office. The Department of Law Enforcement is authorized to</div>
|
||||
</div>
|
||||
<div>
|
||||
<span class="line-number">11</span>
|
||||
<div class="line-content">provide the sheriff's office with mental health and substance</div>
|
||||
</div>
|
||||
<div>
|
||||
<span class="line-number">12</span>
|
||||
<div class="line-content">abuse data for compliance with this paragraph.</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<footer class="document-footer">
|
||||
<div>588513</div>
|
||||
<div>Approved For Filing: 4/23/2019 6:09:18 PM</div>
|
||||
<div>Page 1 of 1</div>
|
||||
</footer>
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
tests = generate_tests_from_html(html_content, "0", 1)
|
||||
|
||||
for test in tests:
|
||||
if test["type"] == "order":
|
||||
self.assertTrue(len([c for c in test["before"] if not c.isdigit()]) > 0)
|
||||
|
Loading…
x
Reference in New Issue
Block a user