Test mining not including line numbers

This commit is contained in:
Jake Poznanski 2025-04-02 23:07:32 +00:00
parent 2614fc9050
commit e856e9de1d
2 changed files with 122 additions and 1 deletions

View File

@ -352,6 +352,10 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
pdf_filename = f"{pdf_id}_page{page_num}.pdf"
soup = BeautifulSoup(html_content, "html.parser")
# Remove any divs or spans with class "line-number"
for element in soup.find_all(["div", "span"], class_="line-number"):
element.extract()
# Rewrite any page-header and page-footer divs to be normalized to headers
# Convert div.page-footer to footer in one line
for div in soup.find_all("div", class_="page-header"):

View File

@ -851,8 +851,31 @@ class TestMineTests(unittest.TestCase):
tests = generate_tests_from_html(html_content, "0", 1)
superscript_map = {
"0": "",
"1": "¹",
"2": "²",
"3": "³",
"4": "",
"5": "",
"6": "",
"7": "",
"8": "",
"9": "",
"+": "",
"-": "",
"=": "",
"(": "",
")": "",
"n": "",
"i": "",
}
for test in tests:
print(test)
for sup in superscript_map.values():
self.assertTrue(sup not in test.get("text", ""))
self.assertTrue(sup not in test.get("before", ""))
self.assertTrue(sup not in test.get("after", ""))
def test_katex_autorender(self):
"""Test that KaTeX math expressions are properly auto-rendered when using the render_pdf_with_playwright function."""
@ -906,3 +929,97 @@ class TestMineTests(unittest.TestCase):
# but at minimum we can verify the file was created successfully
print(output_pdf_path)
def test_line_numbers(self):
html_content = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>House Amendment Bill No. CS/CS/SB 7030</title>
</head>
<body>
<header class="document-header">
<div class="bill-title">HOUSE AMENDMENT</div>
<div>Bill No. CS/CS/SB 7030, 1st Eng. (2019)</div>
</header>
<div class="amendment-label">Amendment No.</div>
<div class="chamber-action">
<div class="chamber-columns">
<div class="senate-column">Senate</div>
<div class="house-column">House</div>
</div>
<div style="text-align: center;">.</div>
</div>
<div class="horizontal-line"></div>
<div class="horizontal-line"></div>
<div class="amendment-content">
<div>
<span class="line-number">1</span>
<div class="line-content">Representative Jenne offered the following:</div>
</div>
<div>
<span class="line-number">2</span>
<div class="line-content"></div>
</div>
<div>
<span class="line-number">3</span>
<div class="line-content"><strong>Amendment</strong></div>
</div>
<div>
<span class="line-number">4</span>
<div class="line-content">Remove lines 274-280 and insert:</div>
</div>
<div>
<span class="line-number">5</span>
<div class="line-content">c.3. Pass <span class="underline">an initial</span> a psychological evaluation, and</div>
</div>
<div>
<span class="line-number">6</span>
<div class="line-content"><span class="underline">subsequent yearly psychological evaluations before each school</span></div>
</div>
<div>
<span class="line-number">7</span>
<div class="line-content"><span class="underline">year, administered by a psychologist licensed under chapter 490</span></div>
</div>
<div>
<span class="line-number">8</span>
<div class="line-content">and designated by the Department of Law Enforcement and submit</div>
</div>
<div>
<span class="line-number">9</span>
<div class="line-content">the results of <span class="underline">such evaluations</span> <span class="strikethrough">the evaluation</span> to the sheriff's</div>
</div>
<div>
<span class="line-number">10</span>
<div class="line-content">office. The Department of Law Enforcement is authorized to</div>
</div>
<div>
<span class="line-number">11</span>
<div class="line-content">provide the sheriff's office with mental health and substance</div>
</div>
<div>
<span class="line-number">12</span>
<div class="line-content">abuse data for compliance with this paragraph.</div>
</div>
</div>
<footer class="document-footer">
<div>588513</div>
<div>Approved For Filing: 4/23/2019 6:09:18 PM</div>
<div>Page 1 of 1</div>
</footer>
</body>
</html>"""
tests = generate_tests_from_html(html_content, "0", 1)
for test in tests:
if test["type"] == "order":
self.assertTrue(len([c for c in test["before"] if not c.isdigit()]) > 0)