mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-13 01:02:26 +00:00
Better filtering of tests
This commit is contained in:
parent
6d3a7d634e
commit
360b1be07c
@ -453,8 +453,11 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
|
||||
if right_text and "\n" not in right_text:
|
||||
test_data["right"] = right_text
|
||||
|
||||
# Check for top heading using header information
|
||||
if col_idx in table_data.col_headers:
|
||||
# Check if current cell is a heading cell
|
||||
is_header_cell = row_idx in table_data.header_rows or col_idx in table_data.header_cols
|
||||
|
||||
# Check for top heading using header information (skip if current cell is a heading)
|
||||
if not is_header_cell and col_idx in table_data.col_headers:
|
||||
# Get the headers for this column
|
||||
col_headers = table_data.col_headers[col_idx]
|
||||
if col_headers:
|
||||
@ -463,8 +466,8 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
|
||||
if top_heading and "\n" not in top_heading:
|
||||
test_data["top_heading"] = top_heading
|
||||
|
||||
# Check for left heading using header information
|
||||
if row_idx in table_data.row_headers:
|
||||
# Check for left heading using header information (skip if current cell is a heading)
|
||||
if not is_header_cell and row_idx in table_data.row_headers:
|
||||
# Get the headers for this row
|
||||
row_headers = table_data.row_headers[row_idx]
|
||||
if row_headers:
|
||||
@ -575,10 +578,56 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
|
||||
# If they do, filter them out
|
||||
tests = [t for t in tests if t["type"] != "absent" or t["text"] not in full_text]
|
||||
|
||||
# Remove any tests where the text has no alpha numeric characters
|
||||
tests = [t for t in tests if "text" not in t or len([c for c in t["text"] if c.isalnum()])]
|
||||
|
||||
return tests
|
||||
# Remove any tests where text-based fields have no alphanumeric characters or contain LaTeX
|
||||
text_fields = ["text", "cell", "before", "after", "up", "down", "left", "right", "top_heading", "left_heading"]
|
||||
|
||||
def contains_alphanumeric(value):
|
||||
return any(c.isalnum() for c in value) if isinstance(value, str) else False
|
||||
|
||||
def contains_latex(value):
|
||||
if not isinstance(value, str):
|
||||
return False
|
||||
# Check for LaTeX delimiters
|
||||
latex_patterns = [r'\(', r'\)', r'\[', r'\]']
|
||||
return any(pattern in value for pattern in latex_patterns)
|
||||
|
||||
filtered_tests = []
|
||||
for test in tests:
|
||||
# Check all text fields in the test for alphanumeric content and LaTeX
|
||||
all_valid = True
|
||||
for field in text_fields:
|
||||
if field in test:
|
||||
# Skip test if field has no alphanumeric characters
|
||||
if not contains_alphanumeric(test[field]):
|
||||
all_valid = False
|
||||
break
|
||||
# Skip test if field contains LaTeX delimiters
|
||||
if contains_latex(test[field]):
|
||||
all_valid = False
|
||||
break
|
||||
if all_valid:
|
||||
filtered_tests.append(test)
|
||||
|
||||
tests = filtered_tests
|
||||
|
||||
# Remove duplicate tests (identical on everything but the id field)
|
||||
unique_tests = []
|
||||
test_signatures = set()
|
||||
|
||||
for test in tests:
|
||||
# Create a signature for the test by using all fields except 'id'
|
||||
test_dict = test.copy()
|
||||
test_id = test_dict.pop('id')
|
||||
|
||||
# Convert dict to a sorted tuple of items for hashability
|
||||
test_signature = tuple(sorted((k, str(v)) for k, v in test_dict.items()))
|
||||
|
||||
# Only add the test if we haven't seen an identical one
|
||||
if test_signature not in test_signatures:
|
||||
test_signatures.add(test_signature)
|
||||
unique_tests.append(test)
|
||||
|
||||
return unique_tests
|
||||
|
||||
|
||||
def process_pdf(pdf_info, args, client):
|
||||
|
@ -757,5 +757,99 @@ class TestMineTests(unittest.TestCase):
|
||||
|
||||
tests = generate_tests_from_html(html_content, "0", 1)
|
||||
|
||||
self.assertTrue(len(tests) > 10)
|
||||
|
||||
def test_sup(self):
|
||||
html_content = """
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>A ROSE BY ANY OTHER NAME</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: Georgia, serif;
|
||||
line-height: 1.5;
|
||||
margin: 0 auto;
|
||||
max-width: 666px;
|
||||
padding: 20px;
|
||||
}
|
||||
header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
.page-number-left {
|
||||
text-align: left;
|
||||
}
|
||||
.title {
|
||||
text-align: center;
|
||||
font-weight: bold;
|
||||
flex-grow: 1;
|
||||
}
|
||||
.page-number-right {
|
||||
text-align: right;
|
||||
}
|
||||
.section-heading {
|
||||
text-align: center;
|
||||
margin: 20px 0;
|
||||
}
|
||||
p {
|
||||
text-indent: 2em;
|
||||
margin: 0 0 10px 0;
|
||||
}
|
||||
.footnotes {
|
||||
margin-top: 30px;
|
||||
border-top: 1px solid #ccc;
|
||||
padding-top: 10px;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
.footnote {
|
||||
text-indent: -1.5em;
|
||||
padding-left: 1.5em;
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
.italic {
|
||||
font-style: italic;
|
||||
}
|
||||
sup {
|
||||
font-size: 0.7em;
|
||||
vertical-align: super;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<header>
|
||||
<div class="page-number-left">2016]</div>
|
||||
<div class="title">A ROSE BY ANY OTHER NAME</div>
|
||||
<div class="page-number-right">1083</div>
|
||||
</header>
|
||||
|
||||
<main>
|
||||
<p>cases were decided within a year of each other (2000 and 2001, respectively). <span class="italic">Save the Manatee Club</span> largely consists of a truncated version of the <span class="italic">Consolidated-Tomoka</span> analysis, with minor adjustments to conform the opinion to the 1999 amendments. <span class="italic">Day Cruise</span>, on the other hand, closely analyzes the 1999 version of section 120.52(8). However, it is <span class="italic">Save the Manatee Club</span> that has come to dominate Florida court opinions on rulemaking challenges and not the more detailed <span class="italic">Day Cruise</span> analysis.<sup>78</sup> The following Sections will discuss the facts of the two cases, examine the differences between their analyses of section 120.52(8), and finally conclude with an opinion on which analysis is better to apply in section 120.52(8) rulemaking challenges.</p>
|
||||
|
||||
<h2 class="section-heading">A. Southwest Florida Water Management District v. Save the Manatee Club, Inc.</h2>
|
||||
|
||||
<p>After the legislature amended the APA, the First DCA analyzed the statutory language of section 120.52(8) again in <span class="italic">Southwest Florida Water Management District v. Save the Manatee Club, Inc.</span><sup>79</sup> <span class="italic">Save the Manatee Club</span> concerned the Southwest Florida Water Management District's (the "District's") authority to create exemptions to environmental resource permitting requirements.<sup>80</sup> South Shores Partners, Ltd. ("South Shores") applied "for a permit to develop a 720-acre tract of land in Southwest Hillsborough County."<sup>81</sup> As part of the development project, South Shores wanted "to build a connecting waterway between the [existing] canal system [on the property] and the [Tampa] Bay."<sup>82</sup> The Save the Manatee Club believed that the resulting increase in power boat traffic in this new waterway would "endanger the manatee and its habitat."<sup>83</sup></p>
|
||||
|
||||
<p>The District has the authority to grant either a general permit or an environmental resource permit to a development project, depending on the type of project involved.<sup>84</sup> When granting an environmental resource permit, the District must consider "[t]he impact a proposed development will have on wildlife" as a factor; it does not have to do so when it grants a general permit.<sup>85</sup> The District granted South</p>
|
||||
</main>
|
||||
|
||||
<footer class="footnotes">
|
||||
<div class="footnote">78. As of December 14, 2015, a search of the "Citing References" on WestLaw shows that <span class="italic">Save the Manatee Club</span> has been cited by forty court opinions. <span class="italic">Day Cruise</span>, by comparison, has been cited by fifteen court opinions. These numbers do not include citations to either case in DOAH decisions.</div>
|
||||
<div class="footnote">79. 773 So. 2d 594 (Fla. 1st DCA 2000).</div>
|
||||
<div class="footnote">80. <span class="italic">Id.</span> at 596.</div>
|
||||
<div class="footnote">81. <span class="italic">Id.</span></div>
|
||||
<div class="footnote">82. <span class="italic">Id.</span></div>
|
||||
<div class="footnote">83. <span class="italic">Id.</span></div>
|
||||
<div class="footnote">84. <span class="italic">See id.</span></div>
|
||||
<div class="footnote">85. <span class="italic">Id.</span></div>
|
||||
</footer>
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
tests = generate_tests_from_html(html_content, "0", 1)
|
||||
|
||||
for test in tests:
|
||||
print(test)
|
||||
|
Loading…
x
Reference in New Issue
Block a user