mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-12 08:43:32 +00:00
Force flag for review app, tests fixed for difference comparison in tables
This commit is contained in:
parent
e856e9de1d
commit
d4d87f7c65
@ -18,7 +18,7 @@ DATASET_FILE = None
|
|||||||
CURRENT_PDF = None
|
CURRENT_PDF = None
|
||||||
PDF_TESTS = {}
|
PDF_TESTS = {}
|
||||||
ALL_PDFS = []
|
ALL_PDFS = []
|
||||||
|
FORCE = False # New global flag
|
||||||
|
|
||||||
def find_next_unchecked_pdf() -> Optional[str]:
|
def find_next_unchecked_pdf() -> Optional[str]:
|
||||||
"""Find the next PDF with at least one unchecked test."""
|
"""Find the next PDF with at least one unchecked test."""
|
||||||
@ -31,7 +31,6 @@ def find_next_unchecked_pdf() -> Optional[str]:
|
|||||||
return pdf_name
|
return pdf_name
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def calculate_stats() -> dict:
|
def calculate_stats() -> dict:
|
||||||
"""Calculate statistics for all tests in the dataset."""
|
"""Calculate statistics for all tests in the dataset."""
|
||||||
global PDF_TESTS
|
global PDF_TESTS
|
||||||
@ -59,7 +58,6 @@ def calculate_stats() -> dict:
|
|||||||
|
|
||||||
return {"total": total_tests, "null": null_status, "verified": verified_status, "rejected": rejected_status, "completion": completion}
|
return {"total": total_tests, "null": null_status, "verified": verified_status, "rejected": rejected_status, "completion": completion}
|
||||||
|
|
||||||
|
|
||||||
def save_dataset(jsonl_file: str) -> None:
|
def save_dataset(jsonl_file: str) -> None:
|
||||||
"""Save the tests to a JSONL file, using temp file for atomic write."""
|
"""Save the tests to a JSONL file, using temp file for atomic write."""
|
||||||
global PDF_TESTS
|
global PDF_TESTS
|
||||||
@ -77,26 +75,27 @@ def save_dataset(jsonl_file: str) -> None:
|
|||||||
# Atomic replace
|
# Atomic replace
|
||||||
shutil.move(temp_file.name, jsonl_file)
|
shutil.move(temp_file.name, jsonl_file)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/pdf/<path:pdf_name>")
|
@app.route("/pdf/<path:pdf_name>")
|
||||||
def serve_pdf(pdf_name):
|
def serve_pdf(pdf_name):
|
||||||
"""Serve the PDF file directly."""
|
"""Serve the PDF file directly."""
|
||||||
pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name)
|
pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name)
|
||||||
return send_file(pdf_path, mimetype="application/pdf")
|
return send_file(pdf_path, mimetype="application/pdf")
|
||||||
|
|
||||||
|
|
||||||
@app.route("/")
|
@app.route("/")
|
||||||
def index():
|
def index():
|
||||||
"""Main page displaying the current PDF and its tests."""
|
"""Main page displaying the current PDF and its tests."""
|
||||||
global CURRENT_PDF, PDF_TESTS, DATASET_DIR
|
global CURRENT_PDF, PDF_TESTS, DATASET_DIR, ALL_PDFS, FORCE
|
||||||
|
|
||||||
# If no current PDF is set, find the next one with unchecked tests
|
# If no current PDF is set, find the next one with unchecked tests
|
||||||
if CURRENT_PDF is None:
|
if CURRENT_PDF is None:
|
||||||
CURRENT_PDF = find_next_unchecked_pdf()
|
CURRENT_PDF = find_next_unchecked_pdf()
|
||||||
|
|
||||||
# If still no PDF, all tests have been checked
|
# If still no PDF, either show the "All done" page or force display the first PDF
|
||||||
if CURRENT_PDF is None:
|
if CURRENT_PDF is None:
|
||||||
return render_template("all_done.html")
|
if FORCE and ALL_PDFS:
|
||||||
|
CURRENT_PDF = ALL_PDFS[0]
|
||||||
|
else:
|
||||||
|
return render_template("all_done.html")
|
||||||
|
|
||||||
# Get the tests for the current PDF
|
# Get the tests for the current PDF
|
||||||
current_tests = PDF_TESTS.get(CURRENT_PDF, [])
|
current_tests = PDF_TESTS.get(CURRENT_PDF, [])
|
||||||
@ -117,7 +116,6 @@ def index():
|
|||||||
stats=stats,
|
stats=stats,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/update_test", methods=["POST"])
|
@app.route("/update_test", methods=["POST"])
|
||||||
def update_test():
|
def update_test():
|
||||||
"""API endpoint to update a test."""
|
"""API endpoint to update a test."""
|
||||||
@ -140,7 +138,6 @@ def update_test():
|
|||||||
|
|
||||||
return jsonify({"status": "success"})
|
return jsonify({"status": "success"})
|
||||||
|
|
||||||
|
|
||||||
@app.route("/reject_all", methods=["POST"])
|
@app.route("/reject_all", methods=["POST"])
|
||||||
def reject_all():
|
def reject_all():
|
||||||
"""API endpoint to reject all tests for a PDF."""
|
"""API endpoint to reject all tests for a PDF."""
|
||||||
@ -161,24 +158,29 @@ def reject_all():
|
|||||||
|
|
||||||
return jsonify({"status": "error", "message": "PDF not found"})
|
return jsonify({"status": "error", "message": "PDF not found"})
|
||||||
|
|
||||||
|
|
||||||
@app.route("/next_pdf", methods=["POST"])
|
@app.route("/next_pdf", methods=["POST"])
|
||||||
def next_pdf():
|
def next_pdf():
|
||||||
"""Move to the next PDF in the list."""
|
"""Move to the next PDF in the list."""
|
||||||
global CURRENT_PDF, ALL_PDFS
|
global CURRENT_PDF, ALL_PDFS, FORCE
|
||||||
|
|
||||||
if CURRENT_PDF in ALL_PDFS:
|
if CURRENT_PDF in ALL_PDFS:
|
||||||
current_index = ALL_PDFS.index(CURRENT_PDF)
|
current_index = ALL_PDFS.index(CURRENT_PDF)
|
||||||
if current_index < len(ALL_PDFS) - 1:
|
if current_index < len(ALL_PDFS) - 1:
|
||||||
CURRENT_PDF = ALL_PDFS[current_index + 1]
|
CURRENT_PDF = ALL_PDFS[current_index + 1]
|
||||||
else:
|
else:
|
||||||
CURRENT_PDF = find_next_unchecked_pdf()
|
# If in force mode, cycle back to the beginning instead of checking for an unchecked PDF
|
||||||
|
if FORCE and ALL_PDFS:
|
||||||
|
CURRENT_PDF = ALL_PDFS[0]
|
||||||
|
else:
|
||||||
|
CURRENT_PDF = find_next_unchecked_pdf()
|
||||||
else:
|
else:
|
||||||
CURRENT_PDF = find_next_unchecked_pdf()
|
if FORCE and ALL_PDFS:
|
||||||
|
CURRENT_PDF = ALL_PDFS[0]
|
||||||
|
else:
|
||||||
|
CURRENT_PDF = find_next_unchecked_pdf()
|
||||||
|
|
||||||
return redirect(url_for("index"))
|
return redirect(url_for("index"))
|
||||||
|
|
||||||
|
|
||||||
@app.route("/prev_pdf", methods=["POST"])
|
@app.route("/prev_pdf", methods=["POST"])
|
||||||
def prev_pdf():
|
def prev_pdf():
|
||||||
"""Move to the previous PDF in the list."""
|
"""Move to the previous PDF in the list."""
|
||||||
@ -191,7 +193,6 @@ def prev_pdf():
|
|||||||
|
|
||||||
return redirect(url_for("index"))
|
return redirect(url_for("index"))
|
||||||
|
|
||||||
|
|
||||||
@app.route("/goto_pdf/<int:index>", methods=["POST"])
|
@app.route("/goto_pdf/<int:index>", methods=["POST"])
|
||||||
def goto_pdf(index):
|
def goto_pdf(index):
|
||||||
"""Go to a specific PDF by index."""
|
"""Go to a specific PDF by index."""
|
||||||
@ -202,7 +203,6 @@ def goto_pdf(index):
|
|||||||
|
|
||||||
return redirect(url_for("index"))
|
return redirect(url_for("index"))
|
||||||
|
|
||||||
|
|
||||||
def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
|
def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
|
||||||
"""Load tests from the dataset file and organize them by PDF."""
|
"""Load tests from the dataset file and organize them by PDF."""
|
||||||
if not os.path.exists(dataset_file):
|
if not os.path.exists(dataset_file):
|
||||||
@ -228,24 +228,24 @@ def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
|
|||||||
|
|
||||||
return pdf_tests, all_pdfs
|
return pdf_tests, all_pdfs
|
||||||
|
|
||||||
|
|
||||||
def create_templates_directory():
|
def create_templates_directory():
|
||||||
"""Create templates directory for Flask if it doesn't exist."""
|
"""Create templates directory for Flask if it doesn't exist."""
|
||||||
templates_dir = os.path.join(os.path.dirname(__file__), "templates")
|
templates_dir = os.path.join(os.path.dirname(__file__), "templates")
|
||||||
os.makedirs(templates_dir, exist_ok=True)
|
os.makedirs(templates_dir, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Main entry point with command-line arguments."""
|
"""Main entry point with command-line arguments."""
|
||||||
global DATASET_DIR, DATASET_FILE, PDF_TESTS, ALL_PDFS, CURRENT_PDF
|
global DATASET_DIR, DATASET_FILE, PDF_TESTS, ALL_PDFS, CURRENT_PDF, FORCE
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Interactive Test Review App")
|
parser = argparse.ArgumentParser(description="Interactive Test Review App")
|
||||||
parser.add_argument("dataset_file", help="Path to the dataset jsonl file")
|
parser.add_argument("dataset_file", help="Path to the dataset jsonl file")
|
||||||
parser.add_argument("--port", type=int, default=5000, help="Port for the Flask app")
|
parser.add_argument("--port", type=int, default=5000, help="Port for the Flask app")
|
||||||
parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app")
|
parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app")
|
||||||
parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode")
|
parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode")
|
||||||
|
parser.add_argument("--force", action="store_true", help="Force show each file one by one and never do the 'All done' page")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
FORCE = args.force # Set the global FORCE flag
|
||||||
|
|
||||||
# Validate dataset directory
|
# Validate dataset directory
|
||||||
if not os.path.exists(args.dataset_file):
|
if not os.path.exists(args.dataset_file):
|
||||||
@ -280,6 +280,5 @@ def main():
|
|||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
sys.exit(main())
|
sys.exit(main())
|
||||||
|
@ -600,6 +600,24 @@ Some text before the table.
|
|||||||
result, explanation = test.run(table)
|
result, explanation = test.run(table)
|
||||||
self.assertTrue(result, explanation)
|
self.assertTrue(result, explanation)
|
||||||
|
|
||||||
|
def test_diffs(self):
|
||||||
|
table = """| CATEGORY | POINTS EARNED |
|
||||||
|
|------------------------------|------------------|
|
||||||
|
| Sustainable Sites | 9 |
|
||||||
|
| Water Efficiency | 3 |
|
||||||
|
| Energy & Atmosphere | 12 |
|
||||||
|
| Materials & Resources | 6 |
|
||||||
|
| Indoor Environmental Quality | 11 |
|
||||||
|
| Innovation & Design Process | 5 |
|
||||||
|
| TOTAL | 46 |"""
|
||||||
|
test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="9", left="Sustl Sie", max_diffs=2)
|
||||||
|
result, explanation = test.run(table)
|
||||||
|
self.assertFalse(result, explanation)
|
||||||
|
|
||||||
|
test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="9", left="Sustainable Site", max_diffs=2)
|
||||||
|
result, explanation = test.run(table)
|
||||||
|
self.assertTrue(result, explanation)
|
||||||
|
|
||||||
def test_markdown_marker2(self):
|
def test_markdown_marker2(self):
|
||||||
table = """| Concentration
|
table = """| Concentration
|
||||||
level | [CO] | [SO2] | [NOx] |
|
level | [CO] | [SO2] | [NOx] |
|
||||||
|
@ -658,6 +658,7 @@ class TableTest(BasePDFTest):
|
|||||||
|
|
||||||
# Threshold for fuzzy matching derived from max_diffs
|
# Threshold for fuzzy matching derived from max_diffs
|
||||||
threshold = 1.0 - (self.max_diffs / (len(self.cell) if len(self.cell) > 0 else 1))
|
threshold = 1.0 - (self.max_diffs / (len(self.cell) if len(self.cell) > 0 else 1))
|
||||||
|
threshold = max(0.5, threshold)
|
||||||
|
|
||||||
# Parse tables based on content_type
|
# Parse tables based on content_type
|
||||||
md_tables = parse_markdown_tables(content)
|
md_tables = parse_markdown_tables(content)
|
||||||
@ -700,7 +701,7 @@ class TableTest(BasePDFTest):
|
|||||||
if self.up and row_idx > 0:
|
if self.up and row_idx > 0:
|
||||||
up_cell = normalize_text(table_array[row_idx - 1, col_idx])
|
up_cell = normalize_text(table_array[row_idx - 1, col_idx])
|
||||||
up_similarity = fuzz.ratio(self.up, up_cell) / 100.0
|
up_similarity = fuzz.ratio(self.up, up_cell) / 100.0
|
||||||
if up_similarity < threshold:
|
if up_similarity < max(0.5, 1.0 - (self.max_diffs / (len(self.up) if len(self.up) > 0 else 1))):
|
||||||
all_relationships_satisfied = False
|
all_relationships_satisfied = False
|
||||||
current_failed_reasons.append(f"Cell above '{up_cell}' doesn't match expected '{self.up}' (similarity: {up_similarity:.2f})")
|
current_failed_reasons.append(f"Cell above '{up_cell}' doesn't match expected '{self.up}' (similarity: {up_similarity:.2f})")
|
||||||
|
|
||||||
@ -708,7 +709,7 @@ class TableTest(BasePDFTest):
|
|||||||
if self.down and row_idx < table_array.shape[0] - 1:
|
if self.down and row_idx < table_array.shape[0] - 1:
|
||||||
down_cell = normalize_text(table_array[row_idx + 1, col_idx])
|
down_cell = normalize_text(table_array[row_idx + 1, col_idx])
|
||||||
down_similarity = fuzz.ratio(self.down, down_cell) / 100.0
|
down_similarity = fuzz.ratio(self.down, down_cell) / 100.0
|
||||||
if down_similarity < threshold:
|
if down_similarity < max(0.5, 1.0 - (self.max_diffs / (len(self.down) if len(self.down) > 0 else 1))):
|
||||||
all_relationships_satisfied = False
|
all_relationships_satisfied = False
|
||||||
current_failed_reasons.append(f"Cell below '{down_cell}' doesn't match expected '{self.down}' (similarity: {down_similarity:.2f})")
|
current_failed_reasons.append(f"Cell below '{down_cell}' doesn't match expected '{self.down}' (similarity: {down_similarity:.2f})")
|
||||||
|
|
||||||
@ -716,7 +717,7 @@ class TableTest(BasePDFTest):
|
|||||||
if self.left and col_idx > 0:
|
if self.left and col_idx > 0:
|
||||||
left_cell = normalize_text(table_array[row_idx, col_idx - 1])
|
left_cell = normalize_text(table_array[row_idx, col_idx - 1])
|
||||||
left_similarity = fuzz.ratio(self.left, left_cell) / 100.0
|
left_similarity = fuzz.ratio(self.left, left_cell) / 100.0
|
||||||
if left_similarity < threshold:
|
if left_similarity < max(0.5, 1.0 - (self.max_diffs / (len(self.left) if len(self.left) > 0 else 1))):
|
||||||
all_relationships_satisfied = False
|
all_relationships_satisfied = False
|
||||||
current_failed_reasons.append(
|
current_failed_reasons.append(
|
||||||
f"Cell to the left '{left_cell}' doesn't match expected '{self.left}' (similarity: {left_similarity:.2f})"
|
f"Cell to the left '{left_cell}' doesn't match expected '{self.left}' (similarity: {left_similarity:.2f})"
|
||||||
@ -726,7 +727,7 @@ class TableTest(BasePDFTest):
|
|||||||
if self.right and col_idx < table_array.shape[1] - 1:
|
if self.right and col_idx < table_array.shape[1] - 1:
|
||||||
right_cell = normalize_text(table_array[row_idx, col_idx + 1])
|
right_cell = normalize_text(table_array[row_idx, col_idx + 1])
|
||||||
right_similarity = fuzz.ratio(self.right, right_cell) / 100.0
|
right_similarity = fuzz.ratio(self.right, right_cell) / 100.0
|
||||||
if right_similarity < threshold:
|
if right_similarity < max(0.5, 1.0 - (self.max_diffs / (len(self.right) if len(self.right) > 0 else 1))):
|
||||||
all_relationships_satisfied = False
|
all_relationships_satisfied = False
|
||||||
current_failed_reasons.append(
|
current_failed_reasons.append(
|
||||||
f"Cell to the right '{right_cell}' doesn't match expected '{self.right}' (similarity: {right_similarity:.2f})"
|
f"Cell to the right '{right_cell}' doesn't match expected '{self.right}' (similarity: {right_similarity:.2f})"
|
||||||
@ -747,7 +748,7 @@ class TableTest(BasePDFTest):
|
|||||||
if similarity > best_similarity:
|
if similarity > best_similarity:
|
||||||
best_similarity = similarity
|
best_similarity = similarity
|
||||||
best_match = header_text
|
best_match = header_text
|
||||||
if best_similarity >= threshold:
|
if best_similarity >= max(0.5, 1.0 - (self.max_diffs / (len(self.top_heading) if len(self.top_heading) > 0 else 1))):
|
||||||
top_heading_found = True
|
top_heading_found = True
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -760,7 +761,7 @@ class TableTest(BasePDFTest):
|
|||||||
if similarity > best_similarity:
|
if similarity > best_similarity:
|
||||||
best_similarity = similarity
|
best_similarity = similarity
|
||||||
best_match = header_text
|
best_match = header_text
|
||||||
if best_similarity >= threshold:
|
if best_similarity >= max(0.5, 1.0 - (self.max_diffs / (len(self.top_heading) if len(self.top_heading) > 0 else 1))):
|
||||||
top_heading_found = True
|
top_heading_found = True
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -777,7 +778,7 @@ class TableTest(BasePDFTest):
|
|||||||
if not best_match:
|
if not best_match:
|
||||||
all_relationships_satisfied = False
|
all_relationships_satisfied = False
|
||||||
current_failed_reasons.append(f"No top heading found for cell at ({row_idx}, {col_idx})")
|
current_failed_reasons.append(f"No top heading found for cell at ({row_idx}, {col_idx})")
|
||||||
elif best_similarity < threshold:
|
elif best_similarity < max(0.5, 1.0 - (self.max_diffs / (len(self.top_heading) if len(self.top_heading) > 0 else 1))):
|
||||||
all_relationships_satisfied = False
|
all_relationships_satisfied = False
|
||||||
current_failed_reasons.append(
|
current_failed_reasons.append(
|
||||||
f"Top heading '{best_match}' doesn't match expected '{self.top_heading}' (similarity: {best_similarity:.2f})"
|
f"Top heading '{best_match}' doesn't match expected '{self.top_heading}' (similarity: {best_similarity:.2f})"
|
||||||
@ -798,7 +799,7 @@ class TableTest(BasePDFTest):
|
|||||||
if similarity > best_similarity:
|
if similarity > best_similarity:
|
||||||
best_similarity = similarity
|
best_similarity = similarity
|
||||||
best_match = header_text
|
best_match = header_text
|
||||||
if best_similarity >= threshold:
|
if best_similarity >= max(0.5, 1.0 - (self.max_diffs / (len(self.left_heading) if len(self.left_heading) > 0 else 1))):
|
||||||
left_heading_found = True
|
left_heading_found = True
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -811,7 +812,7 @@ class TableTest(BasePDFTest):
|
|||||||
if similarity > best_similarity:
|
if similarity > best_similarity:
|
||||||
best_similarity = similarity
|
best_similarity = similarity
|
||||||
best_match = header_text
|
best_match = header_text
|
||||||
if best_similarity >= threshold:
|
if best_similarity >= max(0.5, 1.0 - (self.max_diffs / (len(self.left_heading) if len(self.left_heading) > 0 else 1))):
|
||||||
left_heading_found = True
|
left_heading_found = True
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -828,7 +829,7 @@ class TableTest(BasePDFTest):
|
|||||||
if not best_match:
|
if not best_match:
|
||||||
all_relationships_satisfied = False
|
all_relationships_satisfied = False
|
||||||
current_failed_reasons.append(f"No left heading found for cell at ({row_idx}, {col_idx})")
|
current_failed_reasons.append(f"No left heading found for cell at ({row_idx}, {col_idx})")
|
||||||
elif best_similarity < threshold:
|
elif best_similarity < max(0.5, 1.0 - (self.max_diffs / (len(self.left_heading) if len(self.left_heading) > 0 else 1))):
|
||||||
all_relationships_satisfied = False
|
all_relationships_satisfied = False
|
||||||
current_failed_reasons.append(
|
current_failed_reasons.append(
|
||||||
f"Left heading '{best_match}' doesn't match expected '{self.left_heading}' (similarity: {best_similarity:.2f})"
|
f"Left heading '{best_match}' doesn't match expected '{self.left_heading}' (similarity: {best_similarity:.2f})"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user