Cleanup review app

This commit is contained in:
Jake Poznanski 2025-03-20 16:36:10 +00:00
parent 063d4f556a
commit f79bd0d248
3 changed files with 24 additions and 28 deletions

View File

@ -40,15 +40,16 @@ if __name__ == "__main__":
assert j["url"] assert j["url"]
hash = parse_pdf_hash(j["url"]) hash = parse_pdf_hash(j["url"])
print(j["url"], hash) if hash:
assert hash url = get_uri_from_db(args.db, hash)
url = get_uri_from_db(args.db, hash)
if url: if url:
j["url"] = url j["url"] = url
data.append(j) data.append(j)
else:
skipped += 1
else: else:
skipped += 1 data.append(j)
print(data) print(data)

View File

@ -14,6 +14,7 @@ app = Flask(__name__)
# Global state # Global state
DATASET_DIR = "" DATASET_DIR = ""
DATASET_FILE = None
CURRENT_PDF = None CURRENT_PDF = None
PDF_TESTS = {} PDF_TESTS = {}
ALL_PDFS = [] ALL_PDFS = []
@ -120,7 +121,7 @@ def index():
@app.route("/update_test", methods=["POST"]) @app.route("/update_test", methods=["POST"])
def update_test(): def update_test():
"""API endpoint to update a test.""" """API endpoint to update a test."""
global PDF_TESTS, DATASET_DIR global PDF_TESTS, DATASET_DIR, DATASET_FILE
data = request.json data = request.json
pdf_name = data.get("pdf") pdf_name = data.get("pdf")
@ -135,8 +136,7 @@ def update_test():
break break
# Save the updated tests # Save the updated tests
dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl") save_dataset(DATASET_FILE)
save_dataset(dataset_file)
return jsonify({"status": "success"}) return jsonify({"status": "success"})
@ -144,7 +144,7 @@ def update_test():
@app.route("/reject_all", methods=["POST"]) @app.route("/reject_all", methods=["POST"])
def reject_all(): def reject_all():
"""API endpoint to reject all tests for a PDF.""" """API endpoint to reject all tests for a PDF."""
global PDF_TESTS, DATASET_DIR global PDF_TESTS, DATASET_DIR, DATASET_FILE
data = request.json data = request.json
pdf_name = data.get("pdf") pdf_name = data.get("pdf")
@ -155,8 +155,7 @@ def reject_all():
test["checked"] = "rejected" test["checked"] = "rejected"
# Save the updated tests # Save the updated tests
dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl") save_dataset(DATASET_FILE)
save_dataset(dataset_file)
return jsonify({"status": "success", "count": len(PDF_TESTS[pdf_name])}) return jsonify({"status": "success", "count": len(PDF_TESTS[pdf_name])})
@ -204,10 +203,8 @@ def goto_pdf(index):
return redirect(url_for("index")) return redirect(url_for("index"))
def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]: def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
"""Load tests from the dataset file and organize them by PDF.""" """Load tests from the dataset file and organize them by PDF."""
dataset_file = os.path.join(dataset_dir, "table_tests.jsonl")
if not os.path.exists(dataset_file): if not os.path.exists(dataset_file):
raise FileNotFoundError(f"Dataset file not found: {dataset_file}") raise FileNotFoundError(f"Dataset file not found: {dataset_file}")
@ -240,10 +237,10 @@ def create_templates_directory():
def main(): def main():
"""Main entry point with command-line arguments.""" """Main entry point with command-line arguments."""
global DATASET_DIR, PDF_TESTS, ALL_PDFS, CURRENT_PDF global DATASET_DIR, DATASET_FILE, PDF_TESTS, ALL_PDFS, CURRENT_PDF
parser = argparse.ArgumentParser(description="Interactive Test Review App") parser = argparse.ArgumentParser(description="Interactive Test Review App")
parser.add_argument("dataset_dir", help="Path to the dataset directory containing table_tests.jsonl and pdfs/ folder") parser.add_argument("dataset_file", help="Path to the dataset jsonl file")
parser.add_argument("--port", type=int, default=5000, help="Port for the Flask app") parser.add_argument("--port", type=int, default=5000, help="Port for the Flask app")
parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app") parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app")
parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode") parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode")
@ -251,21 +248,22 @@ def main():
args = parser.parse_args() args = parser.parse_args()
# Validate dataset directory # Validate dataset directory
if not os.path.isdir(args.dataset_dir): if not os.path.exists(args.dataset_file):
print(f"Error: Dataset directory not found: {args.dataset_dir}") print(f"Error: Dataset not found: {args.dataset_file}")
return 1 return 1
pdf_dir = os.path.join(args.dataset_dir, "pdfs") # Store dataset directory globally
DATASET_DIR = os.path.dirname(os.path.abspath(args.dataset_file))
DATASET_FILE = args.dataset_file
pdf_dir = os.path.join(DATASET_DIR, "pdfs")
if not os.path.isdir(pdf_dir): if not os.path.isdir(pdf_dir):
print(f"Error: PDF directory not found: {pdf_dir}") print(f"Error: PDF directory not found: {pdf_dir}")
return 1 return 1
# Store dataset directory globally
DATASET_DIR = args.dataset_dir
# Load dataset # Load dataset
try: try:
PDF_TESTS, ALL_PDFS = load_dataset(args.dataset_dir) PDF_TESTS, ALL_PDFS = load_dataset(args.dataset_file)
except Exception as e: except Exception as e:
print(f"Error loading dataset: {str(e)}") print(f"Error loading dataset: {str(e)}")
return 1 return 1

View File

@ -1,6 +1,5 @@
import unittest import unittest
from olmocr.bench.tests import ( from olmocr.bench.tests import (
BaselineTest, BaselineTest,
BasePDFTest, BasePDFTest,
@ -877,8 +876,6 @@ consignatiediensten | 19816 | 1,0 | 6,0 | 2,8 | 1,2 |
result, explanation = test.run(table) result, explanation = test.run(table)
self.assertTrue(result, explanation) self.assertTrue(result, explanation)
def test_multiple_markdown_tables(self): def test_multiple_markdown_tables(self):
"""Test that we can find and verify cells in multiple markdown tables in one document""" """Test that we can find and verify cells in multiple markdown tables in one document"""
content = """ content = """