mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-13 09:12:18 +00:00
Cleanup review app
This commit is contained in:
parent
063d4f556a
commit
f79bd0d248
@ -40,8 +40,7 @@ if __name__ == "__main__":
|
||||
|
||||
assert j["url"]
|
||||
hash = parse_pdf_hash(j["url"])
|
||||
print(j["url"], hash)
|
||||
assert hash
|
||||
if hash:
|
||||
url = get_uri_from_db(args.db, hash)
|
||||
|
||||
if url:
|
||||
@ -49,6 +48,8 @@ if __name__ == "__main__":
|
||||
data.append(j)
|
||||
else:
|
||||
skipped += 1
|
||||
else:
|
||||
data.append(j)
|
||||
|
||||
print(data)
|
||||
|
||||
|
@ -14,6 +14,7 @@ app = Flask(__name__)
|
||||
|
||||
# Global state
|
||||
DATASET_DIR = ""
|
||||
DATASET_FILE = None
|
||||
CURRENT_PDF = None
|
||||
PDF_TESTS = {}
|
||||
ALL_PDFS = []
|
||||
@ -120,7 +121,7 @@ def index():
|
||||
@app.route("/update_test", methods=["POST"])
|
||||
def update_test():
|
||||
"""API endpoint to update a test."""
|
||||
global PDF_TESTS, DATASET_DIR
|
||||
global PDF_TESTS, DATASET_DIR, DATASET_FILE
|
||||
|
||||
data = request.json
|
||||
pdf_name = data.get("pdf")
|
||||
@ -135,8 +136,7 @@ def update_test():
|
||||
break
|
||||
|
||||
# Save the updated tests
|
||||
dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl")
|
||||
save_dataset(dataset_file)
|
||||
save_dataset(DATASET_FILE)
|
||||
|
||||
return jsonify({"status": "success"})
|
||||
|
||||
@ -144,7 +144,7 @@ def update_test():
|
||||
@app.route("/reject_all", methods=["POST"])
|
||||
def reject_all():
|
||||
"""API endpoint to reject all tests for a PDF."""
|
||||
global PDF_TESTS, DATASET_DIR
|
||||
global PDF_TESTS, DATASET_DIR, DATASET_FILE
|
||||
|
||||
data = request.json
|
||||
pdf_name = data.get("pdf")
|
||||
@ -155,8 +155,7 @@ def reject_all():
|
||||
test["checked"] = "rejected"
|
||||
|
||||
# Save the updated tests
|
||||
dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl")
|
||||
save_dataset(dataset_file)
|
||||
save_dataset(DATASET_FILE)
|
||||
|
||||
return jsonify({"status": "success", "count": len(PDF_TESTS[pdf_name])})
|
||||
|
||||
@ -204,10 +203,8 @@ def goto_pdf(index):
|
||||
return redirect(url_for("index"))
|
||||
|
||||
|
||||
def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
|
||||
def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
|
||||
"""Load tests from the dataset file and organize them by PDF."""
|
||||
dataset_file = os.path.join(dataset_dir, "table_tests.jsonl")
|
||||
|
||||
if not os.path.exists(dataset_file):
|
||||
raise FileNotFoundError(f"Dataset file not found: {dataset_file}")
|
||||
|
||||
@ -240,10 +237,10 @@ def create_templates_directory():
|
||||
|
||||
def main():
|
||||
"""Main entry point with command-line arguments."""
|
||||
global DATASET_DIR, PDF_TESTS, ALL_PDFS, CURRENT_PDF
|
||||
global DATASET_DIR, DATASET_FILE, PDF_TESTS, ALL_PDFS, CURRENT_PDF
|
||||
|
||||
parser = argparse.ArgumentParser(description="Interactive Test Review App")
|
||||
parser.add_argument("dataset_dir", help="Path to the dataset directory containing table_tests.jsonl and pdfs/ folder")
|
||||
parser.add_argument("dataset_file", help="Path to the dataset jsonl file")
|
||||
parser.add_argument("--port", type=int, default=5000, help="Port for the Flask app")
|
||||
parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app")
|
||||
parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode")
|
||||
@ -251,21 +248,22 @@ def main():
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate dataset directory
|
||||
if not os.path.isdir(args.dataset_dir):
|
||||
print(f"Error: Dataset directory not found: {args.dataset_dir}")
|
||||
if not os.path.exists(args.dataset_file):
|
||||
print(f"Error: Dataset not found: {args.dataset_file}")
|
||||
return 1
|
||||
|
||||
pdf_dir = os.path.join(args.dataset_dir, "pdfs")
|
||||
# Store dataset directory globally
|
||||
DATASET_DIR = os.path.dirname(os.path.abspath(args.dataset_file))
|
||||
DATASET_FILE = args.dataset_file
|
||||
|
||||
pdf_dir = os.path.join(DATASET_DIR, "pdfs")
|
||||
if not os.path.isdir(pdf_dir):
|
||||
print(f"Error: PDF directory not found: {pdf_dir}")
|
||||
return 1
|
||||
|
||||
# Store dataset directory globally
|
||||
DATASET_DIR = args.dataset_dir
|
||||
|
||||
# Load dataset
|
||||
try:
|
||||
PDF_TESTS, ALL_PDFS = load_dataset(args.dataset_dir)
|
||||
PDF_TESTS, ALL_PDFS = load_dataset(args.dataset_file)
|
||||
except Exception as e:
|
||||
print(f"Error loading dataset: {str(e)}")
|
||||
return 1
|
||||
|
@ -1,6 +1,5 @@
|
||||
import unittest
|
||||
|
||||
|
||||
from olmocr.bench.tests import (
|
||||
BaselineTest,
|
||||
BasePDFTest,
|
||||
@ -877,8 +876,6 @@ consignatiediensten | 19816 | 1,0 | 6,0 | 2,8 | 1,2 |
|
||||
result, explanation = test.run(table)
|
||||
self.assertTrue(result, explanation)
|
||||
|
||||
|
||||
|
||||
def test_multiple_markdown_tables(self):
|
||||
"""Test that we can find and verify cells in multiple markdown tables in one document"""
|
||||
content = """
|
||||
|
Loading…
x
Reference in New Issue
Block a user