Cleanup review app

2025-10-13 09:12:18 +00:00 · 2025-03-20 16:36:10 +00:00 · 2025-03-20 16:36:10 +00:00 · f79bd0d248
commit f79bd0d248
parent 063d4f556a
3 changed files with 24 additions and 28 deletions
--- a/olmocr/bench/miners/cleanup_urls.py
+++ b/olmocr/bench/miners/cleanup_urls.py
@ -40,15 +40,16 @@ if __name__ == "__main__":
                assert j["url"]
                hash = parse_pdf_hash(j["url"])
-                print(j["url"], hash)
+                if hash:
-                assert hash
+                    url = get_uri_from_db(args.db, hash)
                url = get_uri_from_db(args.db, hash)
-                if url:
+                    if url:
-                    j["url"] = url
+                        j["url"] = url
-                    data.append(j)
+                        data.append(j)
                    else:
                        skipped += 1
                else:
-                    skipped += 1
+                    data.append(j)
    print(data)
--- a/olmocr/bench/review_app.py
+++ b/olmocr/bench/review_app.py
@ -14,6 +14,7 @@ app = Flask(__name__)
 # Global state
 DATASET_DIR = ""
 DATASET_FILE = None
 CURRENT_PDF = None
 PDF_TESTS = {}
 ALL_PDFS = []
@ -120,7 +121,7 @@ def index():
@app.route("/update_test", methods=["POST"])
 def update_test():
    """API endpoint to update a test."""
-    global PDF_TESTS, DATASET_DIR
+    global PDF_TESTS, DATASET_DIR, DATASET_FILE
    data = request.json
    pdf_name = data.get("pdf")
@ -135,8 +136,7 @@ def update_test():
            break
    # Save the updated tests
-    dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl")
+    save_dataset(DATASET_FILE)
    save_dataset(dataset_file)
    return jsonify({"status": "success"})
@ -144,7 +144,7 @@ def update_test():
@app.route("/reject_all", methods=["POST"])
 def reject_all():
    """API endpoint to reject all tests for a PDF."""
-    global PDF_TESTS, DATASET_DIR
+    global PDF_TESTS, DATASET_DIR, DATASET_FILE
    data = request.json
    pdf_name = data.get("pdf")
@ -155,8 +155,7 @@ def reject_all():
            test["checked"] = "rejected"
        # Save the updated tests
-        dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl")
+        save_dataset(DATASET_FILE)
        save_dataset(dataset_file)
        return jsonify({"status": "success", "count": len(PDF_TESTS[pdf_name])})
@ -204,10 +203,8 @@ def goto_pdf(index):
    return redirect(url_for("index"))
-def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
+def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
    """Load tests from the dataset file and organize them by PDF."""
    dataset_file = os.path.join(dataset_dir, "table_tests.jsonl")
    if not os.path.exists(dataset_file):
        raise FileNotFoundError(f"Dataset file not found: {dataset_file}")
@ -240,10 +237,10 @@ def create_templates_directory():
 def main():
    """Main entry point with command-line arguments."""
-    global DATASET_DIR, PDF_TESTS, ALL_PDFS, CURRENT_PDF
+    global DATASET_DIR, DATASET_FILE, PDF_TESTS, ALL_PDFS, CURRENT_PDF
    parser = argparse.ArgumentParser(description="Interactive Test Review App")
-    parser.add_argument("dataset_dir", help="Path to the dataset directory containing table_tests.jsonl and pdfs/ folder")
+    parser.add_argument("dataset_file", help="Path to the dataset jsonl file")
    parser.add_argument("--port", type=int, default=5000, help="Port for the Flask app")
    parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app")
    parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode")
@ -251,21 +248,22 @@ def main():
    args = parser.parse_args()
    # Validate dataset directory
-    if not os.path.isdir(args.dataset_dir):
+    if not os.path.exists(args.dataset_file):
-        print(f"Error: Dataset directory not found: {args.dataset_dir}")
+        print(f"Error: Dataset not found: {args.dataset_file}")
        return 1
-    pdf_dir = os.path.join(args.dataset_dir, "pdfs")
+    # Store dataset directory globally
    DATASET_DIR = os.path.dirname(os.path.abspath(args.dataset_file))
    DATASET_FILE = args.dataset_file
    pdf_dir = os.path.join(DATASET_DIR, "pdfs")
    if not os.path.isdir(pdf_dir):
        print(f"Error: PDF directory not found: {pdf_dir}")
        return 1
    # Store dataset directory globally
    DATASET_DIR = args.dataset_dir
    # Load dataset
    try:
-        PDF_TESTS, ALL_PDFS = load_dataset(args.dataset_dir)
+        PDF_TESTS, ALL_PDFS = load_dataset(args.dataset_file)
    except Exception as e:
        print(f"Error loading dataset: {str(e)}")
        return 1
--- a/olmocr/bench/test_tests.py
+++ b/olmocr/bench/test_tests.py
@ -1,6 +1,5 @@
 import unittest
 from olmocr.bench.tests import (
    BaselineTest,
    BasePDFTest,
@ -877,8 +876,6 @@ consignatiediensten | 19816 | 1,0     | 6,0     | 2,8        | 1,2 |
        result, explanation = test.run(table)
        self.assertTrue(result, explanation)
    def test_multiple_markdown_tables(self):
        """Test that we can find and verify cells in multiple markdown tables in one document"""
        content = """