From d620722a0e2943f89f1b3513a5be698ab530f13f Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Tue, 18 Mar 2025 18:57:50 +0000
Subject: [PATCH] Review app is much nicer now

---
 olmocr/bench/miners/mine_tables_gemini.py |   4 +-
 olmocr/bench/miners/mine_tables_gpt.py    |  37 ++---
 olmocr/bench/review_app.py                | 174 ++++++++++------------
 olmocr/bench/templates/review.html        |  10 +-
 4 files changed, 101 insertions(+), 124 deletions(-)

diff --git a/olmocr/bench/miners/mine_tables_gemini.py b/olmocr/bench/miners/mine_tables_gemini.py
index 9f229d8..5df22c3 100644
--- a/olmocr/bench/miners/mine_tables_gemini.py
+++ b/olmocr/bench/miners/mine_tables_gemini.py
@@ -138,7 +138,7 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[
             parts=[
                 image_part,
                 types.Part.from_text(
-                    text=(                           
+                    text=(
                         "Analyze the document attached and output it in markdown format. "
                         "Output equations as Latex escaped with $$. "
                         "Output tables in valid HTML format that preserves the structure and content exactly. "
@@ -415,4 +415,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/olmocr/bench/miners/mine_tables_gpt.py b/olmocr/bench/miners/mine_tables_gpt.py
index 522f7fc..53010bb 100644
--- a/olmocr/bench/miners/mine_tables_gpt.py
+++ b/olmocr/bench/miners/mine_tables_gpt.py
@@ -15,8 +15,6 @@ Usage:
 """
 
 import argparse
-import base64
-import json
 import os
 import random
 from typing import Dict, List, Optional, Tuple
@@ -136,13 +134,7 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[
                 {
                     "role": "user",
                     "content": [
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": f"data:image/png;base64,{image_base64}",
-                                "detail": "high"
-                            }
-                        },
+                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}", "detail": "high"}},
                         {
                             "type": "text",
                             "text": (
@@ -150,9 +142,9 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[
                                 "Output equations as Latex escaped with $$. "
                                 "Output tables in valid HTML format that preserves the structure and content exactly. "
                                 "Output figures with just a simple markdown image placeholder."
-                            )
-                        }
-                    ]
+                            ),
+                        },
+                    ],
                 }
             ],
             temperature=0.2,
@@ -278,26 +270,17 @@ def generate_table_tests(tables: List[np.ndarray], pdf_image: str, api_key: str,
                         {
                             "role": "user",
                             "content": [
-                                {
-                                    "type": "image_url",
-                                    "image_url": {
-                                        "url": f"data:image/png;base64,{pdf_image}",
-                                        "detail": "high"
-                                    }
-                                },
-                                {
-                                    "type": "text",
-                                    "text": prompt
-                                }
-                            ]
+                                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{pdf_image}", "detail": "high"}},
+                                {"type": "text", "text": prompt},
+                            ],
                         }
                     ],
                     temperature=0.2,
                 )
-                
+
                 if not response.choices or len(response.choices) == 0:
                     continue
-                    
+
                 answer_text = response.choices[0].message.content.strip()
                 if answer_text and "null" not in answer_text:
                     test_data = {"cell": cell_value, relationship: answer_text}
@@ -432,4 +415,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/olmocr/bench/review_app.py b/olmocr/bench/review_app.py
index cf92929..9ed1a59 100644
--- a/olmocr/bench/review_app.py
+++ b/olmocr/bench/review_app.py
@@ -8,12 +8,9 @@ import tempfile
 from collections import defaultdict
 from typing import Dict, List, Optional, Tuple
 
-import flask
-from flask import Flask, render_template, request, jsonify, redirect, url_for, send_file
-from werkzeug.utils import secure_filename
+from flask import Flask, jsonify, redirect, render_template, request, send_file, url_for
+
 
-from olmocr.data.renderpdf import render_pdf_to_base64png
-from . import tests
 
 app = Flask(__name__)
 
@@ -27,7 +24,7 @@ ALL_PDFS = []
 def find_next_unchecked_pdf() -> Optional[str]:
     """Find the next PDF with at least one unchecked test."""
     global PDF_TESTS, ALL_PDFS
-    
+
     for pdf_name in ALL_PDFS:
         pdf_tests = PDF_TESTS[pdf_name]
         for test in pdf_tests:
@@ -39,149 +36,140 @@ def find_next_unchecked_pdf() -> Optional[str]:
 def calculate_stats() -> dict:
     """Calculate statistics for all tests in the dataset."""
     global PDF_TESTS
-    
+
     total_tests = 0
     null_status = 0
     verified_status = 0
     rejected_status = 0
-    
+
     for pdf_tests in PDF_TESTS.values():
         total_tests += len(pdf_tests)
-        
+
         for test in pdf_tests:
-            status = test.get('checked')
+            status = test.get("checked")
             if status is None:
                 null_status += 1
-            elif status == 'verified':
+            elif status == "verified":
                 verified_status += 1
-            elif status == 'rejected':
+            elif status == "rejected":
                 rejected_status += 1
-    
+
     completion = 0
     if total_tests > 0:
         completion = (verified_status + rejected_status) / total_tests * 100
-    
-    return {
-        'total': total_tests,
-        'null': null_status,
-        'verified': verified_status,
-        'rejected': rejected_status,
-        'completion': completion
-    }
+
+    return {"total": total_tests, "null": null_status, "verified": verified_status, "rejected": rejected_status, "completion": completion}
 
 
 def save_dataset(jsonl_file: str) -> None:
     """Save the tests to a JSONL file, using temp file for atomic write."""
     global PDF_TESTS
-    
+
     # Flatten all tests
     all_tests = []
     for pdf_tests in PDF_TESTS.values():
         all_tests.extend(pdf_tests)
-    
+
     # Create temp file and write updated content
-    with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
+    with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file:
         for test in all_tests:
             temp_file.write(json.dumps(test) + "\n")
-    
+
     # Atomic replace
     shutil.move(temp_file.name, jsonl_file)
 
 
-@app.route('/pdf/<path:pdf_name>')
+@app.route("/pdf/<path:pdf_name>")
 def serve_pdf(pdf_name):
     """Serve the PDF file directly."""
     pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name)
-    return send_file(pdf_path, mimetype='application/pdf')
+    return send_file(pdf_path, mimetype="application/pdf")
 
 
-@app.route('/')
+@app.route("/")
 def index():
     """Main page displaying the current PDF and its tests."""
     global CURRENT_PDF, PDF_TESTS, DATASET_DIR
-    
+
     # If no current PDF is set, find the next one with unchecked tests
     if CURRENT_PDF is None:
         CURRENT_PDF = find_next_unchecked_pdf()
-    
+
     # If still no PDF, all tests have been checked
     if CURRENT_PDF is None:
-        return render_template('all_done.html')
-    
+        return render_template("all_done.html")
+
     # Get the tests for the current PDF
     current_tests = PDF_TESTS.get(CURRENT_PDF, [])
-    
+
     # Create PDF URL for pdf.js to load
-    pdf_url = url_for('serve_pdf', pdf_name=CURRENT_PDF)
-    
+    pdf_url = url_for("serve_pdf", pdf_name=CURRENT_PDF)
+
     # Calculate statistics
     stats = calculate_stats()
-    
+
     return render_template(
-        'review.html', 
+        "review.html",
         pdf_name=CURRENT_PDF,
         tests=current_tests,
         pdf_path=pdf_url,
         pdf_index=ALL_PDFS.index(CURRENT_PDF) if CURRENT_PDF in ALL_PDFS else 0,
         total_pdfs=len(ALL_PDFS),
-        stats=stats
+        stats=stats,
     )
 
 
-@app.route('/update_test', methods=['POST'])
+@app.route("/update_test", methods=["POST"])
 def update_test():
     """API endpoint to update a test."""
     global PDF_TESTS, DATASET_DIR
-    
+
     data = request.json
-    pdf_name = data.get('pdf')
-    test_id = data.get('id')
-    field = data.get('field')
-    value = data.get('value')
-    
+    pdf_name = data.get("pdf")
+    test_id = data.get("id")
+    field = data.get("field")
+    value = data.get("value")
+
     # Find and update the test
     for test in PDF_TESTS.get(pdf_name, []):
-        if test.get('id') == test_id:
+        if test.get("id") == test_id:
             test[field] = value
             break
-    
+
     # Save the updated tests
     dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl")
     save_dataset(dataset_file)
-    
+
     return jsonify({"status": "success"})
 
 
-@app.route('/reject_all', methods=['POST'])
+@app.route("/reject_all", methods=["POST"])
 def reject_all():
     """API endpoint to reject all tests for a PDF."""
     global PDF_TESTS, DATASET_DIR
-    
+
     data = request.json
-    pdf_name = data.get('pdf')
-    
+    pdf_name = data.get("pdf")
+
     if pdf_name and pdf_name in PDF_TESTS:
         # Update all tests for this PDF to rejected
         for test in PDF_TESTS[pdf_name]:
-            test['checked'] = 'rejected'
-        
+            test["checked"] = "rejected"
+
         # Save the updated tests
         dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl")
         save_dataset(dataset_file)
-        
-        return jsonify({
-            "status": "success",
-            "count": len(PDF_TESTS[pdf_name])
-        })
-    
+
+        return jsonify({"status": "success", "count": len(PDF_TESTS[pdf_name])})
+
     return jsonify({"status": "error", "message": "PDF not found"})
 
 
-@app.route('/next_pdf', methods=['POST'])
+@app.route("/next_pdf", methods=["POST"])
 def next_pdf():
     """Move to the next PDF in the list."""
     global CURRENT_PDF, ALL_PDFS
-    
+
     if CURRENT_PDF in ALL_PDFS:
         current_index = ALL_PDFS.index(CURRENT_PDF)
         if current_index < len(ALL_PDFS) - 1:
@@ -190,112 +178,112 @@ def next_pdf():
             CURRENT_PDF = find_next_unchecked_pdf()
     else:
         CURRENT_PDF = find_next_unchecked_pdf()
-    
-    return redirect(url_for('index'))
+
+    return redirect(url_for("index"))
 
 
-@app.route('/prev_pdf', methods=['POST'])
+@app.route("/prev_pdf", methods=["POST"])
 def prev_pdf():
     """Move to the previous PDF in the list."""
     global CURRENT_PDF, ALL_PDFS
-    
+
     if CURRENT_PDF in ALL_PDFS:
         current_index = ALL_PDFS.index(CURRENT_PDF)
         if current_index > 0:
             CURRENT_PDF = ALL_PDFS[current_index - 1]
-    
-    return redirect(url_for('index'))
+
+    return redirect(url_for("index"))
 
 
-@app.route('/goto_pdf/<int:index>', methods=['POST'])
+@app.route("/goto_pdf/<int:index>", methods=["POST"])
 def goto_pdf(index):
     """Go to a specific PDF by index."""
     global CURRENT_PDF, ALL_PDFS
-    
+
     if 0 <= index < len(ALL_PDFS):
         CURRENT_PDF = ALL_PDFS[index]
-    
-    return redirect(url_for('index'))
+
+    return redirect(url_for("index"))
 
 
 def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
     """Load tests from the dataset file and organize them by PDF."""
     dataset_file = os.path.join(dataset_dir, "table_tests.jsonl")
-    
+
     if not os.path.exists(dataset_file):
         raise FileNotFoundError(f"Dataset file not found: {dataset_file}")
-    
+
     pdf_tests = defaultdict(list)
-    
+
     with open(dataset_file, "r") as f:
         for line in f:
             line = line.strip()
             if not line:
                 continue
-                
+
             try:
                 test = json.loads(line)
-                pdf_name = test.get('pdf')
+                pdf_name = test.get("pdf")
                 if pdf_name:
                     pdf_tests[pdf_name].append(test)
             except json.JSONDecodeError:
                 print(f"Warning: Could not parse line as JSON: {line}")
-    
+
     all_pdfs = list(pdf_tests.keys())
-    
+
     return pdf_tests, all_pdfs
 
 
 def create_templates_directory():
     """Create templates directory for Flask if it doesn't exist."""
-    templates_dir = os.path.join(os.path.dirname(__file__), 'templates')
+    templates_dir = os.path.join(os.path.dirname(__file__), "templates")
     os.makedirs(templates_dir, exist_ok=True)
-    
+
 
 def main():
     """Main entry point with command-line arguments."""
-    global DATASET_DIR, PDF_TESTS, ALL_PDFS
-    
+    global DATASET_DIR, PDF_TESTS, ALL_PDFS, CURRENT_PDF
+
     parser = argparse.ArgumentParser(description="Interactive Test Review App")
     parser.add_argument("dataset_dir", help="Path to the dataset directory containing table_tests.jsonl and pdfs/ folder")
     parser.add_argument("--port", type=int, default=5000, help="Port for the Flask app")
     parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app")
     parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode")
-    
+
     args = parser.parse_args()
-    
+
     # Validate dataset directory
     if not os.path.isdir(args.dataset_dir):
         print(f"Error: Dataset directory not found: {args.dataset_dir}")
         return 1
-    
+
     pdf_dir = os.path.join(args.dataset_dir, "pdfs")
     if not os.path.isdir(pdf_dir):
         print(f"Error: PDF directory not found: {pdf_dir}")
         return 1
-    
+
     # Store dataset directory globally
     DATASET_DIR = args.dataset_dir
-    
+
     # Load dataset
     try:
         PDF_TESTS, ALL_PDFS = load_dataset(args.dataset_dir)
     except Exception as e:
         print(f"Error loading dataset: {str(e)}")
         return 1
-    
+
     # Create templates directory
     create_templates_directory()
-    
+
     # Find first PDF with unchecked tests
     CURRENT_PDF = find_next_unchecked_pdf()
-    
+
     # Start Flask app
     print(f"Starting server at http://{args.host}:{args.port}")
     app.run(host=args.host, port=args.port, debug=args.debug)
-    
+
     return 0
 
 
 if __name__ == "__main__":
-    sys.exit(main())
\ No newline at end of file
+    sys.exit(main())
diff --git a/olmocr/bench/templates/review.html b/olmocr/bench/templates/review.html
index ba048d0..965ed77 100644
--- a/olmocr/bench/templates/review.html
+++ b/olmocr/bench/templates/review.html
@@ -782,13 +782,16 @@
                     if (textarea.parentNode) {
                         textarea.parentNode.replaceChild(span, textarea);
                     }
+                    
+                    // Important: Reset edit mode flag
+                    isEditMode = false;
                 }
                 
                 // Add keydown event to handle Enter key
                 textarea.addEventListener('keydown', function(e) {
                     if (e.key === 'Enter' && !e.shiftKey) {
                         e.preventDefault(); // Prevent default Enter behavior
-                        this.blur(); // Will trigger the blur event
+                        saveAndExitForField(); // Save directly instead of blur
                     }
                 });
                 
@@ -839,13 +842,16 @@
                 if (textarea.parentNode) {
                     textarea.parentNode.replaceChild(span, textarea);
                 }
+                
+                // Important: Reset edit mode flag
+                isEditMode = false;
             }
             
             // Add keydown event to handle Enter key
             textarea.addEventListener('keydown', function(e) {
                 if (e.key === 'Enter' && !e.shiftKey) {
                     e.preventDefault(); // Prevent default Enter behavior
-                    this.blur(); // Will trigger the blur event
+                    saveAndExit(); // Save directly rather than triggering blur
                 }
             });