From d620722a0e2943f89f1b3513a5be698ab530f13f Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Tue, 18 Mar 2025 18:57:50 +0000 Subject: [PATCH] Review app is much nicer now --- olmocr/bench/miners/mine_tables_gemini.py | 4 +- olmocr/bench/miners/mine_tables_gpt.py | 37 ++--- olmocr/bench/review_app.py | 174 ++++++++++------------ olmocr/bench/templates/review.html | 10 +- 4 files changed, 101 insertions(+), 124 deletions(-) diff --git a/olmocr/bench/miners/mine_tables_gemini.py b/olmocr/bench/miners/mine_tables_gemini.py index 9f229d8..5df22c3 100644 --- a/olmocr/bench/miners/mine_tables_gemini.py +++ b/olmocr/bench/miners/mine_tables_gemini.py @@ -138,7 +138,7 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[ parts=[ image_part, types.Part.from_text( - text=( + text=( "Analyze the document attached and output it in markdown format. " "Output equations as Latex escaped with $$. " "Output tables in valid HTML format that preserves the structure and content exactly. " @@ -415,4 +415,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/olmocr/bench/miners/mine_tables_gpt.py b/olmocr/bench/miners/mine_tables_gpt.py index 522f7fc..53010bb 100644 --- a/olmocr/bench/miners/mine_tables_gpt.py +++ b/olmocr/bench/miners/mine_tables_gpt.py @@ -15,8 +15,6 @@ Usage: """ import argparse -import base64 -import json import os import random from typing import Dict, List, Optional, Tuple @@ -136,13 +134,7 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[ { "role": "user", "content": [ - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{image_base64}", - "detail": "high" - } - }, + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}", "detail": "high"}}, { "type": "text", "text": ( @@ -150,9 +142,9 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[ "Output equations as Latex escaped with $$. " "Output tables in valid HTML format that preserves the structure and content exactly. " "Output figures with just a simple markdown image placeholder." - ) - } - ] + ), + }, + ], } ], temperature=0.2, @@ -278,26 +270,17 @@ def generate_table_tests(tables: List[np.ndarray], pdf_image: str, api_key: str, { "role": "user", "content": [ - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{pdf_image}", - "detail": "high" - } - }, - { - "type": "text", - "text": prompt - } - ] + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{pdf_image}", "detail": "high"}}, + {"type": "text", "text": prompt}, + ], } ], temperature=0.2, ) - + if not response.choices or len(response.choices) == 0: continue - + answer_text = response.choices[0].message.content.strip() if answer_text and "null" not in answer_text: test_data = {"cell": cell_value, relationship: answer_text} @@ -432,4 +415,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/olmocr/bench/review_app.py b/olmocr/bench/review_app.py index cf92929..9ed1a59 100644 --- a/olmocr/bench/review_app.py +++ b/olmocr/bench/review_app.py @@ -8,12 +8,9 @@ import tempfile from collections import defaultdict from typing import Dict, List, Optional, Tuple -import flask -from flask import Flask, render_template, request, jsonify, redirect, url_for, send_file -from werkzeug.utils import secure_filename +from flask import Flask, jsonify, redirect, render_template, request, send_file, url_for + -from olmocr.data.renderpdf import render_pdf_to_base64png -from . import tests app = Flask(__name__) @@ -27,7 +24,7 @@ ALL_PDFS = [] def find_next_unchecked_pdf() -> Optional[str]: """Find the next PDF with at least one unchecked test.""" global PDF_TESTS, ALL_PDFS - + for pdf_name in ALL_PDFS: pdf_tests = PDF_TESTS[pdf_name] for test in pdf_tests: @@ -39,149 +36,140 @@ def find_next_unchecked_pdf() -> Optional[str]: def calculate_stats() -> dict: """Calculate statistics for all tests in the dataset.""" global PDF_TESTS - + total_tests = 0 null_status = 0 verified_status = 0 rejected_status = 0 - + for pdf_tests in PDF_TESTS.values(): total_tests += len(pdf_tests) - + for test in pdf_tests: - status = test.get('checked') + status = test.get("checked") if status is None: null_status += 1 - elif status == 'verified': + elif status == "verified": verified_status += 1 - elif status == 'rejected': + elif status == "rejected": rejected_status += 1 - + completion = 0 if total_tests > 0: completion = (verified_status + rejected_status) / total_tests * 100 - - return { - 'total': total_tests, - 'null': null_status, - 'verified': verified_status, - 'rejected': rejected_status, - 'completion': completion - } + + return {"total": total_tests, "null": null_status, "verified": verified_status, "rejected": rejected_status, "completion": completion} def save_dataset(jsonl_file: str) -> None: """Save the tests to a JSONL file, using temp file for atomic write.""" global PDF_TESTS - + # Flatten all tests all_tests = [] for pdf_tests in PDF_TESTS.values(): all_tests.extend(pdf_tests) - + # Create temp file and write updated content - with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file: + with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file: for test in all_tests: temp_file.write(json.dumps(test) + "\n") - + # Atomic replace shutil.move(temp_file.name, jsonl_file) -@app.route('/pdf/') +@app.route("/pdf/") def serve_pdf(pdf_name): """Serve the PDF file directly.""" pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name) - return send_file(pdf_path, mimetype='application/pdf') + return send_file(pdf_path, mimetype="application/pdf") -@app.route('/') +@app.route("/") def index(): """Main page displaying the current PDF and its tests.""" global CURRENT_PDF, PDF_TESTS, DATASET_DIR - + # If no current PDF is set, find the next one with unchecked tests if CURRENT_PDF is None: CURRENT_PDF = find_next_unchecked_pdf() - + # If still no PDF, all tests have been checked if CURRENT_PDF is None: - return render_template('all_done.html') - + return render_template("all_done.html") + # Get the tests for the current PDF current_tests = PDF_TESTS.get(CURRENT_PDF, []) - + # Create PDF URL for pdf.js to load - pdf_url = url_for('serve_pdf', pdf_name=CURRENT_PDF) - + pdf_url = url_for("serve_pdf", pdf_name=CURRENT_PDF) + # Calculate statistics stats = calculate_stats() - + return render_template( - 'review.html', + "review.html", pdf_name=CURRENT_PDF, tests=current_tests, pdf_path=pdf_url, pdf_index=ALL_PDFS.index(CURRENT_PDF) if CURRENT_PDF in ALL_PDFS else 0, total_pdfs=len(ALL_PDFS), - stats=stats + stats=stats, ) -@app.route('/update_test', methods=['POST']) +@app.route("/update_test", methods=["POST"]) def update_test(): """API endpoint to update a test.""" global PDF_TESTS, DATASET_DIR - + data = request.json - pdf_name = data.get('pdf') - test_id = data.get('id') - field = data.get('field') - value = data.get('value') - + pdf_name = data.get("pdf") + test_id = data.get("id") + field = data.get("field") + value = data.get("value") + # Find and update the test for test in PDF_TESTS.get(pdf_name, []): - if test.get('id') == test_id: + if test.get("id") == test_id: test[field] = value break - + # Save the updated tests dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl") save_dataset(dataset_file) - + return jsonify({"status": "success"}) -@app.route('/reject_all', methods=['POST']) +@app.route("/reject_all", methods=["POST"]) def reject_all(): """API endpoint to reject all tests for a PDF.""" global PDF_TESTS, DATASET_DIR - + data = request.json - pdf_name = data.get('pdf') - + pdf_name = data.get("pdf") + if pdf_name and pdf_name in PDF_TESTS: # Update all tests for this PDF to rejected for test in PDF_TESTS[pdf_name]: - test['checked'] = 'rejected' - + test["checked"] = "rejected" + # Save the updated tests dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl") save_dataset(dataset_file) - - return jsonify({ - "status": "success", - "count": len(PDF_TESTS[pdf_name]) - }) - + + return jsonify({"status": "success", "count": len(PDF_TESTS[pdf_name])}) + return jsonify({"status": "error", "message": "PDF not found"}) -@app.route('/next_pdf', methods=['POST']) +@app.route("/next_pdf", methods=["POST"]) def next_pdf(): """Move to the next PDF in the list.""" global CURRENT_PDF, ALL_PDFS - + if CURRENT_PDF in ALL_PDFS: current_index = ALL_PDFS.index(CURRENT_PDF) if current_index < len(ALL_PDFS) - 1: @@ -190,112 +178,112 @@ def next_pdf(): CURRENT_PDF = find_next_unchecked_pdf() else: CURRENT_PDF = find_next_unchecked_pdf() - - return redirect(url_for('index')) + + return redirect(url_for("index")) -@app.route('/prev_pdf', methods=['POST']) +@app.route("/prev_pdf", methods=["POST"]) def prev_pdf(): """Move to the previous PDF in the list.""" global CURRENT_PDF, ALL_PDFS - + if CURRENT_PDF in ALL_PDFS: current_index = ALL_PDFS.index(CURRENT_PDF) if current_index > 0: CURRENT_PDF = ALL_PDFS[current_index - 1] - - return redirect(url_for('index')) + + return redirect(url_for("index")) -@app.route('/goto_pdf/', methods=['POST']) +@app.route("/goto_pdf/", methods=["POST"]) def goto_pdf(index): """Go to a specific PDF by index.""" global CURRENT_PDF, ALL_PDFS - + if 0 <= index < len(ALL_PDFS): CURRENT_PDF = ALL_PDFS[index] - - return redirect(url_for('index')) + + return redirect(url_for("index")) def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]: """Load tests from the dataset file and organize them by PDF.""" dataset_file = os.path.join(dataset_dir, "table_tests.jsonl") - + if not os.path.exists(dataset_file): raise FileNotFoundError(f"Dataset file not found: {dataset_file}") - + pdf_tests = defaultdict(list) - + with open(dataset_file, "r") as f: for line in f: line = line.strip() if not line: continue - + try: test = json.loads(line) - pdf_name = test.get('pdf') + pdf_name = test.get("pdf") if pdf_name: pdf_tests[pdf_name].append(test) except json.JSONDecodeError: print(f"Warning: Could not parse line as JSON: {line}") - + all_pdfs = list(pdf_tests.keys()) - + return pdf_tests, all_pdfs def create_templates_directory(): """Create templates directory for Flask if it doesn't exist.""" - templates_dir = os.path.join(os.path.dirname(__file__), 'templates') + templates_dir = os.path.join(os.path.dirname(__file__), "templates") os.makedirs(templates_dir, exist_ok=True) - + def main(): """Main entry point with command-line arguments.""" - global DATASET_DIR, PDF_TESTS, ALL_PDFS - + global DATASET_DIR, PDF_TESTS, ALL_PDFS, CURRENT_PDF + parser = argparse.ArgumentParser(description="Interactive Test Review App") parser.add_argument("dataset_dir", help="Path to the dataset directory containing table_tests.jsonl and pdfs/ folder") parser.add_argument("--port", type=int, default=5000, help="Port for the Flask app") parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app") parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode") - + args = parser.parse_args() - + # Validate dataset directory if not os.path.isdir(args.dataset_dir): print(f"Error: Dataset directory not found: {args.dataset_dir}") return 1 - + pdf_dir = os.path.join(args.dataset_dir, "pdfs") if not os.path.isdir(pdf_dir): print(f"Error: PDF directory not found: {pdf_dir}") return 1 - + # Store dataset directory globally DATASET_DIR = args.dataset_dir - + # Load dataset try: PDF_TESTS, ALL_PDFS = load_dataset(args.dataset_dir) except Exception as e: print(f"Error loading dataset: {str(e)}") return 1 - + # Create templates directory create_templates_directory() - + # Find first PDF with unchecked tests CURRENT_PDF = find_next_unchecked_pdf() - + # Start Flask app print(f"Starting server at http://{args.host}:{args.port}") app.run(host=args.host, port=args.port, debug=args.debug) - + return 0 if __name__ == "__main__": - sys.exit(main()) \ No newline at end of file + sys.exit(main()) diff --git a/olmocr/bench/templates/review.html b/olmocr/bench/templates/review.html index ba048d0..965ed77 100644 --- a/olmocr/bench/templates/review.html +++ b/olmocr/bench/templates/review.html @@ -782,13 +782,16 @@ if (textarea.parentNode) { textarea.parentNode.replaceChild(span, textarea); } + + // Important: Reset edit mode flag + isEditMode = false; } // Add keydown event to handle Enter key textarea.addEventListener('keydown', function(e) { if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); // Prevent default Enter behavior - this.blur(); // Will trigger the blur event + saveAndExitForField(); // Save directly instead of blur } }); @@ -839,13 +842,16 @@ if (textarea.parentNode) { textarea.parentNode.replaceChild(span, textarea); } + + // Important: Reset edit mode flag + isEditMode = false; } // Add keydown event to handle Enter key textarea.addEventListener('keydown', function(e) { if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); // Prevent default Enter behavior - this.blur(); // Will trigger the blur event + saveAndExit(); // Save directly rather than triggering blur } });