Review app is much nicer now

This commit is contained in:
Jake Poznanski 2025-03-18 18:57:50 +00:00
parent 5ec96476c9
commit d620722a0e
4 changed files with 101 additions and 124 deletions

View File

@ -15,8 +15,6 @@ Usage:
""" """
import argparse import argparse
import base64
import json
import os import os
import random import random
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
@ -136,13 +134,7 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[
{ {
"role": "user", "role": "user",
"content": [ "content": [
{ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}", "detail": "high"}},
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}",
"detail": "high"
}
},
{ {
"type": "text", "type": "text",
"text": ( "text": (
@ -150,9 +142,9 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[
"Output equations as Latex escaped with $$. " "Output equations as Latex escaped with $$. "
"Output tables in valid HTML format that preserves the structure and content exactly. " "Output tables in valid HTML format that preserves the structure and content exactly. "
"Output figures with just a simple markdown image placeholder." "Output figures with just a simple markdown image placeholder."
) ),
} },
] ],
} }
], ],
temperature=0.2, temperature=0.2,
@ -278,18 +270,9 @@ def generate_table_tests(tables: List[np.ndarray], pdf_image: str, api_key: str,
{ {
"role": "user", "role": "user",
"content": [ "content": [
{ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{pdf_image}", "detail": "high"}},
"type": "image_url", {"type": "text", "text": prompt},
"image_url": { ],
"url": f"data:image/png;base64,{pdf_image}",
"detail": "high"
}
},
{
"type": "text",
"text": prompt
}
]
} }
], ],
temperature=0.2, temperature=0.2,

View File

@ -8,12 +8,9 @@ import tempfile
from collections import defaultdict from collections import defaultdict
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
import flask from flask import Flask, jsonify, redirect, render_template, request, send_file, url_for
from flask import Flask, render_template, request, jsonify, redirect, url_for, send_file
from werkzeug.utils import secure_filename
from olmocr.data.renderpdf import render_pdf_to_base64png
from . import tests
app = Flask(__name__) app = Flask(__name__)
@ -49,25 +46,19 @@ def calculate_stats() -> dict:
total_tests += len(pdf_tests) total_tests += len(pdf_tests)
for test in pdf_tests: for test in pdf_tests:
status = test.get('checked') status = test.get("checked")
if status is None: if status is None:
null_status += 1 null_status += 1
elif status == 'verified': elif status == "verified":
verified_status += 1 verified_status += 1
elif status == 'rejected': elif status == "rejected":
rejected_status += 1 rejected_status += 1
completion = 0 completion = 0
if total_tests > 0: if total_tests > 0:
completion = (verified_status + rejected_status) / total_tests * 100 completion = (verified_status + rejected_status) / total_tests * 100
return { return {"total": total_tests, "null": null_status, "verified": verified_status, "rejected": rejected_status, "completion": completion}
'total': total_tests,
'null': null_status,
'verified': verified_status,
'rejected': rejected_status,
'completion': completion
}
def save_dataset(jsonl_file: str) -> None: def save_dataset(jsonl_file: str) -> None:
@ -80,7 +71,7 @@ def save_dataset(jsonl_file: str) -> None:
all_tests.extend(pdf_tests) all_tests.extend(pdf_tests)
# Create temp file and write updated content # Create temp file and write updated content
with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file: with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file:
for test in all_tests: for test in all_tests:
temp_file.write(json.dumps(test) + "\n") temp_file.write(json.dumps(test) + "\n")
@ -88,14 +79,14 @@ def save_dataset(jsonl_file: str) -> None:
shutil.move(temp_file.name, jsonl_file) shutil.move(temp_file.name, jsonl_file)
@app.route('/pdf/<path:pdf_name>') @app.route("/pdf/<path:pdf_name>")
def serve_pdf(pdf_name): def serve_pdf(pdf_name):
"""Serve the PDF file directly.""" """Serve the PDF file directly."""
pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name) pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name)
return send_file(pdf_path, mimetype='application/pdf') return send_file(pdf_path, mimetype="application/pdf")
@app.route('/') @app.route("/")
def index(): def index():
"""Main page displaying the current PDF and its tests.""" """Main page displaying the current PDF and its tests."""
global CURRENT_PDF, PDF_TESTS, DATASET_DIR global CURRENT_PDF, PDF_TESTS, DATASET_DIR
@ -106,42 +97,42 @@ def index():
# If still no PDF, all tests have been checked # If still no PDF, all tests have been checked
if CURRENT_PDF is None: if CURRENT_PDF is None:
return render_template('all_done.html') return render_template("all_done.html")
# Get the tests for the current PDF # Get the tests for the current PDF
current_tests = PDF_TESTS.get(CURRENT_PDF, []) current_tests = PDF_TESTS.get(CURRENT_PDF, [])
# Create PDF URL for pdf.js to load # Create PDF URL for pdf.js to load
pdf_url = url_for('serve_pdf', pdf_name=CURRENT_PDF) pdf_url = url_for("serve_pdf", pdf_name=CURRENT_PDF)
# Calculate statistics # Calculate statistics
stats = calculate_stats() stats = calculate_stats()
return render_template( return render_template(
'review.html', "review.html",
pdf_name=CURRENT_PDF, pdf_name=CURRENT_PDF,
tests=current_tests, tests=current_tests,
pdf_path=pdf_url, pdf_path=pdf_url,
pdf_index=ALL_PDFS.index(CURRENT_PDF) if CURRENT_PDF in ALL_PDFS else 0, pdf_index=ALL_PDFS.index(CURRENT_PDF) if CURRENT_PDF in ALL_PDFS else 0,
total_pdfs=len(ALL_PDFS), total_pdfs=len(ALL_PDFS),
stats=stats stats=stats,
) )
@app.route('/update_test', methods=['POST']) @app.route("/update_test", methods=["POST"])
def update_test(): def update_test():
"""API endpoint to update a test.""" """API endpoint to update a test."""
global PDF_TESTS, DATASET_DIR global PDF_TESTS, DATASET_DIR
data = request.json data = request.json
pdf_name = data.get('pdf') pdf_name = data.get("pdf")
test_id = data.get('id') test_id = data.get("id")
field = data.get('field') field = data.get("field")
value = data.get('value') value = data.get("value")
# Find and update the test # Find and update the test
for test in PDF_TESTS.get(pdf_name, []): for test in PDF_TESTS.get(pdf_name, []):
if test.get('id') == test_id: if test.get("id") == test_id:
test[field] = value test[field] = value
break break
@ -152,32 +143,29 @@ def update_test():
return jsonify({"status": "success"}) return jsonify({"status": "success"})
@app.route('/reject_all', methods=['POST']) @app.route("/reject_all", methods=["POST"])
def reject_all(): def reject_all():
"""API endpoint to reject all tests for a PDF.""" """API endpoint to reject all tests for a PDF."""
global PDF_TESTS, DATASET_DIR global PDF_TESTS, DATASET_DIR
data = request.json data = request.json
pdf_name = data.get('pdf') pdf_name = data.get("pdf")
if pdf_name and pdf_name in PDF_TESTS: if pdf_name and pdf_name in PDF_TESTS:
# Update all tests for this PDF to rejected # Update all tests for this PDF to rejected
for test in PDF_TESTS[pdf_name]: for test in PDF_TESTS[pdf_name]:
test['checked'] = 'rejected' test["checked"] = "rejected"
# Save the updated tests # Save the updated tests
dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl") dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl")
save_dataset(dataset_file) save_dataset(dataset_file)
return jsonify({ return jsonify({"status": "success", "count": len(PDF_TESTS[pdf_name])})
"status": "success",
"count": len(PDF_TESTS[pdf_name])
})
return jsonify({"status": "error", "message": "PDF not found"}) return jsonify({"status": "error", "message": "PDF not found"})
@app.route('/next_pdf', methods=['POST']) @app.route("/next_pdf", methods=["POST"])
def next_pdf(): def next_pdf():
"""Move to the next PDF in the list.""" """Move to the next PDF in the list."""
global CURRENT_PDF, ALL_PDFS global CURRENT_PDF, ALL_PDFS
@ -191,10 +179,10 @@ def next_pdf():
else: else:
CURRENT_PDF = find_next_unchecked_pdf() CURRENT_PDF = find_next_unchecked_pdf()
return redirect(url_for('index')) return redirect(url_for("index"))
@app.route('/prev_pdf', methods=['POST']) @app.route("/prev_pdf", methods=["POST"])
def prev_pdf(): def prev_pdf():
"""Move to the previous PDF in the list.""" """Move to the previous PDF in the list."""
global CURRENT_PDF, ALL_PDFS global CURRENT_PDF, ALL_PDFS
@ -204,10 +192,10 @@ def prev_pdf():
if current_index > 0: if current_index > 0:
CURRENT_PDF = ALL_PDFS[current_index - 1] CURRENT_PDF = ALL_PDFS[current_index - 1]
return redirect(url_for('index')) return redirect(url_for("index"))
@app.route('/goto_pdf/<int:index>', methods=['POST']) @app.route("/goto_pdf/<int:index>", methods=["POST"])
def goto_pdf(index): def goto_pdf(index):
"""Go to a specific PDF by index.""" """Go to a specific PDF by index."""
global CURRENT_PDF, ALL_PDFS global CURRENT_PDF, ALL_PDFS
@ -215,7 +203,7 @@ def goto_pdf(index):
if 0 <= index < len(ALL_PDFS): if 0 <= index < len(ALL_PDFS):
CURRENT_PDF = ALL_PDFS[index] CURRENT_PDF = ALL_PDFS[index]
return redirect(url_for('index')) return redirect(url_for("index"))
def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]: def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
@ -235,7 +223,7 @@ def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
try: try:
test = json.loads(line) test = json.loads(line)
pdf_name = test.get('pdf') pdf_name = test.get("pdf")
if pdf_name: if pdf_name:
pdf_tests[pdf_name].append(test) pdf_tests[pdf_name].append(test)
except json.JSONDecodeError: except json.JSONDecodeError:
@ -248,13 +236,13 @@ def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
def create_templates_directory(): def create_templates_directory():
"""Create templates directory for Flask if it doesn't exist.""" """Create templates directory for Flask if it doesn't exist."""
templates_dir = os.path.join(os.path.dirname(__file__), 'templates') templates_dir = os.path.join(os.path.dirname(__file__), "templates")
os.makedirs(templates_dir, exist_ok=True) os.makedirs(templates_dir, exist_ok=True)
def main(): def main():
"""Main entry point with command-line arguments.""" """Main entry point with command-line arguments."""
global DATASET_DIR, PDF_TESTS, ALL_PDFS global DATASET_DIR, PDF_TESTS, ALL_PDFS, CURRENT_PDF
parser = argparse.ArgumentParser(description="Interactive Test Review App") parser = argparse.ArgumentParser(description="Interactive Test Review App")
parser.add_argument("dataset_dir", help="Path to the dataset directory containing table_tests.jsonl and pdfs/ folder") parser.add_argument("dataset_dir", help="Path to the dataset directory containing table_tests.jsonl and pdfs/ folder")

View File

@ -782,13 +782,16 @@
if (textarea.parentNode) { if (textarea.parentNode) {
textarea.parentNode.replaceChild(span, textarea); textarea.parentNode.replaceChild(span, textarea);
} }
// Important: Reset edit mode flag
isEditMode = false;
} }
// Add keydown event to handle Enter key // Add keydown event to handle Enter key
textarea.addEventListener('keydown', function(e) { textarea.addEventListener('keydown', function(e) {
if (e.key === 'Enter' && !e.shiftKey) { if (e.key === 'Enter' && !e.shiftKey) {
e.preventDefault(); // Prevent default Enter behavior e.preventDefault(); // Prevent default Enter behavior
this.blur(); // Will trigger the blur event saveAndExitForField(); // Save directly instead of blur
} }
}); });
@ -839,13 +842,16 @@
if (textarea.parentNode) { if (textarea.parentNode) {
textarea.parentNode.replaceChild(span, textarea); textarea.parentNode.replaceChild(span, textarea);
} }
// Important: Reset edit mode flag
isEditMode = false;
} }
// Add keydown event to handle Enter key // Add keydown event to handle Enter key
textarea.addEventListener('keydown', function(e) { textarea.addEventListener('keydown', function(e) {
if (e.key === 'Enter' && !e.shiftKey) { if (e.key === 'Enter' && !e.shiftKey) {
e.preventDefault(); // Prevent default Enter behavior e.preventDefault(); // Prevent default Enter behavior
this.blur(); // Will trigger the blur event saveAndExit(); // Save directly rather than triggering blur
} }
}); });