Review app is much nicer now

This commit is contained in:
Jake Poznanski 2025-03-18 18:57:50 +00:00
parent 5ec96476c9
commit d620722a0e
4 changed files with 101 additions and 124 deletions

View File

@ -138,7 +138,7 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[
parts=[ parts=[
image_part, image_part,
types.Part.from_text( types.Part.from_text(
text=( text=(
"Analyze the document attached and output it in markdown format. " "Analyze the document attached and output it in markdown format. "
"Output equations as Latex escaped with $$. " "Output equations as Latex escaped with $$. "
"Output tables in valid HTML format that preserves the structure and content exactly. " "Output tables in valid HTML format that preserves the structure and content exactly. "
@ -415,4 +415,4 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -15,8 +15,6 @@ Usage:
""" """
import argparse import argparse
import base64
import json
import os import os
import random import random
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
@ -136,13 +134,7 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[
{ {
"role": "user", "role": "user",
"content": [ "content": [
{ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}", "detail": "high"}},
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}",
"detail": "high"
}
},
{ {
"type": "text", "type": "text",
"text": ( "text": (
@ -150,9 +142,9 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[
"Output equations as Latex escaped with $$. " "Output equations as Latex escaped with $$. "
"Output tables in valid HTML format that preserves the structure and content exactly. " "Output tables in valid HTML format that preserves the structure and content exactly. "
"Output figures with just a simple markdown image placeholder." "Output figures with just a simple markdown image placeholder."
) ),
} },
] ],
} }
], ],
temperature=0.2, temperature=0.2,
@ -278,26 +270,17 @@ def generate_table_tests(tables: List[np.ndarray], pdf_image: str, api_key: str,
{ {
"role": "user", "role": "user",
"content": [ "content": [
{ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{pdf_image}", "detail": "high"}},
"type": "image_url", {"type": "text", "text": prompt},
"image_url": { ],
"url": f"data:image/png;base64,{pdf_image}",
"detail": "high"
}
},
{
"type": "text",
"text": prompt
}
]
} }
], ],
temperature=0.2, temperature=0.2,
) )
if not response.choices or len(response.choices) == 0: if not response.choices or len(response.choices) == 0:
continue continue
answer_text = response.choices[0].message.content.strip() answer_text = response.choices[0].message.content.strip()
if answer_text and "null" not in answer_text: if answer_text and "null" not in answer_text:
test_data = {"cell": cell_value, relationship: answer_text} test_data = {"cell": cell_value, relationship: answer_text}
@ -432,4 +415,4 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -8,12 +8,9 @@ import tempfile
from collections import defaultdict from collections import defaultdict
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
import flask from flask import Flask, jsonify, redirect, render_template, request, send_file, url_for
from flask import Flask, render_template, request, jsonify, redirect, url_for, send_file
from werkzeug.utils import secure_filename
from olmocr.data.renderpdf import render_pdf_to_base64png
from . import tests
app = Flask(__name__) app = Flask(__name__)
@ -27,7 +24,7 @@ ALL_PDFS = []
def find_next_unchecked_pdf() -> Optional[str]: def find_next_unchecked_pdf() -> Optional[str]:
"""Find the next PDF with at least one unchecked test.""" """Find the next PDF with at least one unchecked test."""
global PDF_TESTS, ALL_PDFS global PDF_TESTS, ALL_PDFS
for pdf_name in ALL_PDFS: for pdf_name in ALL_PDFS:
pdf_tests = PDF_TESTS[pdf_name] pdf_tests = PDF_TESTS[pdf_name]
for test in pdf_tests: for test in pdf_tests:
@ -39,149 +36,140 @@ def find_next_unchecked_pdf() -> Optional[str]:
def calculate_stats() -> dict: def calculate_stats() -> dict:
"""Calculate statistics for all tests in the dataset.""" """Calculate statistics for all tests in the dataset."""
global PDF_TESTS global PDF_TESTS
total_tests = 0 total_tests = 0
null_status = 0 null_status = 0
verified_status = 0 verified_status = 0
rejected_status = 0 rejected_status = 0
for pdf_tests in PDF_TESTS.values(): for pdf_tests in PDF_TESTS.values():
total_tests += len(pdf_tests) total_tests += len(pdf_tests)
for test in pdf_tests: for test in pdf_tests:
status = test.get('checked') status = test.get("checked")
if status is None: if status is None:
null_status += 1 null_status += 1
elif status == 'verified': elif status == "verified":
verified_status += 1 verified_status += 1
elif status == 'rejected': elif status == "rejected":
rejected_status += 1 rejected_status += 1
completion = 0 completion = 0
if total_tests > 0: if total_tests > 0:
completion = (verified_status + rejected_status) / total_tests * 100 completion = (verified_status + rejected_status) / total_tests * 100
return { return {"total": total_tests, "null": null_status, "verified": verified_status, "rejected": rejected_status, "completion": completion}
'total': total_tests,
'null': null_status,
'verified': verified_status,
'rejected': rejected_status,
'completion': completion
}
def save_dataset(jsonl_file: str) -> None: def save_dataset(jsonl_file: str) -> None:
"""Save the tests to a JSONL file, using temp file for atomic write.""" """Save the tests to a JSONL file, using temp file for atomic write."""
global PDF_TESTS global PDF_TESTS
# Flatten all tests # Flatten all tests
all_tests = [] all_tests = []
for pdf_tests in PDF_TESTS.values(): for pdf_tests in PDF_TESTS.values():
all_tests.extend(pdf_tests) all_tests.extend(pdf_tests)
# Create temp file and write updated content # Create temp file and write updated content
with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file: with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file:
for test in all_tests: for test in all_tests:
temp_file.write(json.dumps(test) + "\n") temp_file.write(json.dumps(test) + "\n")
# Atomic replace # Atomic replace
shutil.move(temp_file.name, jsonl_file) shutil.move(temp_file.name, jsonl_file)
@app.route('/pdf/<path:pdf_name>') @app.route("/pdf/<path:pdf_name>")
def serve_pdf(pdf_name): def serve_pdf(pdf_name):
"""Serve the PDF file directly.""" """Serve the PDF file directly."""
pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name) pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name)
return send_file(pdf_path, mimetype='application/pdf') return send_file(pdf_path, mimetype="application/pdf")
@app.route('/') @app.route("/")
def index(): def index():
"""Main page displaying the current PDF and its tests.""" """Main page displaying the current PDF and its tests."""
global CURRENT_PDF, PDF_TESTS, DATASET_DIR global CURRENT_PDF, PDF_TESTS, DATASET_DIR
# If no current PDF is set, find the next one with unchecked tests # If no current PDF is set, find the next one with unchecked tests
if CURRENT_PDF is None: if CURRENT_PDF is None:
CURRENT_PDF = find_next_unchecked_pdf() CURRENT_PDF = find_next_unchecked_pdf()
# If still no PDF, all tests have been checked # If still no PDF, all tests have been checked
if CURRENT_PDF is None: if CURRENT_PDF is None:
return render_template('all_done.html') return render_template("all_done.html")
# Get the tests for the current PDF # Get the tests for the current PDF
current_tests = PDF_TESTS.get(CURRENT_PDF, []) current_tests = PDF_TESTS.get(CURRENT_PDF, [])
# Create PDF URL for pdf.js to load # Create PDF URL for pdf.js to load
pdf_url = url_for('serve_pdf', pdf_name=CURRENT_PDF) pdf_url = url_for("serve_pdf", pdf_name=CURRENT_PDF)
# Calculate statistics # Calculate statistics
stats = calculate_stats() stats = calculate_stats()
return render_template( return render_template(
'review.html', "review.html",
pdf_name=CURRENT_PDF, pdf_name=CURRENT_PDF,
tests=current_tests, tests=current_tests,
pdf_path=pdf_url, pdf_path=pdf_url,
pdf_index=ALL_PDFS.index(CURRENT_PDF) if CURRENT_PDF in ALL_PDFS else 0, pdf_index=ALL_PDFS.index(CURRENT_PDF) if CURRENT_PDF in ALL_PDFS else 0,
total_pdfs=len(ALL_PDFS), total_pdfs=len(ALL_PDFS),
stats=stats stats=stats,
) )
@app.route('/update_test', methods=['POST']) @app.route("/update_test", methods=["POST"])
def update_test(): def update_test():
"""API endpoint to update a test.""" """API endpoint to update a test."""
global PDF_TESTS, DATASET_DIR global PDF_TESTS, DATASET_DIR
data = request.json data = request.json
pdf_name = data.get('pdf') pdf_name = data.get("pdf")
test_id = data.get('id') test_id = data.get("id")
field = data.get('field') field = data.get("field")
value = data.get('value') value = data.get("value")
# Find and update the test # Find and update the test
for test in PDF_TESTS.get(pdf_name, []): for test in PDF_TESTS.get(pdf_name, []):
if test.get('id') == test_id: if test.get("id") == test_id:
test[field] = value test[field] = value
break break
# Save the updated tests # Save the updated tests
dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl") dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl")
save_dataset(dataset_file) save_dataset(dataset_file)
return jsonify({"status": "success"}) return jsonify({"status": "success"})
@app.route('/reject_all', methods=['POST']) @app.route("/reject_all", methods=["POST"])
def reject_all(): def reject_all():
"""API endpoint to reject all tests for a PDF.""" """API endpoint to reject all tests for a PDF."""
global PDF_TESTS, DATASET_DIR global PDF_TESTS, DATASET_DIR
data = request.json data = request.json
pdf_name = data.get('pdf') pdf_name = data.get("pdf")
if pdf_name and pdf_name in PDF_TESTS: if pdf_name and pdf_name in PDF_TESTS:
# Update all tests for this PDF to rejected # Update all tests for this PDF to rejected
for test in PDF_TESTS[pdf_name]: for test in PDF_TESTS[pdf_name]:
test['checked'] = 'rejected' test["checked"] = "rejected"
# Save the updated tests # Save the updated tests
dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl") dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl")
save_dataset(dataset_file) save_dataset(dataset_file)
return jsonify({ return jsonify({"status": "success", "count": len(PDF_TESTS[pdf_name])})
"status": "success",
"count": len(PDF_TESTS[pdf_name])
})
return jsonify({"status": "error", "message": "PDF not found"}) return jsonify({"status": "error", "message": "PDF not found"})
@app.route('/next_pdf', methods=['POST']) @app.route("/next_pdf", methods=["POST"])
def next_pdf(): def next_pdf():
"""Move to the next PDF in the list.""" """Move to the next PDF in the list."""
global CURRENT_PDF, ALL_PDFS global CURRENT_PDF, ALL_PDFS
if CURRENT_PDF in ALL_PDFS: if CURRENT_PDF in ALL_PDFS:
current_index = ALL_PDFS.index(CURRENT_PDF) current_index = ALL_PDFS.index(CURRENT_PDF)
if current_index < len(ALL_PDFS) - 1: if current_index < len(ALL_PDFS) - 1:
@ -190,112 +178,112 @@ def next_pdf():
CURRENT_PDF = find_next_unchecked_pdf() CURRENT_PDF = find_next_unchecked_pdf()
else: else:
CURRENT_PDF = find_next_unchecked_pdf() CURRENT_PDF = find_next_unchecked_pdf()
return redirect(url_for('index')) return redirect(url_for("index"))
@app.route('/prev_pdf', methods=['POST']) @app.route("/prev_pdf", methods=["POST"])
def prev_pdf(): def prev_pdf():
"""Move to the previous PDF in the list.""" """Move to the previous PDF in the list."""
global CURRENT_PDF, ALL_PDFS global CURRENT_PDF, ALL_PDFS
if CURRENT_PDF in ALL_PDFS: if CURRENT_PDF in ALL_PDFS:
current_index = ALL_PDFS.index(CURRENT_PDF) current_index = ALL_PDFS.index(CURRENT_PDF)
if current_index > 0: if current_index > 0:
CURRENT_PDF = ALL_PDFS[current_index - 1] CURRENT_PDF = ALL_PDFS[current_index - 1]
return redirect(url_for('index')) return redirect(url_for("index"))
@app.route('/goto_pdf/<int:index>', methods=['POST']) @app.route("/goto_pdf/<int:index>", methods=["POST"])
def goto_pdf(index): def goto_pdf(index):
"""Go to a specific PDF by index.""" """Go to a specific PDF by index."""
global CURRENT_PDF, ALL_PDFS global CURRENT_PDF, ALL_PDFS
if 0 <= index < len(ALL_PDFS): if 0 <= index < len(ALL_PDFS):
CURRENT_PDF = ALL_PDFS[index] CURRENT_PDF = ALL_PDFS[index]
return redirect(url_for('index')) return redirect(url_for("index"))
def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]: def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
"""Load tests from the dataset file and organize them by PDF.""" """Load tests from the dataset file and organize them by PDF."""
dataset_file = os.path.join(dataset_dir, "table_tests.jsonl") dataset_file = os.path.join(dataset_dir, "table_tests.jsonl")
if not os.path.exists(dataset_file): if not os.path.exists(dataset_file):
raise FileNotFoundError(f"Dataset file not found: {dataset_file}") raise FileNotFoundError(f"Dataset file not found: {dataset_file}")
pdf_tests = defaultdict(list) pdf_tests = defaultdict(list)
with open(dataset_file, "r") as f: with open(dataset_file, "r") as f:
for line in f: for line in f:
line = line.strip() line = line.strip()
if not line: if not line:
continue continue
try: try:
test = json.loads(line) test = json.loads(line)
pdf_name = test.get('pdf') pdf_name = test.get("pdf")
if pdf_name: if pdf_name:
pdf_tests[pdf_name].append(test) pdf_tests[pdf_name].append(test)
except json.JSONDecodeError: except json.JSONDecodeError:
print(f"Warning: Could not parse line as JSON: {line}") print(f"Warning: Could not parse line as JSON: {line}")
all_pdfs = list(pdf_tests.keys()) all_pdfs = list(pdf_tests.keys())
return pdf_tests, all_pdfs return pdf_tests, all_pdfs
def create_templates_directory(): def create_templates_directory():
"""Create templates directory for Flask if it doesn't exist.""" """Create templates directory for Flask if it doesn't exist."""
templates_dir = os.path.join(os.path.dirname(__file__), 'templates') templates_dir = os.path.join(os.path.dirname(__file__), "templates")
os.makedirs(templates_dir, exist_ok=True) os.makedirs(templates_dir, exist_ok=True)
def main(): def main():
"""Main entry point with command-line arguments.""" """Main entry point with command-line arguments."""
global DATASET_DIR, PDF_TESTS, ALL_PDFS global DATASET_DIR, PDF_TESTS, ALL_PDFS, CURRENT_PDF
parser = argparse.ArgumentParser(description="Interactive Test Review App") parser = argparse.ArgumentParser(description="Interactive Test Review App")
parser.add_argument("dataset_dir", help="Path to the dataset directory containing table_tests.jsonl and pdfs/ folder") parser.add_argument("dataset_dir", help="Path to the dataset directory containing table_tests.jsonl and pdfs/ folder")
parser.add_argument("--port", type=int, default=5000, help="Port for the Flask app") parser.add_argument("--port", type=int, default=5000, help="Port for the Flask app")
parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app") parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app")
parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode") parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode")
args = parser.parse_args() args = parser.parse_args()
# Validate dataset directory # Validate dataset directory
if not os.path.isdir(args.dataset_dir): if not os.path.isdir(args.dataset_dir):
print(f"Error: Dataset directory not found: {args.dataset_dir}") print(f"Error: Dataset directory not found: {args.dataset_dir}")
return 1 return 1
pdf_dir = os.path.join(args.dataset_dir, "pdfs") pdf_dir = os.path.join(args.dataset_dir, "pdfs")
if not os.path.isdir(pdf_dir): if not os.path.isdir(pdf_dir):
print(f"Error: PDF directory not found: {pdf_dir}") print(f"Error: PDF directory not found: {pdf_dir}")
return 1 return 1
# Store dataset directory globally # Store dataset directory globally
DATASET_DIR = args.dataset_dir DATASET_DIR = args.dataset_dir
# Load dataset # Load dataset
try: try:
PDF_TESTS, ALL_PDFS = load_dataset(args.dataset_dir) PDF_TESTS, ALL_PDFS = load_dataset(args.dataset_dir)
except Exception as e: except Exception as e:
print(f"Error loading dataset: {str(e)}") print(f"Error loading dataset: {str(e)}")
return 1 return 1
# Create templates directory # Create templates directory
create_templates_directory() create_templates_directory()
# Find first PDF with unchecked tests # Find first PDF with unchecked tests
CURRENT_PDF = find_next_unchecked_pdf() CURRENT_PDF = find_next_unchecked_pdf()
# Start Flask app # Start Flask app
print(f"Starting server at http://{args.host}:{args.port}") print(f"Starting server at http://{args.host}:{args.port}")
app.run(host=args.host, port=args.port, debug=args.debug) app.run(host=args.host, port=args.port, debug=args.debug)
return 0 return 0
if __name__ == "__main__": if __name__ == "__main__":
sys.exit(main()) sys.exit(main())

View File

@ -782,13 +782,16 @@
if (textarea.parentNode) { if (textarea.parentNode) {
textarea.parentNode.replaceChild(span, textarea); textarea.parentNode.replaceChild(span, textarea);
} }
// Important: Reset edit mode flag
isEditMode = false;
} }
// Add keydown event to handle Enter key // Add keydown event to handle Enter key
textarea.addEventListener('keydown', function(e) { textarea.addEventListener('keydown', function(e) {
if (e.key === 'Enter' && !e.shiftKey) { if (e.key === 'Enter' && !e.shiftKey) {
e.preventDefault(); // Prevent default Enter behavior e.preventDefault(); // Prevent default Enter behavior
this.blur(); // Will trigger the blur event saveAndExitForField(); // Save directly instead of blur
} }
}); });
@ -839,13 +842,16 @@
if (textarea.parentNode) { if (textarea.parentNode) {
textarea.parentNode.replaceChild(span, textarea); textarea.parentNode.replaceChild(span, textarea);
} }
// Important: Reset edit mode flag
isEditMode = false;
} }
// Add keydown event to handle Enter key // Add keydown event to handle Enter key
textarea.addEventListener('keydown', function(e) { textarea.addEventListener('keydown', function(e) {
if (e.key === 'Enter' && !e.shiftKey) { if (e.key === 'Enter' && !e.shiftKey) {
e.preventDefault(); // Prevent default Enter behavior e.preventDefault(); // Prevent default Enter behavior
this.blur(); // Will trigger the blur event saveAndExit(); // Save directly rather than triggering blur
} }
}); });