Review app is much nicer now

This commit is contained in:
Jake Poznanski 2025-03-18 18:57:50 +00:00
parent 5ec96476c9
commit d620722a0e
4 changed files with 101 additions and 124 deletions

View File

@ -138,7 +138,7 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[
parts=[
image_part,
types.Part.from_text(
text=(
text=(
"Analyze the document attached and output it in markdown format. "
"Output equations as Latex escaped with $$. "
"Output tables in valid HTML format that preserves the structure and content exactly. "
@ -415,4 +415,4 @@ def main():
if __name__ == "__main__":
main()
main()

View File

@ -15,8 +15,6 @@ Usage:
"""
import argparse
import base64
import json
import os
import random
from typing import Dict, List, Optional, Tuple
@ -136,13 +134,7 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}",
"detail": "high"
}
},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}", "detail": "high"}},
{
"type": "text",
"text": (
@ -150,9 +142,9 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[
"Output equations as Latex escaped with $$. "
"Output tables in valid HTML format that preserves the structure and content exactly. "
"Output figures with just a simple markdown image placeholder."
)
}
]
),
},
],
}
],
temperature=0.2,
@ -278,26 +270,17 @@ def generate_table_tests(tables: List[np.ndarray], pdf_image: str, api_key: str,
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{pdf_image}",
"detail": "high"
}
},
{
"type": "text",
"text": prompt
}
]
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{pdf_image}", "detail": "high"}},
{"type": "text", "text": prompt},
],
}
],
temperature=0.2,
)
if not response.choices or len(response.choices) == 0:
continue
answer_text = response.choices[0].message.content.strip()
if answer_text and "null" not in answer_text:
test_data = {"cell": cell_value, relationship: answer_text}
@ -432,4 +415,4 @@ def main():
if __name__ == "__main__":
main()
main()

View File

@ -8,12 +8,9 @@ import tempfile
from collections import defaultdict
from typing import Dict, List, Optional, Tuple
import flask
from flask import Flask, render_template, request, jsonify, redirect, url_for, send_file
from werkzeug.utils import secure_filename
from flask import Flask, jsonify, redirect, render_template, request, send_file, url_for
from olmocr.data.renderpdf import render_pdf_to_base64png
from . import tests
app = Flask(__name__)
@ -27,7 +24,7 @@ ALL_PDFS = []
def find_next_unchecked_pdf() -> Optional[str]:
"""Find the next PDF with at least one unchecked test."""
global PDF_TESTS, ALL_PDFS
for pdf_name in ALL_PDFS:
pdf_tests = PDF_TESTS[pdf_name]
for test in pdf_tests:
@ -39,149 +36,140 @@ def find_next_unchecked_pdf() -> Optional[str]:
def calculate_stats() -> dict:
"""Calculate statistics for all tests in the dataset."""
global PDF_TESTS
total_tests = 0
null_status = 0
verified_status = 0
rejected_status = 0
for pdf_tests in PDF_TESTS.values():
total_tests += len(pdf_tests)
for test in pdf_tests:
status = test.get('checked')
status = test.get("checked")
if status is None:
null_status += 1
elif status == 'verified':
elif status == "verified":
verified_status += 1
elif status == 'rejected':
elif status == "rejected":
rejected_status += 1
completion = 0
if total_tests > 0:
completion = (verified_status + rejected_status) / total_tests * 100
return {
'total': total_tests,
'null': null_status,
'verified': verified_status,
'rejected': rejected_status,
'completion': completion
}
return {"total": total_tests, "null": null_status, "verified": verified_status, "rejected": rejected_status, "completion": completion}
def save_dataset(jsonl_file: str) -> None:
"""Save the tests to a JSONL file, using temp file for atomic write."""
global PDF_TESTS
# Flatten all tests
all_tests = []
for pdf_tests in PDF_TESTS.values():
all_tests.extend(pdf_tests)
# Create temp file and write updated content
with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file:
for test in all_tests:
temp_file.write(json.dumps(test) + "\n")
# Atomic replace
shutil.move(temp_file.name, jsonl_file)
@app.route('/pdf/<path:pdf_name>')
@app.route("/pdf/<path:pdf_name>")
def serve_pdf(pdf_name):
"""Serve the PDF file directly."""
pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name)
return send_file(pdf_path, mimetype='application/pdf')
return send_file(pdf_path, mimetype="application/pdf")
@app.route('/')
@app.route("/")
def index():
"""Main page displaying the current PDF and its tests."""
global CURRENT_PDF, PDF_TESTS, DATASET_DIR
# If no current PDF is set, find the next one with unchecked tests
if CURRENT_PDF is None:
CURRENT_PDF = find_next_unchecked_pdf()
# If still no PDF, all tests have been checked
if CURRENT_PDF is None:
return render_template('all_done.html')
return render_template("all_done.html")
# Get the tests for the current PDF
current_tests = PDF_TESTS.get(CURRENT_PDF, [])
# Create PDF URL for pdf.js to load
pdf_url = url_for('serve_pdf', pdf_name=CURRENT_PDF)
pdf_url = url_for("serve_pdf", pdf_name=CURRENT_PDF)
# Calculate statistics
stats = calculate_stats()
return render_template(
'review.html',
"review.html",
pdf_name=CURRENT_PDF,
tests=current_tests,
pdf_path=pdf_url,
pdf_index=ALL_PDFS.index(CURRENT_PDF) if CURRENT_PDF in ALL_PDFS else 0,
total_pdfs=len(ALL_PDFS),
stats=stats
stats=stats,
)
@app.route('/update_test', methods=['POST'])
@app.route("/update_test", methods=["POST"])
def update_test():
"""API endpoint to update a test."""
global PDF_TESTS, DATASET_DIR
data = request.json
pdf_name = data.get('pdf')
test_id = data.get('id')
field = data.get('field')
value = data.get('value')
pdf_name = data.get("pdf")
test_id = data.get("id")
field = data.get("field")
value = data.get("value")
# Find and update the test
for test in PDF_TESTS.get(pdf_name, []):
if test.get('id') == test_id:
if test.get("id") == test_id:
test[field] = value
break
# Save the updated tests
dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl")
save_dataset(dataset_file)
return jsonify({"status": "success"})
@app.route('/reject_all', methods=['POST'])
@app.route("/reject_all", methods=["POST"])
def reject_all():
"""API endpoint to reject all tests for a PDF."""
global PDF_TESTS, DATASET_DIR
data = request.json
pdf_name = data.get('pdf')
pdf_name = data.get("pdf")
if pdf_name and pdf_name in PDF_TESTS:
# Update all tests for this PDF to rejected
for test in PDF_TESTS[pdf_name]:
test['checked'] = 'rejected'
test["checked"] = "rejected"
# Save the updated tests
dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl")
save_dataset(dataset_file)
return jsonify({
"status": "success",
"count": len(PDF_TESTS[pdf_name])
})
return jsonify({"status": "success", "count": len(PDF_TESTS[pdf_name])})
return jsonify({"status": "error", "message": "PDF not found"})
@app.route('/next_pdf', methods=['POST'])
@app.route("/next_pdf", methods=["POST"])
def next_pdf():
"""Move to the next PDF in the list."""
global CURRENT_PDF, ALL_PDFS
if CURRENT_PDF in ALL_PDFS:
current_index = ALL_PDFS.index(CURRENT_PDF)
if current_index < len(ALL_PDFS) - 1:
@ -190,112 +178,112 @@ def next_pdf():
CURRENT_PDF = find_next_unchecked_pdf()
else:
CURRENT_PDF = find_next_unchecked_pdf()
return redirect(url_for('index'))
return redirect(url_for("index"))
@app.route('/prev_pdf', methods=['POST'])
@app.route("/prev_pdf", methods=["POST"])
def prev_pdf():
"""Move to the previous PDF in the list."""
global CURRENT_PDF, ALL_PDFS
if CURRENT_PDF in ALL_PDFS:
current_index = ALL_PDFS.index(CURRENT_PDF)
if current_index > 0:
CURRENT_PDF = ALL_PDFS[current_index - 1]
return redirect(url_for('index'))
return redirect(url_for("index"))
@app.route('/goto_pdf/<int:index>', methods=['POST'])
@app.route("/goto_pdf/<int:index>", methods=["POST"])
def goto_pdf(index):
"""Go to a specific PDF by index."""
global CURRENT_PDF, ALL_PDFS
if 0 <= index < len(ALL_PDFS):
CURRENT_PDF = ALL_PDFS[index]
return redirect(url_for('index'))
return redirect(url_for("index"))
def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
"""Load tests from the dataset file and organize them by PDF."""
dataset_file = os.path.join(dataset_dir, "table_tests.jsonl")
if not os.path.exists(dataset_file):
raise FileNotFoundError(f"Dataset file not found: {dataset_file}")
pdf_tests = defaultdict(list)
with open(dataset_file, "r") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
test = json.loads(line)
pdf_name = test.get('pdf')
pdf_name = test.get("pdf")
if pdf_name:
pdf_tests[pdf_name].append(test)
except json.JSONDecodeError:
print(f"Warning: Could not parse line as JSON: {line}")
all_pdfs = list(pdf_tests.keys())
return pdf_tests, all_pdfs
def create_templates_directory():
"""Create templates directory for Flask if it doesn't exist."""
templates_dir = os.path.join(os.path.dirname(__file__), 'templates')
templates_dir = os.path.join(os.path.dirname(__file__), "templates")
os.makedirs(templates_dir, exist_ok=True)
def main():
"""Main entry point with command-line arguments."""
global DATASET_DIR, PDF_TESTS, ALL_PDFS
global DATASET_DIR, PDF_TESTS, ALL_PDFS, CURRENT_PDF
parser = argparse.ArgumentParser(description="Interactive Test Review App")
parser.add_argument("dataset_dir", help="Path to the dataset directory containing table_tests.jsonl and pdfs/ folder")
parser.add_argument("--port", type=int, default=5000, help="Port for the Flask app")
parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app")
parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode")
args = parser.parse_args()
# Validate dataset directory
if not os.path.isdir(args.dataset_dir):
print(f"Error: Dataset directory not found: {args.dataset_dir}")
return 1
pdf_dir = os.path.join(args.dataset_dir, "pdfs")
if not os.path.isdir(pdf_dir):
print(f"Error: PDF directory not found: {pdf_dir}")
return 1
# Store dataset directory globally
DATASET_DIR = args.dataset_dir
# Load dataset
try:
PDF_TESTS, ALL_PDFS = load_dataset(args.dataset_dir)
except Exception as e:
print(f"Error loading dataset: {str(e)}")
return 1
# Create templates directory
create_templates_directory()
# Find first PDF with unchecked tests
CURRENT_PDF = find_next_unchecked_pdf()
# Start Flask app
print(f"Starting server at http://{args.host}:{args.port}")
app.run(host=args.host, port=args.port, debug=args.debug)
return 0
if __name__ == "__main__":
sys.exit(main())
sys.exit(main())

View File

@ -782,13 +782,16 @@
if (textarea.parentNode) {
textarea.parentNode.replaceChild(span, textarea);
}
// Important: Reset edit mode flag
isEditMode = false;
}
// Add keydown event to handle Enter key
textarea.addEventListener('keydown', function(e) {
if (e.key === 'Enter' && !e.shiftKey) {
e.preventDefault(); // Prevent default Enter behavior
this.blur(); // Will trigger the blur event
saveAndExitForField(); // Save directly instead of blur
}
});
@ -839,13 +842,16 @@
if (textarea.parentNode) {
textarea.parentNode.replaceChild(span, textarea);
}
// Important: Reset edit mode flag
isEditMode = false;
}
// Add keydown event to handle Enter key
textarea.addEventListener('keydown', function(e) {
if (e.key === 'Enter' && !e.shiftKey) {
e.preventDefault(); // Prevent default Enter behavior
this.blur(); // Will trigger the blur event
saveAndExit(); // Save directly rather than triggering blur
}
});