mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-13 01:02:26 +00:00
Review app is much nicer now
This commit is contained in:
parent
5ec96476c9
commit
d620722a0e
@ -15,8 +15,6 @@ Usage:
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
@ -136,13 +134,7 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{image_base64}",
|
||||
"detail": "high"
|
||||
}
|
||||
},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}", "detail": "high"}},
|
||||
{
|
||||
"type": "text",
|
||||
"text": (
|
||||
@ -150,9 +142,9 @@ def detect_tables(pdf_path: str, page_num: int, api_key: str) -> Optional[Tuple[
|
||||
"Output equations as Latex escaped with $$. "
|
||||
"Output tables in valid HTML format that preserves the structure and content exactly. "
|
||||
"Output figures with just a simple markdown image placeholder."
|
||||
)
|
||||
}
|
||||
]
|
||||
),
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
temperature=0.2,
|
||||
@ -278,18 +270,9 @@ def generate_table_tests(tables: List[np.ndarray], pdf_image: str, api_key: str,
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{pdf_image}",
|
||||
"detail": "high"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
}
|
||||
]
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{pdf_image}", "detail": "high"}},
|
||||
{"type": "text", "text": prompt},
|
||||
],
|
||||
}
|
||||
],
|
||||
temperature=0.2,
|
||||
|
@ -8,12 +8,9 @@ import tempfile
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import flask
|
||||
from flask import Flask, render_template, request, jsonify, redirect, url_for, send_file
|
||||
from werkzeug.utils import secure_filename
|
||||
from flask import Flask, jsonify, redirect, render_template, request, send_file, url_for
|
||||
|
||||
|
||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||
from . import tests
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
@ -49,25 +46,19 @@ def calculate_stats() -> dict:
|
||||
total_tests += len(pdf_tests)
|
||||
|
||||
for test in pdf_tests:
|
||||
status = test.get('checked')
|
||||
status = test.get("checked")
|
||||
if status is None:
|
||||
null_status += 1
|
||||
elif status == 'verified':
|
||||
elif status == "verified":
|
||||
verified_status += 1
|
||||
elif status == 'rejected':
|
||||
elif status == "rejected":
|
||||
rejected_status += 1
|
||||
|
||||
completion = 0
|
||||
if total_tests > 0:
|
||||
completion = (verified_status + rejected_status) / total_tests * 100
|
||||
|
||||
return {
|
||||
'total': total_tests,
|
||||
'null': null_status,
|
||||
'verified': verified_status,
|
||||
'rejected': rejected_status,
|
||||
'completion': completion
|
||||
}
|
||||
return {"total": total_tests, "null": null_status, "verified": verified_status, "rejected": rejected_status, "completion": completion}
|
||||
|
||||
|
||||
def save_dataset(jsonl_file: str) -> None:
|
||||
@ -80,7 +71,7 @@ def save_dataset(jsonl_file: str) -> None:
|
||||
all_tests.extend(pdf_tests)
|
||||
|
||||
# Create temp file and write updated content
|
||||
with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
|
||||
with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file:
|
||||
for test in all_tests:
|
||||
temp_file.write(json.dumps(test) + "\n")
|
||||
|
||||
@ -88,14 +79,14 @@ def save_dataset(jsonl_file: str) -> None:
|
||||
shutil.move(temp_file.name, jsonl_file)
|
||||
|
||||
|
||||
@app.route('/pdf/<path:pdf_name>')
|
||||
@app.route("/pdf/<path:pdf_name>")
|
||||
def serve_pdf(pdf_name):
|
||||
"""Serve the PDF file directly."""
|
||||
pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name)
|
||||
return send_file(pdf_path, mimetype='application/pdf')
|
||||
return send_file(pdf_path, mimetype="application/pdf")
|
||||
|
||||
|
||||
@app.route('/')
|
||||
@app.route("/")
|
||||
def index():
|
||||
"""Main page displaying the current PDF and its tests."""
|
||||
global CURRENT_PDF, PDF_TESTS, DATASET_DIR
|
||||
@ -106,42 +97,42 @@ def index():
|
||||
|
||||
# If still no PDF, all tests have been checked
|
||||
if CURRENT_PDF is None:
|
||||
return render_template('all_done.html')
|
||||
return render_template("all_done.html")
|
||||
|
||||
# Get the tests for the current PDF
|
||||
current_tests = PDF_TESTS.get(CURRENT_PDF, [])
|
||||
|
||||
# Create PDF URL for pdf.js to load
|
||||
pdf_url = url_for('serve_pdf', pdf_name=CURRENT_PDF)
|
||||
pdf_url = url_for("serve_pdf", pdf_name=CURRENT_PDF)
|
||||
|
||||
# Calculate statistics
|
||||
stats = calculate_stats()
|
||||
|
||||
return render_template(
|
||||
'review.html',
|
||||
"review.html",
|
||||
pdf_name=CURRENT_PDF,
|
||||
tests=current_tests,
|
||||
pdf_path=pdf_url,
|
||||
pdf_index=ALL_PDFS.index(CURRENT_PDF) if CURRENT_PDF in ALL_PDFS else 0,
|
||||
total_pdfs=len(ALL_PDFS),
|
||||
stats=stats
|
||||
stats=stats,
|
||||
)
|
||||
|
||||
|
||||
@app.route('/update_test', methods=['POST'])
|
||||
@app.route("/update_test", methods=["POST"])
|
||||
def update_test():
|
||||
"""API endpoint to update a test."""
|
||||
global PDF_TESTS, DATASET_DIR
|
||||
|
||||
data = request.json
|
||||
pdf_name = data.get('pdf')
|
||||
test_id = data.get('id')
|
||||
field = data.get('field')
|
||||
value = data.get('value')
|
||||
pdf_name = data.get("pdf")
|
||||
test_id = data.get("id")
|
||||
field = data.get("field")
|
||||
value = data.get("value")
|
||||
|
||||
# Find and update the test
|
||||
for test in PDF_TESTS.get(pdf_name, []):
|
||||
if test.get('id') == test_id:
|
||||
if test.get("id") == test_id:
|
||||
test[field] = value
|
||||
break
|
||||
|
||||
@ -152,32 +143,29 @@ def update_test():
|
||||
return jsonify({"status": "success"})
|
||||
|
||||
|
||||
@app.route('/reject_all', methods=['POST'])
|
||||
@app.route("/reject_all", methods=["POST"])
|
||||
def reject_all():
|
||||
"""API endpoint to reject all tests for a PDF."""
|
||||
global PDF_TESTS, DATASET_DIR
|
||||
|
||||
data = request.json
|
||||
pdf_name = data.get('pdf')
|
||||
pdf_name = data.get("pdf")
|
||||
|
||||
if pdf_name and pdf_name in PDF_TESTS:
|
||||
# Update all tests for this PDF to rejected
|
||||
for test in PDF_TESTS[pdf_name]:
|
||||
test['checked'] = 'rejected'
|
||||
test["checked"] = "rejected"
|
||||
|
||||
# Save the updated tests
|
||||
dataset_file = os.path.join(DATASET_DIR, "table_tests.jsonl")
|
||||
save_dataset(dataset_file)
|
||||
|
||||
return jsonify({
|
||||
"status": "success",
|
||||
"count": len(PDF_TESTS[pdf_name])
|
||||
})
|
||||
return jsonify({"status": "success", "count": len(PDF_TESTS[pdf_name])})
|
||||
|
||||
return jsonify({"status": "error", "message": "PDF not found"})
|
||||
|
||||
|
||||
@app.route('/next_pdf', methods=['POST'])
|
||||
@app.route("/next_pdf", methods=["POST"])
|
||||
def next_pdf():
|
||||
"""Move to the next PDF in the list."""
|
||||
global CURRENT_PDF, ALL_PDFS
|
||||
@ -191,10 +179,10 @@ def next_pdf():
|
||||
else:
|
||||
CURRENT_PDF = find_next_unchecked_pdf()
|
||||
|
||||
return redirect(url_for('index'))
|
||||
return redirect(url_for("index"))
|
||||
|
||||
|
||||
@app.route('/prev_pdf', methods=['POST'])
|
||||
@app.route("/prev_pdf", methods=["POST"])
|
||||
def prev_pdf():
|
||||
"""Move to the previous PDF in the list."""
|
||||
global CURRENT_PDF, ALL_PDFS
|
||||
@ -204,10 +192,10 @@ def prev_pdf():
|
||||
if current_index > 0:
|
||||
CURRENT_PDF = ALL_PDFS[current_index - 1]
|
||||
|
||||
return redirect(url_for('index'))
|
||||
return redirect(url_for("index"))
|
||||
|
||||
|
||||
@app.route('/goto_pdf/<int:index>', methods=['POST'])
|
||||
@app.route("/goto_pdf/<int:index>", methods=["POST"])
|
||||
def goto_pdf(index):
|
||||
"""Go to a specific PDF by index."""
|
||||
global CURRENT_PDF, ALL_PDFS
|
||||
@ -215,7 +203,7 @@ def goto_pdf(index):
|
||||
if 0 <= index < len(ALL_PDFS):
|
||||
CURRENT_PDF = ALL_PDFS[index]
|
||||
|
||||
return redirect(url_for('index'))
|
||||
return redirect(url_for("index"))
|
||||
|
||||
|
||||
def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
|
||||
@ -235,7 +223,7 @@ def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
|
||||
|
||||
try:
|
||||
test = json.loads(line)
|
||||
pdf_name = test.get('pdf')
|
||||
pdf_name = test.get("pdf")
|
||||
if pdf_name:
|
||||
pdf_tests[pdf_name].append(test)
|
||||
except json.JSONDecodeError:
|
||||
@ -248,13 +236,13 @@ def load_dataset(dataset_dir: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
|
||||
|
||||
def create_templates_directory():
|
||||
"""Create templates directory for Flask if it doesn't exist."""
|
||||
templates_dir = os.path.join(os.path.dirname(__file__), 'templates')
|
||||
templates_dir = os.path.join(os.path.dirname(__file__), "templates")
|
||||
os.makedirs(templates_dir, exist_ok=True)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point with command-line arguments."""
|
||||
global DATASET_DIR, PDF_TESTS, ALL_PDFS
|
||||
global DATASET_DIR, PDF_TESTS, ALL_PDFS, CURRENT_PDF
|
||||
|
||||
parser = argparse.ArgumentParser(description="Interactive Test Review App")
|
||||
parser.add_argument("dataset_dir", help="Path to the dataset directory containing table_tests.jsonl and pdfs/ folder")
|
||||
|
@ -782,13 +782,16 @@
|
||||
if (textarea.parentNode) {
|
||||
textarea.parentNode.replaceChild(span, textarea);
|
||||
}
|
||||
|
||||
// Important: Reset edit mode flag
|
||||
isEditMode = false;
|
||||
}
|
||||
|
||||
// Add keydown event to handle Enter key
|
||||
textarea.addEventListener('keydown', function(e) {
|
||||
if (e.key === 'Enter' && !e.shiftKey) {
|
||||
e.preventDefault(); // Prevent default Enter behavior
|
||||
this.blur(); // Will trigger the blur event
|
||||
saveAndExitForField(); // Save directly instead of blur
|
||||
}
|
||||
});
|
||||
|
||||
@ -839,13 +842,16 @@
|
||||
if (textarea.parentNode) {
|
||||
textarea.parentNode.replaceChild(span, textarea);
|
||||
}
|
||||
|
||||
// Important: Reset edit mode flag
|
||||
isEditMode = false;
|
||||
}
|
||||
|
||||
// Add keydown event to handle Enter key
|
||||
textarea.addEventListener('keydown', function(e) {
|
||||
if (e.key === 'Enter' && !e.shiftKey) {
|
||||
e.preventDefault(); // Prevent default Enter behavior
|
||||
this.blur(); // Will trigger the blur event
|
||||
saveAndExit(); // Save directly rather than triggering blur
|
||||
}
|
||||
});
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user