Review app

This commit is contained in:
Jake Poznanski 2025-04-04 17:18:19 +00:00
parent e8c14fc496
commit a113fd3015
2 changed files with 32 additions and 14 deletions

View File

@ -20,6 +20,7 @@ PDF_TESTS = {}
ALL_PDFS = [] ALL_PDFS = []
FORCE = False # New global flag FORCE = False # New global flag
def find_next_unchecked_pdf() -> Optional[str]: def find_next_unchecked_pdf() -> Optional[str]:
"""Find the next PDF with at least one unchecked test.""" """Find the next PDF with at least one unchecked test."""
global PDF_TESTS, ALL_PDFS global PDF_TESTS, ALL_PDFS
@ -31,6 +32,7 @@ def find_next_unchecked_pdf() -> Optional[str]:
return pdf_name return pdf_name
return None return None
def calculate_stats() -> dict: def calculate_stats() -> dict:
"""Calculate statistics for all tests in the dataset.""" """Calculate statistics for all tests in the dataset."""
global PDF_TESTS global PDF_TESTS
@ -58,6 +60,7 @@ def calculate_stats() -> dict:
return {"total": total_tests, "null": null_status, "verified": verified_status, "rejected": rejected_status, "completion": completion} return {"total": total_tests, "null": null_status, "verified": verified_status, "rejected": rejected_status, "completion": completion}
def save_dataset(jsonl_file: str) -> None: def save_dataset(jsonl_file: str) -> None:
"""Save the tests to a JSONL file, using temp file for atomic write.""" """Save the tests to a JSONL file, using temp file for atomic write."""
global PDF_TESTS global PDF_TESTS
@ -75,12 +78,14 @@ def save_dataset(jsonl_file: str) -> None:
# Atomic replace # Atomic replace
shutil.move(temp_file.name, jsonl_file) shutil.move(temp_file.name, jsonl_file)
@app.route("/pdf/<path:pdf_name>") @app.route("/pdf/<path:pdf_name>")
def serve_pdf(pdf_name): def serve_pdf(pdf_name):
"""Serve the PDF file directly.""" """Serve the PDF file directly."""
pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name) pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name)
return send_file(pdf_path, mimetype="application/pdf") return send_file(pdf_path, mimetype="application/pdf")
@app.route("/") @app.route("/")
def index(): def index():
"""Main page displaying the current PDF and its tests.""" """Main page displaying the current PDF and its tests."""
@ -116,6 +121,7 @@ def index():
stats=stats, stats=stats,
) )
@app.route("/update_test", methods=["POST"]) @app.route("/update_test", methods=["POST"])
def update_test(): def update_test():
"""API endpoint to update a test.""" """API endpoint to update a test."""
@ -138,6 +144,7 @@ def update_test():
return jsonify({"status": "success"}) return jsonify({"status": "success"})
@app.route("/reject_all", methods=["POST"]) @app.route("/reject_all", methods=["POST"])
def reject_all(): def reject_all():
"""API endpoint to reject all tests for a PDF.""" """API endpoint to reject all tests for a PDF."""
@ -158,6 +165,7 @@ def reject_all():
return jsonify({"status": "error", "message": "PDF not found"}) return jsonify({"status": "error", "message": "PDF not found"})
@app.route("/next_pdf", methods=["POST"]) @app.route("/next_pdf", methods=["POST"])
def next_pdf(): def next_pdf():
"""Move to the next PDF in the list.""" """Move to the next PDF in the list."""
@ -181,6 +189,7 @@ def next_pdf():
return redirect(url_for("index")) return redirect(url_for("index"))
@app.route("/prev_pdf", methods=["POST"]) @app.route("/prev_pdf", methods=["POST"])
def prev_pdf(): def prev_pdf():
"""Move to the previous PDF in the list.""" """Move to the previous PDF in the list."""
@ -193,6 +202,7 @@ def prev_pdf():
return redirect(url_for("index")) return redirect(url_for("index"))
@app.route("/goto_pdf/<int:index>", methods=["POST"]) @app.route("/goto_pdf/<int:index>", methods=["POST"])
def goto_pdf(index): def goto_pdf(index):
"""Go to a specific PDF by index.""" """Go to a specific PDF by index."""
@ -203,6 +213,7 @@ def goto_pdf(index):
return redirect(url_for("index")) return redirect(url_for("index"))
def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]: def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
"""Load tests from the dataset file and organize them by PDF.""" """Load tests from the dataset file and organize them by PDF."""
if not os.path.exists(dataset_file): if not os.path.exists(dataset_file):
@ -228,11 +239,13 @@ def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
return pdf_tests, all_pdfs return pdf_tests, all_pdfs
def create_templates_directory(): def create_templates_directory():
"""Create templates directory for Flask if it doesn't exist.""" """Create templates directory for Flask if it doesn't exist."""
templates_dir = os.path.join(os.path.dirname(__file__), "templates") templates_dir = os.path.join(os.path.dirname(__file__), "templates")
os.makedirs(templates_dir, exist_ok=True) os.makedirs(templates_dir, exist_ok=True)
def main(): def main():
"""Main entry point with command-line arguments.""" """Main entry point with command-line arguments."""
global DATASET_DIR, DATASET_FILE, PDF_TESTS, ALL_PDFS, CURRENT_PDF, FORCE global DATASET_DIR, DATASET_FILE, PDF_TESTS, ALL_PDFS, CURRENT_PDF, FORCE
@ -280,5 +293,6 @@ def main():
return 0 return 0
if __name__ == "__main__": if __name__ == "__main__":
sys.exit(main()) sys.exit(main())

View File

@ -9,12 +9,12 @@ import re
import sqlite3 import sqlite3
import string import string
import tempfile import tempfile
import tinyhost
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional, Tuple from typing import Optional
import boto3 import boto3
import tinyhost
from tqdm import tqdm from tqdm import tqdm
from olmocr.data.renderpdf import render_pdf_to_base64webp from olmocr.data.renderpdf import render_pdf_to_base64webp
@ -45,7 +45,7 @@ def parse_args():
def generate_prolific_code(length=8): def generate_prolific_code(length=8):
"""Generate a random code for Prolific.""" """Generate a random code for Prolific."""
characters = string.ascii_uppercase + string.digits characters = string.ascii_uppercase + string.digits
return ''.join(random.choice(characters) for _ in range(length)) return "".join(random.choice(characters) for _ in range(length))
def obfuscate_code(code): def obfuscate_code(code):
@ -670,7 +670,8 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
</div> </div>
""" """
html_content += """ html_content += (
"""
</div> </div>
<div class="completion-message" id="completion-message"> <div class="completion-message" id="completion-message">
@ -678,7 +679,9 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
Your Prolific completion code is: <strong id="prolific-code">Loading...</strong> Your Prolific completion code is: <strong id="prolific-code">Loading...</strong>
</div> </div>
<!-- Store the obfuscated code in a hidden element --> <!-- Store the obfuscated code in a hidden element -->
<div id="obfuscated-code" style="display:none;">""" + obfuscated_code + """</div> <div id="obfuscated-code" style="display:none;">"""
+ obfuscated_code
+ """</div>
<div class="annotation-progress" id="progress-bar"> <div class="annotation-progress" id="progress-bar">
<div class="progress-text"> <div class="progress-text">
@ -880,6 +883,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
</body> </body>
</html> </html>
""" """
)
with open(output_path, "w") as f: with open(output_path, "w") as f:
f.write(html_content) f.write(html_content)
@ -965,9 +969,9 @@ def main():
# Create CSV file with tinyhost links and Prolific codes # Create CSV file with tinyhost links and Prolific codes
csv_path = args.prolific_csv csv_path = args.prolific_csv
print(f"Writing Prolific codes to {csv_path}") print(f"Writing Prolific codes to {csv_path}")
with open(csv_path, 'w', newline='') as csvfile: with open(csv_path, "w", newline="") as csvfile:
writer = csv.writer(csvfile) writer = csv.writer(csvfile)
writer.writerow(['tinyhost_link', 'code']) writer.writerow(["tinyhost_link", "code"])
for link, code in zip(links, prolific_codes): for link, code in zip(links, prolific_codes):
writer.writerow([link, code]) writer.writerow([link, code])