Review app

This commit is contained in:
Jake Poznanski 2025-04-04 17:18:19 +00:00
parent e8c14fc496
commit a113fd3015
2 changed files with 32 additions and 14 deletions

View File

@ -20,6 +20,7 @@ PDF_TESTS = {}
ALL_PDFS = []
FORCE = False # New global flag
def find_next_unchecked_pdf() -> Optional[str]:
"""Find the next PDF with at least one unchecked test."""
global PDF_TESTS, ALL_PDFS
@ -31,6 +32,7 @@ def find_next_unchecked_pdf() -> Optional[str]:
return pdf_name
return None
def calculate_stats() -> dict:
"""Calculate statistics for all tests in the dataset."""
global PDF_TESTS
@ -58,6 +60,7 @@ def calculate_stats() -> dict:
return {"total": total_tests, "null": null_status, "verified": verified_status, "rejected": rejected_status, "completion": completion}
def save_dataset(jsonl_file: str) -> None:
"""Save the tests to a JSONL file, using temp file for atomic write."""
global PDF_TESTS
@ -75,12 +78,14 @@ def save_dataset(jsonl_file: str) -> None:
# Atomic replace
shutil.move(temp_file.name, jsonl_file)
@app.route("/pdf/<path:pdf_name>")
def serve_pdf(pdf_name):
"""Serve the PDF file directly."""
pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name)
return send_file(pdf_path, mimetype="application/pdf")
@app.route("/")
def index():
"""Main page displaying the current PDF and its tests."""
@ -116,6 +121,7 @@ def index():
stats=stats,
)
@app.route("/update_test", methods=["POST"])
def update_test():
"""API endpoint to update a test."""
@ -138,6 +144,7 @@ def update_test():
return jsonify({"status": "success"})
@app.route("/reject_all", methods=["POST"])
def reject_all():
"""API endpoint to reject all tests for a PDF."""
@ -158,6 +165,7 @@ def reject_all():
return jsonify({"status": "error", "message": "PDF not found"})
@app.route("/next_pdf", methods=["POST"])
def next_pdf():
"""Move to the next PDF in the list."""
@ -181,6 +189,7 @@ def next_pdf():
return redirect(url_for("index"))
@app.route("/prev_pdf", methods=["POST"])
def prev_pdf():
"""Move to the previous PDF in the list."""
@ -193,6 +202,7 @@ def prev_pdf():
return redirect(url_for("index"))
@app.route("/goto_pdf/<int:index>", methods=["POST"])
def goto_pdf(index):
"""Go to a specific PDF by index."""
@ -203,6 +213,7 @@ def goto_pdf(index):
return redirect(url_for("index"))
def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
"""Load tests from the dataset file and organize them by PDF."""
if not os.path.exists(dataset_file):
@ -228,11 +239,13 @@ def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
return pdf_tests, all_pdfs
def create_templates_directory():
"""Create templates directory for Flask if it doesn't exist."""
templates_dir = os.path.join(os.path.dirname(__file__), "templates")
os.makedirs(templates_dir, exist_ok=True)
def main():
"""Main entry point with command-line arguments."""
global DATASET_DIR, DATASET_FILE, PDF_TESTS, ALL_PDFS, CURRENT_PDF, FORCE
@ -243,7 +256,7 @@ def main():
parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app")
parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode")
parser.add_argument("--force", action="store_true", help="Force show each file one by one and never do the 'All done' page")
args = parser.parse_args()
FORCE = args.force # Set the global FORCE flag
@ -280,5 +293,6 @@ def main():
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@ -9,12 +9,12 @@ import re
import sqlite3
import string
import tempfile
import tinyhost
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from typing import Optional
import boto3
import tinyhost
from tqdm import tqdm
from olmocr.data.renderpdf import render_pdf_to_base64webp
@ -45,7 +45,7 @@ def parse_args():
def generate_prolific_code(length=8):
"""Generate a random code for Prolific."""
characters = string.ascii_uppercase + string.digits
return ''.join(random.choice(characters) for _ in range(length))
return "".join(random.choice(characters) for _ in range(length))
def obfuscate_code(code):
@ -201,7 +201,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
"""Create an HTML file with rendered PDF pages."""
# Obfuscate the provided Prolific code
obfuscated_code = obfuscate_code(prolific_code)
# Get current date and time for the report
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@ -646,7 +646,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
except Exception as e:
# Add CSS class for the first annotation interface to be active by default
active_class = " active" if i == 0 else ""
html_content += f"""
<div class="page-container" data-index="{i}">
<div class="page-info">
@ -670,7 +670,8 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
</div>
"""
html_content += """
html_content += (
"""
</div>
<div class="completion-message" id="completion-message">
@ -678,7 +679,9 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
Your Prolific completion code is: <strong id="prolific-code">Loading...</strong>
</div>
<!-- Store the obfuscated code in a hidden element -->
<div id="obfuscated-code" style="display:none;">""" + obfuscated_code + """</div>
<div id="obfuscated-code" style="display:none;">"""
+ obfuscated_code
+ """</div>
<div class="annotation-progress" id="progress-bar">
<div class="progress-text">
@ -880,6 +883,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
</body>
</html>
"""
)
with open(output_path, "w") as f:
f.write(html_content)
@ -898,7 +902,7 @@ def generate_sample_set(args, i, s3_client, pdf_s3_client, result_files):
# Generate a unique Prolific code for this sample set
prolific_code = generate_prolific_code()
# Create HTML output with the Prolific code
create_html_output(random_pages, pdf_s3_client, output_filename, args.workspace, args.db_path, prolific_code)
@ -961,18 +965,18 @@ def main():
link = tinyhost.tinyhost([str(output_filename)])
links.append(link[0])
print(link)
# Create CSV file with tinyhost links and Prolific codes
csv_path = args.prolific_csv
print(f"Writing Prolific codes to {csv_path}")
with open(csv_path, 'w', newline='') as csvfile:
with open(csv_path, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['tinyhost_link', 'code'])
writer.writerow(["tinyhost_link", "code"])
for link, code in zip(links, prolific_codes):
writer.writerow([link, code])
print(f"Prolific codes written to {csv_path}")
if __name__ == "__main__":
main()
main()