Review app

2025-10-30 09:24:25 +00:00 · 2025-04-04 17:18:19 +00:00 · 2025-04-04 17:18:19 +00:00 · a113fd3015
commit a113fd3015
parent e8c14fc496
2 changed files with 32 additions and 14 deletions
--- a/olmocr/bench/review_app.py
+++ b/olmocr/bench/review_app.py
@ -20,6 +20,7 @@ PDF_TESTS = {}
 ALL_PDFS = []
 FORCE = False  # New global flag
 def find_next_unchecked_pdf() -> Optional[str]:
    """Find the next PDF with at least one unchecked test."""
    global PDF_TESTS, ALL_PDFS
@ -31,6 +32,7 @@ def find_next_unchecked_pdf() -> Optional[str]:
                return pdf_name
    return None
 def calculate_stats() -> dict:
    """Calculate statistics for all tests in the dataset."""
    global PDF_TESTS
@ -58,6 +60,7 @@ def calculate_stats() -> dict:
    return {"total": total_tests, "null": null_status, "verified": verified_status, "rejected": rejected_status, "completion": completion}
 def save_dataset(jsonl_file: str) -> None:
    """Save the tests to a JSONL file, using temp file for atomic write."""
    global PDF_TESTS
@ -75,12 +78,14 @@ def save_dataset(jsonl_file: str) -> None:
    # Atomic replace
    shutil.move(temp_file.name, jsonl_file)
@app.route("/pdf/<path:pdf_name>")
 def serve_pdf(pdf_name):
    """Serve the PDF file directly."""
    pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name)
    return send_file(pdf_path, mimetype="application/pdf")
@app.route("/")
 def index():
    """Main page displaying the current PDF and its tests."""
@ -116,6 +121,7 @@ def index():
        stats=stats,
    )
@app.route("/update_test", methods=["POST"])
 def update_test():
    """API endpoint to update a test."""
@ -138,6 +144,7 @@ def update_test():
    return jsonify({"status": "success"})
@app.route("/reject_all", methods=["POST"])
 def reject_all():
    """API endpoint to reject all tests for a PDF."""
@ -158,6 +165,7 @@ def reject_all():
    return jsonify({"status": "error", "message": "PDF not found"})
@app.route("/next_pdf", methods=["POST"])
 def next_pdf():
    """Move to the next PDF in the list."""
@ -181,6 +189,7 @@ def next_pdf():
    return redirect(url_for("index"))
@app.route("/prev_pdf", methods=["POST"])
 def prev_pdf():
    """Move to the previous PDF in the list."""
@ -193,6 +202,7 @@ def prev_pdf():
    return redirect(url_for("index"))
@app.route("/goto_pdf/<int:index>", methods=["POST"])
 def goto_pdf(index):
    """Go to a specific PDF by index."""
@ -203,6 +213,7 @@ def goto_pdf(index):
    return redirect(url_for("index"))
 def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
    """Load tests from the dataset file and organize them by PDF."""
    if not os.path.exists(dataset_file):
@ -228,11 +239,13 @@ def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
    return pdf_tests, all_pdfs
 def create_templates_directory():
    """Create templates directory for Flask if it doesn't exist."""
    templates_dir = os.path.join(os.path.dirname(__file__), "templates")
    os.makedirs(templates_dir, exist_ok=True)
 def main():
    """Main entry point with command-line arguments."""
    global DATASET_DIR, DATASET_FILE, PDF_TESTS, ALL_PDFS, CURRENT_PDF, FORCE
@ -280,5 +293,6 @@ def main():
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/scan_dolmadocs.py
+++ b/scripts/scan_dolmadocs.py
@ -9,12 +9,12 @@ import re
 import sqlite3
 import string
 import tempfile
 import tinyhost
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple
+from typing import Optional
 import boto3
 import tinyhost
 from tqdm import tqdm
 from olmocr.data.renderpdf import render_pdf_to_base64webp
@ -45,7 +45,7 @@ def parse_args():
 def generate_prolific_code(length=8):
    """Generate a random code for Prolific."""
    characters = string.ascii_uppercase + string.digits
-    return ''.join(random.choice(characters) for _ in range(length))
+    return "".join(random.choice(characters) for _ in range(length))
 def obfuscate_code(code):
@ -670,7 +670,8 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
            </div>
            """
-    html_content += """
+    html_content += (
        """
            </div>
            <div class="completion-message" id="completion-message">
@ -678,7 +679,9 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
                Your Prolific completion code is: <strong id="prolific-code">Loading...</strong>
            </div>
            <!-- Store the obfuscated code in a hidden element -->
-            <div id="obfuscated-code" style="display:none;">""" + obfuscated_code + """</div>
+            <div id="obfuscated-code" style="display:none;">"""
        + obfuscated_code
        + """</div>
            <div class="annotation-progress" id="progress-bar">
                <div class="progress-text">
@ -880,6 +883,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
    </body>
    </html>
    """
    )
    with open(output_path, "w") as f:
        f.write(html_content)
@ -965,9 +969,9 @@ def main():
    # Create CSV file with tinyhost links and Prolific codes
    csv_path = args.prolific_csv
    print(f"Writing Prolific codes to {csv_path}")
-    with open(csv_path, 'w', newline='') as csvfile:
+    with open(csv_path, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
-        writer.writerow(['tinyhost_link', 'code'])
+        writer.writerow(["tinyhost_link", "code"])
        for link, code in zip(links, prolific_codes):
            writer.writerow([link, code])