diff --git a/olmocr/bench/review_app.py b/olmocr/bench/review_app.py index e07e9c6..6776398 100644 --- a/olmocr/bench/review_app.py +++ b/olmocr/bench/review_app.py @@ -20,6 +20,7 @@ PDF_TESTS = {} ALL_PDFS = [] FORCE = False # New global flag + def find_next_unchecked_pdf() -> Optional[str]: """Find the next PDF with at least one unchecked test.""" global PDF_TESTS, ALL_PDFS @@ -31,6 +32,7 @@ def find_next_unchecked_pdf() -> Optional[str]: return pdf_name return None + def calculate_stats() -> dict: """Calculate statistics for all tests in the dataset.""" global PDF_TESTS @@ -58,6 +60,7 @@ def calculate_stats() -> dict: return {"total": total_tests, "null": null_status, "verified": verified_status, "rejected": rejected_status, "completion": completion} + def save_dataset(jsonl_file: str) -> None: """Save the tests to a JSONL file, using temp file for atomic write.""" global PDF_TESTS @@ -75,12 +78,14 @@ def save_dataset(jsonl_file: str) -> None: # Atomic replace shutil.move(temp_file.name, jsonl_file) + @app.route("/pdf/") def serve_pdf(pdf_name): """Serve the PDF file directly.""" pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name) return send_file(pdf_path, mimetype="application/pdf") + @app.route("/") def index(): """Main page displaying the current PDF and its tests.""" @@ -116,6 +121,7 @@ def index(): stats=stats, ) + @app.route("/update_test", methods=["POST"]) def update_test(): """API endpoint to update a test.""" @@ -138,6 +144,7 @@ def update_test(): return jsonify({"status": "success"}) + @app.route("/reject_all", methods=["POST"]) def reject_all(): """API endpoint to reject all tests for a PDF.""" @@ -158,6 +165,7 @@ def reject_all(): return jsonify({"status": "error", "message": "PDF not found"}) + @app.route("/next_pdf", methods=["POST"]) def next_pdf(): """Move to the next PDF in the list.""" @@ -181,6 +189,7 @@ def next_pdf(): return redirect(url_for("index")) + @app.route("/prev_pdf", methods=["POST"]) def prev_pdf(): """Move to the previous PDF in the list.""" @@ -193,6 +202,7 @@ def prev_pdf(): return redirect(url_for("index")) + @app.route("/goto_pdf/", methods=["POST"]) def goto_pdf(index): """Go to a specific PDF by index.""" @@ -203,6 +213,7 @@ def goto_pdf(index): return redirect(url_for("index")) + def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]: """Load tests from the dataset file and organize them by PDF.""" if not os.path.exists(dataset_file): @@ -228,11 +239,13 @@ def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]: return pdf_tests, all_pdfs + def create_templates_directory(): """Create templates directory for Flask if it doesn't exist.""" templates_dir = os.path.join(os.path.dirname(__file__), "templates") os.makedirs(templates_dir, exist_ok=True) + def main(): """Main entry point with command-line arguments.""" global DATASET_DIR, DATASET_FILE, PDF_TESTS, ALL_PDFS, CURRENT_PDF, FORCE @@ -243,7 +256,7 @@ def main(): parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app") parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode") parser.add_argument("--force", action="store_true", help="Force show each file one by one and never do the 'All done' page") - + args = parser.parse_args() FORCE = args.force # Set the global FORCE flag @@ -280,5 +293,6 @@ def main(): return 0 + if __name__ == "__main__": sys.exit(main()) diff --git a/scripts/scan_dolmadocs.py b/scripts/scan_dolmadocs.py index a0b54ff..dac0b02 100644 --- a/scripts/scan_dolmadocs.py +++ b/scripts/scan_dolmadocs.py @@ -9,12 +9,12 @@ import re import sqlite3 import string import tempfile -import tinyhost from concurrent.futures import ThreadPoolExecutor from pathlib import Path -from typing import Dict, List, Optional, Tuple +from typing import Optional import boto3 +import tinyhost from tqdm import tqdm from olmocr.data.renderpdf import render_pdf_to_base64webp @@ -45,7 +45,7 @@ def parse_args(): def generate_prolific_code(length=8): """Generate a random code for Prolific.""" characters = string.ascii_uppercase + string.digits - return ''.join(random.choice(characters) for _ in range(length)) + return "".join(random.choice(characters) for _ in range(length)) def obfuscate_code(code): @@ -201,7 +201,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path, """Create an HTML file with rendered PDF pages.""" # Obfuscate the provided Prolific code obfuscated_code = obfuscate_code(prolific_code) - + # Get current date and time for the report current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") @@ -646,7 +646,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path, except Exception as e: # Add CSS class for the first annotation interface to be active by default active_class = " active" if i == 0 else "" - + html_content += f"""
@@ -670,7 +670,8 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
""" - html_content += """ + html_content += ( + """
@@ -678,7 +679,9 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path, Your Prolific completion code is: Loading...
- +
@@ -880,6 +883,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path, """ + ) with open(output_path, "w") as f: f.write(html_content) @@ -898,7 +902,7 @@ def generate_sample_set(args, i, s3_client, pdf_s3_client, result_files): # Generate a unique Prolific code for this sample set prolific_code = generate_prolific_code() - + # Create HTML output with the Prolific code create_html_output(random_pages, pdf_s3_client, output_filename, args.workspace, args.db_path, prolific_code) @@ -961,18 +965,18 @@ def main(): link = tinyhost.tinyhost([str(output_filename)]) links.append(link[0]) print(link) - + # Create CSV file with tinyhost links and Prolific codes csv_path = args.prolific_csv print(f"Writing Prolific codes to {csv_path}") - with open(csv_path, 'w', newline='') as csvfile: + with open(csv_path, "w", newline="") as csvfile: writer = csv.writer(csvfile) - writer.writerow(['tinyhost_link', 'code']) + writer.writerow(["tinyhost_link", "code"]) for link, code in zip(links, prolific_codes): writer.writerow([link, code]) - + print(f"Prolific codes written to {csv_path}") if __name__ == "__main__": - main() \ No newline at end of file + main()