mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-26 23:53:31 +00:00
Review app
This commit is contained in:
parent
e8c14fc496
commit
a113fd3015
@ -20,6 +20,7 @@ PDF_TESTS = {}
|
||||
ALL_PDFS = []
|
||||
FORCE = False # New global flag
|
||||
|
||||
|
||||
def find_next_unchecked_pdf() -> Optional[str]:
|
||||
"""Find the next PDF with at least one unchecked test."""
|
||||
global PDF_TESTS, ALL_PDFS
|
||||
@ -31,6 +32,7 @@ def find_next_unchecked_pdf() -> Optional[str]:
|
||||
return pdf_name
|
||||
return None
|
||||
|
||||
|
||||
def calculate_stats() -> dict:
|
||||
"""Calculate statistics for all tests in the dataset."""
|
||||
global PDF_TESTS
|
||||
@ -58,6 +60,7 @@ def calculate_stats() -> dict:
|
||||
|
||||
return {"total": total_tests, "null": null_status, "verified": verified_status, "rejected": rejected_status, "completion": completion}
|
||||
|
||||
|
||||
def save_dataset(jsonl_file: str) -> None:
|
||||
"""Save the tests to a JSONL file, using temp file for atomic write."""
|
||||
global PDF_TESTS
|
||||
@ -75,12 +78,14 @@ def save_dataset(jsonl_file: str) -> None:
|
||||
# Atomic replace
|
||||
shutil.move(temp_file.name, jsonl_file)
|
||||
|
||||
|
||||
@app.route("/pdf/<path:pdf_name>")
|
||||
def serve_pdf(pdf_name):
|
||||
"""Serve the PDF file directly."""
|
||||
pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name)
|
||||
return send_file(pdf_path, mimetype="application/pdf")
|
||||
|
||||
|
||||
@app.route("/")
|
||||
def index():
|
||||
"""Main page displaying the current PDF and its tests."""
|
||||
@ -116,6 +121,7 @@ def index():
|
||||
stats=stats,
|
||||
)
|
||||
|
||||
|
||||
@app.route("/update_test", methods=["POST"])
|
||||
def update_test():
|
||||
"""API endpoint to update a test."""
|
||||
@ -138,6 +144,7 @@ def update_test():
|
||||
|
||||
return jsonify({"status": "success"})
|
||||
|
||||
|
||||
@app.route("/reject_all", methods=["POST"])
|
||||
def reject_all():
|
||||
"""API endpoint to reject all tests for a PDF."""
|
||||
@ -158,6 +165,7 @@ def reject_all():
|
||||
|
||||
return jsonify({"status": "error", "message": "PDF not found"})
|
||||
|
||||
|
||||
@app.route("/next_pdf", methods=["POST"])
|
||||
def next_pdf():
|
||||
"""Move to the next PDF in the list."""
|
||||
@ -181,6 +189,7 @@ def next_pdf():
|
||||
|
||||
return redirect(url_for("index"))
|
||||
|
||||
|
||||
@app.route("/prev_pdf", methods=["POST"])
|
||||
def prev_pdf():
|
||||
"""Move to the previous PDF in the list."""
|
||||
@ -193,6 +202,7 @@ def prev_pdf():
|
||||
|
||||
return redirect(url_for("index"))
|
||||
|
||||
|
||||
@app.route("/goto_pdf/<int:index>", methods=["POST"])
|
||||
def goto_pdf(index):
|
||||
"""Go to a specific PDF by index."""
|
||||
@ -203,6 +213,7 @@ def goto_pdf(index):
|
||||
|
||||
return redirect(url_for("index"))
|
||||
|
||||
|
||||
def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
|
||||
"""Load tests from the dataset file and organize them by PDF."""
|
||||
if not os.path.exists(dataset_file):
|
||||
@ -228,11 +239,13 @@ def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
|
||||
|
||||
return pdf_tests, all_pdfs
|
||||
|
||||
|
||||
def create_templates_directory():
|
||||
"""Create templates directory for Flask if it doesn't exist."""
|
||||
templates_dir = os.path.join(os.path.dirname(__file__), "templates")
|
||||
os.makedirs(templates_dir, exist_ok=True)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point with command-line arguments."""
|
||||
global DATASET_DIR, DATASET_FILE, PDF_TESTS, ALL_PDFS, CURRENT_PDF, FORCE
|
||||
@ -243,7 +256,7 @@ def main():
|
||||
parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app")
|
||||
parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode")
|
||||
parser.add_argument("--force", action="store_true", help="Force show each file one by one and never do the 'All done' page")
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
FORCE = args.force # Set the global FORCE flag
|
||||
|
||||
@ -280,5 +293,6 @@ def main():
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
||||
@ -9,12 +9,12 @@ import re
|
||||
import sqlite3
|
||||
import string
|
||||
import tempfile
|
||||
import tinyhost
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Optional
|
||||
|
||||
import boto3
|
||||
import tinyhost
|
||||
from tqdm import tqdm
|
||||
|
||||
from olmocr.data.renderpdf import render_pdf_to_base64webp
|
||||
@ -45,7 +45,7 @@ def parse_args():
|
||||
def generate_prolific_code(length=8):
|
||||
"""Generate a random code for Prolific."""
|
||||
characters = string.ascii_uppercase + string.digits
|
||||
return ''.join(random.choice(characters) for _ in range(length))
|
||||
return "".join(random.choice(characters) for _ in range(length))
|
||||
|
||||
|
||||
def obfuscate_code(code):
|
||||
@ -201,7 +201,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
"""Create an HTML file with rendered PDF pages."""
|
||||
# Obfuscate the provided Prolific code
|
||||
obfuscated_code = obfuscate_code(prolific_code)
|
||||
|
||||
|
||||
# Get current date and time for the report
|
||||
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
@ -646,7 +646,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
except Exception as e:
|
||||
# Add CSS class for the first annotation interface to be active by default
|
||||
active_class = " active" if i == 0 else ""
|
||||
|
||||
|
||||
html_content += f"""
|
||||
<div class="page-container" data-index="{i}">
|
||||
<div class="page-info">
|
||||
@ -670,7 +670,8 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
</div>
|
||||
"""
|
||||
|
||||
html_content += """
|
||||
html_content += (
|
||||
"""
|
||||
</div>
|
||||
|
||||
<div class="completion-message" id="completion-message">
|
||||
@ -678,7 +679,9 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
Your Prolific completion code is: <strong id="prolific-code">Loading...</strong>
|
||||
</div>
|
||||
<!-- Store the obfuscated code in a hidden element -->
|
||||
<div id="obfuscated-code" style="display:none;">""" + obfuscated_code + """</div>
|
||||
<div id="obfuscated-code" style="display:none;">"""
|
||||
+ obfuscated_code
|
||||
+ """</div>
|
||||
|
||||
<div class="annotation-progress" id="progress-bar">
|
||||
<div class="progress-text">
|
||||
@ -880,6 +883,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
)
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
f.write(html_content)
|
||||
@ -898,7 +902,7 @@ def generate_sample_set(args, i, s3_client, pdf_s3_client, result_files):
|
||||
|
||||
# Generate a unique Prolific code for this sample set
|
||||
prolific_code = generate_prolific_code()
|
||||
|
||||
|
||||
# Create HTML output with the Prolific code
|
||||
create_html_output(random_pages, pdf_s3_client, output_filename, args.workspace, args.db_path, prolific_code)
|
||||
|
||||
@ -961,18 +965,18 @@ def main():
|
||||
link = tinyhost.tinyhost([str(output_filename)])
|
||||
links.append(link[0])
|
||||
print(link)
|
||||
|
||||
|
||||
# Create CSV file with tinyhost links and Prolific codes
|
||||
csv_path = args.prolific_csv
|
||||
print(f"Writing Prolific codes to {csv_path}")
|
||||
with open(csv_path, 'w', newline='') as csvfile:
|
||||
with open(csv_path, "w", newline="") as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
writer.writerow(['tinyhost_link', 'code'])
|
||||
writer.writerow(["tinyhost_link", "code"])
|
||||
for link, code in zip(links, prolific_codes):
|
||||
writer.writerow([link, code])
|
||||
|
||||
|
||||
print(f"Prolific codes written to {csv_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user