mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-10-31 01:55:06 +00:00 
			
		
		
		
	Review app
This commit is contained in:
		
							parent
							
								
									e8c14fc496
								
							
						
					
					
						commit
						a113fd3015
					
				| @ -20,6 +20,7 @@ PDF_TESTS = {} | |||||||
| ALL_PDFS = [] | ALL_PDFS = [] | ||||||
| FORCE = False  # New global flag | FORCE = False  # New global flag | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def find_next_unchecked_pdf() -> Optional[str]: | def find_next_unchecked_pdf() -> Optional[str]: | ||||||
|     """Find the next PDF with at least one unchecked test.""" |     """Find the next PDF with at least one unchecked test.""" | ||||||
|     global PDF_TESTS, ALL_PDFS |     global PDF_TESTS, ALL_PDFS | ||||||
| @ -31,6 +32,7 @@ def find_next_unchecked_pdf() -> Optional[str]: | |||||||
|                 return pdf_name |                 return pdf_name | ||||||
|     return None |     return None | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def calculate_stats() -> dict: | def calculate_stats() -> dict: | ||||||
|     """Calculate statistics for all tests in the dataset.""" |     """Calculate statistics for all tests in the dataset.""" | ||||||
|     global PDF_TESTS |     global PDF_TESTS | ||||||
| @ -58,6 +60,7 @@ def calculate_stats() -> dict: | |||||||
| 
 | 
 | ||||||
|     return {"total": total_tests, "null": null_status, "verified": verified_status, "rejected": rejected_status, "completion": completion} |     return {"total": total_tests, "null": null_status, "verified": verified_status, "rejected": rejected_status, "completion": completion} | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def save_dataset(jsonl_file: str) -> None: | def save_dataset(jsonl_file: str) -> None: | ||||||
|     """Save the tests to a JSONL file, using temp file for atomic write.""" |     """Save the tests to a JSONL file, using temp file for atomic write.""" | ||||||
|     global PDF_TESTS |     global PDF_TESTS | ||||||
| @ -75,12 +78,14 @@ def save_dataset(jsonl_file: str) -> None: | |||||||
|     # Atomic replace |     # Atomic replace | ||||||
|     shutil.move(temp_file.name, jsonl_file) |     shutil.move(temp_file.name, jsonl_file) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| @app.route("/pdf/<path:pdf_name>") | @app.route("/pdf/<path:pdf_name>") | ||||||
| def serve_pdf(pdf_name): | def serve_pdf(pdf_name): | ||||||
|     """Serve the PDF file directly.""" |     """Serve the PDF file directly.""" | ||||||
|     pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name) |     pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name) | ||||||
|     return send_file(pdf_path, mimetype="application/pdf") |     return send_file(pdf_path, mimetype="application/pdf") | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| @app.route("/") | @app.route("/") | ||||||
| def index(): | def index(): | ||||||
|     """Main page displaying the current PDF and its tests.""" |     """Main page displaying the current PDF and its tests.""" | ||||||
| @ -116,6 +121,7 @@ def index(): | |||||||
|         stats=stats, |         stats=stats, | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| @app.route("/update_test", methods=["POST"]) | @app.route("/update_test", methods=["POST"]) | ||||||
| def update_test(): | def update_test(): | ||||||
|     """API endpoint to update a test.""" |     """API endpoint to update a test.""" | ||||||
| @ -138,6 +144,7 @@ def update_test(): | |||||||
| 
 | 
 | ||||||
|     return jsonify({"status": "success"}) |     return jsonify({"status": "success"}) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| @app.route("/reject_all", methods=["POST"]) | @app.route("/reject_all", methods=["POST"]) | ||||||
| def reject_all(): | def reject_all(): | ||||||
|     """API endpoint to reject all tests for a PDF.""" |     """API endpoint to reject all tests for a PDF.""" | ||||||
| @ -158,6 +165,7 @@ def reject_all(): | |||||||
| 
 | 
 | ||||||
|     return jsonify({"status": "error", "message": "PDF not found"}) |     return jsonify({"status": "error", "message": "PDF not found"}) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| @app.route("/next_pdf", methods=["POST"]) | @app.route("/next_pdf", methods=["POST"]) | ||||||
| def next_pdf(): | def next_pdf(): | ||||||
|     """Move to the next PDF in the list.""" |     """Move to the next PDF in the list.""" | ||||||
| @ -181,6 +189,7 @@ def next_pdf(): | |||||||
| 
 | 
 | ||||||
|     return redirect(url_for("index")) |     return redirect(url_for("index")) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| @app.route("/prev_pdf", methods=["POST"]) | @app.route("/prev_pdf", methods=["POST"]) | ||||||
| def prev_pdf(): | def prev_pdf(): | ||||||
|     """Move to the previous PDF in the list.""" |     """Move to the previous PDF in the list.""" | ||||||
| @ -193,6 +202,7 @@ def prev_pdf(): | |||||||
| 
 | 
 | ||||||
|     return redirect(url_for("index")) |     return redirect(url_for("index")) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| @app.route("/goto_pdf/<int:index>", methods=["POST"]) | @app.route("/goto_pdf/<int:index>", methods=["POST"]) | ||||||
| def goto_pdf(index): | def goto_pdf(index): | ||||||
|     """Go to a specific PDF by index.""" |     """Go to a specific PDF by index.""" | ||||||
| @ -203,6 +213,7 @@ def goto_pdf(index): | |||||||
| 
 | 
 | ||||||
|     return redirect(url_for("index")) |     return redirect(url_for("index")) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]: | def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]: | ||||||
|     """Load tests from the dataset file and organize them by PDF.""" |     """Load tests from the dataset file and organize them by PDF.""" | ||||||
|     if not os.path.exists(dataset_file): |     if not os.path.exists(dataset_file): | ||||||
| @ -228,11 +239,13 @@ def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]: | |||||||
| 
 | 
 | ||||||
|     return pdf_tests, all_pdfs |     return pdf_tests, all_pdfs | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def create_templates_directory(): | def create_templates_directory(): | ||||||
|     """Create templates directory for Flask if it doesn't exist.""" |     """Create templates directory for Flask if it doesn't exist.""" | ||||||
|     templates_dir = os.path.join(os.path.dirname(__file__), "templates") |     templates_dir = os.path.join(os.path.dirname(__file__), "templates") | ||||||
|     os.makedirs(templates_dir, exist_ok=True) |     os.makedirs(templates_dir, exist_ok=True) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def main(): | def main(): | ||||||
|     """Main entry point with command-line arguments.""" |     """Main entry point with command-line arguments.""" | ||||||
|     global DATASET_DIR, DATASET_FILE, PDF_TESTS, ALL_PDFS, CURRENT_PDF, FORCE |     global DATASET_DIR, DATASET_FILE, PDF_TESTS, ALL_PDFS, CURRENT_PDF, FORCE | ||||||
| @ -280,5 +293,6 @@ def main(): | |||||||
| 
 | 
 | ||||||
|     return 0 |     return 0 | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
|     sys.exit(main()) |     sys.exit(main()) | ||||||
|  | |||||||
| @ -9,12 +9,12 @@ import re | |||||||
| import sqlite3 | import sqlite3 | ||||||
| import string | import string | ||||||
| import tempfile | import tempfile | ||||||
| import tinyhost |  | ||||||
| from concurrent.futures import ThreadPoolExecutor | from concurrent.futures import ThreadPoolExecutor | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from typing import Dict, List, Optional, Tuple | from typing import Optional | ||||||
| 
 | 
 | ||||||
| import boto3 | import boto3 | ||||||
|  | import tinyhost | ||||||
| from tqdm import tqdm | from tqdm import tqdm | ||||||
| 
 | 
 | ||||||
| from olmocr.data.renderpdf import render_pdf_to_base64webp | from olmocr.data.renderpdf import render_pdf_to_base64webp | ||||||
| @ -45,7 +45,7 @@ def parse_args(): | |||||||
| def generate_prolific_code(length=8): | def generate_prolific_code(length=8): | ||||||
|     """Generate a random code for Prolific.""" |     """Generate a random code for Prolific.""" | ||||||
|     characters = string.ascii_uppercase + string.digits |     characters = string.ascii_uppercase + string.digits | ||||||
|     return ''.join(random.choice(characters) for _ in range(length)) |     return "".join(random.choice(characters) for _ in range(length)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def obfuscate_code(code): | def obfuscate_code(code): | ||||||
| @ -670,7 +670,8 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path, | |||||||
|             </div> |             </div> | ||||||
|             """ |             """ | ||||||
| 
 | 
 | ||||||
|     html_content += """ |     html_content += ( | ||||||
|  |         """ | ||||||
|             </div> |             </div> | ||||||
|              |              | ||||||
|             <div class="completion-message" id="completion-message"> |             <div class="completion-message" id="completion-message"> | ||||||
| @ -678,7 +679,9 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path, | |||||||
|                 Your Prolific completion code is: <strong id="prolific-code">Loading...</strong> |                 Your Prolific completion code is: <strong id="prolific-code">Loading...</strong> | ||||||
|             </div> |             </div> | ||||||
|             <!-- Store the obfuscated code in a hidden element --> |             <!-- Store the obfuscated code in a hidden element --> | ||||||
|             <div id="obfuscated-code" style="display:none;">""" + obfuscated_code + """</div> |             <div id="obfuscated-code" style="display:none;">""" | ||||||
|  |         + obfuscated_code | ||||||
|  |         + """</div> | ||||||
|              |              | ||||||
|             <div class="annotation-progress" id="progress-bar"> |             <div class="annotation-progress" id="progress-bar"> | ||||||
|                 <div class="progress-text"> |                 <div class="progress-text"> | ||||||
| @ -880,6 +883,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path, | |||||||
|     </body> |     </body> | ||||||
|     </html> |     </html> | ||||||
|     """ |     """ | ||||||
|  |     ) | ||||||
| 
 | 
 | ||||||
|     with open(output_path, "w") as f: |     with open(output_path, "w") as f: | ||||||
|         f.write(html_content) |         f.write(html_content) | ||||||
| @ -965,9 +969,9 @@ def main(): | |||||||
|     # Create CSV file with tinyhost links and Prolific codes |     # Create CSV file with tinyhost links and Prolific codes | ||||||
|     csv_path = args.prolific_csv |     csv_path = args.prolific_csv | ||||||
|     print(f"Writing Prolific codes to {csv_path}") |     print(f"Writing Prolific codes to {csv_path}") | ||||||
|     with open(csv_path, 'w', newline='') as csvfile: |     with open(csv_path, "w", newline="") as csvfile: | ||||||
|         writer = csv.writer(csvfile) |         writer = csv.writer(csvfile) | ||||||
|         writer.writerow(['tinyhost_link', 'code']) |         writer.writerow(["tinyhost_link", "code"]) | ||||||
|         for link, code in zip(links, prolific_codes): |         for link, code in zip(links, prolific_codes): | ||||||
|             writer.writerow([link, code]) |             writer.writerow([link, code]) | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Jake Poznanski
						Jake Poznanski