olmocr/scripts/scan_dolmadocs.py

1514 lines
68 KiB
Python
Raw Normal View History

2025-03-26 18:26:06 +00:00
import argparse
2025-04-04 17:12:46 +00:00
import base64
import csv
import datetime
2025-03-26 18:26:06 +00:00
import json
import os
import random
import re
import sqlite3
import tempfile
from concurrent.futures import ThreadPoolExecutor
2025-03-26 18:26:06 +00:00
from pathlib import Path
2025-04-04 19:44:54 +00:00
from typing import Any, Dict, List, Optional, Tuple
import boto3
2025-04-04 19:44:54 +00:00
import requests
2025-04-04 17:18:19 +00:00
import tinyhost
2025-03-26 18:26:06 +00:00
from tqdm import tqdm
2025-03-26 18:26:06 +00:00
from olmocr.data.renderpdf import render_pdf_to_base64webp
from olmocr.s3_utils import get_s3_bytes, parse_s3_path
2025-03-26 18:26:06 +00:00
def parse_args():
parser = argparse.ArgumentParser(description="Scan OLMO OCR workspace results and create visual samples")
parser.add_argument("workspace", help="OLMO OCR workspace path (s3://bucket/workspace)")
parser.add_argument("--pages_per_output", type=int, default=30, help="Number of pages per output file")
parser.add_argument("--repeats", type=int, default=1, help="Number of output files to generate")
parser.add_argument("--pdf_profile", help="AWS profile for accessing PDFs")
parser.add_argument("--output_dir", default="dolma_samples", help="Directory to save output HTML files")
parser.add_argument("--max_workers", type=int, default=4, help="Maximum number of worker threads")
parser.add_argument(
"--db_path",
default="~/s2pdf_url_data/d65142df-6588-4b68-a12c-d468b3761189.csv.db",
help="Path to the SQLite database containing PDF hash to URL mapping",
)
2025-04-07 21:39:55 +00:00
parser.add_argument(
"--prolific_code",
required=True,
help="Fixed completion code to use for all outputs",
)
2025-04-04 17:12:46 +00:00
parser.add_argument(
"--prolific_csv",
default="prolific_codes.csv",
2025-04-07 21:39:55 +00:00
help="Path to save the file with tinyhost links (one URL per line)",
2025-04-04 17:12:46 +00:00
)
2025-04-04 19:44:54 +00:00
parser.add_argument(
"--read_results",
help="Path to a CSV file containing previously generated tinyhost links to extract annotations",
)
2025-03-26 18:26:06 +00:00
return parser.parse_args()
2025-04-07 21:39:55 +00:00
# Fixed prolific code is now passed in as a command line argument
2025-04-04 17:12:46 +00:00
def obfuscate_code(code):
"""Gently obfuscate the Prolific code so it's not immediately visible in source."""
# Convert to base64 and reverse
encoded = base64.b64encode(code.encode()).decode()
return encoded[::-1]
def deobfuscate_code(obfuscated_code):
"""Deobfuscate the code - this will be done in JavaScript."""
# Reverse and decode from base64
reversed_encoded = obfuscated_code[::-1]
try:
return base64.b64decode(reversed_encoded).decode()
except:
return "ERROR_DECODING"
2025-03-26 18:26:06 +00:00
def parse_pdf_hash(pretty_pdf_path: str) -> Optional[str]:
pattern = r"s3://ai2-s2-pdfs/([a-f0-9]{4})/([a-f0-9]+)\.pdf"
match = re.match(pattern, pretty_pdf_path)
if match:
return match.group(1) + match.group(2)
return None
2025-03-26 18:26:06 +00:00
def get_original_url(pdf_hash: str, db_path: str) -> Optional[str]:
"""Look up the original URL for a PDF hash in the SQLite database."""
if not pdf_hash:
return None
2025-03-26 18:26:06 +00:00
try:
sqlite_db_path = os.path.expanduser(db_path)
if not os.path.exists(sqlite_db_path):
print(f"SQLite database not found at {sqlite_db_path}")
return None
2025-03-26 18:26:06 +00:00
conn = sqlite3.connect(sqlite_db_path)
cursor = conn.cursor()
2025-03-26 18:26:06 +00:00
cursor.execute("SELECT uri FROM pdf_mapping WHERE pdf_hash = ?", (pdf_hash,))
result = cursor.fetchone()
2025-03-26 18:26:06 +00:00
conn.close()
2025-03-26 18:26:06 +00:00
if result:
return result[0]
return None
except Exception as e:
print(f"Error looking up URL for PDF hash {pdf_hash}: {e}")
return None
def list_result_files(s3_client, workspace_path):
"""List all JSON result files in the workspace results directory."""
bucket, prefix = parse_s3_path(workspace_path)
results_prefix = os.path.join(prefix, "results").rstrip("/") + "/"
2025-03-26 18:26:06 +00:00
all_files = []
paginator = s3_client.get_paginator("list_objects_v2")
for page in paginator.paginate(Bucket=bucket, Prefix=results_prefix):
if "Contents" in page:
all_files.extend([f"s3://{bucket}/{obj['Key']}" for obj in page["Contents"] if obj["Key"].endswith(".jsonl") or obj["Key"].endswith(".json")])
2025-04-04 16:05:04 +00:00
if len(all_files) > 1000:
break
2025-03-26 18:26:06 +00:00
return all_files
2025-03-26 18:26:06 +00:00
def get_random_pages(s3_client, result_files, count=30):
"""Get random pages from the result files."""
random_pages = []
2025-03-26 18:26:06 +00:00
# Try to collect the requested number of pages
attempts = 0
max_attempts = count * 3 # Allow extra attempts to handle potential failures
2025-03-26 18:26:06 +00:00
while len(random_pages) < count and attempts < max_attempts:
attempts += 1
2025-03-26 18:26:06 +00:00
# Pick a random result file
if not result_files:
print("No result files found!")
break
2025-03-26 18:26:06 +00:00
result_file = random.choice(result_files)
2025-03-26 18:26:06 +00:00
try:
# Get the content of the file
content = get_s3_bytes(s3_client, result_file)
lines = content.decode("utf-8").strip().split("\n")
2025-03-26 18:26:06 +00:00
if not lines:
continue
2025-03-26 18:26:06 +00:00
# Pick a random line (which contains a complete document)
line = random.choice(lines)
doc = json.loads(line)
2025-03-26 18:26:06 +00:00
# A Dolma document has "text", "metadata", and "attributes" fields
if "text" not in doc or "metadata" not in doc or "attributes" not in doc:
print(f"Document in {result_file} is not a valid Dolma document")
continue
2025-03-26 18:26:06 +00:00
# Get the original PDF path from metadata
pdf_path = doc["metadata"].get("Source-File")
if not pdf_path:
continue
2025-03-26 18:26:06 +00:00
# Get page spans from attributes
page_spans = doc["attributes"].get("pdf_page_numbers", [])
if not page_spans:
continue
2025-03-26 18:26:06 +00:00
# Pick a random page span
page_span = random.choice(page_spans)
if len(page_span) >= 3:
# Page spans are [start_pos, end_pos, page_num]
page_num = page_span[2]
2025-03-26 18:26:06 +00:00
# Extract text for this page
start_pos, end_pos = page_span[0], page_span[1]
page_text = doc["text"][start_pos:end_pos].strip()
2025-03-26 18:26:06 +00:00
# Include the text snippet with the page info
random_pages.append((pdf_path, page_num, page_text, result_file))
2025-03-26 18:26:06 +00:00
if len(random_pages) >= count:
break
2025-03-26 18:26:06 +00:00
except Exception as e:
print(f"Error processing {result_file}: {e}")
continue
2025-03-26 18:26:06 +00:00
print(f"Found {len(random_pages)} random pages from Dolma documents")
return random_pages
def create_presigned_url(s3_client, pdf_path, expiration=3600 * 24 * 7):
2025-03-26 18:26:06 +00:00
"""Create a presigned URL for the given S3 path."""
try:
bucket, key = parse_s3_path(pdf_path)
url = s3_client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": key}, ExpiresIn=expiration)
2025-03-26 18:26:06 +00:00
return url
except Exception as e:
print(f"Error creating presigned URL for {pdf_path}: {e}")
return None
2025-04-04 17:12:46 +00:00
def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path, db_path, prolific_code, resolution=2048):
2025-03-26 18:26:06 +00:00
"""Create an HTML file with rendered PDF pages."""
2025-04-04 17:12:46 +00:00
# Obfuscate the provided Prolific code
obfuscated_code = obfuscate_code(prolific_code)
2025-04-04 17:18:19 +00:00
2025-03-26 18:26:06 +00:00
# Get current date and time for the report
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
2025-03-26 18:26:06 +00:00
html_content = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>OLMO OCR Samples</title>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
<style>
:root {{
--primary-color: #2563eb;
--secondary-color: #4b5563;
--border-color: #e5e7eb;
--bg-color: #f9fafb;
--text-color: #111827;
--text-light: #6b7280;
--card-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
2025-04-04 16:19:04 +00:00
--success-color: #10b981;
2025-03-26 18:26:06 +00:00
}}
* {{
box-sizing: border-box;
margin: 0;
padding: 0;
}}
body {{
font-family: 'Inter', sans-serif;
line-height: 1.6;
color: var(--text-color);
background-color: var(--bg-color);
padding: 2rem;
2025-04-08 20:50:00 +00:00
display: flex;
flex-direction: row;
gap: 2rem;
2025-03-26 18:26:06 +00:00
}}
2025-04-07 20:27:32 +00:00
ul {{
margin-left: 2em;
}}
2025-03-26 18:26:06 +00:00
.container {{
2025-04-08 20:50:00 +00:00
flex: 2;
max-width: 750px;
2025-03-26 18:26:06 +00:00
}}
header {{
2025-04-08 20:50:00 +00:00
position: sticky;
top: 2rem;
flex: 1;
min-width: 380px;
max-width: 420px;
max-height: calc(100vh - 4rem);
overflow-y: auto;
padding: 1.5rem;
background-color: white;
border-radius: 0.5rem;
box-shadow: var(--card-shadow);
align-self: flex-start;
2025-04-07 20:27:32 +00:00
font-size: small;
2025-03-26 18:26:06 +00:00
}}
2025-04-07 20:27:32 +00:00
header h2 {{
margin-top: 1em;
2025-03-26 18:26:06 +00:00
}}
2025-04-07 20:27:32 +00:00
2025-03-26 18:26:06 +00:00
.info-bar {{
background-color: white;
padding: 1rem;
border-radius: 0.5rem;
margin-bottom: 2rem;
box-shadow: var(--card-shadow);
display: flex;
justify-content: space-between;
flex-wrap: wrap;
gap: 1rem;
}}
.info-item {{
flex: 1;
min-width: 200px;
}}
.info-item h3 {{
font-size: 0.6rem;
2025-03-26 18:26:06 +00:00
color: var(--text-light);
margin-bottom: 0.25rem;
}}
.info-item p {{
font-size: 0.6rem;
2025-03-26 18:26:06 +00:00
}}
.page-grid {{
display: grid;
2025-04-08 20:50:00 +00:00
grid-template-columns: 1fr;
2025-03-26 18:26:06 +00:00
gap: 2rem;
}}
.page-container {{
background-color: white;
border-radius: 0.5rem;
overflow: hidden;
box-shadow: var(--card-shadow);
2025-04-04 19:38:59 +00:00
transition: all 0.3s ease;
}}
.page-container.editing {{
box-shadow: 0 0 0 3px var(--primary-color), var(--card-shadow);
2025-03-26 18:26:06 +00:00
}}
.page-info {{
padding: 1rem;
border-bottom: 1px solid var(--border-color);
}}
.page-info h2 {{
font-size: 1rem;
margin-bottom: 0.5rem;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}}
.page-info p {{
font-size: 0.875rem;
color: var(--text-light);
}}
.page-image-wrapper {{
padding: 1rem;
display: flex;
justify-content: center;
align-items: center;
background-color: #f3f4f6;
}}
.page-image {{
max-width: 100%;
height: auto;
border: 1px solid var(--border-color);
}}
.s3-link {{
padding: 1rem;
background-color: #f8fafc;
border-top: 1px solid var(--border-color);
font-size: 0.875rem;
color: var(--secondary-color);
word-break: break-all;
}}
.s3-link a {{
color: var(--primary-color);
text-decoration: none;
font-weight: 500;
}}
.s3-link a:hover {{
text-decoration: underline;
}}
2025-04-04 16:19:04 +00:00
/* Annotation elements */
.annotation-interface {{
display: none; /* Hide annotation interface by default */
margin-top: 1rem;
padding: 0.5rem;
border-top: 1px solid var(--border-color);
border-radius: 0.25rem;
background-color: #f8fafc;
}}
.annotation-interface.active {{
display: block; /* Show only the active annotation interface */
}}
2025-04-08 21:04:56 +00:00
.question-container {{
margin-bottom: 1rem;
}}
.question-text {{
font-weight: 500;
margin-bottom: 0.5rem;
}}
2025-04-04 16:19:04 +00:00
/* Button group styling for connected buttons */
.btn-group {{
display: inline-flex;
margin-bottom: 0.5rem;
}}
.btn-group .toggle-button {{
padding: 0.5rem 1rem;
border: 1px solid var(--border-color);
background-color: #f8fafc;
cursor: pointer;
margin: 0;
/* Remove individual border radius so we can set unified ones */
border-radius: 0;
}}
.btn-group .toggle-button:first-child {{
border-right: none;
border-top-left-radius: 0.25rem;
border-bottom-left-radius: 0.25rem;
}}
.btn-group .toggle-button:last-child {{
border-top-right-radius: 0.25rem;
border-bottom-right-radius: 0.25rem;
}}
2025-04-04 16:19:04 +00:00
.btn-group .toggle-button:not(:first-child):not(:last-child) {{
border-right: none;
}}
.toggle-button.active {{
background-color: var(--primary-color);
color: white;
2025-04-01 18:35:04 +00:00
}}
2025-04-08 21:04:56 +00:00
.checkbox-group {{
display: flex;
flex-wrap: wrap;
gap: 0.5rem;
margin-bottom: 1rem;
}}
.checkbox-group label {{
display: flex;
align-items: center;
padding: 0.25rem 0.5rem;
background-color: #f1f5f9;
border-radius: 0.25rem;
cursor: pointer;
font-size: 0.875rem;
}}
.checkbox-group label:hover {{
background-color: #e2e8f0;
}}
.checkbox-group input[type="checkbox"] {{
margin-right: 0.5rem;
}}
.continue-button {{
padding: 0.5rem 1rem;
background-color: var(--primary-color);
color: white;
border: none;
border-radius: 0.25rem;
cursor: pointer;
font-weight: 500;
}}
.continue-button:hover {{
background-color: #1d4ed8;
}}
2025-04-04 16:19:04 +00:00
.annotation-interface textarea {{
display: none; /* Hide textarea by default */
2025-04-01 18:35:04 +00:00
width: 100%;
margin-top: 0.5rem;
2025-04-08 21:04:56 +00:00
margin-bottom: 1rem;
2025-04-01 18:35:04 +00:00
padding: 0.5rem;
font-size: 0.875rem;
border: 1px solid var(--border-color);
border-radius: 0.25rem;
}}
2025-04-04 16:19:04 +00:00
.annotation-status {{
display: inline-block;
margin-left: 1rem;
padding: 0.25rem 0.5rem;
border-radius: 0.25rem;
font-size: 0.75rem;
font-weight: 600;
}}
.status-complete {{
background-color: #ecfdf5;
color: var(--success-color);
2025-04-04 19:38:59 +00:00
cursor: pointer;
transition: all 0.2s ease;
}}
.status-complete:hover {{
background-color: #d1fae5;
box-shadow: 0 0 0 2px rgba(16, 185, 129, 0.3);
2025-04-04 16:19:04 +00:00
}}
.status-pending {{
background-color: #fff7ed;
color: #ea580c;
}}
.status-current {{
background-color: #eff6ff;
color: var(--primary-color);
animation: pulse 2s infinite;
}}
@keyframes pulse {{
0% {{ opacity: 0.6; }}
50% {{ opacity: 1; }}
100% {{ opacity: 0.6; }}
}}
2025-03-26 18:26:06 +00:00
.error {{
color: #dc2626;
padding: 1rem;
background-color: #fee2e2;
border-radius: 0.25rem;
}}
2025-04-04 21:41:36 +00:00
2025-04-04 16:19:04 +00:00
.completion-message {{
display: none;
margin: 2rem auto;
padding: 1.5rem;
background-color: #ecfdf5;
border: 1px solid #A7F3D0;
border-radius: 0.5rem;
text-align: center;
color: var(--success-color);
font-weight: 600;
max-width: 500px;
}}
2025-03-26 18:26:06 +00:00
footer {{
margin-top: 3rem;
text-align: center;
color: var(--text-light);
font-size: 0.875rem;
border-top: 1px solid var(--border-color);
padding-top: 1rem;
}}
@media (max-width: 768px) {{
body {{
padding: 1rem;
2025-04-08 20:50:00 +00:00
flex-direction: column;
}}
header {{
position: static;
max-width: 100%;
margin-left: 0;
margin-bottom: 2rem;
2025-03-26 18:26:06 +00:00
}}
2025-04-08 20:50:00 +00:00
.container {{
max-width: 100%;
2025-03-26 18:26:06 +00:00
}}
}}
</style>
</head>
<body>
2025-04-08 20:50:00 +00:00
<header>
2025-04-07 20:27:32 +00:00
<h2>Task Instructions</h2>
2025-04-08 20:50:00 +00:00
<p>Your task is to review {len(random_pages)} document pages and determine whether they contain any <strong>Personally Identifiable Information (PII)</strong>. Carefully but efficiently inspect each page and select the appropriate response. You do not need to read every word - quickly scan the page and look for any obvious PII. The time expected to complete this task is 10-15 minutes.</p>
<h2>How to Annotate</h2>
2025-04-08 21:04:56 +00:00
<p>The page you are currently annotating will be highlighted with a blue outline and a set of questions will be displayed directly below it.</p>
2025-04-08 20:50:00 +00:00
<br/>
2025-04-08 21:04:56 +00:00
<p><strong>First question:</strong> Is this document meant for public dissemination?</p>
<ul>
<li><strong>Yes</strong> - If the document appears to be a publication, research paper, public information, etc.</li>
<li><strong>No</strong> - If the document appears to be private, personal, or not intended for public release</li>
2025-04-08 22:30:59 +00:00
<li><strong>Cannot Read</strong> - If you are unable to read the page (e.g., foreign language, poor quality)</li>
2025-04-08 21:04:56 +00:00
<li><strong>Report Content</strong> - If the content is inappropriate or disturbing</li>
</ul>
<p><strong>Second question:</strong> Depending on your first answer, you'll be asked to identify any PII in the document:</p>
<ul>
<li>For <strong>public</strong> documents, select from: SSN, Bank Info, Credit Card Info, Usernames/Passwords, Other</li>
<li>For <strong>private</strong> documents, select from: Full Names, Addresses, Contact Info, Personal Attributes, SSN, Bank Info, Credit Card Info, Usernames/Passwords, Other</li>
</ul>
<p>You can select multiple PII types. If you select "Other", a text box will appear where you can describe the PII.</p>
2025-04-08 20:50:00 +00:00
<br/>
<p>You may edit your annotations any time before submitting. To do so, press the green Edit button directly above the page.</p>
<p>After completing all the document pages on this screen, you will receive a Prolific completion code.</p>
<h2>What Counts as PII?</h2>
<ul>
<li><strong>Names</strong>: Full names, first names, last names, nicknames, maiden names, aliases</li>
<li><strong>Addresses</strong>: Street addresses, postal codes, cities, states, countries</li>
<li><strong>Contact Information</strong>: Phone numbers, email addresses</li>
<li><strong>Government IDs</strong>: SSNs, passport numbers, driver's license numbers, tax IDs</li>
<li><strong>Financial Information</strong>: Credit card numbers, bank account numbers, routing numbers</li>
<li><strong>Biometric Data</strong>: Fingerprints, retina scans, facial recognition data, voice signatures</li>
<li><strong>Personal Attributes</strong>: Date of birth, place of birth, gender, race, religion</li>
<li><strong>Online Identifiers</strong>: IP addresses, login IDs, usernames, passwords, API keys, URLs</li>
<li><strong>Location Information</strong>: Geolocations, specific coordinates</li>
<li><strong>Employment Information</strong>: Job titles, workplace names, employment history</li>
<li><strong>Education Information</strong>: School names, degrees, transcripts</li>
<li><strong>Medical Information</strong>: Health records, diagnoses</li>
<li><strong>Company Names</strong>: If they are tied to an individual's identity (e.g., a person's personal business)</li>
</ul>
<h2>What NOT to Mark as PII</h2>
<p><strong>Author names, researcher names, citations, or references from published research papers</strong> should NOT be marked as PII. These names are part of the normal publication process and are not considered private or sensitive information for the purposes of this task.
Only mark information as PII if it relates to private, sensitive, or personal details about an individual outside the context of the publication.</p>
</header>
<div class="container">
2025-03-26 18:26:06 +00:00
<div class="info-bar">
<div class="info-item">
<h3>Generated On</h3>
<p>{current_time}</p>
</div>
<div class="info-item">
<h3>Workspace</h3>
<p title="{workspace_path}">{workspace_path}</p>
</div>
<div class="info-item">
<h3>Sample Size</h3>
<p>{len(random_pages)} pages</p>
</div>
</div>
<div class="page-grid">
"""
2025-03-26 18:26:06 +00:00
for i, (pdf_path, page_num, page_text, result_file) in enumerate(tqdm(random_pages, desc="Rendering pages")):
# Get original URL from PDF hash
pdf_hash = parse_pdf_hash(pdf_path)
original_url = get_original_url(pdf_hash, db_path) if pdf_hash else None
# Create a truncated path for display
display_path = pdf_path
if len(display_path) > 60:
display_path = "..." + display_path[-57:]
2025-03-26 18:26:06 +00:00
# Generate presigned URL
presigned_url = create_presigned_url(pdf_s3_client, pdf_path)
2025-03-26 18:26:06 +00:00
try:
# Download PDF to temp file
bucket, key = parse_s3_path(pdf_path)
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
pdf_data = pdf_s3_client.get_object(Bucket=bucket, Key=key)["Body"].read()
2025-03-26 18:26:06 +00:00
temp_file.write(pdf_data)
temp_file_path = temp_file.name
2025-03-26 18:26:06 +00:00
# Render PDF to base64 webp
base64_image = render_pdf_to_base64webp(temp_file_path, page_num, resolution)
2025-04-04 16:19:04 +00:00
# Add CSS class for the first annotation interface to be active by default
active_class = " active" if i == 0 else ""
# Add to HTML with the annotation interface
html_content += f"""
2025-04-04 16:19:04 +00:00
<div class="page-container" data-index="{i}">
<div class="page-info">
2025-04-08 22:30:59 +00:00
<p>{f'<a href="{presigned_url}#page={page_num}" target="_blank">View Cached PDF (page {page_num})</a>' if presigned_url else pdf_path}</p>
2025-04-04 16:19:04 +00:00
<p>
Status: <span class="annotation-status status-pending" id="status-{i}">Pending</span>
</p>
2025-03-26 18:26:06 +00:00
</div>
<div class="page-image-wrapper">
<img class="page-image" src="data:image/webp;base64,{base64_image}" alt="PDF Page {page_num}" loading="lazy" />
2025-03-26 18:26:06 +00:00
</div>
2025-04-04 19:36:10 +00:00
<div class="annotation-interface{active_class}" data-id="page-{i}" data-pdf-path="{pdf_path}">
2025-04-08 21:04:56 +00:00
<div class="question-container" id="question1-{i}">
<p class="question-text">Is this document meant for public dissemination?</p>
<span class="btn-group">
<button type="button" class="toggle-button primary-option" data-value="yes-public" onclick="togglePrimaryOption(this, {i})">Yes</button>
<button type="button" class="toggle-button primary-option" data-value="no-public" onclick="togglePrimaryOption(this, {i})">No</button>
2025-04-08 22:30:59 +00:00
<button type="button" class="toggle-button primary-option" data-value="cannot-read" onclick="togglePrimaryOption(this, {i})">Cannot Read</button>
2025-04-08 21:04:56 +00:00
<button type="button" class="toggle-button primary-option" data-value="report-content" onclick="togglePrimaryOption(this, {i})">Report Content</button>
</span>
</div>
<div class="question-container" id="public-pii-options-{i}" style="display: none; margin-top: 1rem;">
<p class="question-text">Select any PII found in this public document:</p>
<div class="checkbox-group">
<label><input type="checkbox" class="pii-checkbox" data-value="ssn" onchange="saveCheckboxes(this)"> SSN</label>
<label><input type="checkbox" class="pii-checkbox" data-value="bank-info" onchange="saveCheckboxes(this)"> Bank Info</label>
<label><input type="checkbox" class="pii-checkbox" data-value="credit-card" onchange="saveCheckboxes(this)"> Credit Card Info</label>
<label><input type="checkbox" class="pii-checkbox" data-value="usernames-passwords" onchange="saveCheckboxes(this)"> Usernames/Passwords</label>
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
</div>
<textarea id="other-pii-public-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
2025-04-08 22:30:59 +00:00
<button type="button" class="continue-button" onclick="saveThenNext(this)">Continue</button>
2025-04-08 21:04:56 +00:00
</div>
<div class="question-container" id="private-pii-options-{i}" style="display: none; margin-top: 1rem;">
<p class="question-text">Select any PII found in this private document:</p>
<div class="checkbox-group">
<label><input type="checkbox" class="pii-checkbox" data-value="full-names" onchange="saveCheckboxes(this)"> Full Names</label>
<label><input type="checkbox" class="pii-checkbox" data-value="addresses" onchange="saveCheckboxes(this)"> Addresses</label>
<label><input type="checkbox" class="pii-checkbox" data-value="contact-info" onchange="saveCheckboxes(this)"> Contact Info</label>
<label><input type="checkbox" class="pii-checkbox" data-value="personal-attributes" onchange="saveCheckboxes(this)"> Personal Attributes</label>
<label><input type="checkbox" class="pii-checkbox" data-value="ssn" onchange="saveCheckboxes(this)"> SSN</label>
<label><input type="checkbox" class="pii-checkbox" data-value="bank-info" onchange="saveCheckboxes(this)"> Bank Info</label>
<label><input type="checkbox" class="pii-checkbox" data-value="credit-card" onchange="saveCheckboxes(this)"> Credit Card Info</label>
<label><input type="checkbox" class="pii-checkbox" data-value="usernames-passwords" onchange="saveCheckboxes(this)"> Usernames/Passwords</label>
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
</div>
<textarea id="other-pii-private-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
2025-04-08 22:30:59 +00:00
<button type="button" class="continue-button" onclick="saveThenNext(this)">Continue</button>
2025-04-08 21:04:56 +00:00
</div>
2025-04-04 16:19:04 +00:00
</div>
</div>
"""
2025-03-26 18:26:06 +00:00
# Clean up temp file
os.unlink(temp_file_path)
2025-03-26 18:26:06 +00:00
except Exception as e:
2025-04-04 16:19:04 +00:00
# Add CSS class for the first annotation interface to be active by default
active_class = " active" if i == 0 else ""
2025-04-04 17:18:19 +00:00
2025-03-26 18:26:06 +00:00
html_content += f"""
2025-04-04 16:19:04 +00:00
<div class="page-container" data-index="{i}">
2025-03-26 18:26:06 +00:00
<div class="page-info">
2025-04-08 22:30:59 +00:00
<p>{f'<a href="{presigned_url}#page={page_num}" target="_blank">View Cached PDF (page {page_num})</a>' if presigned_url else pdf_path}</p>
2025-04-04 16:19:04 +00:00
<p>
Status: <span class="annotation-status status-pending" id="status-{i}">Pending</span>
</p>
2025-03-26 18:26:06 +00:00
</div>
<div class="error">Error: {str(e)}</div>
2025-04-04 19:36:10 +00:00
<div class="annotation-interface{active_class}" data-id="page-{i}" data-pdf-path="{pdf_path}">
2025-04-08 21:04:56 +00:00
<div class="question-container" id="question1-{i}">
<p class="question-text">Is this document meant for public dissemination?</p>
<span class="btn-group">
<button type="button" class="toggle-button primary-option" data-value="yes-public" onclick="togglePrimaryOption(this, {i})">Yes</button>
<button type="button" class="toggle-button primary-option" data-value="no-public" onclick="togglePrimaryOption(this, {i})">No</button>
2025-04-08 22:30:59 +00:00
<button type="button" class="toggle-button primary-option" data-value="cannot-read" onclick="togglePrimaryOption(this, {i})">Cannot Read</button>
2025-04-08 21:04:56 +00:00
<button type="button" class="toggle-button primary-option" data-value="report-content" onclick="togglePrimaryOption(this, {i})">Report Content</button>
</span>
</div>
<div class="question-container" id="public-pii-options-{i}" style="display: none; margin-top: 1rem;">
<p class="question-text">Select any PII found in this public document:</p>
<div class="checkbox-group">
<label><input type="checkbox" class="pii-checkbox" data-value="ssn" onchange="saveCheckboxes(this)"> SSN</label>
<label><input type="checkbox" class="pii-checkbox" data-value="bank-info" onchange="saveCheckboxes(this)"> Bank Info</label>
<label><input type="checkbox" class="pii-checkbox" data-value="credit-card" onchange="saveCheckboxes(this)"> Credit Card Info</label>
<label><input type="checkbox" class="pii-checkbox" data-value="usernames-passwords" onchange="saveCheckboxes(this)"> Usernames/Passwords</label>
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
</div>
<textarea id="other-pii-public-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
2025-04-08 22:30:59 +00:00
<button type="button" class="continue-button" onclick="saveThenNext(this)">Continue</button>
2025-04-08 21:04:56 +00:00
</div>
<div class="question-container" id="private-pii-options-{i}" style="display: none; margin-top: 1rem;">
<p class="question-text">Select any PII found in this private document:</p>
<div class="checkbox-group">
<label><input type="checkbox" class="pii-checkbox" data-value="full-names" onchange="saveCheckboxes(this)"> Full Names</label>
<label><input type="checkbox" class="pii-checkbox" data-value="addresses" onchange="saveCheckboxes(this)"> Addresses</label>
<label><input type="checkbox" class="pii-checkbox" data-value="contact-info" onchange="saveCheckboxes(this)"> Contact Info</label>
<label><input type="checkbox" class="pii-checkbox" data-value="personal-attributes" onchange="saveCheckboxes(this)"> Personal Attributes</label>
<label><input type="checkbox" class="pii-checkbox" data-value="ssn" onchange="saveCheckboxes(this)"> SSN</label>
<label><input type="checkbox" class="pii-checkbox" data-value="bank-info" onchange="saveCheckboxes(this)"> Bank Info</label>
<label><input type="checkbox" class="pii-checkbox" data-value="credit-card" onchange="saveCheckboxes(this)"> Credit Card Info</label>
<label><input type="checkbox" class="pii-checkbox" data-value="usernames-passwords" onchange="saveCheckboxes(this)"> Usernames/Passwords</label>
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
</div>
<textarea id="other-pii-private-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
2025-04-08 22:30:59 +00:00
<button type="button" class="continue-button" onclick="saveThenNext(this)">Continue</button>
2025-04-08 21:04:56 +00:00
</div>
2025-04-04 16:19:04 +00:00
</div>
2025-03-26 18:26:06 +00:00
</div>
"""
2025-04-04 17:18:19 +00:00
html_content += (
"""
2025-03-26 18:26:06 +00:00
</div>
2025-04-04 16:19:04 +00:00
<div class="completion-message" id="completion-message">
2025-04-04 17:12:46 +00:00
Thank you! All annotations are complete.<br>
Your Prolific completion code is: <strong id="prolific-code">Loading...</strong>
2025-04-04 16:19:04 +00:00
</div>
2025-04-04 17:12:46 +00:00
<!-- Store the obfuscated code in a hidden element -->
2025-04-04 17:18:19 +00:00
<div id="obfuscated-code" style="display:none;">"""
+ obfuscated_code
+ """</div>
2025-04-04 16:19:04 +00:00
2025-03-26 18:26:06 +00:00
</div>
2025-04-01 18:35:04 +00:00
<script>
// Using externally injected async functions: fetchDatastore() and putDatastore()
2025-04-04 16:19:04 +00:00
// Track annotation progress
let currentIndex = 0;
const totalPages = document.querySelectorAll('.page-container').length;
2025-04-04 21:41:36 +00:00
2025-04-04 16:19:04 +00:00
// Update progress bar
function updateProgressBar() {
// Check if all annotations are complete
2025-04-04 17:53:26 +00:00
if (currentIndex >= totalPages) {
2025-04-04 16:19:04 +00:00
document.getElementById('completion-message').style.display = 'block';
}
}
// Update status indicators
function updateStatusIndicators() {
// Reset all status indicators
document.querySelectorAll('.annotation-status').forEach(function(status) {
status.className = 'annotation-status status-pending';
status.textContent = 'Pending';
2025-04-04 19:38:59 +00:00
// Remove any click handlers
status.onclick = null;
2025-04-04 16:19:04 +00:00
});
// Set current item status
const currentStatus = document.getElementById(`status-${currentIndex}`);
if (currentStatus) {
currentStatus.className = 'annotation-status status-current';
currentStatus.textContent = 'Current';
}
// Update completed statuses
for (let i = 0; i < currentIndex; i++) {
const status = document.getElementById(`status-${i}`);
if (status) {
status.className = 'annotation-status status-complete';
2025-04-04 19:38:59 +00:00
status.textContent = 'Edit ✎';
// Add click handler to edit this annotation
status.onclick = function() { editAnnotation(i); };
2025-04-04 16:19:04 +00:00
}
}
}
2025-04-04 19:38:59 +00:00
// Function to enable editing a previously completed annotation
function editAnnotation(index) {
// Hide current annotation interface
document.querySelector(`.annotation-interface[data-id="page-${currentIndex}"]`).classList.remove('active');
// Remove editing class from all containers
document.querySelectorAll('.page-container').forEach(container => {
container.classList.remove('editing');
});
// Show the selected annotation interface
document.querySelector(`.annotation-interface[data-id="page-${index}"]`).classList.add('active');
// Add editing class to the container being edited
const activeContainer = document.querySelector(`.page-container[data-index="${index}"]`);
if (activeContainer) {
activeContainer.classList.add('editing');
activeContainer.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
// Update current index
currentIndex = index;
updateProgressBar();
updateStatusIndicators();
}
2025-04-04 16:19:04 +00:00
// Navigate to the next document
function goToNextDocument() {
// Hide current annotation interface
document.querySelector(`.annotation-interface[data-id="page-${currentIndex}"]`).classList.remove('active');
2025-04-04 19:38:59 +00:00
// Remove editing class from all containers
document.querySelectorAll('.page-container').forEach(container => {
container.classList.remove('editing');
});
2025-04-04 16:19:04 +00:00
// Move to next document if not at the end
if (currentIndex < totalPages - 1) {
currentIndex++;
document.querySelector(`.annotation-interface[data-id="page-${currentIndex}"]`).classList.add('active');
2025-04-04 19:38:59 +00:00
// Add editing class to current container
2025-04-04 16:19:04 +00:00
const activeContainer = document.querySelector(`.page-container[data-index="${currentIndex}"]`);
if (activeContainer) {
2025-04-04 19:38:59 +00:00
activeContainer.classList.add('editing');
2025-04-04 16:19:04 +00:00
activeContainer.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
2025-04-04 19:38:59 +00:00
updateProgressBar();
updateStatusIndicators();
2025-04-04 16:19:04 +00:00
}
2025-04-04 17:53:26 +00:00
else {
// This was the last document, mark as complete
currentIndex = totalPages;
updateProgressBar();
updateStatusIndicators();
// Show completion message and scroll to it
document.getElementById('completion-message').style.display = 'block';
document.getElementById('completion-message').scrollIntoView({ behavior: 'smooth', block: 'center' });
}
2025-04-04 16:19:04 +00:00
}
// Handle text area keydown for Enter key
function handleTextareaKeydown(event, textarea) {
// If Enter key is pressed and not with Shift key, move to next document
if (event.key === 'Enter' && !event.shiftKey) {
event.preventDefault();
2025-04-08 22:30:59 +00:00
saveFeedback(textarea).then(() => {
goToNextDocument();
});
2025-04-04 16:19:04 +00:00
}
}
async function saveFeedback(source) {
2025-04-04 16:19:04 +00:00
const interfaceDiv = source.closest('.annotation-interface');
const id = interfaceDiv.getAttribute('data-id');
2025-04-08 21:04:56 +00:00
// Get the selected primary option
const activePrimaryButton = interfaceDiv.querySelector('button.primary-option.active');
const primaryOption = activePrimaryButton ? activePrimaryButton.getAttribute('data-value') : null;
// Get checkbox selections for public document
const publicPiiOptions = [];
interfaceDiv.querySelectorAll('#public-pii-options-' + id.split('-')[1] + ' input[type="checkbox"]:checked').forEach(checkbox => {
publicPiiOptions.push(checkbox.getAttribute('data-value'));
});
// Get checkbox selections for private document
const privatePiiOptions = [];
interfaceDiv.querySelectorAll('#private-pii-options-' + id.split('-')[1] + ' input[type="checkbox"]:checked').forEach(checkbox => {
privatePiiOptions.push(checkbox.getAttribute('data-value'));
});
// Get any "Other" descriptions
const otherPublicDesc = interfaceDiv.querySelector('#other-pii-public-' + id.split('-')[1])?.value || '';
const otherPrivateDesc = interfaceDiv.querySelector('#other-pii-private-' + id.split('-')[1])?.value || '';
2025-04-04 19:36:10 +00:00
const pdfPath = interfaceDiv.getAttribute('data-pdf-path');
2025-04-01 18:35:04 +00:00
const datastore = await fetchDatastore() || {};
datastore[id] = {
2025-04-08 21:04:56 +00:00
primaryOption: primaryOption,
publicPiiOptions: publicPiiOptions,
privatePiiOptions: privatePiiOptions,
otherPublicDesc: otherPublicDesc,
otherPrivateDesc: otherPrivateDesc,
2025-04-04 19:36:10 +00:00
pdfPath: pdfPath
2025-04-01 18:35:04 +00:00
};
await putDatastore(datastore);
}
2025-04-08 22:30:59 +00:00
function saveThenNext(btn) {
const interfaceDiv = btn.closest('.annotation-interface');
saveFeedback(interfaceDiv).then(() => {
goToNextDocument();
});
}
2025-04-08 21:04:56 +00:00
function togglePrimaryOption(btn, index) {
2025-04-04 16:19:04 +00:00
const interfaceDiv = btn.closest('.annotation-interface');
2025-04-08 21:04:56 +00:00
// Remove active class from all primary option buttons in this group
interfaceDiv.querySelectorAll('button.primary-option').forEach(function(b) {
b.classList.remove('active');
});
2025-04-08 21:04:56 +00:00
// Toggle on the clicked button
btn.classList.add('active');
2025-04-04 16:19:04 +00:00
2025-04-08 21:04:56 +00:00
// Hide all secondary option containers
document.querySelector(`#public-pii-options-${index}`).style.display = 'none';
document.querySelector(`#private-pii-options-${index}`).style.display = 'none';
2025-04-08 22:30:59 +00:00
// Immediately save the primary option selection
saveFeedback(interfaceDiv);
2025-04-08 21:04:56 +00:00
const option = btn.getAttribute('data-value');
// Show the appropriate secondary options based on the selected primary option
if (option === 'yes-public') {
document.querySelector(`#public-pii-options-${index}`).style.display = 'block';
} else if (option === 'no-public') {
document.querySelector(`#private-pii-options-${index}`).style.display = 'block';
} else {
// For "cannot-read" or "report-content", just save and move to next
goToNextDocument();
}
}
function toggleOtherTextarea(checkbox) {
const container = checkbox.closest('.question-container');
const textareaId = container.querySelector('textarea').id;
const textarea = document.getElementById(textareaId);
2025-04-04 16:19:04 +00:00
2025-04-08 21:04:56 +00:00
if (checkbox.checked) {
2025-04-04 16:19:04 +00:00
textarea.style.display = 'block';
textarea.focus();
} else {
textarea.style.display = 'none';
}
2025-04-08 21:04:56 +00:00
saveCheckboxes(checkbox);
}
function saveCheckboxes(input) {
const interfaceDiv = input.closest('.annotation-interface');
2025-04-08 22:30:59 +00:00
return saveFeedback(interfaceDiv);
}
2025-04-04 17:12:46 +00:00
// Function to deobfuscate the Prolific code
function deobfuscateCode(obfuscatedCode) {
// Reverse the string
const reversed = obfuscatedCode.split('').reverse().join('');
// Decode from base64
try {
return atob(reversed);
} catch (e) {
return "ERROR_DECODING";
}
}
2025-04-01 18:35:04 +00:00
document.addEventListener("DOMContentLoaded", async function() {
const datastore = await fetchDatastore() || {};
2025-04-04 19:38:59 +00:00
// Add editing class to the first container by default
const firstContainer = document.querySelector(`.page-container[data-index="0"]`);
if (firstContainer) {
firstContainer.classList.add('editing');
}
2025-04-04 16:19:04 +00:00
updateProgressBar();
updateStatusIndicators();
2025-04-04 17:12:46 +00:00
// Get and deobfuscate the Prolific code
const obfuscatedCode = document.getElementById('obfuscated-code').textContent;
const prolificCode = deobfuscateCode(obfuscatedCode);
document.getElementById('prolific-code').textContent = prolificCode;
2025-04-04 16:19:04 +00:00
document.querySelectorAll('.annotation-interface').forEach(function(interfaceDiv) {
const id = interfaceDiv.getAttribute('data-id');
2025-04-08 21:04:56 +00:00
const pageIndex = id.split('-')[1];
2025-04-01 18:35:04 +00:00
if (datastore[id]) {
const data = datastore[id];
2025-04-08 21:04:56 +00:00
// Set active state for primary option buttons
interfaceDiv.querySelectorAll('button.primary-option').forEach(function(btn) {
if (btn.getAttribute('data-value') === data.primaryOption) {
btn.classList.add('active');
2025-04-04 16:19:04 +00:00
2025-04-08 21:04:56 +00:00
// Show the appropriate secondary options
const option = btn.getAttribute('data-value');
if (option === 'yes-public') {
document.querySelector(`#public-pii-options-${pageIndex}`).style.display = 'block';
} else if (option === 'no-public') {
document.querySelector(`#private-pii-options-${pageIndex}`).style.display = 'block';
2025-04-04 16:19:04 +00:00
}
} else {
btn.classList.remove('active');
}
});
2025-04-08 21:04:56 +00:00
// Restore public PII checkboxes
if (data.publicPiiOptions && data.publicPiiOptions.length > 0) {
const publicContainer = document.querySelector(`#public-pii-options-${pageIndex}`);
data.publicPiiOptions.forEach(option => {
const checkbox = publicContainer.querySelector(`input[data-value="${option}"]`);
if (checkbox) {
checkbox.checked = true;
if (option === 'other') {
document.getElementById(`other-pii-public-${pageIndex}`).style.display = 'block';
}
}
});
}
// Restore private PII checkboxes
if (data.privatePiiOptions && data.privatePiiOptions.length > 0) {
const privateContainer = document.querySelector(`#private-pii-options-${pageIndex}`);
data.privatePiiOptions.forEach(option => {
const checkbox = privateContainer.querySelector(`input[data-value="${option}"]`);
if (checkbox) {
checkbox.checked = true;
if (option === 'other') {
document.getElementById(`other-pii-private-${pageIndex}`).style.display = 'block';
}
}
});
}
// Set the textarea values
if (data.otherPublicDesc) {
document.getElementById(`other-pii-public-${pageIndex}`).value = data.otherPublicDesc;
}
if (data.otherPrivateDesc) {
document.getElementById(`other-pii-private-${pageIndex}`).value = data.otherPrivateDesc;
}
2025-04-01 18:35:04 +00:00
}
});
2025-04-04 16:19:04 +00:00
// If we have stored data, restore the current position
let lastAnnotatedIndex = -1;
for (let i = 0; i < totalPages; i++) {
const pageId = `page-${i}`;
2025-04-08 21:04:56 +00:00
if (datastore[pageId] && datastore[pageId].primaryOption) {
2025-04-04 16:19:04 +00:00
lastAnnotatedIndex = i;
}
}
// If we have annotated pages, go to the first unannotated page
2025-04-07 20:27:32 +00:00
if (lastAnnotatedIndex >= 0) {
2025-04-04 16:19:04 +00:00
document.querySelector(`.annotation-interface.active`).classList.remove('active');
2025-04-07 20:27:32 +00:00
// Check if all pages are annotated
if (lastAnnotatedIndex === totalPages - 1) {
// All pages are annotated, set currentIndex to totalPages to trigger completion
currentIndex = totalPages;
// Show completion message and scroll to it
document.getElementById('completion-message').style.display = 'block';
document.getElementById('completion-message').scrollIntoView({ behavior: 'smooth', block: 'center' });
} else {
// Go to the next unannotated page
currentIndex = lastAnnotatedIndex + 1;
document.querySelector(`.annotation-interface[data-id="page-${currentIndex}"]`).classList.add('active');
// Add editing class and scroll to the active annotation
const activeContainer = document.querySelector(`.page-container[data-index="${currentIndex}"]`);
if (activeContainer) {
// Remove editing class from all containers first
document.querySelectorAll('.page-container').forEach(container => {
container.classList.remove('editing');
});
// Add editing class to current container
activeContainer.classList.add('editing');
activeContainer.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
2025-04-04 16:19:04 +00:00
}
updateProgressBar();
updateStatusIndicators();
}
2025-04-01 18:35:04 +00:00
});
</script>
2025-03-26 18:26:06 +00:00
</body>
</html>
"""
2025-04-04 17:18:19 +00:00
)
with open(output_path, "w") as f:
2025-03-26 18:26:06 +00:00
f.write(html_content)
2025-03-26 18:26:06 +00:00
print(f"Created HTML output at {output_path}")
2025-03-26 18:26:06 +00:00
def generate_sample_set(args, i, s3_client, pdf_s3_client, result_files):
"""Generate a single sample set."""
output_filename = Path(args.output_dir) / f"dolma_samples_{i+1}.html"
2025-03-26 18:26:06 +00:00
print(f"\nGenerating sample set {i+1} of {args.repeats}")
2025-03-26 18:26:06 +00:00
# Get random pages
random_pages = get_random_pages(s3_client, result_files, args.pages_per_output)
2025-04-07 21:39:55 +00:00
# Use the fixed prolific code from command line arguments
prolific_code = args.prolific_code
2025-04-04 17:18:19 +00:00
2025-04-04 17:12:46 +00:00
# Create HTML output with the Prolific code
create_html_output(random_pages, pdf_s3_client, output_filename, args.workspace, args.db_path, prolific_code)
2025-04-07 21:39:55 +00:00
return output_filename
2025-03-26 18:26:06 +00:00
2025-04-04 19:44:54 +00:00
def extract_datastore_url(html_content: str) -> Optional[str]:
"""Extract the presigned datastore URL from HTML content."""
match = re.search(r'const\s+presignedGetUrl\s*=\s*"([^"]+)"', html_content)
if match:
return match.group(1)
return None
def fetch_annotations(tinyhost_link: str) -> Tuple[Dict[str, Any], str]:
"""Fetch and parse annotations from a tinyhost link."""
# Request the HTML content
print(f"Fetching annotations from {tinyhost_link}")
response = requests.get(tinyhost_link)
response.raise_for_status()
html_content = response.text
# Extract the datastore URL
datastore_url = extract_datastore_url(html_content)
if not datastore_url:
print(f"Could not find datastore URL in {tinyhost_link}")
return {}, tinyhost_link
# Fetch the datastore content
print(f"Found datastore URL: {datastore_url}")
try:
datastore_response = requests.get(datastore_url)
datastore_response.raise_for_status()
annotations = datastore_response.json()
return annotations, tinyhost_link
except Exception as e:
print(f"Error fetching datastore from {datastore_url}: {e}")
return {}, tinyhost_link
def process_annotations(annotations_by_link: List[Tuple[Dict[str, Any], str]]) -> Dict[str, List[Dict[str, Any]]]:
"""Process and categorize annotations by feedback type."""
results = {
2025-04-08 21:04:56 +00:00
"public_document": [],
"private_document": [],
2025-04-04 19:44:54 +00:00
"cannot_read": [],
2025-04-08 21:04:56 +00:00
"report_content": [],
2025-04-04 19:44:54 +00:00
"no_annotation": [],
}
# Process each annotation
for annotations, link in annotations_by_link:
for page_id, annotation in annotations.items():
2025-04-08 21:04:56 +00:00
if not annotation or "primaryOption" not in annotation:
2025-04-04 19:44:54 +00:00
results["no_annotation"].append(
{"page_id": page_id, "link": link, "pdf_path": annotation.get("pdfPath", "Unknown") if annotation else "Unknown"}
)
continue
2025-04-08 21:04:56 +00:00
primary_option = annotation["primaryOption"]
pdf_path = annotation.get("pdfPath", "Unknown")
2025-04-08 22:30:59 +00:00
2025-04-08 21:04:56 +00:00
# Build a result item based on the new annotation structure
if primary_option == "yes-public":
# Public document with potential PII
public_pii_options = annotation.get("publicPiiOptions", [])
other_desc = annotation.get("otherPublicDesc", "")
2025-04-08 22:30:59 +00:00
2025-04-08 21:04:56 +00:00
if not public_pii_options:
# No PII selected in a public document
2025-04-08 22:30:59 +00:00
results["public_document"].append(
{"page_id": page_id, "link": link, "pdf_path": pdf_path, "pii_types": [], "has_pii": False, "description": ""}
)
2025-04-08 21:04:56 +00:00
else:
# PII found in a public document
2025-04-08 22:30:59 +00:00
results["public_document"].append(
{
"page_id": page_id,
"link": link,
"pdf_path": pdf_path,
"pii_types": public_pii_options,
"has_pii": True,
"description": other_desc if "other" in public_pii_options else "",
}
)
2025-04-08 21:04:56 +00:00
elif primary_option == "no-public":
# Private document with potential PII
private_pii_options = annotation.get("privatePiiOptions", [])
other_desc = annotation.get("otherPrivateDesc", "")
2025-04-08 22:30:59 +00:00
2025-04-08 21:04:56 +00:00
if not private_pii_options:
# No PII selected in a private document
2025-04-08 22:30:59 +00:00
results["private_document"].append(
{"page_id": page_id, "link": link, "pdf_path": pdf_path, "pii_types": [], "has_pii": False, "description": ""}
)
2025-04-08 21:04:56 +00:00
else:
# PII found in a private document
2025-04-08 22:30:59 +00:00
results["private_document"].append(
{
"page_id": page_id,
"link": link,
"pdf_path": pdf_path,
"pii_types": private_pii_options,
"has_pii": True,
"description": other_desc if "other" in private_pii_options else "",
}
)
2025-04-08 21:04:56 +00:00
elif primary_option == "cannot-read":
2025-04-08 22:30:59 +00:00
results["cannot_read"].append({"page_id": page_id, "link": link, "pdf_path": pdf_path})
2025-04-08 21:04:56 +00:00
elif primary_option == "report-content":
2025-04-08 22:30:59 +00:00
results["report_content"].append({"page_id": page_id, "link": link, "pdf_path": pdf_path})
2025-04-04 19:44:54 +00:00
else:
2025-04-08 22:30:59 +00:00
results["no_annotation"].append({"page_id": page_id, "link": link, "pdf_path": pdf_path})
2025-04-04 19:44:54 +00:00
return results
def print_annotation_report(annotation_results: Dict[str, List[Dict[str, Any]]]):
"""Print a summary report of annotations."""
total_pages = sum(len(items) for items in annotation_results.values())
print("\n" + "=" * 80)
print(f"ANNOTATION REPORT - Total Pages: {total_pages}")
print("=" * 80)
2025-04-08 21:04:56 +00:00
# Count pages with PII in public documents
2025-04-08 22:30:59 +00:00
public_with_pii = [page for page in annotation_results["public_document"] if page.get("has_pii", False)]
public_without_pii = [page for page in annotation_results["public_document"] if not page.get("has_pii", False)]
2025-04-08 21:04:56 +00:00
# Count pages with PII in private documents
2025-04-08 22:30:59 +00:00
private_with_pii = [page for page in annotation_results["private_document"] if page.get("has_pii", False)]
private_without_pii = [page for page in annotation_results["private_document"] if not page.get("has_pii", False)]
2025-04-08 21:04:56 +00:00
2025-04-04 19:44:54 +00:00
# Print summary statistics
print("\nSummary:")
2025-04-08 22:30:59 +00:00
print(
f" Public documents (total): {len(annotation_results['public_document'])} ({len(annotation_results['public_document'])/total_pages*100:.1f}% of all pages)"
)
2025-04-08 21:04:56 +00:00
print(f" - With PII: {len(public_with_pii)} ({len(public_with_pii)/max(1, len(annotation_results['public_document']))*100:.1f}% of public docs)")
2025-04-08 22:30:59 +00:00
print(
f" - Without PII: {len(public_without_pii)} ({len(public_without_pii)/max(1, len(annotation_results['public_document']))*100:.1f}% of public docs)"
)
print(
f" Private documents (total): {len(annotation_results['private_document'])} ({len(annotation_results['private_document'])/total_pages*100:.1f}% of all pages)"
)
2025-04-08 21:04:56 +00:00
print(f" - With PII: {len(private_with_pii)} ({len(private_with_pii)/max(1, len(annotation_results['private_document']))*100:.1f}% of private docs)")
2025-04-08 22:30:59 +00:00
print(
f" - Without PII: {len(private_without_pii)} ({len(private_without_pii)/max(1, len(annotation_results['private_document']))*100:.1f}% of private docs)"
)
2025-04-04 19:44:54 +00:00
print(f" Unreadable pages: {len(annotation_results['cannot_read'])} ({len(annotation_results['cannot_read'])/total_pages*100:.1f}%)")
2025-04-08 21:04:56 +00:00
print(f" Pages with reported content: {len(annotation_results['report_content'])} ({len(annotation_results['report_content'])/total_pages*100:.1f}%)")
2025-04-04 19:44:54 +00:00
print(f" Pages without annotation: {len(annotation_results['no_annotation'])} ({len(annotation_results['no_annotation'])/total_pages*100:.1f}%)")
2025-04-08 21:04:56 +00:00
# Analyze PII types in public documents
if public_with_pii:
pii_counts_public = {}
for page in public_with_pii:
2025-04-08 22:30:59 +00:00
for pii_type in page.get("pii_types", []):
2025-04-08 21:04:56 +00:00
pii_counts_public[pii_type] = pii_counts_public.get(pii_type, 0) + 1
2025-04-08 22:30:59 +00:00
2025-04-08 21:04:56 +00:00
print("\nPII Types in Public Documents:")
for pii_type, count in sorted(pii_counts_public.items(), key=lambda x: x[1], reverse=True):
print(f" - {pii_type}: {count} ({count/len(public_with_pii)*100:.1f}%)")
# Analyze PII types in private documents
if private_with_pii:
pii_counts_private = {}
for page in private_with_pii:
2025-04-08 22:30:59 +00:00
for pii_type in page.get("pii_types", []):
2025-04-08 21:04:56 +00:00
pii_counts_private[pii_type] = pii_counts_private.get(pii_type, 0) + 1
2025-04-08 22:30:59 +00:00
2025-04-08 21:04:56 +00:00
print("\nPII Types in Private Documents:")
for pii_type, count in sorted(pii_counts_private.items(), key=lambda x: x[1], reverse=True):
print(f" - {pii_type}: {count} ({count/len(private_with_pii)*100:.1f}%)")
# Print detailed report for public documents with PII
if public_with_pii:
print("\nDetailed Report - Public Documents with PII:")
print("-" * 80)
for i, item in enumerate(public_with_pii, 1):
print(f"{i}. PDF: {item['pdf_path']}")
print(f" Page ID: {item['page_id']}")
print(f" Link: {item['link']}#{item['page_id']}")
print(f" PII Types: {', '.join(item['pii_types'])}")
2025-04-08 22:30:59 +00:00
if item.get("description"):
2025-04-08 21:04:56 +00:00
print(f" Description: {item['description']}")
print("-" * 80)
# Print detailed report for private documents with PII
if private_with_pii:
print("\nDetailed Report - Private Documents with PII:")
2025-04-04 19:44:54 +00:00
print("-" * 80)
2025-04-08 21:04:56 +00:00
for i, item in enumerate(private_with_pii, 1):
2025-04-04 19:44:54 +00:00
print(f"{i}. PDF: {item['pdf_path']}")
print(f" Page ID: {item['page_id']}")
print(f" Link: {item['link']}#{item['page_id']}")
2025-04-08 21:04:56 +00:00
print(f" PII Types: {', '.join(item['pii_types'])}")
2025-04-08 22:30:59 +00:00
if item.get("description"):
2025-04-08 21:04:56 +00:00
print(f" Description: {item['description']}")
2025-04-04 19:44:54 +00:00
print("-" * 80)
print("\nReport complete.")
def read_and_process_results(args):
"""Read and process results from a previously generated CSV file."""
try:
# Read the CSV file
links = []
with open(args.read_results, "r") as f:
2025-04-07 21:39:55 +00:00
for line in f:
if line.strip():
links.append(line.strip())
2025-04-04 19:44:54 +00:00
if not links:
print(f"No tinyhost links found in {args.read_results}")
return
print(f"Found {len(links)} tinyhost links in {args.read_results}")
# Fetch and process annotations
annotations_by_link = []
for link in tqdm(links, desc="Fetching annotations"):
try:
annotations, link_url = fetch_annotations(link)
annotations_by_link.append((annotations, link_url))
except Exception as e:
print(f"Error processing {link}: {e}")
# Process and categorize annotations
annotation_results = process_annotations(annotations_by_link)
# Print report
print_annotation_report(annotation_results)
# Save detailed report to file
output_file = Path(args.output_dir) / "annotation_report.csv"
print(f"\nSaving detailed report to {output_file}")
with open(output_file, "w", newline="") as f:
writer = csv.writer(f)
2025-04-08 21:04:56 +00:00
writer.writerow(["Category", "PDF Path", "Page ID", "Link", "Document Type", "PII Types", "Description"])
2025-04-04 19:44:54 +00:00
for category, items in annotation_results.items():
for item in items:
2025-04-08 21:04:56 +00:00
if category == "public_document":
doc_type = "Public"
pii_types = ", ".join(item.get("pii_types", []))
description = item.get("description", "")
elif category == "private_document":
doc_type = "Private"
pii_types = ", ".join(item.get("pii_types", []))
description = item.get("description", "")
else:
doc_type = ""
pii_types = ""
description = ""
2025-04-08 22:30:59 +00:00
writer.writerow([category, item["pdf_path"], item["page_id"], f"{item['link']}#{item['page_id']}", doc_type, pii_types, description])
2025-04-04 19:44:54 +00:00
print(f"Report saved to {output_file}")
except Exception as e:
print(f"Error processing results: {e}")
2025-03-26 18:26:06 +00:00
def main():
args = parse_args()
2025-04-04 19:44:54 +00:00
# Check if we're reading results from a previous run
if args.read_results:
read_and_process_results(args)
return
2025-03-26 18:26:06 +00:00
# Set up S3 clients
s3_client = boto3.client("s3")
2025-03-26 18:26:06 +00:00
# Set up PDF S3 client with profile if specified
if args.pdf_profile:
pdf_session = boto3.Session(profile_name=args.pdf_profile)
pdf_s3_client = pdf_session.client("s3")
else:
pdf_s3_client = s3_client
2025-03-26 18:26:06 +00:00
# Create output directory
output_dir = Path(args.output_dir)
output_dir.mkdir(exist_ok=True, parents=True)
2025-03-26 18:26:06 +00:00
# List all result files
print(f"Listing result files in {args.workspace}/results...")
result_files = list_result_files(s3_client, args.workspace)
print(f"Found {len(result_files)} result files")
2025-03-26 18:26:06 +00:00
# Use ThreadPoolExecutor to parallelize the generation of sample sets
2025-04-04 16:29:58 +00:00
output_files = []
2025-03-26 18:26:06 +00:00
if args.repeats > 1:
print(f"Using ThreadPoolExecutor with {min(args.max_workers, args.repeats)} workers")
with ThreadPoolExecutor(max_workers=min(args.max_workers, args.repeats)) as executor:
futures = []
for i in range(args.repeats):
future = executor.submit(generate_sample_set, args, i, s3_client, pdf_s3_client, result_files)
2025-03-26 18:26:06 +00:00
futures.append(future)
2025-03-26 18:26:06 +00:00
# Wait for all futures to complete and collect results
for future in futures:
try:
2025-04-07 21:39:55 +00:00
output_filename = future.result()
2025-04-04 16:29:58 +00:00
output_files.append(output_filename)
2025-04-07 21:39:55 +00:00
print(f"Completed generation of {output_filename}")
2025-03-26 18:26:06 +00:00
except Exception as e:
print(f"Error generating sample set: {e}")
else:
# If only one repeat, just run it directly
2025-04-07 21:39:55 +00:00
output_filename = generate_sample_set(args, 0, s3_client, pdf_s3_client, result_files)
2025-04-04 16:29:58 +00:00
output_files.append(output_filename)
# Now upload each resulting file into tinyhost
print("Generated all files, uploading tinyhost links now")
links = []
for output_filename in output_files:
2025-04-04 17:18:41 +00:00
link = tinyhost.tinyhost([str(output_filename)])[0]
links.append(link)
2025-04-04 16:29:58 +00:00
print(link)
2025-04-04 17:18:19 +00:00
2025-04-07 21:39:55 +00:00
# Create CSV file with just the tinyhost links, one per line
2025-04-04 17:12:46 +00:00
csv_path = args.prolific_csv
2025-04-07 21:39:55 +00:00
print(f"Writing tinyhost links to {csv_path}")
2025-04-04 17:18:19 +00:00
with open(csv_path, "w", newline="") as csvfile:
2025-04-07 21:39:55 +00:00
for link in links:
csvfile.write(f"{link}\n")
2025-04-04 17:18:19 +00:00
2025-04-07 21:39:55 +00:00
print(f"Tinyhost links written to {csv_path}")
2025-03-26 18:26:06 +00:00
2025-03-26 18:26:06 +00:00
if __name__ == "__main__":
2025-04-04 17:18:19 +00:00
main()