2025-03-26 18:26:06 +00:00
import argparse
2025-04-04 17:12:46 +00:00
import base64
import csv
2025-03-26 18:49:48 +00:00
import datetime
2025-03-26 18:26:06 +00:00
import json
import os
import random
import re
import sqlite3
2025-03-26 18:49:48 +00:00
import tempfile
from concurrent . futures import ThreadPoolExecutor
2025-03-26 18:26:06 +00:00
from pathlib import Path
2025-04-04 19:44:54 +00:00
from typing import Any , Dict , List , Optional , Tuple
2025-03-26 18:49:48 +00:00
import boto3
2025-04-04 19:44:54 +00:00
import requests
2025-04-04 17:18:19 +00:00
import tinyhost
2025-03-26 18:26:06 +00:00
from tqdm import tqdm
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
from olmocr . data . renderpdf import render_pdf_to_base64webp
2025-03-26 18:49:48 +00:00
from olmocr . s3_utils import get_s3_bytes , parse_s3_path
2025-03-26 18:26:06 +00:00
def parse_args ( ) :
parser = argparse . ArgumentParser ( description = " Scan OLMO OCR workspace results and create visual samples " )
parser . add_argument ( " workspace " , help = " OLMO OCR workspace path (s3://bucket/workspace) " )
parser . add_argument ( " --pages_per_output " , type = int , default = 30 , help = " Number of pages per output file " )
parser . add_argument ( " --repeats " , type = int , default = 1 , help = " Number of output files to generate " )
parser . add_argument ( " --pdf_profile " , help = " AWS profile for accessing PDFs " )
parser . add_argument ( " --output_dir " , default = " dolma_samples " , help = " Directory to save output HTML files " )
parser . add_argument ( " --max_workers " , type = int , default = 4 , help = " Maximum number of worker threads " )
2025-03-26 18:49:48 +00:00
parser . add_argument (
" --db_path " ,
default = " ~/s2pdf_url_data/d65142df-6588-4b68-a12c-d468b3761189.csv.db " ,
help = " Path to the SQLite database containing PDF hash to URL mapping " ,
)
2025-04-07 21:39:55 +00:00
parser . add_argument (
" --prolific_code " ,
required = True ,
help = " Fixed completion code to use for all outputs " ,
)
2025-04-04 17:12:46 +00:00
parser . add_argument (
" --prolific_csv " ,
default = " prolific_codes.csv " ,
2025-04-07 21:39:55 +00:00
help = " Path to save the file with tinyhost links (one URL per line) " ,
2025-04-04 17:12:46 +00:00
)
2025-04-04 19:44:54 +00:00
parser . add_argument (
" --read_results " ,
help = " Path to a CSV file containing previously generated tinyhost links to extract annotations " ,
)
2025-03-26 18:26:06 +00:00
return parser . parse_args ( )
2025-03-26 18:49:48 +00:00
2025-04-07 21:39:55 +00:00
# Fixed prolific code is now passed in as a command line argument
2025-04-04 17:12:46 +00:00
def obfuscate_code ( code ) :
""" Gently obfuscate the Prolific code so it ' s not immediately visible in source. """
# Convert to base64 and reverse
encoded = base64 . b64encode ( code . encode ( ) ) . decode ( )
return encoded [ : : - 1 ]
def deobfuscate_code ( obfuscated_code ) :
""" Deobfuscate the code - this will be done in JavaScript. """
# Reverse and decode from base64
reversed_encoded = obfuscated_code [ : : - 1 ]
try :
return base64 . b64decode ( reversed_encoded ) . decode ( )
except :
return " ERROR_DECODING "
2025-03-26 18:26:06 +00:00
def parse_pdf_hash ( pretty_pdf_path : str ) - > Optional [ str ] :
pattern = r " s3://ai2-s2-pdfs/([a-f0-9] {4} )/([a-f0-9]+) \ .pdf "
match = re . match ( pattern , pretty_pdf_path )
if match :
return match . group ( 1 ) + match . group ( 2 )
return None
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
def get_original_url ( pdf_hash : str , db_path : str ) - > Optional [ str ] :
""" Look up the original URL for a PDF hash in the SQLite database. """
if not pdf_hash :
return None
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
try :
sqlite_db_path = os . path . expanduser ( db_path )
if not os . path . exists ( sqlite_db_path ) :
print ( f " SQLite database not found at { sqlite_db_path } " )
return None
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
conn = sqlite3 . connect ( sqlite_db_path )
cursor = conn . cursor ( )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
cursor . execute ( " SELECT uri FROM pdf_mapping WHERE pdf_hash = ? " , ( pdf_hash , ) )
result = cursor . fetchone ( )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
conn . close ( )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
if result :
return result [ 0 ]
return None
except Exception as e :
print ( f " Error looking up URL for PDF hash { pdf_hash } : { e } " )
return None
def list_result_files ( s3_client , workspace_path ) :
""" List all JSON result files in the workspace results directory. """
bucket , prefix = parse_s3_path ( workspace_path )
results_prefix = os . path . join ( prefix , " results " ) . rstrip ( " / " ) + " / "
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
all_files = [ ]
paginator = s3_client . get_paginator ( " list_objects_v2 " )
for page in paginator . paginate ( Bucket = bucket , Prefix = results_prefix ) :
if " Contents " in page :
2025-03-26 18:49:48 +00:00
all_files . extend ( [ f " s3:// { bucket } / { obj [ ' Key ' ] } " for obj in page [ " Contents " ] if obj [ " Key " ] . endswith ( " .jsonl " ) or obj [ " Key " ] . endswith ( " .json " ) ] )
2025-04-04 16:05:04 +00:00
if len ( all_files ) > 1000 :
break
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
return all_files
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
def get_random_pages ( s3_client , result_files , count = 30 ) :
""" Get random pages from the result files. """
random_pages = [ ]
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Try to collect the requested number of pages
attempts = 0
max_attempts = count * 3 # Allow extra attempts to handle potential failures
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
while len ( random_pages ) < count and attempts < max_attempts :
attempts + = 1
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Pick a random result file
if not result_files :
print ( " No result files found! " )
break
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
result_file = random . choice ( result_files )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
try :
# Get the content of the file
content = get_s3_bytes ( s3_client , result_file )
2025-03-26 18:49:48 +00:00
lines = content . decode ( " utf-8 " ) . strip ( ) . split ( " \n " )
2025-03-26 18:26:06 +00:00
if not lines :
continue
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Pick a random line (which contains a complete document)
line = random . choice ( lines )
doc = json . loads ( line )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# A Dolma document has "text", "metadata", and "attributes" fields
if " text " not in doc or " metadata " not in doc or " attributes " not in doc :
print ( f " Document in { result_file } is not a valid Dolma document " )
continue
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Get the original PDF path from metadata
pdf_path = doc [ " metadata " ] . get ( " Source-File " )
if not pdf_path :
continue
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Get page spans from attributes
page_spans = doc [ " attributes " ] . get ( " pdf_page_numbers " , [ ] )
if not page_spans :
continue
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Pick a random page span
page_span = random . choice ( page_spans )
if len ( page_span ) > = 3 :
# Page spans are [start_pos, end_pos, page_num]
page_num = page_span [ 2 ]
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Extract text for this page
start_pos , end_pos = page_span [ 0 ] , page_span [ 1 ]
page_text = doc [ " text " ] [ start_pos : end_pos ] . strip ( )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Include the text snippet with the page info
random_pages . append ( ( pdf_path , page_num , page_text , result_file ) )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
if len ( random_pages ) > = count :
break
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
except Exception as e :
print ( f " Error processing { result_file } : { e } " )
continue
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
print ( f " Found { len ( random_pages ) } random pages from Dolma documents " )
return random_pages
2025-03-26 18:49:48 +00:00
def create_presigned_url ( s3_client , pdf_path , expiration = 3600 * 24 * 7 ) :
2025-03-26 18:26:06 +00:00
""" Create a presigned URL for the given S3 path. """
try :
bucket , key = parse_s3_path ( pdf_path )
2025-03-26 18:49:48 +00:00
url = s3_client . generate_presigned_url ( " get_object " , Params = { " Bucket " : bucket , " Key " : key } , ExpiresIn = expiration )
2025-03-26 18:26:06 +00:00
return url
except Exception as e :
print ( f " Error creating presigned URL for { pdf_path } : { e } " )
return None
2025-03-26 18:49:48 +00:00
2025-04-04 17:12:46 +00:00
def create_html_output ( random_pages , pdf_s3_client , output_path , workspace_path , db_path , prolific_code , resolution = 2048 ) :
2025-03-26 18:26:06 +00:00
""" Create an HTML file with rendered PDF pages. """
2025-04-04 17:12:46 +00:00
# Obfuscate the provided Prolific code
obfuscated_code = obfuscate_code ( prolific_code )
2025-04-04 17:18:19 +00:00
2025-03-26 18:26:06 +00:00
# Get current date and time for the report
current_time = datetime . datetime . now ( ) . strftime ( " % Y- % m- %d % H: % M: % S " )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
html_content = f """
< ! DOCTYPE html >
< html lang = " en " >
< head >
< meta charset = " UTF-8 " >
< meta name = " viewport " content = " width=device-width, initial-scale=1.0 " >
< title > OLMO OCR Samples < / title >
< link href = " https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap " rel = " stylesheet " >
< style >
: root { {
- - primary - color : #2563eb;
- - secondary - color : #4b5563;
- - border - color : #e5e7eb;
- - bg - color : #f9fafb;
- - text - color : #111827;
- - text - light : #6b7280;
- - card - shadow : 0 4 px 6 px - 1 px rgba ( 0 , 0 , 0 , 0.1 ) , 0 2 px 4 px - 1 px rgba ( 0 , 0 , 0 , 0.06 ) ;
2025-04-04 16:19:04 +00:00
- - success - color : #10b981;
2025-03-26 18:26:06 +00:00
} }
* { {
box - sizing : border - box ;
margin : 0 ;
padding : 0 ;
} }
body { {
font - family : ' Inter ' , sans - serif ;
line - height : 1.6 ;
color : var ( - - text - color ) ;
background - color : var ( - - bg - color ) ;
padding : 2 rem ;
2025-04-08 20:50:00 +00:00
display : flex ;
flex - direction : row ;
gap : 2 rem ;
2025-03-26 18:26:06 +00:00
} }
2025-04-07 20:27:32 +00:00
ul { {
margin - left : 2 em ;
} }
2025-03-26 18:26:06 +00:00
. container { {
2025-04-08 20:50:00 +00:00
flex : 2 ;
max - width : 750 px ;
2025-03-26 18:26:06 +00:00
} }
header { {
2025-04-08 20:50:00 +00:00
position : sticky ;
top : 2 rem ;
flex : 1 ;
min - width : 380 px ;
max - width : 420 px ;
max - height : calc ( 100 vh - 4 rem ) ;
overflow - y : auto ;
padding : 1.5 rem ;
background - color : white ;
border - radius : 0.5 rem ;
box - shadow : var ( - - card - shadow ) ;
align - self : flex - start ;
2025-04-07 20:27:32 +00:00
font - size : small ;
2025-03-26 18:26:06 +00:00
} }
2025-04-07 20:27:32 +00:00
header h2 { {
margin - top : 1 em ;
2025-03-26 18:26:06 +00:00
} }
2025-04-07 20:27:32 +00:00
2025-03-26 18:26:06 +00:00
. info - bar { {
background - color : white ;
padding : 1 rem ;
border - radius : 0.5 rem ;
margin - bottom : 2 rem ;
box - shadow : var ( - - card - shadow ) ;
display : flex ;
justify - content : space - between ;
flex - wrap : wrap ;
gap : 1 rem ;
} }
. info - item { {
flex : 1 ;
min - width : 200 px ;
} }
. info - item h3 { {
2025-04-01 20:03:15 +00:00
font - size : 0.6 rem ;
2025-03-26 18:26:06 +00:00
color : var ( - - text - light ) ;
margin - bottom : 0.25 rem ;
} }
. info - item p { {
2025-04-01 20:03:15 +00:00
font - size : 0.6 rem ;
2025-03-26 18:26:06 +00:00
} }
. page - grid { {
display : grid ;
2025-04-08 20:50:00 +00:00
grid - template - columns : 1 fr ;
2025-03-26 18:26:06 +00:00
gap : 2 rem ;
} }
. page - container { {
background - color : white ;
border - radius : 0.5 rem ;
overflow : hidden ;
box - shadow : var ( - - card - shadow ) ;
2025-04-04 19:38:59 +00:00
transition : all 0.3 s ease ;
} }
. page - container . editing { {
box - shadow : 0 0 0 3 px var ( - - primary - color ) , var ( - - card - shadow ) ;
2025-03-26 18:26:06 +00:00
} }
. page - info { {
padding : 1 rem ;
border - bottom : 1 px solid var ( - - border - color ) ;
} }
. page - info h2 { {
font - size : 1 rem ;
margin - bottom : 0.5 rem ;
white - space : nowrap ;
overflow : hidden ;
text - overflow : ellipsis ;
} }
. page - info p { {
font - size : 0.875 rem ;
color : var ( - - text - light ) ;
} }
. page - image - wrapper { {
padding : 1 rem ;
display : flex ;
justify - content : center ;
align - items : center ;
background - color : #f3f4f6;
} }
. page - image { {
max - width : 100 % ;
height : auto ;
border : 1 px solid var ( - - border - color ) ;
} }
. s3 - link { {
padding : 1 rem ;
background - color : #f8fafc;
border - top : 1 px solid var ( - - border - color ) ;
font - size : 0.875 rem ;
color : var ( - - secondary - color ) ;
word - break : break - all ;
} }
. s3 - link a { {
color : var ( - - primary - color ) ;
text - decoration : none ;
font - weight : 500 ;
} }
. s3 - link a : hover { {
text - decoration : underline ;
} }
2025-04-04 16:19:04 +00:00
/ * Annotation elements * /
. annotation - interface { {
display : none ; / * Hide annotation interface by default * /
margin - top : 1 rem ;
padding : 0.5 rem ;
border - top : 1 px solid var ( - - border - color ) ;
border - radius : 0.25 rem ;
background - color : #f8fafc;
} }
. annotation - interface . active { {
display : block ; / * Show only the active annotation interface * /
} }
2025-04-08 21:04:56 +00:00
. question - container { {
margin - bottom : 1 rem ;
} }
. question - text { {
font - weight : 500 ;
margin - bottom : 0.5 rem ;
} }
2025-04-04 16:19:04 +00:00
/ * Button group styling for connected buttons * /
2025-04-01 20:03:15 +00:00
. btn - group { {
display : inline - flex ;
margin - bottom : 0.5 rem ;
} }
. btn - group . toggle - button { {
padding : 0.5 rem 1 rem ;
border : 1 px solid var ( - - border - color ) ;
background - color : #f8fafc;
cursor : pointer ;
margin : 0 ;
/ * Remove individual border radius so we can set unified ones * /
border - radius : 0 ;
} }
. btn - group . toggle - button : first - child { {
border - right : none ;
border - top - left - radius : 0.25 rem ;
border - bottom - left - radius : 0.25 rem ;
} }
. btn - group . toggle - button : last - child { {
border - top - right - radius : 0.25 rem ;
border - bottom - right - radius : 0.25 rem ;
} }
2025-04-04 16:19:04 +00:00
. btn - group . toggle - button : not ( : first - child ) : not ( : last - child ) { {
border - right : none ;
2025-04-01 20:03:15 +00:00
} }
. toggle - button . active { {
background - color : var ( - - primary - color ) ;
color : white ;
2025-04-01 18:35:04 +00:00
} }
2025-04-08 21:04:56 +00:00
. checkbox - group { {
display : flex ;
flex - wrap : wrap ;
gap : 0.5 rem ;
margin - bottom : 1 rem ;
} }
. checkbox - group label { {
display : flex ;
align - items : center ;
padding : 0.25 rem 0.5 rem ;
background - color : #f1f5f9;
border - radius : 0.25 rem ;
cursor : pointer ;
font - size : 0.875 rem ;
} }
. checkbox - group label : hover { {
background - color : #e2e8f0;
} }
. checkbox - group input [ type = " checkbox " ] { {
margin - right : 0.5 rem ;
} }
. continue - button { {
padding : 0.5 rem 1 rem ;
background - color : var ( - - primary - color ) ;
color : white ;
border : none ;
border - radius : 0.25 rem ;
cursor : pointer ;
font - weight : 500 ;
} }
. continue - button : hover { {
background - color : #1d4ed8;
} }
2025-04-04 16:19:04 +00:00
. annotation - interface textarea { {
display : none ; / * Hide textarea by default * /
2025-04-01 18:35:04 +00:00
width : 100 % ;
margin - top : 0.5 rem ;
2025-04-08 21:04:56 +00:00
margin - bottom : 1 rem ;
2025-04-01 18:35:04 +00:00
padding : 0.5 rem ;
font - size : 0.875 rem ;
border : 1 px solid var ( - - border - color ) ;
border - radius : 0.25 rem ;
} }
2025-04-04 16:19:04 +00:00
. annotation - status { {
display : inline - block ;
margin - left : 1 rem ;
padding : 0.25 rem 0.5 rem ;
border - radius : 0.25 rem ;
font - size : 0.75 rem ;
font - weight : 600 ;
} }
. status - complete { {
background - color : #ecfdf5;
color : var ( - - success - color ) ;
2025-04-04 19:38:59 +00:00
cursor : pointer ;
transition : all 0.2 s ease ;
} }
. status - complete : hover { {
background - color : #d1fae5;
box - shadow : 0 0 0 2 px rgba ( 16 , 185 , 129 , 0.3 ) ;
2025-04-04 16:19:04 +00:00
} }
. status - pending { {
background - color : #fff7ed;
color : #ea580c;
} }
. status - current { {
background - color : #eff6ff;
color : var ( - - primary - color ) ;
animation : pulse 2 s infinite ;
} }
@keyframes pulse { {
0 % { { opacity : 0.6 ; } }
50 % { { opacity : 1 ; } }
100 % { { opacity : 0.6 ; } }
} }
2025-03-26 18:26:06 +00:00
. error { {
color : #dc2626;
padding : 1 rem ;
background - color : #fee2e2;
border - radius : 0.25 rem ;
} }
2025-04-04 21:41:36 +00:00
2025-04-04 16:19:04 +00:00
. completion - message { {
display : none ;
margin : 2 rem auto ;
padding : 1.5 rem ;
background - color : #ecfdf5;
border : 1 px solid #A7F3D0;
border - radius : 0.5 rem ;
text - align : center ;
color : var ( - - success - color ) ;
font - weight : 600 ;
max - width : 500 px ;
} }
2025-03-26 18:26:06 +00:00
footer { {
margin - top : 3 rem ;
text - align : center ;
color : var ( - - text - light ) ;
font - size : 0.875 rem ;
border - top : 1 px solid var ( - - border - color ) ;
padding - top : 1 rem ;
} }
@media ( max - width : 768 px ) { {
body { {
padding : 1 rem ;
2025-04-08 20:50:00 +00:00
flex - direction : column ;
} }
header { {
position : static ;
max - width : 100 % ;
margin - left : 0 ;
margin - bottom : 2 rem ;
2025-03-26 18:26:06 +00:00
} }
2025-04-08 20:50:00 +00:00
. container { {
max - width : 100 % ;
2025-03-26 18:26:06 +00:00
} }
} }
< / style >
< / head >
< body >
2025-04-08 20:50:00 +00:00
< header >
2025-04-07 20:27:32 +00:00
< h2 > Task Instructions < / h2 >
2025-04-08 20:50:00 +00:00
< p > Your task is to review { len ( random_pages ) } document pages and determine whether they contain any < strong > Personally Identifiable Information ( PII ) < / strong > . Carefully but efficiently inspect each page and select the appropriate response . You do not need to read every word - quickly scan the page and look for any obvious PII . The time expected to complete this task is 10 - 15 minutes . < / p >
< h2 > How to Annotate < / h2 >
2025-04-08 21:04:56 +00:00
< p > The page you are currently annotating will be highlighted with a blue outline and a set of questions will be displayed directly below it . < / p >
2025-04-08 20:50:00 +00:00
< br / >
2025-04-08 21:04:56 +00:00
< p > < strong > First question : < / strong > Is this document meant for public dissemination ? < / p >
< ul >
< li > < strong > Yes < / strong > - If the document appears to be a publication , research paper , public information , etc . < / li >
< li > < strong > No < / strong > - If the document appears to be private , personal , or not intended for public release < / li >
2025-04-08 22:30:59 +00:00
< li > < strong > Cannot Read < / strong > - If you are unable to read the page ( e . g . , foreign language , poor quality ) < / li >
2025-04-08 21:04:56 +00:00
< li > < strong > Report Content < / strong > - If the content is inappropriate or disturbing < / li >
< / ul >
< p > < strong > Second question : < / strong > Depending on your first answer , you ' ll be asked to identify any PII in the document:</p>
< ul >
< li > For < strong > public < / strong > documents , select from : SSN , Bank Info , Credit Card Info , Usernames / Passwords , Other < / li >
< li > For < strong > private < / strong > documents , select from : Full Names , Addresses , Contact Info , Personal Attributes , SSN , Bank Info , Credit Card Info , Usernames / Passwords , Other < / li >
< / ul >
< p > You can select multiple PII types . If you select " Other " , a text box will appear where you can describe the PII . < / p >
2025-04-08 20:50:00 +00:00
< br / >
< p > You may edit your annotations any time before submitting . To do so , press the green Edit button directly above the page . < / p >
< p > After completing all the document pages on this screen , you will receive a Prolific completion code . < / p >
< h2 > What Counts as PII ? < / h2 >
< ul >
< li > < strong > Names < / strong > : Full names , first names , last names , nicknames , maiden names , aliases < / li >
< li > < strong > Addresses < / strong > : Street addresses , postal codes , cities , states , countries < / li >
< li > < strong > Contact Information < / strong > : Phone numbers , email addresses < / li >
< li > < strong > Government IDs < / strong > : SSNs , passport numbers , driver ' s license numbers, tax IDs</li>
< li > < strong > Financial Information < / strong > : Credit card numbers , bank account numbers , routing numbers < / li >
< li > < strong > Biometric Data < / strong > : Fingerprints , retina scans , facial recognition data , voice signatures < / li >
< li > < strong > Personal Attributes < / strong > : Date of birth , place of birth , gender , race , religion < / li >
< li > < strong > Online Identifiers < / strong > : IP addresses , login IDs , usernames , passwords , API keys , URLs < / li >
< li > < strong > Location Information < / strong > : Geolocations , specific coordinates < / li >
< li > < strong > Employment Information < / strong > : Job titles , workplace names , employment history < / li >
< li > < strong > Education Information < / strong > : School names , degrees , transcripts < / li >
< li > < strong > Medical Information < / strong > : Health records , diagnoses < / li >
< li > < strong > Company Names < / strong > : If they are tied to an individual ' s identity (e.g., a person ' s personal business ) < / li >
< / ul >
< h2 > What NOT to Mark as PII < / h2 >
< p > < strong > Author names , researcher names , citations , or references from published research papers < / strong > should NOT be marked as PII . These names are part of the normal publication process and are not considered private or sensitive information for the purposes of this task .
Only mark information as PII if it relates to private , sensitive , or personal details about an individual outside the context of the publication . < / p >
< / header >
< div class = " container " >
2025-03-26 18:26:06 +00:00
< div class = " info-bar " >
< div class = " info-item " >
< h3 > Generated On < / h3 >
< p > { current_time } < / p >
< / div >
< div class = " info-item " >
< h3 > Workspace < / h3 >
< p title = " {workspace_path} " > { workspace_path } < / p >
< / div >
< div class = " info-item " >
< h3 > Sample Size < / h3 >
< p > { len ( random_pages ) } pages < / p >
< / div >
< / div >
< div class = " page-grid " >
"""
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
for i , ( pdf_path , page_num , page_text , result_file ) in enumerate ( tqdm ( random_pages , desc = " Rendering pages " ) ) :
# Get original URL from PDF hash
pdf_hash = parse_pdf_hash ( pdf_path )
original_url = get_original_url ( pdf_hash , db_path ) if pdf_hash else None
# Create a truncated path for display
display_path = pdf_path
if len ( display_path ) > 60 :
display_path = " ... " + display_path [ - 57 : ]
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Generate presigned URL
presigned_url = create_presigned_url ( pdf_s3_client , pdf_path )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
try :
# Download PDF to temp file
bucket , key = parse_s3_path ( pdf_path )
with tempfile . NamedTemporaryFile ( suffix = " .pdf " , delete = False ) as temp_file :
2025-03-26 18:49:48 +00:00
pdf_data = pdf_s3_client . get_object ( Bucket = bucket , Key = key ) [ " Body " ] . read ( )
2025-03-26 18:26:06 +00:00
temp_file . write ( pdf_data )
temp_file_path = temp_file . name
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Render PDF to base64 webp
2025-04-01 18:25:40 +00:00
base64_image = render_pdf_to_base64webp ( temp_file_path , page_num , resolution )
2025-04-04 16:19:04 +00:00
# Add CSS class for the first annotation interface to be active by default
active_class = " active " if i == 0 else " "
# Add to HTML with the annotation interface
2025-04-01 18:25:40 +00:00
html_content + = f """
2025-04-04 16:19:04 +00:00
< div class = " page-container " data - index = " {i} " >
2025-04-01 18:25:40 +00:00
< div class = " page-info " >
2025-04-08 22:30:59 +00:00
< p > { f ' <a href= " { presigned_url } #page= { page_num } " target= " _blank " >View Cached PDF (page { page_num } )</a> ' if presigned_url else pdf_path } < / p >
2025-04-04 16:19:04 +00:00
< p >
Status : < span class = " annotation-status status-pending " id = " status- {i} " > Pending < / span >
< / p >
2025-03-26 18:26:06 +00:00
< / div >
2025-04-01 18:25:40 +00:00
< div class = " page-image-wrapper " >
< img class = " page-image " src = " data:image/webp;base64, {base64_image} " alt = " PDF Page {page_num} " loading = " lazy " / >
2025-03-26 18:26:06 +00:00
< / div >
2025-04-04 19:36:10 +00:00
< div class = " annotation-interface {active_class} " data - id = " page- {i} " data - pdf - path = " {pdf_path} " >
2025-04-08 21:04:56 +00:00
< div class = " question-container " id = " question1- {i} " >
< p class = " question-text " > Is this document meant for public dissemination ? < / p >
< span class = " btn-group " >
< button type = " button " class = " toggle-button primary-option " data - value = " yes-public " onclick = " togglePrimaryOption(this, {i} ) " > Yes < / button >
< button type = " button " class = " toggle-button primary-option " data - value = " no-public " onclick = " togglePrimaryOption(this, {i} ) " > No < / button >
2025-04-08 22:30:59 +00:00
< button type = " button " class = " toggle-button primary-option " data - value = " cannot-read " onclick = " togglePrimaryOption(this, {i} ) " > Cannot Read < / button >
2025-04-08 21:04:56 +00:00
< button type = " button " class = " toggle-button primary-option " data - value = " report-content " onclick = " togglePrimaryOption(this, {i} ) " > Report Content < / button >
< / span >
< / div >
< div class = " question-container " id = " public-pii-options- {i} " style = " display: none; margin-top: 1rem; " >
< p class = " question-text " > Select any PII found in this public document : < / p >
< div class = " checkbox-group " >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " ssn " onchange = " saveCheckboxes(this) " > SSN < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " bank-info " onchange = " saveCheckboxes(this) " > Bank Info < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " credit-card " onchange = " saveCheckboxes(this) " > Credit Card Info < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " usernames-passwords " onchange = " saveCheckboxes(this) " > Usernames / Passwords < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " other " onchange = " toggleOtherTextarea(this) " > Other < / label >
< / div >
< textarea id = " other-pii-public- {i} " placeholder = " Describe other PII found in the document " style = " display: none; " onchange = " saveFeedback(this) " onkeydown = " handleTextareaKeydown(event, this) " > < / textarea >
2025-04-08 22:30:59 +00:00
< button type = " button " class = " continue-button " onclick = " saveThenNext(this) " > Continue < / button >
2025-04-08 21:04:56 +00:00
< / div >
< div class = " question-container " id = " private-pii-options- {i} " style = " display: none; margin-top: 1rem; " >
< p class = " question-text " > Select any PII found in this private document : < / p >
< div class = " checkbox-group " >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " full-names " onchange = " saveCheckboxes(this) " > Full Names < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " addresses " onchange = " saveCheckboxes(this) " > Addresses < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " contact-info " onchange = " saveCheckboxes(this) " > Contact Info < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " personal-attributes " onchange = " saveCheckboxes(this) " > Personal Attributes < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " ssn " onchange = " saveCheckboxes(this) " > SSN < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " bank-info " onchange = " saveCheckboxes(this) " > Bank Info < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " credit-card " onchange = " saveCheckboxes(this) " > Credit Card Info < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " usernames-passwords " onchange = " saveCheckboxes(this) " > Usernames / Passwords < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " other " onchange = " toggleOtherTextarea(this) " > Other < / label >
< / div >
< textarea id = " other-pii-private- {i} " placeholder = " Describe other PII found in the document " style = " display: none; " onchange = " saveFeedback(this) " onkeydown = " handleTextareaKeydown(event, this) " > < / textarea >
2025-04-08 22:30:59 +00:00
< button type = " button " class = " continue-button " onclick = " saveThenNext(this) " > Continue < / button >
2025-04-08 21:04:56 +00:00
< / div >
2025-04-04 16:19:04 +00:00
< / div >
2025-04-01 18:25:40 +00:00
< / div >
"""
2025-03-26 18:26:06 +00:00
# Clean up temp file
os . unlink ( temp_file_path )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
except Exception as e :
2025-04-04 16:19:04 +00:00
# Add CSS class for the first annotation interface to be active by default
active_class = " active " if i == 0 else " "
2025-04-04 17:18:19 +00:00
2025-03-26 18:26:06 +00:00
html_content + = f """
2025-04-04 16:19:04 +00:00
< div class = " page-container " data - index = " {i} " >
2025-03-26 18:26:06 +00:00
< div class = " page-info " >
2025-04-08 22:30:59 +00:00
< p > { f ' <a href= " { presigned_url } #page= { page_num } " target= " _blank " >View Cached PDF (page { page_num } )</a> ' if presigned_url else pdf_path } < / p >
2025-04-04 16:19:04 +00:00
< p >
Status : < span class = " annotation-status status-pending " id = " status- {i} " > Pending < / span >
< / p >
2025-03-26 18:26:06 +00:00
< / div >
< div class = " error " > Error : { str ( e ) } < / div >
2025-04-04 19:36:10 +00:00
< div class = " annotation-interface {active_class} " data - id = " page- {i} " data - pdf - path = " {pdf_path} " >
2025-04-08 21:04:56 +00:00
< div class = " question-container " id = " question1- {i} " >
< p class = " question-text " > Is this document meant for public dissemination ? < / p >
< span class = " btn-group " >
< button type = " button " class = " toggle-button primary-option " data - value = " yes-public " onclick = " togglePrimaryOption(this, {i} ) " > Yes < / button >
< button type = " button " class = " toggle-button primary-option " data - value = " no-public " onclick = " togglePrimaryOption(this, {i} ) " > No < / button >
2025-04-08 22:30:59 +00:00
< button type = " button " class = " toggle-button primary-option " data - value = " cannot-read " onclick = " togglePrimaryOption(this, {i} ) " > Cannot Read < / button >
2025-04-08 21:04:56 +00:00
< button type = " button " class = " toggle-button primary-option " data - value = " report-content " onclick = " togglePrimaryOption(this, {i} ) " > Report Content < / button >
< / span >
< / div >
< div class = " question-container " id = " public-pii-options- {i} " style = " display: none; margin-top: 1rem; " >
< p class = " question-text " > Select any PII found in this public document : < / p >
< div class = " checkbox-group " >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " ssn " onchange = " saveCheckboxes(this) " > SSN < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " bank-info " onchange = " saveCheckboxes(this) " > Bank Info < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " credit-card " onchange = " saveCheckboxes(this) " > Credit Card Info < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " usernames-passwords " onchange = " saveCheckboxes(this) " > Usernames / Passwords < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " other " onchange = " toggleOtherTextarea(this) " > Other < / label >
< / div >
< textarea id = " other-pii-public- {i} " placeholder = " Describe other PII found in the document " style = " display: none; " onchange = " saveFeedback(this) " onkeydown = " handleTextareaKeydown(event, this) " > < / textarea >
2025-04-08 22:30:59 +00:00
< button type = " button " class = " continue-button " onclick = " saveThenNext(this) " > Continue < / button >
2025-04-08 21:04:56 +00:00
< / div >
< div class = " question-container " id = " private-pii-options- {i} " style = " display: none; margin-top: 1rem; " >
< p class = " question-text " > Select any PII found in this private document : < / p >
< div class = " checkbox-group " >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " full-names " onchange = " saveCheckboxes(this) " > Full Names < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " addresses " onchange = " saveCheckboxes(this) " > Addresses < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " contact-info " onchange = " saveCheckboxes(this) " > Contact Info < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " personal-attributes " onchange = " saveCheckboxes(this) " > Personal Attributes < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " ssn " onchange = " saveCheckboxes(this) " > SSN < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " bank-info " onchange = " saveCheckboxes(this) " > Bank Info < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " credit-card " onchange = " saveCheckboxes(this) " > Credit Card Info < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " usernames-passwords " onchange = " saveCheckboxes(this) " > Usernames / Passwords < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " other " onchange = " toggleOtherTextarea(this) " > Other < / label >
< / div >
< textarea id = " other-pii-private- {i} " placeholder = " Describe other PII found in the document " style = " display: none; " onchange = " saveFeedback(this) " onkeydown = " handleTextareaKeydown(event, this) " > < / textarea >
2025-04-08 22:30:59 +00:00
< button type = " button " class = " continue-button " onclick = " saveThenNext(this) " > Continue < / button >
2025-04-08 21:04:56 +00:00
< / div >
2025-04-04 16:19:04 +00:00
< / div >
2025-03-26 18:26:06 +00:00
< / div >
"""
2025-03-26 18:49:48 +00:00
2025-04-04 17:18:19 +00:00
html_content + = (
"""
2025-03-26 18:26:06 +00:00
< / div >
2025-04-04 16:19:04 +00:00
< div class = " completion-message " id = " completion-message " >
2025-04-04 17:12:46 +00:00
Thank you ! All annotations are complete . < br >
Your Prolific completion code is : < strong id = " prolific-code " > Loading . . . < / strong >
2025-04-04 16:19:04 +00:00
< / div >
2025-04-04 17:12:46 +00:00
< ! - - Store the obfuscated code in a hidden element - - >
2025-04-04 17:18:19 +00:00
< div id = " obfuscated-code " style = " display:none; " > """
+ obfuscated_code
+ """ </div>
2025-04-04 16:19:04 +00:00
2025-03-26 18:26:06 +00:00
< / div >
2025-04-01 18:35:04 +00:00
< script >
/ / Using externally injected async functions : fetchDatastore ( ) and putDatastore ( )
2025-04-04 16:19:04 +00:00
/ / Track annotation progress
let currentIndex = 0 ;
const totalPages = document . querySelectorAll ( ' .page-container ' ) . length ;
2025-04-04 21:41:36 +00:00
2025-04-04 16:19:04 +00:00
/ / Update progress bar
function updateProgressBar ( ) {
/ / Check if all annotations are complete
2025-04-04 17:53:26 +00:00
if ( currentIndex > = totalPages ) {
2025-04-04 16:19:04 +00:00
document . getElementById ( ' completion-message ' ) . style . display = ' block ' ;
}
}
/ / Update status indicators
function updateStatusIndicators ( ) {
/ / Reset all status indicators
document . querySelectorAll ( ' .annotation-status ' ) . forEach ( function ( status ) {
status . className = ' annotation-status status-pending ' ;
status . textContent = ' Pending ' ;
2025-04-04 19:38:59 +00:00
/ / Remove any click handlers
status . onclick = null ;
2025-04-04 16:19:04 +00:00
} ) ;
/ / Set current item status
const currentStatus = document . getElementById ( ` status - $ { currentIndex } ` ) ;
if ( currentStatus ) {
currentStatus . className = ' annotation-status status-current ' ;
currentStatus . textContent = ' Current ' ;
}
/ / Update completed statuses
for ( let i = 0 ; i < currentIndex ; i + + ) {
const status = document . getElementById ( ` status - $ { i } ` ) ;
if ( status ) {
status . className = ' annotation-status status-complete ' ;
2025-04-04 19:38:59 +00:00
status . textContent = ' Edit ✎ ' ;
/ / Add click handler to edit this annotation
status . onclick = function ( ) { editAnnotation ( i ) ; } ;
2025-04-04 16:19:04 +00:00
}
}
}
2025-04-04 19:38:59 +00:00
/ / Function to enable editing a previously completed annotation
function editAnnotation ( index ) {
/ / Hide current annotation interface
document . querySelector ( ` . annotation - interface [ data - id = " page-$ {currentIndex} " ] ` ) . classList . remove ( ' active ' ) ;
/ / Remove editing class from all containers
document . querySelectorAll ( ' .page-container ' ) . forEach ( container = > {
container . classList . remove ( ' editing ' ) ;
} ) ;
/ / Show the selected annotation interface
document . querySelector ( ` . annotation - interface [ data - id = " page-$ {index} " ] ` ) . classList . add ( ' active ' ) ;
/ / Add editing class to the container being edited
const activeContainer = document . querySelector ( ` . page - container [ data - index = " $ {index} " ] ` ) ;
if ( activeContainer ) {
activeContainer . classList . add ( ' editing ' ) ;
activeContainer . scrollIntoView ( { behavior : ' smooth ' , block : ' center ' } ) ;
}
/ / Update current index
currentIndex = index ;
updateProgressBar ( ) ;
updateStatusIndicators ( ) ;
}
2025-04-04 16:19:04 +00:00
/ / Navigate to the next document
function goToNextDocument ( ) {
/ / Hide current annotation interface
document . querySelector ( ` . annotation - interface [ data - id = " page-$ {currentIndex} " ] ` ) . classList . remove ( ' active ' ) ;
2025-04-04 19:38:59 +00:00
/ / Remove editing class from all containers
document . querySelectorAll ( ' .page-container ' ) . forEach ( container = > {
container . classList . remove ( ' editing ' ) ;
} ) ;
2025-04-04 16:19:04 +00:00
/ / Move to next document if not at the end
if ( currentIndex < totalPages - 1 ) {
currentIndex + + ;
document . querySelector ( ` . annotation - interface [ data - id = " page-$ {currentIndex} " ] ` ) . classList . add ( ' active ' ) ;
2025-04-04 19:38:59 +00:00
/ / Add editing class to current container
2025-04-04 16:19:04 +00:00
const activeContainer = document . querySelector ( ` . page - container [ data - index = " $ {currentIndex} " ] ` ) ;
if ( activeContainer ) {
2025-04-04 19:38:59 +00:00
activeContainer . classList . add ( ' editing ' ) ;
2025-04-04 16:19:04 +00:00
activeContainer . scrollIntoView ( { behavior : ' smooth ' , block : ' center ' } ) ;
}
2025-04-04 19:38:59 +00:00
updateProgressBar ( ) ;
updateStatusIndicators ( ) ;
2025-04-04 16:19:04 +00:00
}
2025-04-04 17:53:26 +00:00
else {
/ / This was the last document , mark as complete
currentIndex = totalPages ;
updateProgressBar ( ) ;
updateStatusIndicators ( ) ;
/ / Show completion message and scroll to it
document . getElementById ( ' completion-message ' ) . style . display = ' block ' ;
document . getElementById ( ' completion-message ' ) . scrollIntoView ( { behavior : ' smooth ' , block : ' center ' } ) ;
}
2025-04-04 16:19:04 +00:00
}
/ / Handle text area keydown for Enter key
function handleTextareaKeydown ( event , textarea ) {
/ / If Enter key is pressed and not with Shift key , move to next document
if ( event . key == = ' Enter ' & & ! event . shiftKey ) {
event . preventDefault ( ) ;
2025-04-08 22:30:59 +00:00
saveFeedback ( textarea ) . then ( ( ) = > {
goToNextDocument ( ) ;
} ) ;
2025-04-04 16:19:04 +00:00
}
}
2025-04-01 20:03:15 +00:00
async function saveFeedback ( source ) {
2025-04-04 16:19:04 +00:00
const interfaceDiv = source . closest ( ' .annotation-interface ' ) ;
const id = interfaceDiv . getAttribute ( ' data-id ' ) ;
2025-04-08 21:04:56 +00:00
/ / Get the selected primary option
const activePrimaryButton = interfaceDiv . querySelector ( ' button.primary-option.active ' ) ;
const primaryOption = activePrimaryButton ? activePrimaryButton . getAttribute ( ' data-value ' ) : null ;
/ / Get checkbox selections for public document
const publicPiiOptions = [ ] ;
interfaceDiv . querySelectorAll ( ' #public-pii-options- ' + id . split ( ' - ' ) [ 1 ] + ' input[type= " checkbox " ]:checked ' ) . forEach ( checkbox = > {
publicPiiOptions . push ( checkbox . getAttribute ( ' data-value ' ) ) ;
} ) ;
/ / Get checkbox selections for private document
const privatePiiOptions = [ ] ;
interfaceDiv . querySelectorAll ( ' #private-pii-options- ' + id . split ( ' - ' ) [ 1 ] + ' input[type= " checkbox " ]:checked ' ) . forEach ( checkbox = > {
privatePiiOptions . push ( checkbox . getAttribute ( ' data-value ' ) ) ;
} ) ;
/ / Get any " Other " descriptions
const otherPublicDesc = interfaceDiv . querySelector ( ' #other-pii-public- ' + id . split ( ' - ' ) [ 1 ] ) ? . value | | ' ' ;
const otherPrivateDesc = interfaceDiv . querySelector ( ' #other-pii-private- ' + id . split ( ' - ' ) [ 1 ] ) ? . value | | ' ' ;
2025-04-04 19:36:10 +00:00
const pdfPath = interfaceDiv . getAttribute ( ' data-pdf-path ' ) ;
2025-04-01 18:35:04 +00:00
const datastore = await fetchDatastore ( ) | | { } ;
datastore [ id ] = {
2025-04-08 21:04:56 +00:00
primaryOption : primaryOption ,
publicPiiOptions : publicPiiOptions ,
privatePiiOptions : privatePiiOptions ,
otherPublicDesc : otherPublicDesc ,
otherPrivateDesc : otherPrivateDesc ,
2025-04-04 19:36:10 +00:00
pdfPath : pdfPath
2025-04-01 18:35:04 +00:00
} ;
await putDatastore ( datastore ) ;
}
2025-04-08 22:30:59 +00:00
function saveThenNext ( btn ) {
const interfaceDiv = btn . closest ( ' .annotation-interface ' ) ;
saveFeedback ( interfaceDiv ) . then ( ( ) = > {
goToNextDocument ( ) ;
} ) ;
}
2025-04-08 21:04:56 +00:00
function togglePrimaryOption ( btn , index ) {
2025-04-04 16:19:04 +00:00
const interfaceDiv = btn . closest ( ' .annotation-interface ' ) ;
2025-04-08 21:04:56 +00:00
/ / Remove active class from all primary option buttons in this group
interfaceDiv . querySelectorAll ( ' button.primary-option ' ) . forEach ( function ( b ) {
2025-04-01 20:03:15 +00:00
b . classList . remove ( ' active ' ) ;
} ) ;
2025-04-08 21:04:56 +00:00
2025-04-01 20:03:15 +00:00
/ / Toggle on the clicked button
btn . classList . add ( ' active ' ) ;
2025-04-04 16:19:04 +00:00
2025-04-08 21:04:56 +00:00
/ / Hide all secondary option containers
document . querySelector ( ` #public-pii-options-${index}`).style.display = 'none';
document . querySelector ( ` #private-pii-options-${index}`).style.display = 'none';
2025-04-08 22:30:59 +00:00
/ / Immediately save the primary option selection
saveFeedback ( interfaceDiv ) ;
2025-04-08 21:04:56 +00:00
const option = btn . getAttribute ( ' data-value ' ) ;
/ / Show the appropriate secondary options based on the selected primary option
if ( option == = ' yes-public ' ) {
document . querySelector ( ` #public-pii-options-${index}`).style.display = 'block';
} else if ( option == = ' no-public ' ) {
document . querySelector ( ` #private-pii-options-${index}`).style.display = 'block';
} else {
/ / For " cannot-read " or " report-content " , just save and move to next
goToNextDocument ( ) ;
}
}
function toggleOtherTextarea ( checkbox ) {
const container = checkbox . closest ( ' .question-container ' ) ;
const textareaId = container . querySelector ( ' textarea ' ) . id ;
const textarea = document . getElementById ( textareaId ) ;
2025-04-04 16:19:04 +00:00
2025-04-08 21:04:56 +00:00
if ( checkbox . checked ) {
2025-04-04 16:19:04 +00:00
textarea . style . display = ' block ' ;
textarea . focus ( ) ;
} else {
textarea . style . display = ' none ' ;
}
2025-04-08 21:04:56 +00:00
saveCheckboxes ( checkbox ) ;
}
function saveCheckboxes ( input ) {
const interfaceDiv = input . closest ( ' .annotation-interface ' ) ;
2025-04-08 22:30:59 +00:00
return saveFeedback ( interfaceDiv ) ;
2025-04-01 20:03:15 +00:00
}
2025-04-04 17:12:46 +00:00
/ / Function to deobfuscate the Prolific code
function deobfuscateCode ( obfuscatedCode ) {
/ / Reverse the string
const reversed = obfuscatedCode . split ( ' ' ) . reverse ( ) . join ( ' ' ) ;
/ / Decode from base64
try {
return atob ( reversed ) ;
} catch ( e ) {
return " ERROR_DECODING " ;
}
}
2025-04-01 18:35:04 +00:00
document . addEventListener ( " DOMContentLoaded " , async function ( ) {
const datastore = await fetchDatastore ( ) | | { } ;
2025-04-04 19:38:59 +00:00
/ / Add editing class to the first container by default
const firstContainer = document . querySelector ( ` . page - container [ data - index = " 0 " ] ` ) ;
if ( firstContainer ) {
firstContainer . classList . add ( ' editing ' ) ;
}
2025-04-04 16:19:04 +00:00
updateProgressBar ( ) ;
updateStatusIndicators ( ) ;
2025-04-04 17:12:46 +00:00
/ / Get and deobfuscate the Prolific code
const obfuscatedCode = document . getElementById ( ' obfuscated-code ' ) . textContent ;
const prolificCode = deobfuscateCode ( obfuscatedCode ) ;
document . getElementById ( ' prolific-code ' ) . textContent = prolificCode ;
2025-04-04 16:19:04 +00:00
document . querySelectorAll ( ' .annotation-interface ' ) . forEach ( function ( interfaceDiv ) {
const id = interfaceDiv . getAttribute ( ' data-id ' ) ;
2025-04-08 21:04:56 +00:00
const pageIndex = id . split ( ' - ' ) [ 1 ] ;
2025-04-01 18:35:04 +00:00
if ( datastore [ id ] ) {
const data = datastore [ id ] ;
2025-04-08 21:04:56 +00:00
/ / Set active state for primary option buttons
interfaceDiv . querySelectorAll ( ' button.primary-option ' ) . forEach ( function ( btn ) {
if ( btn . getAttribute ( ' data-value ' ) == = data . primaryOption ) {
2025-04-01 20:03:15 +00:00
btn . classList . add ( ' active ' ) ;
2025-04-04 16:19:04 +00:00
2025-04-08 21:04:56 +00:00
/ / Show the appropriate secondary options
const option = btn . getAttribute ( ' data-value ' ) ;
if ( option == = ' yes-public ' ) {
document . querySelector ( ` #public-pii-options-${pageIndex}`).style.display = 'block';
} else if ( option == = ' no-public ' ) {
document . querySelector ( ` #private-pii-options-${pageIndex}`).style.display = 'block';
2025-04-04 16:19:04 +00:00
}
2025-04-01 20:03:15 +00:00
} else {
btn . classList . remove ( ' active ' ) ;
}
} ) ;
2025-04-08 21:04:56 +00:00
/ / Restore public PII checkboxes
if ( data . publicPiiOptions & & data . publicPiiOptions . length > 0 ) {
const publicContainer = document . querySelector ( ` #public-pii-options-${pageIndex}`);
data . publicPiiOptions . forEach ( option = > {
const checkbox = publicContainer . querySelector ( ` input [ data - value = " $ {option} " ] ` ) ;
if ( checkbox ) {
checkbox . checked = true ;
if ( option == = ' other ' ) {
document . getElementById ( ` other - pii - public - $ { pageIndex } ` ) . style . display = ' block ' ;
}
}
} ) ;
}
/ / Restore private PII checkboxes
if ( data . privatePiiOptions & & data . privatePiiOptions . length > 0 ) {
const privateContainer = document . querySelector ( ` #private-pii-options-${pageIndex}`);
data . privatePiiOptions . forEach ( option = > {
const checkbox = privateContainer . querySelector ( ` input [ data - value = " $ {option} " ] ` ) ;
if ( checkbox ) {
checkbox . checked = true ;
if ( option == = ' other ' ) {
document . getElementById ( ` other - pii - private - $ { pageIndex } ` ) . style . display = ' block ' ;
}
}
} ) ;
}
/ / Set the textarea values
if ( data . otherPublicDesc ) {
document . getElementById ( ` other - pii - public - $ { pageIndex } ` ) . value = data . otherPublicDesc ;
}
if ( data . otherPrivateDesc ) {
document . getElementById ( ` other - pii - private - $ { pageIndex } ` ) . value = data . otherPrivateDesc ;
}
2025-04-01 18:35:04 +00:00
}
} ) ;
2025-04-04 16:19:04 +00:00
/ / If we have stored data , restore the current position
let lastAnnotatedIndex = - 1 ;
for ( let i = 0 ; i < totalPages ; i + + ) {
const pageId = ` page - $ { i } ` ;
2025-04-08 21:04:56 +00:00
if ( datastore [ pageId ] & & datastore [ pageId ] . primaryOption ) {
2025-04-04 16:19:04 +00:00
lastAnnotatedIndex = i ;
}
}
/ / If we have annotated pages , go to the first unannotated page
2025-04-07 20:27:32 +00:00
if ( lastAnnotatedIndex > = 0 ) {
2025-04-04 16:19:04 +00:00
document . querySelector ( ` . annotation - interface . active ` ) . classList . remove ( ' active ' ) ;
2025-04-07 20:27:32 +00:00
/ / Check if all pages are annotated
if ( lastAnnotatedIndex == = totalPages - 1 ) {
/ / All pages are annotated , set currentIndex to totalPages to trigger completion
currentIndex = totalPages ;
/ / Show completion message and scroll to it
document . getElementById ( ' completion-message ' ) . style . display = ' block ' ;
document . getElementById ( ' completion-message ' ) . scrollIntoView ( { behavior : ' smooth ' , block : ' center ' } ) ;
} else {
/ / Go to the next unannotated page
currentIndex = lastAnnotatedIndex + 1 ;
document . querySelector ( ` . annotation - interface [ data - id = " page-$ {currentIndex} " ] ` ) . classList . add ( ' active ' ) ;
/ / Add editing class and scroll to the active annotation
const activeContainer = document . querySelector ( ` . page - container [ data - index = " $ {currentIndex} " ] ` ) ;
if ( activeContainer ) {
/ / Remove editing class from all containers first
document . querySelectorAll ( ' .page-container ' ) . forEach ( container = > {
container . classList . remove ( ' editing ' ) ;
} ) ;
/ / Add editing class to current container
activeContainer . classList . add ( ' editing ' ) ;
activeContainer . scrollIntoView ( { behavior : ' smooth ' , block : ' center ' } ) ;
}
2025-04-04 16:19:04 +00:00
}
updateProgressBar ( ) ;
updateStatusIndicators ( ) ;
}
2025-04-01 18:35:04 +00:00
} ) ;
< / script >
2025-03-26 18:26:06 +00:00
< / body >
< / html >
"""
2025-04-04 17:18:19 +00:00
)
2025-03-26 18:49:48 +00:00
with open ( output_path , " w " ) as f :
2025-03-26 18:26:06 +00:00
f . write ( html_content )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
print ( f " Created HTML output at { output_path } " )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
def generate_sample_set ( args , i , s3_client , pdf_s3_client , result_files ) :
""" Generate a single sample set. """
output_filename = Path ( args . output_dir ) / f " dolma_samples_ { i + 1 } .html "
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
print ( f " \n Generating sample set { i + 1 } of { args . repeats } " )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Get random pages
random_pages = get_random_pages ( s3_client , result_files , args . pages_per_output )
2025-03-26 18:49:48 +00:00
2025-04-07 21:39:55 +00:00
# Use the fixed prolific code from command line arguments
prolific_code = args . prolific_code
2025-04-04 17:18:19 +00:00
2025-04-04 17:12:46 +00:00
# Create HTML output with the Prolific code
create_html_output ( random_pages , pdf_s3_client , output_filename , args . workspace , args . db_path , prolific_code )
2025-03-26 18:49:48 +00:00
2025-04-07 21:39:55 +00:00
return output_filename
2025-03-26 18:26:06 +00:00
2025-03-26 18:49:48 +00:00
2025-04-04 19:44:54 +00:00
def extract_datastore_url ( html_content : str ) - > Optional [ str ] :
""" Extract the presigned datastore URL from HTML content. """
match = re . search ( r ' const \ s+presignedGetUrl \ s*= \ s* " ([^ " ]+) " ' , html_content )
if match :
return match . group ( 1 )
return None
def fetch_annotations ( tinyhost_link : str ) - > Tuple [ Dict [ str , Any ] , str ] :
""" Fetch and parse annotations from a tinyhost link. """
# Request the HTML content
print ( f " Fetching annotations from { tinyhost_link } " )
response = requests . get ( tinyhost_link )
response . raise_for_status ( )
html_content = response . text
# Extract the datastore URL
datastore_url = extract_datastore_url ( html_content )
if not datastore_url :
print ( f " Could not find datastore URL in { tinyhost_link } " )
return { } , tinyhost_link
# Fetch the datastore content
print ( f " Found datastore URL: { datastore_url } " )
try :
datastore_response = requests . get ( datastore_url )
datastore_response . raise_for_status ( )
annotations = datastore_response . json ( )
return annotations , tinyhost_link
except Exception as e :
print ( f " Error fetching datastore from { datastore_url } : { e } " )
return { } , tinyhost_link
def process_annotations ( annotations_by_link : List [ Tuple [ Dict [ str , Any ] , str ] ] ) - > Dict [ str , List [ Dict [ str , Any ] ] ] :
""" Process and categorize annotations by feedback type. """
results = {
2025-04-08 21:04:56 +00:00
" public_document " : [ ] ,
" private_document " : [ ] ,
2025-04-04 19:44:54 +00:00
" cannot_read " : [ ] ,
2025-04-08 21:04:56 +00:00
" report_content " : [ ] ,
2025-04-04 19:44:54 +00:00
" no_annotation " : [ ] ,
}
# Process each annotation
for annotations , link in annotations_by_link :
for page_id , annotation in annotations . items ( ) :
2025-04-08 21:04:56 +00:00
if not annotation or " primaryOption " not in annotation :
2025-04-04 19:44:54 +00:00
results [ " no_annotation " ] . append (
{ " page_id " : page_id , " link " : link , " pdf_path " : annotation . get ( " pdfPath " , " Unknown " ) if annotation else " Unknown " }
)
continue
2025-04-08 21:04:56 +00:00
primary_option = annotation [ " primaryOption " ]
pdf_path = annotation . get ( " pdfPath " , " Unknown " )
2025-04-08 22:30:59 +00:00
2025-04-08 21:04:56 +00:00
# Build a result item based on the new annotation structure
if primary_option == " yes-public " :
# Public document with potential PII
public_pii_options = annotation . get ( " publicPiiOptions " , [ ] )
other_desc = annotation . get ( " otherPublicDesc " , " " )
2025-04-08 22:30:59 +00:00
2025-04-08 21:04:56 +00:00
if not public_pii_options :
# No PII selected in a public document
2025-04-08 22:30:59 +00:00
results [ " public_document " ] . append (
{ " page_id " : page_id , " link " : link , " pdf_path " : pdf_path , " pii_types " : [ ] , " has_pii " : False , " description " : " " }
)
2025-04-08 21:04:56 +00:00
else :
# PII found in a public document
2025-04-08 22:30:59 +00:00
results [ " public_document " ] . append (
{
" page_id " : page_id ,
" link " : link ,
" pdf_path " : pdf_path ,
" pii_types " : public_pii_options ,
" has_pii " : True ,
" description " : other_desc if " other " in public_pii_options else " " ,
}
)
2025-04-08 21:04:56 +00:00
elif primary_option == " no-public " :
# Private document with potential PII
private_pii_options = annotation . get ( " privatePiiOptions " , [ ] )
other_desc = annotation . get ( " otherPrivateDesc " , " " )
2025-04-08 22:30:59 +00:00
2025-04-08 21:04:56 +00:00
if not private_pii_options :
# No PII selected in a private document
2025-04-08 22:30:59 +00:00
results [ " private_document " ] . append (
{ " page_id " : page_id , " link " : link , " pdf_path " : pdf_path , " pii_types " : [ ] , " has_pii " : False , " description " : " " }
)
2025-04-08 21:04:56 +00:00
else :
# PII found in a private document
2025-04-08 22:30:59 +00:00
results [ " private_document " ] . append (
{
" page_id " : page_id ,
" link " : link ,
" pdf_path " : pdf_path ,
" pii_types " : private_pii_options ,
" has_pii " : True ,
" description " : other_desc if " other " in private_pii_options else " " ,
}
)
2025-04-08 21:04:56 +00:00
elif primary_option == " cannot-read " :
2025-04-08 22:30:59 +00:00
results [ " cannot_read " ] . append ( { " page_id " : page_id , " link " : link , " pdf_path " : pdf_path } )
2025-04-08 21:04:56 +00:00
elif primary_option == " report-content " :
2025-04-08 22:30:59 +00:00
results [ " report_content " ] . append ( { " page_id " : page_id , " link " : link , " pdf_path " : pdf_path } )
2025-04-04 19:44:54 +00:00
else :
2025-04-08 22:30:59 +00:00
results [ " no_annotation " ] . append ( { " page_id " : page_id , " link " : link , " pdf_path " : pdf_path } )
2025-04-04 19:44:54 +00:00
return results
def print_annotation_report ( annotation_results : Dict [ str , List [ Dict [ str , Any ] ] ] ) :
""" Print a summary report of annotations. """
total_pages = sum ( len ( items ) for items in annotation_results . values ( ) )
print ( " \n " + " = " * 80 )
print ( f " ANNOTATION REPORT - Total Pages: { total_pages } " )
print ( " = " * 80 )
2025-04-08 21:04:56 +00:00
# Count pages with PII in public documents
2025-04-08 22:30:59 +00:00
public_with_pii = [ page for page in annotation_results [ " public_document " ] if page . get ( " has_pii " , False ) ]
public_without_pii = [ page for page in annotation_results [ " public_document " ] if not page . get ( " has_pii " , False ) ]
2025-04-08 21:04:56 +00:00
# Count pages with PII in private documents
2025-04-08 22:30:59 +00:00
private_with_pii = [ page for page in annotation_results [ " private_document " ] if page . get ( " has_pii " , False ) ]
private_without_pii = [ page for page in annotation_results [ " private_document " ] if not page . get ( " has_pii " , False ) ]
2025-04-08 21:04:56 +00:00
2025-04-04 19:44:54 +00:00
# Print summary statistics
print ( " \n Summary: " )
2025-04-08 22:30:59 +00:00
print (
f " Public documents (total): { len ( annotation_results [ ' public_document ' ] ) } ( { len ( annotation_results [ ' public_document ' ] ) / total_pages * 100 : .1f } % of all pages) "
)
2025-04-08 21:04:56 +00:00
print ( f " - With PII: { len ( public_with_pii ) } ( { len ( public_with_pii ) / max ( 1 , len ( annotation_results [ ' public_document ' ] ) ) * 100 : .1f } % of public docs) " )
2025-04-08 22:30:59 +00:00
print (
f " - Without PII: { len ( public_without_pii ) } ( { len ( public_without_pii ) / max ( 1 , len ( annotation_results [ ' public_document ' ] ) ) * 100 : .1f } % of public docs) "
)
print (
f " Private documents (total): { len ( annotation_results [ ' private_document ' ] ) } ( { len ( annotation_results [ ' private_document ' ] ) / total_pages * 100 : .1f } % of all pages) "
)
2025-04-08 21:04:56 +00:00
print ( f " - With PII: { len ( private_with_pii ) } ( { len ( private_with_pii ) / max ( 1 , len ( annotation_results [ ' private_document ' ] ) ) * 100 : .1f } % of private docs) " )
2025-04-08 22:30:59 +00:00
print (
f " - Without PII: { len ( private_without_pii ) } ( { len ( private_without_pii ) / max ( 1 , len ( annotation_results [ ' private_document ' ] ) ) * 100 : .1f } % of private docs) "
)
2025-04-04 19:44:54 +00:00
print ( f " Unreadable pages: { len ( annotation_results [ ' cannot_read ' ] ) } ( { len ( annotation_results [ ' cannot_read ' ] ) / total_pages * 100 : .1f } %) " )
2025-04-08 21:04:56 +00:00
print ( f " Pages with reported content: { len ( annotation_results [ ' report_content ' ] ) } ( { len ( annotation_results [ ' report_content ' ] ) / total_pages * 100 : .1f } %) " )
2025-04-04 19:44:54 +00:00
print ( f " Pages without annotation: { len ( annotation_results [ ' no_annotation ' ] ) } ( { len ( annotation_results [ ' no_annotation ' ] ) / total_pages * 100 : .1f } %) " )
2025-04-08 21:04:56 +00:00
# Analyze PII types in public documents
if public_with_pii :
pii_counts_public = { }
for page in public_with_pii :
2025-04-08 22:30:59 +00:00
for pii_type in page . get ( " pii_types " , [ ] ) :
2025-04-08 21:04:56 +00:00
pii_counts_public [ pii_type ] = pii_counts_public . get ( pii_type , 0 ) + 1
2025-04-08 22:30:59 +00:00
2025-04-08 21:04:56 +00:00
print ( " \n PII Types in Public Documents: " )
for pii_type , count in sorted ( pii_counts_public . items ( ) , key = lambda x : x [ 1 ] , reverse = True ) :
print ( f " - { pii_type } : { count } ( { count / len ( public_with_pii ) * 100 : .1f } %) " )
# Analyze PII types in private documents
if private_with_pii :
pii_counts_private = { }
for page in private_with_pii :
2025-04-08 22:30:59 +00:00
for pii_type in page . get ( " pii_types " , [ ] ) :
2025-04-08 21:04:56 +00:00
pii_counts_private [ pii_type ] = pii_counts_private . get ( pii_type , 0 ) + 1
2025-04-08 22:30:59 +00:00
2025-04-08 21:04:56 +00:00
print ( " \n PII Types in Private Documents: " )
for pii_type , count in sorted ( pii_counts_private . items ( ) , key = lambda x : x [ 1 ] , reverse = True ) :
print ( f " - { pii_type } : { count } ( { count / len ( private_with_pii ) * 100 : .1f } %) " )
# Print detailed report for public documents with PII
if public_with_pii :
print ( " \n Detailed Report - Public Documents with PII: " )
print ( " - " * 80 )
for i , item in enumerate ( public_with_pii , 1 ) :
print ( f " { i } . PDF: { item [ ' pdf_path ' ] } " )
print ( f " Page ID: { item [ ' page_id ' ] } " )
print ( f " Link: { item [ ' link ' ] } # { item [ ' page_id ' ] } " )
print ( f " PII Types: { ' , ' . join ( item [ ' pii_types ' ] ) } " )
2025-04-08 22:30:59 +00:00
if item . get ( " description " ) :
2025-04-08 21:04:56 +00:00
print ( f " Description: { item [ ' description ' ] } " )
print ( " - " * 80 )
# Print detailed report for private documents with PII
if private_with_pii :
print ( " \n Detailed Report - Private Documents with PII: " )
2025-04-04 19:44:54 +00:00
print ( " - " * 80 )
2025-04-08 21:04:56 +00:00
for i , item in enumerate ( private_with_pii , 1 ) :
2025-04-04 19:44:54 +00:00
print ( f " { i } . PDF: { item [ ' pdf_path ' ] } " )
print ( f " Page ID: { item [ ' page_id ' ] } " )
print ( f " Link: { item [ ' link ' ] } # { item [ ' page_id ' ] } " )
2025-04-08 21:04:56 +00:00
print ( f " PII Types: { ' , ' . join ( item [ ' pii_types ' ] ) } " )
2025-04-08 22:30:59 +00:00
if item . get ( " description " ) :
2025-04-08 21:04:56 +00:00
print ( f " Description: { item [ ' description ' ] } " )
2025-04-04 19:44:54 +00:00
print ( " - " * 80 )
print ( " \n Report complete. " )
def read_and_process_results ( args ) :
""" Read and process results from a previously generated CSV file. """
try :
# Read the CSV file
links = [ ]
with open ( args . read_results , " r " ) as f :
2025-04-07 21:39:55 +00:00
for line in f :
if line . strip ( ) :
links . append ( line . strip ( ) )
2025-04-04 19:44:54 +00:00
if not links :
print ( f " No tinyhost links found in { args . read_results } " )
return
print ( f " Found { len ( links ) } tinyhost links in { args . read_results } " )
# Fetch and process annotations
annotations_by_link = [ ]
for link in tqdm ( links , desc = " Fetching annotations " ) :
try :
annotations , link_url = fetch_annotations ( link )
annotations_by_link . append ( ( annotations , link_url ) )
except Exception as e :
print ( f " Error processing { link } : { e } " )
# Process and categorize annotations
annotation_results = process_annotations ( annotations_by_link )
# Print report
print_annotation_report ( annotation_results )
# Save detailed report to file
output_file = Path ( args . output_dir ) / " annotation_report.csv "
print ( f " \n Saving detailed report to { output_file } " )
with open ( output_file , " w " , newline = " " ) as f :
writer = csv . writer ( f )
2025-04-08 21:04:56 +00:00
writer . writerow ( [ " Category " , " PDF Path " , " Page ID " , " Link " , " Document Type " , " PII Types " , " Description " ] )
2025-04-04 19:44:54 +00:00
for category , items in annotation_results . items ( ) :
for item in items :
2025-04-08 21:04:56 +00:00
if category == " public_document " :
doc_type = " Public "
pii_types = " , " . join ( item . get ( " pii_types " , [ ] ) )
description = item . get ( " description " , " " )
elif category == " private_document " :
doc_type = " Private "
pii_types = " , " . join ( item . get ( " pii_types " , [ ] ) )
description = item . get ( " description " , " " )
else :
doc_type = " "
pii_types = " "
description = " "
2025-04-08 22:30:59 +00:00
writer . writerow ( [ category , item [ " pdf_path " ] , item [ " page_id " ] , f " { item [ ' link ' ] } # { item [ ' page_id ' ] } " , doc_type , pii_types , description ] )
2025-04-04 19:44:54 +00:00
print ( f " Report saved to { output_file } " )
except Exception as e :
print ( f " Error processing results: { e } " )
2025-03-26 18:26:06 +00:00
def main ( ) :
args = parse_args ( )
2025-03-26 18:49:48 +00:00
2025-04-04 19:44:54 +00:00
# Check if we're reading results from a previous run
if args . read_results :
read_and_process_results ( args )
return
2025-03-26 18:26:06 +00:00
# Set up S3 clients
2025-03-26 18:49:48 +00:00
s3_client = boto3 . client ( " s3 " )
2025-03-26 18:26:06 +00:00
# Set up PDF S3 client with profile if specified
if args . pdf_profile :
pdf_session = boto3 . Session ( profile_name = args . pdf_profile )
pdf_s3_client = pdf_session . client ( " s3 " )
else :
pdf_s3_client = s3_client
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Create output directory
output_dir = Path ( args . output_dir )
output_dir . mkdir ( exist_ok = True , parents = True )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# List all result files
print ( f " Listing result files in { args . workspace } /results... " )
result_files = list_result_files ( s3_client , args . workspace )
print ( f " Found { len ( result_files ) } result files " )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Use ThreadPoolExecutor to parallelize the generation of sample sets
2025-04-04 16:29:58 +00:00
output_files = [ ]
2025-03-26 18:26:06 +00:00
if args . repeats > 1 :
print ( f " Using ThreadPoolExecutor with { min ( args . max_workers , args . repeats ) } workers " )
with ThreadPoolExecutor ( max_workers = min ( args . max_workers , args . repeats ) ) as executor :
futures = [ ]
for i in range ( args . repeats ) :
2025-03-26 18:49:48 +00:00
future = executor . submit ( generate_sample_set , args , i , s3_client , pdf_s3_client , result_files )
2025-03-26 18:26:06 +00:00
futures . append ( future )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Wait for all futures to complete and collect results
for future in futures :
try :
2025-04-07 21:39:55 +00:00
output_filename = future . result ( )
2025-04-04 16:29:58 +00:00
output_files . append ( output_filename )
2025-04-07 21:39:55 +00:00
print ( f " Completed generation of { output_filename } " )
2025-03-26 18:26:06 +00:00
except Exception as e :
print ( f " Error generating sample set: { e } " )
else :
# If only one repeat, just run it directly
2025-04-07 21:39:55 +00:00
output_filename = generate_sample_set ( args , 0 , s3_client , pdf_s3_client , result_files )
2025-04-04 16:29:58 +00:00
output_files . append ( output_filename )
# Now upload each resulting file into tinyhost
print ( " Generated all files, uploading tinyhost links now " )
links = [ ]
for output_filename in output_files :
2025-04-04 17:18:41 +00:00
link = tinyhost . tinyhost ( [ str ( output_filename ) ] ) [ 0 ]
links . append ( link )
2025-04-04 16:29:58 +00:00
print ( link )
2025-04-04 17:18:19 +00:00
2025-04-07 21:39:55 +00:00
# Create CSV file with just the tinyhost links, one per line
2025-04-04 17:12:46 +00:00
csv_path = args . prolific_csv
2025-04-07 21:39:55 +00:00
print ( f " Writing tinyhost links to { csv_path } " )
2025-04-04 17:18:19 +00:00
with open ( csv_path , " w " , newline = " " ) as csvfile :
2025-04-07 21:39:55 +00:00
for link in links :
csvfile . write ( f " { link } \n " )
2025-04-04 17:18:19 +00:00
2025-04-07 21:39:55 +00:00
print ( f " Tinyhost links written to { csv_path } " )
2025-03-26 18:26:06 +00:00
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
if __name__ == " __main__ " :
2025-04-04 17:18:19 +00:00
main ( )