2025-03-26 18:26:06 +00:00
import argparse
2025-04-04 17:12:46 +00:00
import base64
import csv
2025-03-26 18:49:48 +00:00
import datetime
2025-03-26 18:26:06 +00:00
import json
import os
import random
import re
import sqlite3
2025-03-26 18:49:48 +00:00
import tempfile
from concurrent . futures import ThreadPoolExecutor
2025-03-26 18:26:06 +00:00
from pathlib import Path
2025-04-04 19:44:54 +00:00
from typing import Any , Dict , List , Optional , Tuple
2025-03-26 18:49:48 +00:00
import boto3
2025-04-04 19:44:54 +00:00
import requests
2025-04-04 17:18:19 +00:00
import tinyhost
2025-03-26 18:26:06 +00:00
from tqdm import tqdm
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
from olmocr . data . renderpdf import render_pdf_to_base64webp
2025-03-26 18:49:48 +00:00
from olmocr . s3_utils import get_s3_bytes , parse_s3_path
2025-03-26 18:26:06 +00:00
def parse_args ( ) :
parser = argparse . ArgumentParser ( description = " Scan OLMO OCR workspace results and create visual samples " )
parser . add_argument ( " workspace " , help = " OLMO OCR workspace path (s3://bucket/workspace) " )
parser . add_argument ( " --pages_per_output " , type = int , default = 30 , help = " Number of pages per output file " )
parser . add_argument ( " --repeats " , type = int , default = 1 , help = " Number of output files to generate " )
parser . add_argument ( " --pdf_profile " , help = " AWS profile for accessing PDFs " )
parser . add_argument ( " --output_dir " , default = " dolma_samples " , help = " Directory to save output HTML files " )
parser . add_argument ( " --max_workers " , type = int , default = 4 , help = " Maximum number of worker threads " )
2025-03-26 18:49:48 +00:00
parser . add_argument (
" --db_path " ,
default = " ~/s2pdf_url_data/d65142df-6588-4b68-a12c-d468b3761189.csv.db " ,
help = " Path to the SQLite database containing PDF hash to URL mapping " ,
)
2025-04-07 21:39:55 +00:00
parser . add_argument (
" --prolific_code " ,
required = True ,
help = " Fixed completion code to use for all outputs " ,
)
2025-04-04 17:12:46 +00:00
parser . add_argument (
" --prolific_csv " ,
default = " prolific_codes.csv " ,
2025-04-07 21:39:55 +00:00
help = " Path to save the file with tinyhost links (one URL per line) " ,
2025-04-04 17:12:46 +00:00
)
2025-04-04 19:44:54 +00:00
parser . add_argument (
" --read_results " ,
help = " Path to a CSV file containing previously generated tinyhost links to extract annotations " ,
)
2025-03-26 18:26:06 +00:00
return parser . parse_args ( )
2025-03-26 18:49:48 +00:00
2025-04-07 21:39:55 +00:00
# Fixed prolific code is now passed in as a command line argument
2025-04-04 17:12:46 +00:00
def obfuscate_code ( code ) :
""" Gently obfuscate the Prolific code so it ' s not immediately visible in source. """
# Convert to base64 and reverse
encoded = base64 . b64encode ( code . encode ( ) ) . decode ( )
return encoded [ : : - 1 ]
def deobfuscate_code ( obfuscated_code ) :
""" Deobfuscate the code - this will be done in JavaScript. """
# Reverse and decode from base64
reversed_encoded = obfuscated_code [ : : - 1 ]
try :
return base64 . b64decode ( reversed_encoded ) . decode ( )
except :
return " ERROR_DECODING "
2025-03-26 18:26:06 +00:00
def parse_pdf_hash ( pretty_pdf_path : str ) - > Optional [ str ] :
pattern = r " s3://ai2-s2-pdfs/([a-f0-9] {4} )/([a-f0-9]+) \ .pdf "
match = re . match ( pattern , pretty_pdf_path )
if match :
return match . group ( 1 ) + match . group ( 2 )
return None
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
def get_original_url ( pdf_hash : str , db_path : str ) - > Optional [ str ] :
""" Look up the original URL for a PDF hash in the SQLite database. """
if not pdf_hash :
return None
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
try :
sqlite_db_path = os . path . expanduser ( db_path )
if not os . path . exists ( sqlite_db_path ) :
print ( f " SQLite database not found at { sqlite_db_path } " )
return None
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
conn = sqlite3 . connect ( sqlite_db_path )
cursor = conn . cursor ( )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
cursor . execute ( " SELECT uri FROM pdf_mapping WHERE pdf_hash = ? " , ( pdf_hash , ) )
result = cursor . fetchone ( )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
conn . close ( )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
if result :
return result [ 0 ]
return None
except Exception as e :
print ( f " Error looking up URL for PDF hash { pdf_hash } : { e } " )
return None
def list_result_files ( s3_client , workspace_path ) :
""" List all JSON result files in the workspace results directory. """
bucket , prefix = parse_s3_path ( workspace_path )
results_prefix = os . path . join ( prefix , " results " ) . rstrip ( " / " ) + " / "
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
all_files = [ ]
paginator = s3_client . get_paginator ( " list_objects_v2 " )
for page in paginator . paginate ( Bucket = bucket , Prefix = results_prefix ) :
if " Contents " in page :
2025-03-26 18:49:48 +00:00
all_files . extend ( [ f " s3:// { bucket } / { obj [ ' Key ' ] } " for obj in page [ " Contents " ] if obj [ " Key " ] . endswith ( " .jsonl " ) or obj [ " Key " ] . endswith ( " .json " ) ] )
2025-04-15 18:50:13 +00:00
# if len(all_files) > 1000:
# break
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
return all_files
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
def get_random_pages ( s3_client , result_files , count = 30 ) :
""" Get random pages from the result files. """
random_pages = [ ]
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Try to collect the requested number of pages
attempts = 0
max_attempts = count * 3 # Allow extra attempts to handle potential failures
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
while len ( random_pages ) < count and attempts < max_attempts :
attempts + = 1
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Pick a random result file
if not result_files :
print ( " No result files found! " )
break
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
result_file = random . choice ( result_files )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
try :
# Get the content of the file
content = get_s3_bytes ( s3_client , result_file )
2025-03-26 18:49:48 +00:00
lines = content . decode ( " utf-8 " ) . strip ( ) . split ( " \n " )
2025-03-26 18:26:06 +00:00
if not lines :
continue
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Pick a random line (which contains a complete document)
line = random . choice ( lines )
doc = json . loads ( line )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# A Dolma document has "text", "metadata", and "attributes" fields
if " text " not in doc or " metadata " not in doc or " attributes " not in doc :
print ( f " Document in { result_file } is not a valid Dolma document " )
continue
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Get the original PDF path from metadata
pdf_path = doc [ " metadata " ] . get ( " Source-File " )
if not pdf_path :
continue
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Get page spans from attributes
page_spans = doc [ " attributes " ] . get ( " pdf_page_numbers " , [ ] )
if not page_spans :
continue
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Pick a random page span
page_span = random . choice ( page_spans )
if len ( page_span ) > = 3 :
# Page spans are [start_pos, end_pos, page_num]
page_num = page_span [ 2 ]
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Extract text for this page
start_pos , end_pos = page_span [ 0 ] , page_span [ 1 ]
page_text = doc [ " text " ] [ start_pos : end_pos ] . strip ( )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Include the text snippet with the page info
random_pages . append ( ( pdf_path , page_num , page_text , result_file ) )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
if len ( random_pages ) > = count :
break
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
except Exception as e :
print ( f " Error processing { result_file } : { e } " )
continue
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
print ( f " Found { len ( random_pages ) } random pages from Dolma documents " )
return random_pages
2025-03-26 18:49:48 +00:00
def create_presigned_url ( s3_client , pdf_path , expiration = 3600 * 24 * 7 ) :
2025-03-26 18:26:06 +00:00
""" Create a presigned URL for the given S3 path. """
try :
bucket , key = parse_s3_path ( pdf_path )
2025-03-26 18:49:48 +00:00
url = s3_client . generate_presigned_url ( " get_object " , Params = { " Bucket " : bucket , " Key " : key } , ExpiresIn = expiration )
2025-03-26 18:26:06 +00:00
return url
except Exception as e :
print ( f " Error creating presigned URL for { pdf_path } : { e } " )
return None
2025-03-26 18:49:48 +00:00
2025-04-04 17:12:46 +00:00
def create_html_output ( random_pages , pdf_s3_client , output_path , workspace_path , db_path , prolific_code , resolution = 2048 ) :
2025-03-26 18:26:06 +00:00
""" Create an HTML file with rendered PDF pages. """
2025-04-04 17:12:46 +00:00
# Obfuscate the provided Prolific code
obfuscated_code = obfuscate_code ( prolific_code )
2025-04-04 17:18:19 +00:00
2025-03-26 18:26:06 +00:00
# Get current date and time for the report
current_time = datetime . datetime . now ( ) . strftime ( " % Y- % m- %d % H: % M: % S " )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
html_content = f """
< ! DOCTYPE html >
< html lang = " en " >
< head >
< meta charset = " UTF-8 " >
< meta name = " viewport " content = " width=device-width, initial-scale=1.0 " >
< title > OLMO OCR Samples < / title >
< link href = " https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap " rel = " stylesheet " >
< style >
: root { {
- - primary - color : #2563eb;
- - secondary - color : #4b5563;
- - border - color : #e5e7eb;
- - bg - color : #f9fafb;
- - text - color : #111827;
- - text - light : #6b7280;
- - card - shadow : 0 4 px 6 px - 1 px rgba ( 0 , 0 , 0 , 0.1 ) , 0 2 px 4 px - 1 px rgba ( 0 , 0 , 0 , 0.06 ) ;
2025-04-04 16:19:04 +00:00
- - success - color : #10b981;
2025-04-22 21:33:39 +00:00
- - overlay - bg : rgba ( 0 , 0 , 0 , 0.7 ) ;
2025-03-26 18:26:06 +00:00
} }
* { {
box - sizing : border - box ;
margin : 0 ;
padding : 0 ;
} }
body { {
font - family : ' Inter ' , sans - serif ;
line - height : 1.6 ;
color : var ( - - text - color ) ;
background - color : var ( - - bg - color ) ;
padding : 2 rem ;
2025-04-08 20:50:00 +00:00
display : flex ;
flex - direction : row ;
gap : 2 rem ;
2025-03-26 18:26:06 +00:00
} }
2025-04-07 20:27:32 +00:00
ul { {
margin - left : 2 em ;
} }
2025-04-14 19:07:13 +00:00
ol { {
margin - left : 2 em ;
} }
. highlight { {
background - color : #f8f9fa;
border - left : 3 px solid #3498db;
padding : 10 px 15 px ;
margin : 15 px 0 ;
} }
2025-03-26 18:26:06 +00:00
. container { {
2025-04-08 20:50:00 +00:00
flex : 2 ;
max - width : 750 px ;
2025-03-26 18:26:06 +00:00
} }
header { {
2025-04-08 20:50:00 +00:00
position : sticky ;
top : 2 rem ;
flex : 1 ;
min - width : 380 px ;
max - width : 420 px ;
max - height : calc ( 100 vh - 4 rem ) ;
overflow - y : auto ;
padding : 1.5 rem ;
background - color : white ;
border - radius : 0.5 rem ;
box - shadow : var ( - - card - shadow ) ;
align - self : flex - start ;
2025-04-07 20:27:32 +00:00
font - size : small ;
2025-03-26 18:26:06 +00:00
} }
2025-04-07 20:27:32 +00:00
header h2 { {
margin - top : 1 em ;
2025-03-26 18:26:06 +00:00
} }
2025-04-07 20:27:32 +00:00
2025-04-14 23:27:27 +00:00
. important { {
font - weight : bold ;
} }
2025-03-26 18:26:06 +00:00
. info - bar { {
background - color : white ;
padding : 1 rem ;
border - radius : 0.5 rem ;
margin - bottom : 2 rem ;
box - shadow : var ( - - card - shadow ) ;
display : flex ;
justify - content : space - between ;
flex - wrap : wrap ;
gap : 1 rem ;
} }
. info - item { {
flex : 1 ;
min - width : 200 px ;
} }
. info - item h3 { {
2025-04-01 20:03:15 +00:00
font - size : 0.6 rem ;
2025-03-26 18:26:06 +00:00
color : var ( - - text - light ) ;
margin - bottom : 0.25 rem ;
} }
. info - item p { {
2025-04-01 20:03:15 +00:00
font - size : 0.6 rem ;
2025-03-26 18:26:06 +00:00
} }
. page - grid { {
display : grid ;
2025-04-08 20:50:00 +00:00
grid - template - columns : 1 fr ;
2025-03-26 18:26:06 +00:00
gap : 2 rem ;
} }
. page - container { {
background - color : white ;
border - radius : 0.5 rem ;
overflow : hidden ;
box - shadow : var ( - - card - shadow ) ;
2025-04-04 19:38:59 +00:00
transition : all 0.3 s ease ;
} }
. page - container . editing { {
box - shadow : 0 0 0 3 px var ( - - primary - color ) , var ( - - card - shadow ) ;
2025-03-26 18:26:06 +00:00
} }
. page - info { {
padding : 1 rem ;
border - bottom : 1 px solid var ( - - border - color ) ;
} }
. page - info h2 { {
font - size : 1 rem ;
margin - bottom : 0.5 rem ;
white - space : nowrap ;
overflow : hidden ;
text - overflow : ellipsis ;
} }
. page - info p { {
font - size : 0.875 rem ;
color : var ( - - text - light ) ;
} }
. page - image - wrapper { {
padding : 1 rem ;
display : flex ;
justify - content : center ;
align - items : center ;
background - color : #f3f4f6;
} }
. page - image { {
max - width : 100 % ;
height : auto ;
border : 1 px solid var ( - - border - color ) ;
} }
. s3 - link { {
padding : 1 rem ;
background - color : #f8fafc;
border - top : 1 px solid var ( - - border - color ) ;
font - size : 0.875 rem ;
color : var ( - - secondary - color ) ;
word - break : break - all ;
} }
. s3 - link a { {
color : var ( - - primary - color ) ;
text - decoration : none ;
font - weight : 500 ;
} }
. s3 - link a : hover { {
text - decoration : underline ;
} }
2025-04-04 16:19:04 +00:00
/ * Annotation elements * /
. annotation - interface { {
display : none ; / * Hide annotation interface by default * /
margin - top : 1 rem ;
padding : 0.5 rem ;
border - top : 1 px solid var ( - - border - color ) ;
border - radius : 0.25 rem ;
background - color : #f8fafc;
} }
. annotation - interface . active { {
display : block ; / * Show only the active annotation interface * /
} }
2025-04-08 21:04:56 +00:00
. question - container { {
margin - bottom : 1 rem ;
} }
. question - text { {
font - weight : 500 ;
margin - bottom : 0.5 rem ;
} }
2025-04-04 16:19:04 +00:00
/ * Button group styling for connected buttons * /
2025-04-01 20:03:15 +00:00
. btn - group { {
display : inline - flex ;
margin - bottom : 0.5 rem ;
} }
. btn - group . toggle - button { {
padding : 0.5 rem 1 rem ;
border : 1 px solid var ( - - border - color ) ;
background - color : #f8fafc;
cursor : pointer ;
margin : 0 ;
/ * Remove individual border radius so we can set unified ones * /
border - radius : 0 ;
} }
. btn - group . toggle - button : first - child { {
border - right : none ;
border - top - left - radius : 0.25 rem ;
border - bottom - left - radius : 0.25 rem ;
} }
. btn - group . toggle - button : last - child { {
border - top - right - radius : 0.25 rem ;
border - bottom - right - radius : 0.25 rem ;
} }
2025-04-04 16:19:04 +00:00
. btn - group . toggle - button : not ( : first - child ) : not ( : last - child ) { {
border - right : none ;
2025-04-01 20:03:15 +00:00
} }
. toggle - button . active { {
background - color : var ( - - primary - color ) ;
color : white ;
2025-04-01 18:35:04 +00:00
} }
2025-04-08 21:04:56 +00:00
. checkbox - group { {
display : flex ;
flex - wrap : wrap ;
gap : 0.5 rem ;
margin - bottom : 1 rem ;
} }
. checkbox - group label { {
display : flex ;
align - items : center ;
padding : 0.25 rem 0.5 rem ;
background - color : #f1f5f9;
border - radius : 0.25 rem ;
cursor : pointer ;
font - size : 0.875 rem ;
2025-04-14 20:27:06 +00:00
border - left : 3 px solid transparent ;
2025-04-08 21:04:56 +00:00
} }
. checkbox - group label : hover { {
background - color : #e2e8f0;
} }
. checkbox - group input [ type = " checkbox " ] { {
margin - right : 0.5 rem ;
} }
2025-04-14 20:27:06 +00:00
/ * Styling for checkbox groups with headings * /
. question - container h4 { {
margin - bottom : 0.5 rem ;
font - weight : 600 ;
font - size : 0.9 rem ;
border - bottom : 1 px solid #e5e7eb;
padding - bottom : 0.25 rem ;
} }
/ * Slightly different styling for each group * /
. question - container h4 : nth - of - type ( 1 ) + . checkbox - group label { {
border - left - color : #3b82f6; /* Blue for identifiers */
} }
. question - container h4 : nth - of - type ( 2 ) + . checkbox - group label { {
border - left - color : #10b981; /* Green for PII with identifier */
} }
. question - container h4 : nth - of - type ( 3 ) + . checkbox - group label { {
border - left - color : #f59e0b; /* Amber for always-PII */
} }
2025-04-08 21:04:56 +00:00
. continue - button { {
padding : 0.5 rem 1 rem ;
background - color : var ( - - primary - color ) ;
color : white ;
border : none ;
border - radius : 0.25 rem ;
cursor : pointer ;
font - weight : 500 ;
} }
. continue - button : hover { {
background - color : #1d4ed8;
} }
2025-04-04 16:19:04 +00:00
. annotation - interface textarea { {
display : none ; / * Hide textarea by default * /
2025-04-01 18:35:04 +00:00
width : 100 % ;
margin - top : 0.5 rem ;
2025-04-08 21:04:56 +00:00
margin - bottom : 1 rem ;
2025-04-01 18:35:04 +00:00
padding : 0.5 rem ;
font - size : 0.875 rem ;
border : 1 px solid var ( - - border - color ) ;
border - radius : 0.25 rem ;
} }
2025-04-04 16:19:04 +00:00
. annotation - status { {
display : inline - block ;
margin - left : 1 rem ;
padding : 0.25 rem 0.5 rem ;
border - radius : 0.25 rem ;
font - size : 0.75 rem ;
font - weight : 600 ;
} }
. status - complete { {
background - color : #ecfdf5;
color : var ( - - success - color ) ;
2025-04-04 19:38:59 +00:00
cursor : pointer ;
transition : all 0.2 s ease ;
} }
. status - complete : hover { {
background - color : #d1fae5;
box - shadow : 0 0 0 2 px rgba ( 16 , 185 , 129 , 0.3 ) ;
2025-04-04 16:19:04 +00:00
} }
. status - pending { {
background - color : #fff7ed;
color : #ea580c;
} }
. status - current { {
background - color : #eff6ff;
color : var ( - - primary - color ) ;
animation : pulse 2 s infinite ;
} }
@keyframes pulse { {
0 % { { opacity : 0.6 ; } }
50 % { { opacity : 1 ; } }
100 % { { opacity : 0.6 ; } }
} }
2025-03-26 18:26:06 +00:00
. error { {
color : #dc2626;
padding : 1 rem ;
background - color : #fee2e2;
border - radius : 0.25 rem ;
} }
2025-04-04 21:41:36 +00:00
2025-04-04 16:19:04 +00:00
. completion - message { {
display : none ;
margin : 2 rem auto ;
padding : 1.5 rem ;
background - color : #ecfdf5;
border : 1 px solid #A7F3D0;
border - radius : 0.5 rem ;
text - align : center ;
color : var ( - - success - color ) ;
font - weight : 600 ;
max - width : 500 px ;
} }
2025-03-26 18:26:06 +00:00
footer { {
margin - top : 3 rem ;
text - align : center ;
color : var ( - - text - light ) ;
font - size : 0.875 rem ;
border - top : 1 px solid var ( - - border - color ) ;
padding - top : 1 rem ;
} }
2025-04-22 21:33:39 +00:00
/ * Instructions Modal * /
. instructions - modal - overlay { {
position : fixed ;
top : 0 ;
left : 0 ;
right : 0 ;
bottom : 0 ;
background - color : var ( - - overlay - bg ) ;
display : flex ;
align - items : center ;
justify - content : center ;
z - index : 1000 ;
opacity : 0 ;
visibility : hidden ;
transition : opacity 0.3 s ease , visibility 0.3 s ease ;
backdrop - filter : blur ( 3 px ) ;
} }
. instructions - modal - overlay . visible { {
opacity : 1 ;
visibility : visible ;
} }
. instructions - modal { {
background - color : white ;
border - radius : 8 px ;
width : 90 % ;
max - width : 1000 px ;
max - height : 90 vh ;
overflow - y : auto ;
padding : 2 rem ;
box - shadow : 0 10 px 25 px rgba ( 0 , 0 , 0 , 0.2 ) ;
position : relative ;
animation : modalAppear 0.3 s ease ;
} }
@keyframes modalAppear { {
from { {
opacity : 0 ;
transform : translateY ( - 20 px ) ;
} }
to { {
opacity : 1 ;
transform : translateY ( 0 ) ;
} }
} }
. instructions - modal - header { {
margin - bottom : 1.5 rem ;
text - align : center ;
} }
. instructions - modal - header h2 { {
font - size : 1.5 rem ;
color : var ( - - primary - color ) ;
margin - bottom : 0.5 rem ;
} }
. instructions - modal - content { {
margin - bottom : 2 rem ;
overflow - y : auto ;
max - height : 60 vh ;
padding - right : 10 px ;
border - radius : 4 px ;
scrollbar - width : thin ;
} }
/ * Scrollbar styling for webkit browsers * /
. instructions - modal - content : : - webkit - scrollbar { {
width : 8 px ;
} }
. instructions - modal - content : : - webkit - scrollbar - track { {
background : #f1f1f1;
border - radius : 10 px ;
} }
. instructions - modal - content : : - webkit - scrollbar - thumb { {
background : #c0c0c0;
border - radius : 10 px ;
} }
. instructions - modal - content : : - webkit - scrollbar - thumb : hover { {
background : #a0a0a0;
} }
/ * Styling for the cloned sidebar content in the modal * /
. instructions - modal - content header { {
position : static ;
min - width : unset ;
max - width : unset ;
max - height : unset ;
overflow - y : visible ;
padding : 0 ;
background - color : transparent ;
border - radius : 0 ;
box - shadow : none ;
align - self : auto ;
font - size : inherit ;
} }
. instructions - modal - footer { {
text - align : center ;
} }
. instructions - modal - button { {
padding : 0.75 rem 2 rem ;
background - color : var ( - - primary - color ) ;
color : white ;
border : none ;
border - radius : 4 px ;
font - size : 1 rem ;
font - weight : 600 ;
cursor : pointer ;
transition : background - color 0.2 s ease ;
} }
. instructions - modal - button : hover { {
background - color : #1d4ed8;
} }
. instructions - modal - button : disabled { {
background - color : #9cb3f0;
cursor : not - allowed ;
opacity : 0.7 ;
} }
2025-03-26 18:26:06 +00:00
@media ( max - width : 768 px ) { {
body { {
padding : 1 rem ;
2025-04-08 20:50:00 +00:00
flex - direction : column ;
} }
header { {
position : static ;
max - width : 100 % ;
margin - left : 0 ;
margin - bottom : 2 rem ;
2025-03-26 18:26:06 +00:00
} }
2025-04-08 20:50:00 +00:00
. container { {
max - width : 100 % ;
2025-03-26 18:26:06 +00:00
} }
2025-04-22 21:33:39 +00:00
. instructions - modal { {
padding : 1.5 rem ;
width : 95 % ;
} }
2025-03-26 18:26:06 +00:00
} }
< / style >
< / head >
< body >
2025-04-08 20:50:00 +00:00
< header >
2025-04-14 19:07:13 +00:00
< h2 > Task Overview < / h2 >
2025-04-14 20:27:06 +00:00
< p > In this task , you will review { len ( random_pages ) } document pages and determine whether they contain any < span class = " important " > Personally Identifiable Information ( PII ) < / span > . For each page , please follow the decision flow outlined in the " How to Annotate " section below . < / p >
2025-04-14 19:07:13 +00:00
< p > Carefully but efficiently inspect each page and select the appropriate response . You do < span class = " important " > not < / span > need to read every word . Instead , focus on ascertaining the document ' s intended use and spotting information that would qualify as PII.</p>
2025-04-22 16:16:21 -07:00
< p > The entire task should take about < span class = " important " > 20 - 25 minutes < / span > . < / p >
2025-04-08 20:50:00 +00:00
2025-04-22 21:33:39 +00:00
< button id = " view-instructions-button " style = " background-color: var(--primary-color); color: white; border: none; border-radius: 4px; padding: 0.5rem 1rem; margin: 1rem 0; cursor: pointer; " > View Instructions Popup < / button >
2025-04-08 20:50:00 +00:00
< h2 > How to Annotate < / h2 >
2025-04-22 16:16:21 -07:00
< p > The current annotation will be highlighted with a blue outline and a set of response buttons will be displayed directly below the page preview . If you are having trouble viewing the displayed page , click the “ View Cached PDF ” link for a better look . However , < span class = " important " > DO NOT < / span > examine the entire document ; < span class = " important " > ONLY < / span > review the single page being previewed ( also indicated in the parentheses after “ Viewed Cached PDF ” ) . < / p >
< p > For each page , complete the following steps : < / p >
2025-04-14 19:07:13 +00:00
< ol >
< li >
< p > < span class = " important " > Determine if the document is intended for public release . < / span > < / p >
< p > Inspect the page and answer : " Is this document intended for public release or dissemination? " < / p >
< ul >
< li > < strong > Yes < / strong > - If the document appears to be a publication , research paper , public information , etc . < / li >
< li > < strong > No < / strong > - If the document appears to be private , personal , or not intended for public release < / li >
< li > < strong > Cannot Read < / strong > - If you are unable to read the page ( e . g . , foreign language , no text , etc . ) < / li >
< li > < strong > Report Content < / strong > - If the content is inappropriate or disturbing < / li >
< / ul >
2025-04-14 20:27:06 +00:00
< p > If you selected " Yes, " " Cannot Read, " or " Report Content, " you will automatically move to the next document . If you selected " No, " proceed to Step 2. < / p >
2025-04-14 19:07:13 +00:00
< / li >
< li >
2025-04-14 20:27:06 +00:00
< p > < span class = " important " > Identify the kind of PII found in the private document ( if any ) . < / span > < / p >
2025-04-14 19:07:13 +00:00
< p > You will be shown a checklist with a set of PII options . < / p >
< ul >
< li > Refer to the " How to Identify PII " section below and mark all options that apply . < / li >
< li > If you select " Other, " describe the kind of other PII in the expanded text box . < / li >
< / ul >
< / li >
< li >
< p > < span class = " important " > Press the blue Continue button to complete your annotation . < / span > < / p >
< p > You will automatically be moved to the next annotation . < / p >
< / li >
< / ol >
2025-04-08 21:04:56 +00:00
2025-04-22 16:16:21 -07:00
< p > < span class = " important " > Note < / span > : If you cannot confidently tell that a page is private , treat it as public and do not mark any PII you are unsure about . We anticipate very few private pages or instances of PII in these documents , so erring towards public and no PII minimizes false positives and keeps the review process consistent . < / p >
2025-04-14 19:07:13 +00:00
< p > You may review and edit your previous annotations at any time . To do so , press the green Edit button directly above the page preview for the annotation you want to edit . < / p >
2025-04-14 20:27:06 +00:00
< p > After completing all { len ( random_pages ) } document pages , you will receive a Prolific completion code . < / p >
2025-04-08 20:50:00 +00:00
2025-04-14 19:07:13 +00:00
< h2 > How to Identify PII < / h2 >
2025-04-08 20:50:00 +00:00
2025-04-14 23:21:28 +00:00
< h3 style = " color: #3b82f6; " > Identifiers for PII < / h3 >
2025-04-14 19:07:13 +00:00
< p > Some personal information needs to be accompanied by an < span class = " important " > identifier < / span > to be considered PII . Identifiers that trigger PII include : < / p >
2025-04-08 20:50:00 +00:00
< ul >
2025-04-22 16:16:21 -07:00
< li > Names ( full names , first / last names , maiden names , nicknames , aliases ) < / li >
2025-04-14 19:07:13 +00:00
< li > Email Addresses < / li >
< li > Phone Numbers < / li >
2025-04-08 20:50:00 +00:00
< / ul >
2025-04-22 16:16:21 -07:00
< p > Note that the reverse is also true - an identifier must be accompanied by additional personal information or another identifier ( e . g . , name + email address ) to be considered PII . < / p >
2025-04-14 23:21:28 +00:00
< br / >
2025-04-08 20:50:00 +00:00
2025-04-14 23:21:28 +00:00
< h3 style = " color: #10b981; " > PII that must co - occur with an Identifier < / h3 >
2025-04-14 19:07:13 +00:00
< div class = " highlight " >
< p > The following types of information should < span class = " important " > only < / span > be marked as PII if they occur < span class = " important " > alongside an identifier < / span > ( commonly , a person ' s name):</p>
< ul >
< li > Addresses ( street address , postal code , etc . ) < / li >
< li > Biographical Information ( date of birth , place of birth , gender , sexual orientation , race , ethnicity , citizenship / immigration status , religion ) < / li >
< li > Location Information ( geolocations , specific coordinates ) < / li >
< li > Employment Information ( job titles , workplace names , employment history ) < / li >
< li > Education Information ( school names , degrees , transcripts ) < / li >
2025-04-14 20:27:06 +00:00
< li > Medical Information ( health records , diagnoses , genetic or neural data ) < / li >
2025-04-14 19:07:13 +00:00
< / ul >
< / div >
2025-04-22 16:16:21 -07:00
< p > For example , a street address might be personal information , but is not PII by itself ; however , a street address associated with a name < span class = " important " > is < / span > regulated PII . < / p >
2025-04-14 19:07:13 +00:00
2025-04-14 23:21:28 +00:00
< br / >
< h3 style = " color: #f59e0b; " > PII that occurs even without an Identifier < / h3 >
2025-04-14 19:07:13 +00:00
< div class = " highlight " >
< p > Certain types of sensitive information should always be classified as PII because the information is inherently self - identifying . The following should < span class = " important " > always be marked as PII < / span > even if they do not occur alongside an identifier : < / p >
< ul >
< li > Government IDs ( SSNs , passport numbers , driver ' s license numbers, tax IDs)</li>
< li > Financial Information ( credit card numbers , bank account / routing numbers ) < / li >
< li > Biometric Data ( fingerprints , retina scans , facial recognition data , voice signatures ) < / li >
< li > Login information ( < span class = " important " > only < / span > mark as PII when a < span class = " important " > username , password , and login location < / span > are present together ) < / li >
< / ul >
< / div >
2025-04-08 20:50:00 +00:00
< / header >
< div class = " container " >
2025-03-26 18:26:06 +00:00
< div class = " info-bar " >
< div class = " info-item " >
< h3 > Generated On < / h3 >
< p > { current_time } < / p >
< / div >
< div class = " info-item " >
< h3 > Workspace < / h3 >
< p title = " {workspace_path} " > { workspace_path } < / p >
< / div >
< div class = " info-item " >
< h3 > Sample Size < / h3 >
< p > { len ( random_pages ) } pages < / p >
< / div >
< / div >
< div class = " page-grid " >
"""
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
for i , ( pdf_path , page_num , page_text , result_file ) in enumerate ( tqdm ( random_pages , desc = " Rendering pages " ) ) :
# Get original URL from PDF hash
pdf_hash = parse_pdf_hash ( pdf_path )
2025-04-10 21:50:14 +00:00
_original_url = get_original_url ( pdf_hash , db_path ) if pdf_hash else None
2025-03-26 18:26:06 +00:00
# Create a truncated path for display
display_path = pdf_path
if len ( display_path ) > 60 :
display_path = " ... " + display_path [ - 57 : ]
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Generate presigned URL
presigned_url = create_presigned_url ( pdf_s3_client , pdf_path )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
try :
# Download PDF to temp file
bucket , key = parse_s3_path ( pdf_path )
with tempfile . NamedTemporaryFile ( suffix = " .pdf " , delete = False ) as temp_file :
2025-03-26 18:49:48 +00:00
pdf_data = pdf_s3_client . get_object ( Bucket = bucket , Key = key ) [ " Body " ] . read ( )
2025-03-26 18:26:06 +00:00
temp_file . write ( pdf_data )
temp_file_path = temp_file . name
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Render PDF to base64 webp
2025-04-01 18:25:40 +00:00
base64_image = render_pdf_to_base64webp ( temp_file_path , page_num , resolution )
2025-04-04 16:19:04 +00:00
# Add CSS class for the first annotation interface to be active by default
active_class = " active " if i == 0 else " "
# Add to HTML with the annotation interface
2025-04-01 18:25:40 +00:00
html_content + = f """
2025-04-04 16:19:04 +00:00
< div class = " page-container " data - index = " {i} " >
2025-04-01 18:25:40 +00:00
< div class = " page-info " >
2025-04-08 22:30:59 +00:00
< p > { f ' <a href= " { presigned_url } #page= { page_num } " target= " _blank " >View Cached PDF (page { page_num } )</a> ' if presigned_url else pdf_path } < / p >
2025-04-04 16:19:04 +00:00
< p >
Status : < span class = " annotation-status status-pending " id = " status- {i} " > Pending < / span >
< / p >
2025-03-26 18:26:06 +00:00
< / div >
2025-04-01 18:25:40 +00:00
< div class = " page-image-wrapper " >
< img class = " page-image " src = " data:image/webp;base64, {base64_image} " alt = " PDF Page {page_num} " loading = " lazy " / >
2025-03-26 18:26:06 +00:00
< / div >
2025-04-15 22:27:07 +00:00
< div class = " annotation-interface {active_class} " data - id = " page- {i} " data - pdf - path = " {pdf_path} " data - pdf - page = " {page_num} " >
2025-04-08 21:04:56 +00:00
< div class = " question-container " id = " question1- {i} " >
2025-04-14 20:27:06 +00:00
< p class = " question-text " > Is this document meant for public dissemination ? ( ex . news article , research paper , etc . ) < / p >
2025-04-08 21:04:56 +00:00
< span class = " btn-group " >
< button type = " button " class = " toggle-button primary-option " data - value = " yes-public " onclick = " togglePrimaryOption(this, {i} ) " > Yes < / button >
< button type = " button " class = " toggle-button primary-option " data - value = " no-public " onclick = " togglePrimaryOption(this, {i} ) " > No < / button >
2025-04-08 22:30:59 +00:00
< button type = " button " class = " toggle-button primary-option " data - value = " cannot-read " onclick = " togglePrimaryOption(this, {i} ) " > Cannot Read < / button >
2025-04-08 21:04:56 +00:00
< button type = " button " class = " toggle-button primary-option " data - value = " report-content " onclick = " togglePrimaryOption(this, {i} ) " > Report Content < / button >
< / span >
< / div >
< div class = " question-container " id = " public-pii-options- {i} " style = " display: none; margin-top: 1rem; " >
< p class = " question-text " > Select any PII found in this public document : < / p >
< div class = " checkbox-group " >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " ssn " onchange = " saveCheckboxes(this) " > SSN < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " bank-info " onchange = " saveCheckboxes(this) " > Bank Info < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " credit-card " onchange = " saveCheckboxes(this) " > Credit Card Info < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " usernames-passwords " onchange = " saveCheckboxes(this) " > Usernames / Passwords < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " other " onchange = " toggleOtherTextarea(this) " > Other < / label >
< / div >
< textarea id = " other-pii-public- {i} " placeholder = " Describe other PII found in the document " style = " display: none; " onchange = " saveFeedback(this) " onkeydown = " handleTextareaKeydown(event, this) " > < / textarea >
2025-04-08 22:30:59 +00:00
< button type = " button " class = " continue-button " onclick = " saveThenNext(this) " > Continue < / button >
2025-04-08 21:04:56 +00:00
< / div >
< div class = " question-container " id = " private-pii-options- {i} " style = " display: none; margin-top: 1rem; " >
< p class = " question-text " > Select any PII found in this private document : < / p >
2025-04-14 20:27:06 +00:00
2025-04-14 23:21:28 +00:00
< h4 style = " margin-top: 1rem; font-size: 0.9rem; color: #3b82f6; " > Identifiers for PII ( Select these if found ) < / h4 >
2025-04-14 20:27:06 +00:00
< div class = " checkbox-group " >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " names " onchange = " saveCheckboxes(this) " > Names ( full , first , last , nicknames ) < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " email " onchange = " saveCheckboxes(this) " > Email Addresses < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " phone " onchange = " saveCheckboxes(this) " > Phone Numbers < / label >
< / div >
2025-04-14 23:21:28 +00:00
< h4 style = " margin-top: 1rem; font-size: 0.9rem; color: #10b981; " > PII that must co - occur with an Identifier < / h4 >
2025-04-08 21:04:56 +00:00
< div class = " checkbox-group " >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " addresses " onchange = " saveCheckboxes(this) " > Addresses < / label >
2025-04-14 20:27:06 +00:00
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " biographical " onchange = " saveCheckboxes(this) " > Biographical Info ( DOB , gender , etc . ) < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " location " onchange = " saveCheckboxes(this) " > Location Information < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " employment " onchange = " saveCheckboxes(this) " > Employment Information < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " education " onchange = " saveCheckboxes(this) " > Education Information < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " medical " onchange = " saveCheckboxes(this) " > Medical Information < / label >
< / div >
2025-04-14 23:21:28 +00:00
< h4 style = " margin-top: 1rem; font-size: 0.9rem; color: #f59e0b; " > PII that occurs even without an Identifier < / h4 >
2025-04-14 20:27:06 +00:00
< div class = " checkbox-group " >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " government-id " onchange = " saveCheckboxes(this) " > Government IDs ( SSN , passport , etc . ) < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " financial " onchange = " saveCheckboxes(this) " > Financial Information ( credit card , bank ) < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " biometric " onchange = " saveCheckboxes(this) " > Biometric Data < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " login-info " onchange = " saveCheckboxes(this) " > Login Information ( username + password ) < / label >
2025-04-08 21:04:56 +00:00
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " other " onchange = " toggleOtherTextarea(this) " > Other < / label >
< / div >
2025-04-14 20:27:06 +00:00
2025-04-08 21:04:56 +00:00
< textarea id = " other-pii-private- {i} " placeholder = " Describe other PII found in the document " style = " display: none; " onchange = " saveFeedback(this) " onkeydown = " handleTextareaKeydown(event, this) " > < / textarea >
2025-04-08 22:30:59 +00:00
< button type = " button " class = " continue-button " onclick = " saveThenNext(this) " > Continue < / button >
2025-04-08 21:04:56 +00:00
< / div >
2025-04-04 16:19:04 +00:00
< / div >
2025-04-01 18:25:40 +00:00
< / div >
"""
2025-03-26 18:26:06 +00:00
# Clean up temp file
os . unlink ( temp_file_path )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
except Exception as e :
2025-04-04 16:19:04 +00:00
# Add CSS class for the first annotation interface to be active by default
active_class = " active " if i == 0 else " "
2025-04-04 17:18:19 +00:00
2025-03-26 18:26:06 +00:00
html_content + = f """
2025-04-04 16:19:04 +00:00
< div class = " page-container " data - index = " {i} " >
2025-03-26 18:26:06 +00:00
< div class = " page-info " >
2025-04-08 22:30:59 +00:00
< p > { f ' <a href= " { presigned_url } #page= { page_num } " target= " _blank " >View Cached PDF (page { page_num } )</a> ' if presigned_url else pdf_path } < / p >
2025-04-04 16:19:04 +00:00
< p >
Status : < span class = " annotation-status status-pending " id = " status- {i} " > Pending < / span >
< / p >
2025-03-26 18:26:06 +00:00
< / div >
< div class = " error " > Error : { str ( e ) } < / div >
2025-04-15 22:27:07 +00:00
< div class = " annotation-interface {active_class} " data - id = " page- {i} " data - pdf - path = " {pdf_path} " data - pdf - page = " {page_num} " >
2025-04-08 21:04:56 +00:00
< div class = " question-container " id = " question1- {i} " >
2025-04-22 16:16:21 -07:00
< p class = " question-text " > Is this document intended for public release or dissemination ? < / p >
2025-04-08 21:04:56 +00:00
< span class = " btn-group " >
< button type = " button " class = " toggle-button primary-option " data - value = " yes-public " onclick = " togglePrimaryOption(this, {i} ) " > Yes < / button >
< button type = " button " class = " toggle-button primary-option " data - value = " no-public " onclick = " togglePrimaryOption(this, {i} ) " > No < / button >
2025-04-08 22:30:59 +00:00
< button type = " button " class = " toggle-button primary-option " data - value = " cannot-read " onclick = " togglePrimaryOption(this, {i} ) " > Cannot Read < / button >
2025-04-08 21:04:56 +00:00
< button type = " button " class = " toggle-button primary-option " data - value = " report-content " onclick = " togglePrimaryOption(this, {i} ) " > Report Content < / button >
< / span >
< / div >
< div class = " question-container " id = " public-pii-options- {i} " style = " display: none; margin-top: 1rem; " >
< p class = " question-text " > Select any PII found in this public document : < / p >
< div class = " checkbox-group " >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " ssn " onchange = " saveCheckboxes(this) " > SSN < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " bank-info " onchange = " saveCheckboxes(this) " > Bank Info < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " credit-card " onchange = " saveCheckboxes(this) " > Credit Card Info < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " usernames-passwords " onchange = " saveCheckboxes(this) " > Usernames / Passwords < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " other " onchange = " toggleOtherTextarea(this) " > Other < / label >
< / div >
< textarea id = " other-pii-public- {i} " placeholder = " Describe other PII found in the document " style = " display: none; " onchange = " saveFeedback(this) " onkeydown = " handleTextareaKeydown(event, this) " > < / textarea >
2025-04-08 22:30:59 +00:00
< button type = " button " class = " continue-button " onclick = " saveThenNext(this) " > Continue < / button >
2025-04-08 21:04:56 +00:00
< / div >
< div class = " question-container " id = " private-pii-options- {i} " style = " display: none; margin-top: 1rem; " >
< p class = " question-text " > Select any PII found in this private document : < / p >
2025-04-14 20:27:06 +00:00
< h4 style = " margin-top: 1rem; font-size: 0.9rem; color: #3b82f6; " > Identifiers ( Select these if found ) < / h4 >
< div class = " checkbox-group " >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " names " onchange = " saveCheckboxes(this) " > Names ( full , first , last , nicknames ) < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " email " onchange = " saveCheckboxes(this) " > Email Addresses < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " phone " onchange = " saveCheckboxes(this) " > Phone Numbers < / label >
< / div >
< h4 style = " margin-top: 1rem; font-size: 0.9rem; color: #10b981; " > PII that requires an identifier above < / h4 >
2025-04-08 21:04:56 +00:00
< div class = " checkbox-group " >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " addresses " onchange = " saveCheckboxes(this) " > Addresses < / label >
2025-04-14 20:27:06 +00:00
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " biographical " onchange = " saveCheckboxes(this) " > Biographical Info ( DOB , gender , etc . ) < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " location " onchange = " saveCheckboxes(this) " > Location Information < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " employment " onchange = " saveCheckboxes(this) " > Employment Information < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " education " onchange = " saveCheckboxes(this) " > Education Information < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " medical " onchange = " saveCheckboxes(this) " > Medical Information < / label >
< / div >
< h4 style = " margin-top: 1rem; font-size: 0.9rem; color: #f59e0b; " > PII that is always sensitive ( even without an identifier ) < / h4 >
< div class = " checkbox-group " >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " government-id " onchange = " saveCheckboxes(this) " > Government IDs ( SSN , passport , etc . ) < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " financial " onchange = " saveCheckboxes(this) " > Financial Information ( credit card , bank ) < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " biometric " onchange = " saveCheckboxes(this) " > Biometric Data < / label >
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " login-info " onchange = " saveCheckboxes(this) " > Login Information ( username + password ) < / label >
2025-04-08 21:04:56 +00:00
< label > < input type = " checkbox " class = " pii-checkbox " data - value = " other " onchange = " toggleOtherTextarea(this) " > Other < / label >
< / div >
2025-04-14 20:27:06 +00:00
2025-04-08 21:04:56 +00:00
< textarea id = " other-pii-private- {i} " placeholder = " Describe other PII found in the document " style = " display: none; " onchange = " saveFeedback(this) " onkeydown = " handleTextareaKeydown(event, this) " > < / textarea >
2025-04-08 22:30:59 +00:00
< button type = " button " class = " continue-button " onclick = " saveThenNext(this) " > Continue < / button >
2025-04-08 21:04:56 +00:00
< / div >
2025-04-04 16:19:04 +00:00
< / div >
2025-03-26 18:26:06 +00:00
< / div >
"""
2025-03-26 18:49:48 +00:00
2025-04-04 17:18:19 +00:00
html_content + = (
"""
2025-03-26 18:26:06 +00:00
< / div >
2025-04-04 16:19:04 +00:00
< div class = " completion-message " id = " completion-message " >
2025-04-04 17:12:46 +00:00
Thank you ! All annotations are complete . < br >
Your Prolific completion code is : < strong id = " prolific-code " > Loading . . . < / strong >
2025-04-04 16:19:04 +00:00
< / div >
2025-04-04 17:12:46 +00:00
< ! - - Store the obfuscated code in a hidden element - - >
2025-04-04 17:18:19 +00:00
< div id = " obfuscated-code " style = " display:none; " > """
+ obfuscated_code
+ """ </div>
2025-04-04 16:19:04 +00:00
2025-03-26 18:26:06 +00:00
< / div >
2025-04-01 18:35:04 +00:00
< script >
/ / Using externally injected async functions : fetchDatastore ( ) and putDatastore ( )
2025-04-04 16:19:04 +00:00
/ / Track annotation progress
let currentIndex = 0 ;
const totalPages = document . querySelectorAll ( ' .page-container ' ) . length ;
2025-04-04 21:41:36 +00:00
2025-04-04 16:19:04 +00:00
/ / Update progress bar
function updateProgressBar ( ) {
/ / Check if all annotations are complete
2025-04-04 17:53:26 +00:00
if ( currentIndex > = totalPages ) {
2025-04-04 16:19:04 +00:00
document . getElementById ( ' completion-message ' ) . style . display = ' block ' ;
}
}
/ / Update status indicators
function updateStatusIndicators ( ) {
/ / Reset all status indicators
document . querySelectorAll ( ' .annotation-status ' ) . forEach ( function ( status ) {
status . className = ' annotation-status status-pending ' ;
status . textContent = ' Pending ' ;
2025-04-04 19:38:59 +00:00
/ / Remove any click handlers
status . onclick = null ;
2025-04-04 16:19:04 +00:00
} ) ;
/ / Set current item status
const currentStatus = document . getElementById ( ` status - $ { currentIndex } ` ) ;
if ( currentStatus ) {
currentStatus . className = ' annotation-status status-current ' ;
currentStatus . textContent = ' Current ' ;
}
/ / Update completed statuses
for ( let i = 0 ; i < currentIndex ; i + + ) {
const status = document . getElementById ( ` status - $ { i } ` ) ;
if ( status ) {
status . className = ' annotation-status status-complete ' ;
2025-04-04 19:38:59 +00:00
status . textContent = ' Edit ✎ ' ;
/ / Add click handler to edit this annotation
status . onclick = function ( ) { editAnnotation ( i ) ; } ;
2025-04-04 16:19:04 +00:00
}
}
}
2025-04-04 19:38:59 +00:00
/ / Function to enable editing a previously completed annotation
function editAnnotation ( index ) {
/ / Hide current annotation interface
document . querySelector ( ` . annotation - interface [ data - id = " page-$ {currentIndex} " ] ` ) . classList . remove ( ' active ' ) ;
/ / Remove editing class from all containers
document . querySelectorAll ( ' .page-container ' ) . forEach ( container = > {
container . classList . remove ( ' editing ' ) ;
} ) ;
/ / Show the selected annotation interface
document . querySelector ( ` . annotation - interface [ data - id = " page-$ {index} " ] ` ) . classList . add ( ' active ' ) ;
/ / Add editing class to the container being edited
const activeContainer = document . querySelector ( ` . page - container [ data - index = " $ {index} " ] ` ) ;
if ( activeContainer ) {
activeContainer . classList . add ( ' editing ' ) ;
activeContainer . scrollIntoView ( { behavior : ' smooth ' , block : ' center ' } ) ;
}
/ / Update current index
currentIndex = index ;
updateProgressBar ( ) ;
updateStatusIndicators ( ) ;
}
2025-04-04 16:19:04 +00:00
/ / Navigate to the next document
function goToNextDocument ( ) {
/ / Hide current annotation interface
document . querySelector ( ` . annotation - interface [ data - id = " page-$ {currentIndex} " ] ` ) . classList . remove ( ' active ' ) ;
2025-04-04 19:38:59 +00:00
/ / Remove editing class from all containers
document . querySelectorAll ( ' .page-container ' ) . forEach ( container = > {
container . classList . remove ( ' editing ' ) ;
} ) ;
2025-04-04 16:19:04 +00:00
/ / Move to next document if not at the end
if ( currentIndex < totalPages - 1 ) {
currentIndex + + ;
document . querySelector ( ` . annotation - interface [ data - id = " page-$ {currentIndex} " ] ` ) . classList . add ( ' active ' ) ;
2025-04-04 19:38:59 +00:00
/ / Add editing class to current container
2025-04-04 16:19:04 +00:00
const activeContainer = document . querySelector ( ` . page - container [ data - index = " $ {currentIndex} " ] ` ) ;
if ( activeContainer ) {
2025-04-04 19:38:59 +00:00
activeContainer . classList . add ( ' editing ' ) ;
2025-04-04 16:19:04 +00:00
activeContainer . scrollIntoView ( { behavior : ' smooth ' , block : ' center ' } ) ;
}
2025-04-04 19:38:59 +00:00
updateProgressBar ( ) ;
updateStatusIndicators ( ) ;
2025-04-04 16:19:04 +00:00
}
2025-04-04 17:53:26 +00:00
else {
/ / This was the last document , mark as complete
currentIndex = totalPages ;
updateProgressBar ( ) ;
updateStatusIndicators ( ) ;
/ / Show completion message and scroll to it
document . getElementById ( ' completion-message ' ) . style . display = ' block ' ;
document . getElementById ( ' completion-message ' ) . scrollIntoView ( { behavior : ' smooth ' , block : ' center ' } ) ;
}
2025-04-04 16:19:04 +00:00
}
/ / Handle text area keydown for Enter key
function handleTextareaKeydown ( event , textarea ) {
/ / If Enter key is pressed and not with Shift key , move to next document
if ( event . key == = ' Enter ' & & ! event . shiftKey ) {
event . preventDefault ( ) ;
2025-04-08 22:30:59 +00:00
saveFeedback ( textarea ) . then ( ( ) = > {
goToNextDocument ( ) ;
} ) ;
2025-04-04 16:19:04 +00:00
}
}
2025-04-01 20:03:15 +00:00
async function saveFeedback ( source ) {
2025-04-04 16:19:04 +00:00
const interfaceDiv = source . closest ( ' .annotation-interface ' ) ;
const id = interfaceDiv . getAttribute ( ' data-id ' ) ;
2025-04-08 21:04:56 +00:00
/ / Get the selected primary option
const activePrimaryButton = interfaceDiv . querySelector ( ' button.primary-option.active ' ) ;
const primaryOption = activePrimaryButton ? activePrimaryButton . getAttribute ( ' data-value ' ) : null ;
/ / Get checkbox selections for public document
const publicPiiOptions = [ ] ;
interfaceDiv . querySelectorAll ( ' #public-pii-options- ' + id . split ( ' - ' ) [ 1 ] + ' input[type= " checkbox " ]:checked ' ) . forEach ( checkbox = > {
publicPiiOptions . push ( checkbox . getAttribute ( ' data-value ' ) ) ;
} ) ;
/ / Get checkbox selections for private document
const privatePiiOptions = [ ] ;
interfaceDiv . querySelectorAll ( ' #private-pii-options- ' + id . split ( ' - ' ) [ 1 ] + ' input[type= " checkbox " ]:checked ' ) . forEach ( checkbox = > {
privatePiiOptions . push ( checkbox . getAttribute ( ' data-value ' ) ) ;
} ) ;
/ / Get any " Other " descriptions
const otherPublicDesc = interfaceDiv . querySelector ( ' #other-pii-public- ' + id . split ( ' - ' ) [ 1 ] ) ? . value | | ' ' ;
const otherPrivateDesc = interfaceDiv . querySelector ( ' #other-pii-private- ' + id . split ( ' - ' ) [ 1 ] ) ? . value | | ' ' ;
2025-04-04 19:36:10 +00:00
const pdfPath = interfaceDiv . getAttribute ( ' data-pdf-path ' ) ;
2025-04-15 22:27:07 +00:00
const pdfPage = interfaceDiv . getAttribute ( ' data-pdf-page ' ) ;
2025-04-01 18:35:04 +00:00
const datastore = await fetchDatastore ( ) | | { } ;
datastore [ id ] = {
2025-04-08 21:04:56 +00:00
primaryOption : primaryOption ,
publicPiiOptions : publicPiiOptions ,
privatePiiOptions : privatePiiOptions ,
otherPublicDesc : otherPublicDesc ,
otherPrivateDesc : otherPrivateDesc ,
2025-04-15 22:27:07 +00:00
pdfPath : pdfPath ,
pdfPage : pdfPage
2025-04-01 18:35:04 +00:00
} ;
await putDatastore ( datastore ) ;
}
2025-04-08 22:30:59 +00:00
function saveThenNext ( btn ) {
const interfaceDiv = btn . closest ( ' .annotation-interface ' ) ;
saveFeedback ( interfaceDiv ) . then ( ( ) = > {
goToNextDocument ( ) ;
} ) ;
}
2025-04-08 21:04:56 +00:00
function togglePrimaryOption ( btn , index ) {
2025-04-04 16:19:04 +00:00
const interfaceDiv = btn . closest ( ' .annotation-interface ' ) ;
2025-04-08 21:04:56 +00:00
/ / Remove active class from all primary option buttons in this group
interfaceDiv . querySelectorAll ( ' button.primary-option ' ) . forEach ( function ( b ) {
2025-04-01 20:03:15 +00:00
b . classList . remove ( ' active ' ) ;
} ) ;
2025-04-08 21:04:56 +00:00
2025-04-01 20:03:15 +00:00
/ / Toggle on the clicked button
btn . classList . add ( ' active ' ) ;
2025-04-04 16:19:04 +00:00
2025-04-14 23:27:27 +00:00
/ / Get the selected option
const option = btn . getAttribute ( ' data-value ' ) ;
/ / If user selected Yes , Cannot Read , or Report Content , clear any checkboxes
/ / from " No " option that might have been selected before
if ( option == = ' yes-public ' | | option == = ' cannot-read ' | | option == = ' report-content ' ) {
/ / Clear all checkboxes
interfaceDiv . querySelectorAll ( ' .pii-checkbox ' ) . forEach ( checkbox = > {
checkbox . checked = false ;
} ) ;
/ / Hide / clear any textareas
interfaceDiv . querySelectorAll ( ' textarea ' ) . forEach ( textarea = > {
textarea . value = ' ' ;
textarea . style . display = ' none ' ;
} ) ;
}
2025-04-08 21:04:56 +00:00
/ / Hide all secondary option containers
document . querySelector ( ` #public-pii-options-${index}`).style.display = 'none';
document . querySelector ( ` #private-pii-options-${index}`).style.display = 'none';
2025-04-08 22:30:59 +00:00
/ / Immediately save the primary option selection
saveFeedback ( interfaceDiv ) ;
2025-04-08 21:04:56 +00:00
/ / Show the appropriate secondary options based on the selected primary option
if ( option == = ' yes-public ' ) {
2025-04-14 20:27:06 +00:00
/ / If " Yes " for public document , immediately go to next without asking for PII
goToNextDocument ( ) ;
2025-04-08 21:04:56 +00:00
} else if ( option == = ' no-public ' ) {
document . querySelector ( ` #private-pii-options-${index}`).style.display = 'block';
} else {
/ / For " cannot-read " or " report-content " , just save and move to next
goToNextDocument ( ) ;
}
}
function toggleOtherTextarea ( checkbox ) {
const container = checkbox . closest ( ' .question-container ' ) ;
const textareaId = container . querySelector ( ' textarea ' ) . id ;
const textarea = document . getElementById ( textareaId ) ;
2025-04-04 16:19:04 +00:00
2025-04-08 21:04:56 +00:00
if ( checkbox . checked ) {
2025-04-04 16:19:04 +00:00
textarea . style . display = ' block ' ;
textarea . focus ( ) ;
} else {
textarea . style . display = ' none ' ;
}
2025-04-08 21:04:56 +00:00
saveCheckboxes ( checkbox ) ;
}
function saveCheckboxes ( input ) {
const interfaceDiv = input . closest ( ' .annotation-interface ' ) ;
2025-04-08 22:30:59 +00:00
return saveFeedback ( interfaceDiv ) ;
2025-04-01 20:03:15 +00:00
}
2025-04-04 17:12:46 +00:00
/ / Function to deobfuscate the Prolific code
function deobfuscateCode ( obfuscatedCode ) {
/ / Reverse the string
const reversed = obfuscatedCode . split ( ' ' ) . reverse ( ) . join ( ' ' ) ;
/ / Decode from base64
try {
return atob ( reversed ) ;
} catch ( e ) {
return " ERROR_DECODING " ;
}
}
2025-04-22 21:16:41 +00:00
function getQueryParam ( name ) {
const urlParams = new URLSearchParams ( window . location . search ) ;
return urlParams . get ( name ) ;
}
2025-04-01 18:35:04 +00:00
document . addEventListener ( " DOMContentLoaded " , async function ( ) {
2025-04-22 21:16:41 +00:00
/ / Get the datastore
2025-04-01 18:35:04 +00:00
const datastore = await fetchDatastore ( ) | | { } ;
2025-04-04 19:38:59 +00:00
2025-04-22 21:16:41 +00:00
/ / Check for PROLIFIC_PID in the URL query parameters
const prolificPid = getQueryParam ( ' PROLIFIC_PID ' ) ;
if ( prolificPid ) {
/ / If it exists , update the datastore with this value
datastore . prolific_pid = prolificPid ;
await putDatastore ( datastore ) ;
}
2025-04-22 21:33:39 +00:00
/ / Track if instructions have been seen before
if ( ! datastore . hasOwnProperty ( ' instructions_seen ' ) ) {
datastore . instructions_seen = false ;
await putDatastore ( datastore ) ;
}
2025-04-04 19:38:59 +00:00
/ / Add editing class to the first container by default
const firstContainer = document . querySelector ( ` . page - container [ data - index = " 0 " ] ` ) ;
if ( firstContainer ) {
firstContainer . classList . add ( ' editing ' ) ;
}
2025-04-04 16:19:04 +00:00
updateProgressBar ( ) ;
updateStatusIndicators ( ) ;
2025-04-04 17:12:46 +00:00
/ / Get and deobfuscate the Prolific code
const obfuscatedCode = document . getElementById ( ' obfuscated-code ' ) . textContent ;
const prolificCode = deobfuscateCode ( obfuscatedCode ) ;
document . getElementById ( ' prolific-code ' ) . textContent = prolificCode ;
2025-04-04 16:19:04 +00:00
document . querySelectorAll ( ' .annotation-interface ' ) . forEach ( function ( interfaceDiv ) {
const id = interfaceDiv . getAttribute ( ' data-id ' ) ;
2025-04-08 21:04:56 +00:00
const pageIndex = id . split ( ' - ' ) [ 1 ] ;
2025-04-01 18:35:04 +00:00
if ( datastore [ id ] ) {
const data = datastore [ id ] ;
2025-04-08 21:04:56 +00:00
/ / Set active state for primary option buttons
interfaceDiv . querySelectorAll ( ' button.primary-option ' ) . forEach ( function ( btn ) {
if ( btn . getAttribute ( ' data-value ' ) == = data . primaryOption ) {
2025-04-01 20:03:15 +00:00
btn . classList . add ( ' active ' ) ;
2025-04-04 16:19:04 +00:00
2025-04-08 21:04:56 +00:00
/ / Show the appropriate secondary options
const option = btn . getAttribute ( ' data-value ' ) ;
if ( option == = ' yes-public ' ) {
2025-04-14 20:27:06 +00:00
/ / No action needed for public documents - PII options remain hidden
2025-04-08 21:04:56 +00:00
} else if ( option == = ' no-public ' ) {
document . querySelector ( ` #private-pii-options-${pageIndex}`).style.display = 'block';
2025-04-04 16:19:04 +00:00
}
2025-04-01 20:03:15 +00:00
} else {
btn . classList . remove ( ' active ' ) ;
}
} ) ;
2025-04-08 21:04:56 +00:00
/ / Restore public PII checkboxes
if ( data . publicPiiOptions & & data . publicPiiOptions . length > 0 ) {
const publicContainer = document . querySelector ( ` #public-pii-options-${pageIndex}`);
data . publicPiiOptions . forEach ( option = > {
const checkbox = publicContainer . querySelector ( ` input [ data - value = " $ {option} " ] ` ) ;
if ( checkbox ) {
checkbox . checked = true ;
if ( option == = ' other ' ) {
document . getElementById ( ` other - pii - public - $ { pageIndex } ` ) . style . display = ' block ' ;
}
}
} ) ;
}
/ / Restore private PII checkboxes
if ( data . privatePiiOptions & & data . privatePiiOptions . length > 0 ) {
const privateContainer = document . querySelector ( ` #private-pii-options-${pageIndex}`);
data . privatePiiOptions . forEach ( option = > {
const checkbox = privateContainer . querySelector ( ` input [ data - value = " $ {option} " ] ` ) ;
if ( checkbox ) {
checkbox . checked = true ;
if ( option == = ' other ' ) {
document . getElementById ( ` other - pii - private - $ { pageIndex } ` ) . style . display = ' block ' ;
}
}
} ) ;
}
/ / Set the textarea values
if ( data . otherPublicDesc ) {
document . getElementById ( ` other - pii - public - $ { pageIndex } ` ) . value = data . otherPublicDesc ;
}
if ( data . otherPrivateDesc ) {
document . getElementById ( ` other - pii - private - $ { pageIndex } ` ) . value = data . otherPrivateDesc ;
}
2025-04-01 18:35:04 +00:00
}
} ) ;
2025-04-04 16:19:04 +00:00
/ / If we have stored data , restore the current position
let lastAnnotatedIndex = - 1 ;
for ( let i = 0 ; i < totalPages ; i + + ) {
const pageId = ` page - $ { i } ` ;
2025-04-08 21:04:56 +00:00
if ( datastore [ pageId ] & & datastore [ pageId ] . primaryOption ) {
2025-04-04 16:19:04 +00:00
lastAnnotatedIndex = i ;
}
}
/ / If we have annotated pages , go to the first unannotated page
2025-04-07 20:27:32 +00:00
if ( lastAnnotatedIndex > = 0 ) {
2025-04-04 16:19:04 +00:00
document . querySelector ( ` . annotation - interface . active ` ) . classList . remove ( ' active ' ) ;
2025-04-07 20:27:32 +00:00
/ / Check if all pages are annotated
if ( lastAnnotatedIndex == = totalPages - 1 ) {
/ / All pages are annotated , set currentIndex to totalPages to trigger completion
currentIndex = totalPages ;
/ / Show completion message and scroll to it
document . getElementById ( ' completion-message ' ) . style . display = ' block ' ;
document . getElementById ( ' completion-message ' ) . scrollIntoView ( { behavior : ' smooth ' , block : ' center ' } ) ;
} else {
/ / Go to the next unannotated page
currentIndex = lastAnnotatedIndex + 1 ;
document . querySelector ( ` . annotation - interface [ data - id = " page-$ {currentIndex} " ] ` ) . classList . add ( ' active ' ) ;
/ / Add editing class and scroll to the active annotation
const activeContainer = document . querySelector ( ` . page - container [ data - index = " $ {currentIndex} " ] ` ) ;
if ( activeContainer ) {
/ / Remove editing class from all containers first
document . querySelectorAll ( ' .page-container ' ) . forEach ( container = > {
container . classList . remove ( ' editing ' ) ;
} ) ;
/ / Add editing class to current container
activeContainer . classList . add ( ' editing ' ) ;
activeContainer . scrollIntoView ( { behavior : ' smooth ' , block : ' center ' } ) ;
}
2025-04-04 16:19:04 +00:00
}
updateProgressBar ( ) ;
updateStatusIndicators ( ) ;
}
2025-04-01 18:35:04 +00:00
} ) ;
2025-04-22 21:33:39 +00:00
/ / Instructions modal functionality
/ / Create modal container
const instructionsModal = document . createElement ( ' div ' ) ;
instructionsModal . className = ' instructions-modal-overlay ' ;
instructionsModal . id = ' instructions-modal ' ;
/ / Create modal content container
const modalContent = document . createElement ( ' div ' ) ;
modalContent . className = ' instructions-modal ' ;
/ / Create header
const modalHeader = document . createElement ( ' div ' ) ;
modalHeader . className = ' instructions-modal-header ' ;
modalHeader . innerHTML = `
< h2 > Welcome to the OLMO OCR Annotation Task < / h2 >
< p > Please read these instructions carefully before you begin . < / p >
` ;
/ / Create content section - will be populated with sidebar content
const modalContentSection = document . createElement ( ' div ' ) ;
modalContentSection . className = ' instructions-modal-content ' ;
/ / Clone the sidebar content to reuse in the modal
const sidebarContent = document . querySelector ( ' header ' ) . cloneNode ( true ) ;
/ / Remove the " View Instructions Popup " button from the cloned content
const viewInstructionsButton = sidebarContent . querySelector ( ' #view-instructions-button ' ) ;
if ( viewInstructionsButton ) {
viewInstructionsButton . remove ( ) ;
}
/ / Style the sidebar content for use in the modal
sidebarContent . style . fontSize = ' 14px ' ;
sidebarContent . style . lineHeight = ' 1.5 ' ;
/ / Append the cloned sidebar content to the modal content section
modalContentSection . appendChild ( sidebarContent ) ;
/ / Create footer with start button ( initially disabled )
const modalFooter = document . createElement ( ' div ' ) ;
modalFooter . className = ' instructions-modal-footer ' ;
modalFooter . innerHTML = ` < button id = " start-button " class = " instructions-modal-button " disabled > I Understand , Begin Task < / button >
< p id = " scroll-notice " style = " margin-top: 10px; font-size: 0.85rem; color: #6b7280; " > Please scroll to the bottom to continue < / p > ` ;
/ / Assemble the modal
modalContent . appendChild ( modalHeader ) ;
modalContent . appendChild ( modalContentSection ) ;
modalContent . appendChild ( modalFooter ) ;
instructionsModal . appendChild ( modalContent ) ;
/ / Track scroll position in instructions and enable button when scrolled to bottom
let hasReachedBottom = false ;
/ / Function to check if user has scrolled to the bottom of instructions
function checkScrollPosition ( ) {
const contentSection = modalContentSection ;
const scrollableContent = contentSection ;
/ / Calculate if the user is at the bottom ( allowing for small differences )
/ / We consider " bottom " when user has scrolled to at least 90 % of the content
const scrollPosition = scrollableContent . scrollTop + scrollableContent . clientHeight ;
const scrollHeight = scrollableContent . scrollHeight ;
const scrollPercentage = ( scrollPosition / scrollHeight ) * 100 ;
if ( scrollPercentage > = 90 & & ! hasReachedBottom ) {
/ / User has scrolled to the bottom , enable the button
hasReachedBottom = true ;
const startButton = document . getElementById ( ' start-button ' ) ;
if ( startButton ) {
startButton . disabled = false ;
/ / Change the notice text
const scrollNotice = document . getElementById ( ' scroll-notice ' ) ;
if ( scrollNotice ) {
scrollNotice . textContent = ' You may now proceed ' ;
scrollNotice . style . color = ' #10b981 ' ; / / Success color
}
}
}
}
/ / Add scroll event listener to the modal content
modalContentSection . addEventListener ( ' scroll ' , checkScrollPosition ) ;
document . body . appendChild ( instructionsModal ) ;
/ / Show the instructions modal when the page loads
async function showInstructionsModal ( ) {
const datastore = await fetchDatastore ( ) | | { } ;
/ / Check if the task is already completed or instructions have been seen
const isTaskCompleted = currentIndex > = totalPages ;
const instructionsSeen = datastore . instructions_seen == = true ;
/ / Only show instructions if task is not completed and instructions haven ' t been seen
if ( ! isTaskCompleted & & ! instructionsSeen ) {
instructionsModal . classList . add ( ' visible ' ) ;
}
}
/ / Handle button clicks for instructions modal
document . addEventListener ( ' click ' , async function ( event ) {
/ / Start button closes the modal and marks instructions as seen
if ( event . target & & event . target . id == = ' start-button ' ) {
/ / Hide the modal
instructionsModal . classList . remove ( ' visible ' ) ;
/ / Update datastore to remember that instructions have been seen
const datastore = await fetchDatastore ( ) | | { } ;
datastore . instructions_seen = true ;
await putDatastore ( datastore ) ;
}
/ / View instructions button shows the modal
if ( event . target & & event . target . id == = ' view-instructions-button ' ) {
instructionsModal . classList . add ( ' visible ' ) ;
}
} ) ;
/ / Show the instructions modal when page loads ( after a slight delay )
setTimeout ( showInstructionsModal , 500 ) ;
2025-04-01 18:35:04 +00:00
< / script >
2025-03-26 18:26:06 +00:00
< / body >
< / html >
"""
2025-04-04 17:18:19 +00:00
)
2025-03-26 18:49:48 +00:00
with open ( output_path , " w " ) as f :
2025-03-26 18:26:06 +00:00
f . write ( html_content )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
print ( f " Created HTML output at { output_path } " )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
def generate_sample_set ( args , i , s3_client , pdf_s3_client , result_files ) :
""" Generate a single sample set. """
output_filename = Path ( args . output_dir ) / f " dolma_samples_ { i + 1 } .html "
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
print ( f " \n Generating sample set { i + 1 } of { args . repeats } " )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Get random pages
random_pages = get_random_pages ( s3_client , result_files , args . pages_per_output )
2025-03-26 18:49:48 +00:00
2025-04-07 21:39:55 +00:00
# Use the fixed prolific code from command line arguments
prolific_code = args . prolific_code
2025-04-04 17:18:19 +00:00
2025-04-04 17:12:46 +00:00
# Create HTML output with the Prolific code
create_html_output ( random_pages , pdf_s3_client , output_filename , args . workspace , args . db_path , prolific_code )
2025-03-26 18:49:48 +00:00
2025-04-07 21:39:55 +00:00
return output_filename
2025-03-26 18:26:06 +00:00
2025-03-26 18:49:48 +00:00
2025-04-04 19:44:54 +00:00
def extract_datastore_url ( html_content : str ) - > Optional [ str ] :
""" Extract the presigned datastore URL from HTML content. """
match = re . search ( r ' const \ s+presignedGetUrl \ s*= \ s* " ([^ " ]+) " ' , html_content )
if match :
return match . group ( 1 )
return None
2025-04-15 22:27:07 +00:00
def extract_page_number_from_html ( html_content : str , page_id : str ) - > Optional [ int ] :
""" Extract PDF page number from HTML content for a specific page_id.
2025-04-16 19:29:45 +00:00
This is a fallback mechanism for older versions of the annotation page
2025-04-15 22:27:07 +00:00
that didn ' t store the page number in a data attribute.
"""
# Try to find the page number in the "View Cached PDF (page X)" text
# Look for section with this page_id
2025-04-16 19:29:45 +00:00
page_section_pattern = ' <div class= " page-container " [^>]*data-index= " ([^ " ]*) " [^>]*>.*?<div class= " page-info " >.*?<a href= " [^ " ]*#page=([0-9]+) " [^>]*>View Cached PDF \\ (page ([0-9]+) \\ )</a> '
2025-04-15 22:27:07 +00:00
matches = re . finditer ( page_section_pattern , html_content , re . DOTALL )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
for match in matches :
container_index = match . group ( 1 )
pdf_page_from_url = match . group ( 2 )
pdf_page_from_text = match . group ( 3 )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Check if this container index matches our page_id (page-X)
if f " page- { container_index } " == page_id :
# Both numbers should be the same, but prefer the one from the URL fragment
try :
return int ( pdf_page_from_url )
except ( ValueError , TypeError ) :
try :
return int ( pdf_page_from_text )
except ( ValueError , TypeError ) :
pass
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
return None
def fetch_annotations ( tinyhost_link : str ) - > Tuple [ Dict [ str , Any ] , str , str ] :
2025-04-04 19:44:54 +00:00
""" Fetch and parse annotations from a tinyhost link. """
# Request the HTML content
print ( f " Fetching annotations from { tinyhost_link } " )
response = requests . get ( tinyhost_link )
response . raise_for_status ( )
html_content = response . text
# Extract the datastore URL
datastore_url = extract_datastore_url ( html_content )
if not datastore_url :
print ( f " Could not find datastore URL in { tinyhost_link } " )
2025-04-15 22:27:07 +00:00
return { } , tinyhost_link , html_content
2025-04-04 19:44:54 +00:00
# Fetch the datastore content
print ( f " Found datastore URL: { datastore_url } " )
try :
datastore_response = requests . get ( datastore_url )
datastore_response . raise_for_status ( )
annotations = datastore_response . json ( )
2025-04-15 22:27:07 +00:00
return annotations , tinyhost_link , html_content
2025-04-04 19:44:54 +00:00
except Exception as e :
print ( f " Error fetching datastore from { datastore_url } : { e } " )
2025-04-15 22:27:07 +00:00
return { } , tinyhost_link , html_content
2025-04-04 19:44:54 +00:00
2025-04-15 22:27:07 +00:00
def process_annotations ( annotations_by_link : List [ Tuple [ Dict [ str , Any ] , str , str ] ] ) - > Dict [ str , List [ Dict [ str , Any ] ] ] :
2025-04-04 19:44:54 +00:00
""" Process and categorize annotations by feedback type. """
results = {
2025-04-08 21:04:56 +00:00
" public_document " : [ ] ,
" private_document " : [ ] ,
2025-04-04 19:44:54 +00:00
" cannot_read " : [ ] ,
2025-04-08 21:04:56 +00:00
" report_content " : [ ] ,
2025-04-04 19:44:54 +00:00
" no_annotation " : [ ] ,
}
# Process each annotation
2025-04-15 22:27:07 +00:00
for annotations , link , html_content in annotations_by_link :
2025-04-22 21:16:41 +00:00
# Extract Prolific PID from datastore if available
prolific_pid = annotations . get ( " prolific_pid " , None )
2025-04-22 21:33:52 +00:00
2025-04-04 19:44:54 +00:00
for page_id , annotation in annotations . items ( ) :
2025-04-22 21:16:41 +00:00
# Skip non-page entries like prolific_pid
if page_id == " prolific_pid " :
continue
2025-04-22 21:33:52 +00:00
2025-04-22 21:33:39 +00:00
# Handle case where annotation might be a boolean or non-dict value
if not isinstance ( annotation , dict ) or " primaryOption " not in annotation :
2025-04-04 19:44:54 +00:00
continue
2025-04-08 21:04:56 +00:00
primary_option = annotation [ " primaryOption " ]
pdf_path = annotation . get ( " pdfPath " , " Unknown " )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Get PDF page number from annotation data
# This is the actual page number in the PDF that was annotated
pdf_page = None
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# First try to get it from the annotation data (for new format)
if annotation . get ( " pdfPage " ) :
try :
pdf_page = int ( annotation . get ( " pdfPage " ) )
except ( ValueError , TypeError ) :
pass
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Fallback: try to extract page number from HTML content (for older format)
if pdf_page is None :
pdf_page = extract_page_number_from_html ( html_content , page_id )
2025-04-16 19:29:45 +00:00
2025-04-08 21:04:56 +00:00
# Build a result item based on the new annotation structure
if primary_option == " yes-public " :
2025-04-14 20:27:06 +00:00
# Public document - no PII info collected with new flow
results [ " public_document " ] . append (
2025-04-22 21:16:41 +00:00
{
2025-04-22 21:33:52 +00:00
" page_id " : page_id ,
" link " : link ,
" pdf_path " : pdf_path ,
" pdf_page " : pdf_page ,
" pii_types " : [ ] ,
" has_pii " : False ,
2025-04-22 21:16:41 +00:00
" description " : " " ,
2025-04-22 21:33:52 +00:00
" prolific_pid " : prolific_pid ,
2025-04-22 21:16:41 +00:00
}
2025-04-14 20:27:06 +00:00
)
2025-04-08 22:30:59 +00:00
2025-04-08 21:04:56 +00:00
elif primary_option == " no-public " :
# Private document with potential PII
private_pii_options = annotation . get ( " privatePiiOptions " , [ ] )
other_desc = annotation . get ( " otherPrivateDesc " , " " )
2025-04-08 22:30:59 +00:00
2025-04-08 21:04:56 +00:00
if not private_pii_options :
# No PII selected in a private document
2025-04-08 22:30:59 +00:00
results [ " private_document " ] . append (
2025-04-22 21:16:41 +00:00
{
2025-04-22 21:33:52 +00:00
" page_id " : page_id ,
" link " : link ,
" pdf_path " : pdf_path ,
" pdf_page " : pdf_page ,
" pii_types " : [ ] ,
" has_pii " : False ,
2025-04-22 21:16:41 +00:00
" description " : " " ,
2025-04-22 21:33:52 +00:00
" prolific_pid " : prolific_pid ,
2025-04-22 21:16:41 +00:00
}
2025-04-08 22:30:59 +00:00
)
2025-04-08 21:04:56 +00:00
else :
# PII found in a private document
2025-04-08 22:30:59 +00:00
results [ " private_document " ] . append (
{
" page_id " : page_id ,
" link " : link ,
" pdf_path " : pdf_path ,
2025-04-15 22:27:07 +00:00
" pdf_page " : pdf_page ,
2025-04-08 22:30:59 +00:00
" pii_types " : private_pii_options ,
" has_pii " : True ,
" description " : other_desc if " other " in private_pii_options else " " ,
2025-04-22 21:33:52 +00:00
" prolific_pid " : prolific_pid ,
2025-04-08 22:30:59 +00:00
}
)
2025-04-08 21:04:56 +00:00
elif primary_option == " cannot-read " :
2025-04-22 21:33:52 +00:00
results [ " cannot_read " ] . append ( { " page_id " : page_id , " link " : link , " pdf_path " : pdf_path , " pdf_page " : pdf_page , " prolific_pid " : prolific_pid } )
2025-04-08 22:30:59 +00:00
2025-04-08 21:04:56 +00:00
elif primary_option == " report-content " :
2025-04-22 21:33:52 +00:00
results [ " report_content " ] . append ( { " page_id " : page_id , " link " : link , " pdf_path " : pdf_path , " pdf_page " : pdf_page , " prolific_pid " : prolific_pid } )
2025-04-08 22:30:59 +00:00
2025-04-04 19:44:54 +00:00
else :
2025-04-22 21:33:52 +00:00
results [ " no_annotation " ] . append ( { " page_id " : page_id , " link " : link , " pdf_path " : pdf_path , " pdf_page " : pdf_page , " prolific_pid " : prolific_pid } )
2025-04-04 19:44:54 +00:00
return results
2025-04-15 22:27:07 +00:00
def print_annotation_report ( annotation_results : Dict [ str , List [ Dict [ str , Any ] ] ] , pdf_s3_client = None ) :
2025-04-04 19:44:54 +00:00
""" Print a summary report of annotations. """
total_pages = sum ( len ( items ) for items in annotation_results . values ( ) )
print ( " \n " + " = " * 80 )
print ( f " ANNOTATION REPORT - Total Pages: { total_pages } " )
print ( " = " * 80 )
2025-04-08 21:04:56 +00:00
# Count pages with PII in public documents
2025-04-08 22:30:59 +00:00
public_with_pii = [ page for page in annotation_results [ " public_document " ] if page . get ( " has_pii " , False ) ]
public_without_pii = [ page for page in annotation_results [ " public_document " ] if not page . get ( " has_pii " , False ) ]
2025-04-08 21:04:56 +00:00
# Count pages with PII in private documents
2025-04-08 22:30:59 +00:00
private_with_pii = [ page for page in annotation_results [ " private_document " ] if page . get ( " has_pii " , False ) ]
private_without_pii = [ page for page in annotation_results [ " private_document " ] if not page . get ( " has_pii " , False ) ]
2025-04-08 21:04:56 +00:00
2025-04-04 19:44:54 +00:00
# Print summary statistics
print ( " \n Summary: " )
2025-04-08 22:30:59 +00:00
print (
f " Public documents (total): { len ( annotation_results [ ' public_document ' ] ) } ( { len ( annotation_results [ ' public_document ' ] ) / total_pages * 100 : .1f } % of all pages) "
)
2025-04-08 21:04:56 +00:00
print ( f " - With PII: { len ( public_with_pii ) } ( { len ( public_with_pii ) / max ( 1 , len ( annotation_results [ ' public_document ' ] ) ) * 100 : .1f } % of public docs) " )
2025-04-08 22:30:59 +00:00
print (
f " - Without PII: { len ( public_without_pii ) } ( { len ( public_without_pii ) / max ( 1 , len ( annotation_results [ ' public_document ' ] ) ) * 100 : .1f } % of public docs) "
)
print (
f " Private documents (total): { len ( annotation_results [ ' private_document ' ] ) } ( { len ( annotation_results [ ' private_document ' ] ) / total_pages * 100 : .1f } % of all pages) "
)
2025-04-08 21:04:56 +00:00
print ( f " - With PII: { len ( private_with_pii ) } ( { len ( private_with_pii ) / max ( 1 , len ( annotation_results [ ' private_document ' ] ) ) * 100 : .1f } % of private docs) " )
2025-04-08 22:30:59 +00:00
print (
f " - Without PII: { len ( private_without_pii ) } ( { len ( private_without_pii ) / max ( 1 , len ( annotation_results [ ' private_document ' ] ) ) * 100 : .1f } % of private docs) "
)
2025-04-04 19:44:54 +00:00
print ( f " Unreadable pages: { len ( annotation_results [ ' cannot_read ' ] ) } ( { len ( annotation_results [ ' cannot_read ' ] ) / total_pages * 100 : .1f } %) " )
2025-04-08 21:04:56 +00:00
print ( f " Pages with reported content: { len ( annotation_results [ ' report_content ' ] ) } ( { len ( annotation_results [ ' report_content ' ] ) / total_pages * 100 : .1f } %) " )
2025-04-04 19:44:54 +00:00
print ( f " Pages without annotation: { len ( annotation_results [ ' no_annotation ' ] ) } ( { len ( annotation_results [ ' no_annotation ' ] ) / total_pages * 100 : .1f } %) " )
2025-04-14 20:27:06 +00:00
# With the updated flow, there should be no public documents with PII flags
# as we don't collect PII information for public documents anymore
2025-04-08 21:04:56 +00:00
if public_with_pii :
2025-04-14 20:27:06 +00:00
print ( " \n Note: With the current annotation flow, public documents should not have PII flags. " )
print ( f " Found { len ( public_with_pii ) } public documents incorrectly marked with PII. " )
2025-04-08 21:04:56 +00:00
# Analyze PII types in private documents
if private_with_pii :
2025-04-14 20:27:06 +00:00
# Categorize the PII types for clearer reporting
pii_categories = {
" Identifiers " : [ " names " , " email " , " phone " ] ,
" PII requiring identifiers " : [ " addresses " , " biographical " , " location " , " employment " , " education " , " medical " ] ,
2025-04-14 23:27:27 +00:00
" Always sensitive PII " : [ " government-id " , " financial " , " biometric " , " login-info " ] ,
2025-04-14 20:27:06 +00:00
}
2025-04-14 23:27:27 +00:00
2025-04-14 20:27:06 +00:00
# Dictionary to track all PII counts
2025-04-08 21:04:56 +00:00
pii_counts_private = { }
for page in private_with_pii :
2025-04-08 22:30:59 +00:00
for pii_type in page . get ( " pii_types " , [ ] ) :
2025-04-08 21:04:56 +00:00
pii_counts_private [ pii_type ] = pii_counts_private . get ( pii_type , 0 ) + 1
2025-04-14 23:27:27 +00:00
2025-04-14 20:27:06 +00:00
# Print categorized PII counts
2025-04-08 21:04:56 +00:00
print ( " \n PII Types in Private Documents: " )
2025-04-14 23:27:27 +00:00
2025-04-14 20:27:06 +00:00
# Print each category
for category , pii_types in pii_categories . items ( ) :
print ( f " \n { category } : " )
for pii_type in pii_types :
count = pii_counts_private . get ( pii_type , 0 )
if count > 0 :
print ( f " - { pii_type } : { count } ( { count / len ( private_with_pii ) * 100 : .1f } %) " )
2025-04-14 23:27:27 +00:00
2025-04-14 20:27:06 +00:00
# Print any other PII types not in our categories (like "other")
2025-04-14 23:27:27 +00:00
other_pii = [ pii_type for pii_type in pii_counts_private . keys ( ) if not any ( pii_type in types for types in pii_categories . values ( ) ) ]
2025-04-14 20:27:06 +00:00
if other_pii :
print ( " \n Other PII types: " )
for pii_type in other_pii :
count = pii_counts_private . get ( pii_type , 0 )
print ( f " - { pii_type } : { count } ( { count / len ( private_with_pii ) * 100 : .1f } %) " )
# With the updated flow, there should be no public documents with PII flags
# so we can remove this section
2025-04-08 21:04:56 +00:00
if public_with_pii :
2025-04-14 20:27:06 +00:00
print ( " \n Note: Public documents with PII flags found in old annotation results. " )
print ( " These are from annotation sessions before the workflow change and should be disregarded. " )
2025-04-08 21:04:56 +00:00
# Print detailed report for private documents with PII
if private_with_pii :
print ( " \n Detailed Report - Private Documents with PII: " )
2025-04-04 19:44:54 +00:00
print ( " - " * 80 )
2025-04-08 21:04:56 +00:00
for i , item in enumerate ( private_with_pii , 1 ) :
2025-04-16 19:29:45 +00:00
pdf_path = item [ " pdf_path " ]
page_id = item [ " page_id " ]
# Get the actual PDF page number
pdf_page = item . get ( " pdf_page " )
2025-04-15 22:27:07 +00:00
# Generate presigned URL with PDF page number if client is available
presigned_url = None
if pdf_s3_client and pdf_path . startswith ( " s3:// " ) :
presigned_url = create_presigned_url ( pdf_s3_client , pdf_path )
if presigned_url and pdf_page is not None :
presigned_url + = f " #page= { pdf_page } "
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
print ( f " { i } . PDF: { pdf_path } " )
print ( f " Page ID: { page_id } " )
print ( f " Link: { item [ ' link ' ] } # { page_id } " )
if presigned_url :
print ( f " Presigned URL: { presigned_url } " )
2025-04-08 21:04:56 +00:00
print ( f " PII Types: { ' , ' . join ( item [ ' pii_types ' ] ) } " )
2025-04-08 22:30:59 +00:00
if item . get ( " description " ) :
2025-04-08 21:04:56 +00:00
print ( f " Description: { item [ ' description ' ] } " )
2025-04-22 21:16:41 +00:00
if item . get ( " prolific_pid " ) :
print ( f " Prolific PID: { item [ ' prolific_pid ' ] } " )
2025-04-04 19:44:54 +00:00
print ( " - " * 80 )
print ( " \n Report complete. " )
def read_and_process_results ( args ) :
""" Read and process results from a previously generated CSV file. """
try :
# Read the CSV file
links = [ ]
with open ( args . read_results , " r " ) as f :
2025-04-07 21:39:55 +00:00
for line in f :
if line . strip ( ) :
links . append ( line . strip ( ) )
2025-04-04 19:44:54 +00:00
if not links :
print ( f " No tinyhost links found in { args . read_results } " )
return
print ( f " Found { len ( links ) } tinyhost links in { args . read_results } " )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Set up PDF S3 client with profile if specified
if args . pdf_profile :
pdf_session = boto3 . Session ( profile_name = args . pdf_profile )
pdf_s3_client = pdf_session . client ( " s3 " )
else :
pdf_s3_client = boto3 . client ( " s3 " )
2025-04-04 19:44:54 +00:00
# Fetch and process annotations
annotations_by_link = [ ]
for link in tqdm ( links , desc = " Fetching annotations " ) :
try :
2025-04-15 22:27:07 +00:00
annotations , link_url , html_content = fetch_annotations ( link )
annotations_by_link . append ( ( annotations , link_url , html_content ) )
2025-04-04 19:44:54 +00:00
except Exception as e :
print ( f " Error processing { link } : { e } " )
# Process and categorize annotations
annotation_results = process_annotations ( annotations_by_link )
2025-04-15 22:27:07 +00:00
# Print report with presigned URLs
print_annotation_report ( annotation_results , pdf_s3_client )
2025-04-04 19:44:54 +00:00
# Save detailed report to file
output_file = Path ( args . output_dir ) / " annotation_report.csv "
print ( f " \n Saving detailed report to { output_file } " )
with open ( output_file , " w " , newline = " " ) as f :
writer = csv . writer ( f )
2025-04-22 21:16:41 +00:00
writer . writerow ( [ " Category " , " PDF Path " , " Page ID " , " Link " , " Presigned URL " , " Document Type " , " PII Types " , " Description " , " Prolific PID " ] )
2025-04-04 19:44:54 +00:00
for category , items in annotation_results . items ( ) :
for item in items :
2025-04-15 22:27:07 +00:00
pdf_path = item [ " pdf_path " ]
2025-04-16 20:14:20 +00:00
2025-04-15 22:27:07 +00:00
# Get the actual PDF page number
pdf_page = item . get ( " pdf_page " )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Generate presigned URL with the PDF page number
presigned_url = " "
if pdf_path . startswith ( " s3:// " ) :
url = create_presigned_url ( pdf_s3_client , pdf_path )
if url and pdf_page is not None :
presigned_url = f " { url } #page= { pdf_page } "
elif url :
presigned_url = url
2025-04-16 19:29:45 +00:00
2025-04-08 21:04:56 +00:00
if category == " public_document " :
doc_type = " Public "
pii_types = " , " . join ( item . get ( " pii_types " , [ ] ) )
description = item . get ( " description " , " " )
elif category == " private_document " :
doc_type = " Private "
pii_types = " , " . join ( item . get ( " pii_types " , [ ] ) )
description = item . get ( " description " , " " )
else :
doc_type = " "
pii_types = " "
description = " "
2025-04-22 21:33:52 +00:00
2025-04-22 21:16:41 +00:00
# Extract Prolific PID from the item if available
prolific_pid = item . get ( " prolific_pid " , " " )
2025-04-08 22:30:59 +00:00
2025-04-16 19:29:45 +00:00
writer . writerow (
2025-04-22 21:33:52 +00:00
[
category ,
item [ " pdf_path " ] ,
item [ " page_id " ] ,
f " { item [ ' link ' ] } # { item [ ' page_id ' ] } " ,
presigned_url ,
doc_type ,
pii_types ,
description ,
prolific_pid ,
]
2025-04-16 19:29:45 +00:00
)
2025-04-04 19:44:54 +00:00
print ( f " Report saved to { output_file } " )
except Exception as e :
print ( f " Error processing results: { e } " )
2025-04-22 21:33:39 +00:00
raise
2025-04-04 19:44:54 +00:00
2025-04-22 21:33:52 +00:00
2025-03-26 18:26:06 +00:00
def main ( ) :
args = parse_args ( )
2025-03-26 18:49:48 +00:00
2025-04-04 19:44:54 +00:00
# Check if we're reading results from a previous run
if args . read_results :
read_and_process_results ( args )
return
2025-03-26 18:26:06 +00:00
# Set up S3 clients
2025-03-26 18:49:48 +00:00
s3_client = boto3 . client ( " s3 " )
2025-03-26 18:26:06 +00:00
# Set up PDF S3 client with profile if specified
if args . pdf_profile :
pdf_session = boto3 . Session ( profile_name = args . pdf_profile )
pdf_s3_client = pdf_session . client ( " s3 " )
else :
pdf_s3_client = s3_client
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Create output directory
output_dir = Path ( args . output_dir )
output_dir . mkdir ( exist_ok = True , parents = True )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# List all result files
print ( f " Listing result files in { args . workspace } /results... " )
result_files = list_result_files ( s3_client , args . workspace )
print ( f " Found { len ( result_files ) } result files " )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Use ThreadPoolExecutor to parallelize the generation of sample sets
2025-04-04 16:29:58 +00:00
output_files = [ ]
2025-03-26 18:26:06 +00:00
if args . repeats > 1 :
print ( f " Using ThreadPoolExecutor with { min ( args . max_workers , args . repeats ) } workers " )
with ThreadPoolExecutor ( max_workers = min ( args . max_workers , args . repeats ) ) as executor :
futures = [ ]
for i in range ( args . repeats ) :
2025-03-26 18:49:48 +00:00
future = executor . submit ( generate_sample_set , args , i , s3_client , pdf_s3_client , result_files )
2025-03-26 18:26:06 +00:00
futures . append ( future )
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
# Wait for all futures to complete and collect results
for future in futures :
try :
2025-04-07 21:39:55 +00:00
output_filename = future . result ( )
2025-04-04 16:29:58 +00:00
output_files . append ( output_filename )
2025-04-07 21:39:55 +00:00
print ( f " Completed generation of { output_filename } " )
2025-03-26 18:26:06 +00:00
except Exception as e :
print ( f " Error generating sample set: { e } " )
else :
# If only one repeat, just run it directly
2025-04-07 21:39:55 +00:00
output_filename = generate_sample_set ( args , 0 , s3_client , pdf_s3_client , result_files )
2025-04-04 16:29:58 +00:00
output_files . append ( output_filename )
# Now upload each resulting file into tinyhost
print ( " Generated all files, uploading tinyhost links now " )
links = [ ]
for output_filename in output_files :
2025-04-04 17:18:41 +00:00
link = tinyhost . tinyhost ( [ str ( output_filename ) ] ) [ 0 ]
links . append ( link )
2025-04-04 16:29:58 +00:00
print ( link )
2025-04-04 17:18:19 +00:00
2025-04-07 21:39:55 +00:00
# Create CSV file with just the tinyhost links, one per line
2025-04-04 17:12:46 +00:00
csv_path = args . prolific_csv
2025-04-07 21:39:55 +00:00
print ( f " Writing tinyhost links to { csv_path } " )
2025-04-04 17:18:19 +00:00
with open ( csv_path , " w " , newline = " " ) as csvfile :
2025-04-07 21:39:55 +00:00
for link in links :
csvfile . write ( f " { link } \n " )
2025-04-04 17:18:19 +00:00
2025-04-07 21:39:55 +00:00
print ( f " Tinyhost links written to { csv_path } " )
2025-03-26 18:26:06 +00:00
2025-03-26 18:49:48 +00:00
2025-03-26 18:26:06 +00:00
if __name__ == " __main__ " :
2025-04-04 17:18:19 +00:00
main ( )