2025-04-15 22:27:07 +00:00
import argparse
import json
import os
import random
import tempfile
from concurrent . futures import ThreadPoolExecutor
2025-04-16 19:29:45 +00:00
from enum import Enum
2025-04-15 22:27:07 +00:00
from pathlib import Path
2025-04-16 19:29:45 +00:00
from typing import Any , Dict , List , Optional
2025-04-15 22:27:07 +00:00
import boto3
import pydantic
2025-04-16 19:29:45 +00:00
from openai import OpenAI
2025-04-15 22:27:07 +00:00
from tqdm import tqdm
from olmocr . data . renderpdf import render_pdf_to_base64png
from olmocr . s3_utils import get_s3_bytes , parse_s3_path
2025-04-16 19:29:45 +00:00
LanguageCode = Enum (
" LanguageCode " ,
{
" en " : " English " ,
" zh " : " Chinese " ,
" hi " : " Hindi " ,
" es " : " Spanish " ,
" fr " : " French " ,
" ar " : " Arabic " ,
" bn " : " Bengali " ,
" ru " : " Russian " ,
" pt " : " Portuguese " ,
" ur " : " Urdu " ,
" id " : " Indonesian " ,
" de " : " German " ,
" ja " : " Japanese " ,
" sw " : " Swahili " ,
" mr " : " Marathi " ,
" te " : " Telugu " ,
" tr " : " Turkish " ,
" vi " : " Vietnamese " ,
" ta " : " Tamil " ,
" ko " : " Korean " ,
" other " : " Other " ,
} ,
)
2025-04-15 22:27:07 +00:00
class PIIAnnotation ( pydantic . BaseModel ) :
""" Structured model for PII annotations returned by ChatGPT """
2025-04-16 19:29:45 +00:00
document_description : str
language_code : LanguageCode
cannot_read : bool
inappropriate_content : bool
2025-04-15 22:27:07 +00:00
is_public_document : bool
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# PII identifiers
2025-04-16 19:29:45 +00:00
contains_names : bool
contains_email_addresses : bool
contains_phone_numbers : bool
2025-04-15 22:27:07 +00:00
# PII that must co-occur with identifiers
2025-04-16 19:29:45 +00:00
contains_addresses : bool
contains_biographical_info : bool # DOB, gender, etc.
contains_location_info : bool
contains_employment_info : bool
contains_education_info : bool
contains_medical_info : bool
2025-04-15 22:27:07 +00:00
# Always sensitive PII
2025-04-16 19:29:45 +00:00
contains_government_ids : bool
contains_financial_info : bool
contains_biometric_data : bool
contains_login_info : bool
other_pii : str
2025-04-15 22:27:07 +00:00
@property
def has_pii ( self ) - > bool :
""" Check if the document contains any PII """
pii_fields = [
self . contains_names ,
self . contains_email_addresses ,
self . contains_phone_numbers ,
self . contains_addresses ,
self . contains_biographical_info ,
self . contains_location_info ,
self . contains_employment_info ,
self . contains_education_info ,
self . contains_medical_info ,
self . contains_government_ids ,
self . contains_financial_info ,
self . contains_biometric_data ,
2025-04-16 19:29:45 +00:00
self . contains_login_info ,
2025-04-15 22:27:07 +00:00
]
return any ( pii_fields ) or bool ( self . other_pii . strip ( ) )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
def get_pii_types ( self ) - > List [ str ] :
""" Get a list of all PII types found in the document """
pii_types = [ ]
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
if self . contains_names :
pii_types . append ( " names " )
if self . contains_email_addresses :
pii_types . append ( " email " )
if self . contains_phone_numbers :
pii_types . append ( " phone " )
if self . contains_addresses :
pii_types . append ( " addresses " )
if self . contains_biographical_info :
pii_types . append ( " biographical " )
if self . contains_location_info :
pii_types . append ( " location " )
if self . contains_employment_info :
pii_types . append ( " employment " )
if self . contains_education_info :
pii_types . append ( " education " )
if self . contains_medical_info :
pii_types . append ( " medical " )
if self . contains_government_ids :
pii_types . append ( " government-id " )
if self . contains_financial_info :
pii_types . append ( " financial " )
if self . contains_biometric_data :
pii_types . append ( " biometric " )
if self . contains_login_info :
pii_types . append ( " login-info " )
if self . other_pii . strip ( ) :
pii_types . append ( " other " )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
return pii_types
def parse_args ( ) :
parser = argparse . ArgumentParser ( description = " Automatically scan OLMO OCR workspace results using ChatGPT " )
parser . add_argument ( " workspace " , help = " OLMO OCR workspace path (s3://bucket/workspace) " )
parser . add_argument ( " --pages_per_run " , type = int , default = 30 , help = " Number of pages per run " )
parser . add_argument ( " --pdf_profile " , help = " AWS profile for accessing PDFs " )
parser . add_argument ( " --output_dir " , default = " dolma_samples " , help = " Directory to save output files " )
parser . add_argument ( " --max_workers " , type = int , default = 4 , help = " Maximum number of worker threads " )
parser . add_argument ( " --openai_api_key " , help = " OpenAI API key (or set OPENAI_API_KEY env var) " )
parser . add_argument ( " --openai_model " , default = " gpt-4.1 " , help = " OpenAI model to use " )
return parser . parse_args ( )
def list_result_files ( s3_client , workspace_path ) :
""" List all JSON result files in the workspace results directory. """
bucket , prefix = parse_s3_path ( workspace_path )
results_prefix = os . path . join ( prefix , " results " ) . rstrip ( " / " ) + " / "
all_files = [ ]
paginator = s3_client . get_paginator ( " list_objects_v2 " )
for page in paginator . paginate ( Bucket = bucket , Prefix = results_prefix ) :
if " Contents " in page :
all_files . extend ( [ f " s3:// { bucket } / { obj [ ' Key ' ] } " for obj in page [ " Contents " ] if obj [ " Key " ] . endswith ( " .jsonl " ) or obj [ " Key " ] . endswith ( " .json " ) ] )
2025-04-30 18:47:10 +00:00
# if len(all_files) > 1000:
# break
2025-04-15 22:27:07 +00:00
return all_files
def get_random_pages ( s3_client , result_files , count = 30 ) :
""" Get random pages from the result files. """
random_pages = [ ]
# Try to collect the requested number of pages
attempts = 0
max_attempts = count * 3 # Allow extra attempts to handle potential failures
while len ( random_pages ) < count and attempts < max_attempts :
attempts + = 1
# Pick a random result file
if not result_files :
print ( " No result files found! " )
break
result_file = random . choice ( result_files )
try :
# Get the content of the file
content = get_s3_bytes ( s3_client , result_file )
lines = content . decode ( " utf-8 " ) . strip ( ) . split ( " \n " )
if not lines :
continue
# Pick a random line (which contains a complete document)
line = random . choice ( lines )
doc = json . loads ( line )
# A Dolma document has "text", "metadata", and "attributes" fields
if " text " not in doc or " metadata " not in doc or " attributes " not in doc :
print ( f " Document in { result_file } is not a valid Dolma document " )
continue
# Get the original PDF path from metadata
pdf_path = doc [ " metadata " ] . get ( " Source-File " )
if not pdf_path :
continue
# Get page spans from attributes
page_spans = doc [ " attributes " ] . get ( " pdf_page_numbers " , [ ] )
if not page_spans :
continue
# Pick a random page span
page_span = random . choice ( page_spans )
if len ( page_span ) > = 3 :
# Page spans are [start_pos, end_pos, page_num]
page_num = page_span [ 2 ]
# Extract text for this page
start_pos , end_pos = page_span [ 0 ] , page_span [ 1 ]
page_text = doc [ " text " ] [ start_pos : end_pos ] . strip ( )
# Include the text snippet with the page info
random_pages . append ( ( pdf_path , page_num , page_text , result_file ) )
if len ( random_pages ) > = count :
break
except Exception as e :
print ( f " Error processing { result_file } : { e } " )
continue
print ( f " Found { len ( random_pages ) } random pages from Dolma documents " )
return random_pages
def chatgpt_analyze_page ( pdf_path : str , page_num : int , pdf_s3_client , openai_api_key : str , openai_model : str ) - > Optional [ PIIAnnotation ] :
2025-04-16 19:29:45 +00:00
""" Analyze a page using the ChatGPT vision model with structured outputs. """
2025-04-15 22:27:07 +00:00
try :
# Download PDF to temp file and render to image
bucket , key = parse_s3_path ( pdf_path )
with tempfile . NamedTemporaryFile ( suffix = " .pdf " , delete = False ) as temp_file :
pdf_data = pdf_s3_client . get_object ( Bucket = bucket , Key = key ) [ " Body " ] . read ( )
temp_file . write ( pdf_data )
temp_file_path = temp_file . name
# Render PDF to base64 image
base64_image = render_pdf_to_base64png ( temp_file_path , page_num , target_longest_image_dim = 2048 )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Clean up temp file
os . unlink ( temp_file_path )
2025-04-16 19:29:45 +00:00
# Create OpenAI client
client = OpenAI ( api_key = openai_api_key )
# Prepare the user message with all instructions
user_message = """
2025-04-15 22:27:07 +00:00
You are a document analyzer that identifies Personally Identifiable Information ( PII ) in documents .
Your task is to analyze the provided document image and determine :
1. Whether the document is intended for public release or dissemination ( e . g . , research paper , public report , etc . )
2. If the document contains any PII
For PII identification , follow these specific guidelines :
IDENTIFIERS FOR PII :
The following are considered identifiers that can make information PII :
- Names ( full names , first names , last names , nicknames )
- Email addresses
- Phone numbers
PII THAT MUST CO - OCCUR WITH AN IDENTIFIER :
The following types of information should ONLY be marked as PII if they occur ALONGSIDE an identifier ( commonly , a person ' s name):
- Addresses ( street address , postal code , etc . )
- Biographical Information ( date of birth , place of birth , gender , sexual orientation , race , ethnicity , citizenship / immigration status , religion )
- Location Information ( geolocations , specific coordinates )
- Employment Information ( job titles , workplace names , employment history )
- Education Information ( school names , degrees , transcripts )
- Medical Information ( health records , diagnoses , genetic or neural data )
PII THAT OCCURS EVEN WITHOUT AN IDENTIFIER :
The following should ALWAYS be marked as PII even if they do not occur alongside an identifier :
- Government IDs ( Social Security Numbers , passport numbers , driver ' s license numbers, tax IDs)
- Financial Information ( credit card numbers , bank account / routing numbers )
- Biometric Data ( fingerprints , retina scans , facial recognition data , voice signatures )
- Login information ( ONLY mark as PII when a username , password , and login location are present together )
2025-04-16 19:29:45 +00:00
If the document is a form , then only consider fields which are filled out with specific values as potential PII .
2025-04-16 20:14:20 +00:00
If this page does not itself contain PII , but references documents ( such as curriculum vitae , personal statements ) that typically contain PII , then do not mark it as PII .
Only consider actual occurrences of the PII within the document shown .
2025-04-15 22:27:07 +00:00
"""
2025-04-16 19:29:45 +00:00
# Use the chat completions API with the custom schema
completion = client . beta . chat . completions . parse (
model = openai_model ,
messages = [
2025-04-15 22:27:07 +00:00
{
" role " : " user " ,
2025-04-16 19:29:45 +00:00
" content " : [ { " type " : " text " , " text " : user_message } , { " type " : " image_url " , " image_url " : { " url " : f " data:image/webp;base64, { base64_image } " } } ] ,
2025-04-15 22:27:07 +00:00
}
] ,
2025-04-16 19:29:45 +00:00
response_format = PIIAnnotation ,
max_tokens = 1000 ,
2025-04-15 22:27:07 +00:00
)
2025-04-16 19:29:45 +00:00
return completion . choices [ 0 ] . message . parsed
2025-04-15 22:27:07 +00:00
except Exception as e :
print ( f " Error analyzing page { pdf_path } (page { page_num } ): { e } " )
return None
def create_presigned_url ( s3_client , pdf_path , expiration = 3600 * 24 * 7 ) :
""" Create a presigned URL for the given S3 path. """
try :
bucket , key = parse_s3_path ( pdf_path )
2025-04-16 19:29:45 +00:00
url = s3_client . generate_presigned_url ( " get_object " , Params = { " Bucket " : bucket , " Key " : key } , ExpiresIn = expiration )
2025-04-15 22:27:07 +00:00
return url
except Exception as e :
print ( f " Error creating presigned URL for { pdf_path } : { e } " )
return None
def process_pages ( random_pages , pdf_s3_client , openai_api_key , openai_model , max_workers ) :
""" Process multiple pages in parallel using ThreadPoolExecutor. """
results = [ ]
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# First generate presigned URLs for all PDFs
print ( " Generating presigned URLs for PDFs... " )
presigned_urls = { }
for pdf_path , page_num , _ , _ in random_pages :
if pdf_path not in presigned_urls and pdf_path . startswith ( " s3:// " ) :
url = create_presigned_url ( pdf_s3_client , pdf_path )
if url :
presigned_urls [ pdf_path ] = url
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
with ThreadPoolExecutor ( max_workers = max_workers ) as executor :
futures = { }
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Submit all tasks
for pdf_path , page_num , page_text , result_file in tqdm ( random_pages , desc = " Submitting pages for analysis " ) :
2025-04-16 19:29:45 +00:00
future = executor . submit ( chatgpt_analyze_page , pdf_path , page_num , pdf_s3_client , openai_api_key , openai_model )
2025-04-15 22:27:07 +00:00
futures [ future ] = ( pdf_path , page_num , page_text , result_file )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Process results as they complete
for future in tqdm ( futures , desc = " Processing results " ) :
pdf_path , page_num , page_text , result_file = futures [ future ]
try :
annotation = future . result ( )
if annotation :
# Get presigned URL with page number
presigned_url = None
if pdf_path in presigned_urls :
presigned_url = f " { presigned_urls [ pdf_path ] } #page= { page_num } "
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
results . append ( ( pdf_path , page_num , page_text , result_file , annotation , presigned_url ) )
else :
print ( f " Failed to get annotation for { pdf_path } (page { page_num } ) " )
except Exception as e :
print ( f " Error processing { pdf_path } (page { page_num } ): { e } " )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
return results
def categorize_results ( all_results ) :
""" Categorize results for reporting. """
categories = {
" public_document " : [ ] ,
" private_document " : [ ] ,
" cannot_read " : [ ] ,
" report_content " : [ ] ,
" no_annotation " : [ ] ,
}
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
for pdf_path , page_num , page_text , result_file , annotation , presigned_url in all_results :
2025-04-16 19:29:45 +00:00
if annotation . cannot_read or annotation . language_code != LanguageCode . en :
categories [ " cannot_read " ] . append ( { " pdf_path " : pdf_path , " pdf_page " : page_num , " result_file " : result_file , " presigned_url " : presigned_url } )
2025-04-15 22:27:07 +00:00
elif annotation . inappropriate_content :
2025-04-16 19:29:45 +00:00
categories [ " report_content " ] . append ( { " pdf_path " : pdf_path , " pdf_page " : page_num , " result_file " : result_file , " presigned_url " : presigned_url } )
2025-04-15 22:27:07 +00:00
elif annotation . is_public_document :
2025-04-16 19:29:45 +00:00
categories [ " public_document " ] . append (
{
" pdf_path " : pdf_path ,
" pdf_page " : page_num ,
" result_file " : result_file ,
" pii_types " : annotation . get_pii_types ( ) ,
" has_pii " : annotation . has_pii ,
" description " : annotation . other_pii ,
" presigned_url " : presigned_url ,
}
)
2025-04-15 22:27:07 +00:00
else :
# Private document
2025-04-16 19:29:45 +00:00
categories [ " private_document " ] . append (
{
" pdf_path " : pdf_path ,
" pdf_page " : page_num ,
" result_file " : result_file ,
" pii_types " : annotation . get_pii_types ( ) ,
" has_pii " : annotation . has_pii ,
" description " : annotation . other_pii ,
" presigned_url " : presigned_url ,
}
)
2025-04-15 22:27:07 +00:00
return categories
def print_annotation_report ( annotation_results : Dict [ str , List [ Dict [ str , Any ] ] ] ) :
""" Print a summary report of annotations. """
total_pages = sum ( len ( items ) for items in annotation_results . values ( ) )
print ( " \n " + " = " * 80 )
print ( f " ANNOTATION REPORT - Total Pages: { total_pages } " )
print ( " = " * 80 )
# Count pages with PII in public documents
public_with_pii = [ page for page in annotation_results [ " public_document " ] if page . get ( " has_pii " , False ) ]
public_without_pii = [ page for page in annotation_results [ " public_document " ] if not page . get ( " has_pii " , False ) ]
# Count pages with PII in private documents
private_with_pii = [ page for page in annotation_results [ " private_document " ] if page . get ( " has_pii " , False ) ]
private_without_pii = [ page for page in annotation_results [ " private_document " ] if not page . get ( " has_pii " , False ) ]
# Print summary statistics
print ( " \n Summary: " )
print (
f " Public documents (total): { len ( annotation_results [ ' public_document ' ] ) } ( { len ( annotation_results [ ' public_document ' ] ) / total_pages * 100 : .1f } % of all pages) "
)
print ( f " - With PII: { len ( public_with_pii ) } ( { len ( public_with_pii ) / max ( 1 , len ( annotation_results [ ' public_document ' ] ) ) * 100 : .1f } % of public docs) " )
print (
f " - Without PII: { len ( public_without_pii ) } ( { len ( public_without_pii ) / max ( 1 , len ( annotation_results [ ' public_document ' ] ) ) * 100 : .1f } % of public docs) "
)
print (
f " Private documents (total): { len ( annotation_results [ ' private_document ' ] ) } ( { len ( annotation_results [ ' private_document ' ] ) / total_pages * 100 : .1f } % of all pages) "
)
print ( f " - With PII: { len ( private_with_pii ) } ( { len ( private_with_pii ) / max ( 1 , len ( annotation_results [ ' private_document ' ] ) ) * 100 : .1f } % of private docs) " )
print (
f " - Without PII: { len ( private_without_pii ) } ( { len ( private_without_pii ) / max ( 1 , len ( annotation_results [ ' private_document ' ] ) ) * 100 : .1f } % of private docs) "
)
print ( f " Unreadable pages: { len ( annotation_results [ ' cannot_read ' ] ) } ( { len ( annotation_results [ ' cannot_read ' ] ) / total_pages * 100 : .1f } %) " )
print ( f " Pages with reported content: { len ( annotation_results [ ' report_content ' ] ) } ( { len ( annotation_results [ ' report_content ' ] ) / total_pages * 100 : .1f } %) " )
print ( f " Pages without annotation: { len ( annotation_results [ ' no_annotation ' ] ) } ( { len ( annotation_results [ ' no_annotation ' ] ) / total_pages * 100 : .1f } %) " )
# Analyze PII types in private documents
if private_with_pii :
# Categorize the PII types for clearer reporting
pii_categories = {
" Identifiers " : [ " names " , " email " , " phone " ] ,
" PII requiring identifiers " : [ " addresses " , " biographical " , " location " , " employment " , " education " , " medical " ] ,
" Always sensitive PII " : [ " government-id " , " financial " , " biometric " , " login-info " ] ,
}
# Dictionary to track all PII counts
pii_counts_private = { }
for page in private_with_pii :
for pii_type in page . get ( " pii_types " , [ ] ) :
pii_counts_private [ pii_type ] = pii_counts_private . get ( pii_type , 0 ) + 1
# Print categorized PII counts
print ( " \n PII Types in Private Documents: " )
# Print each category
for category , pii_types in pii_categories . items ( ) :
print ( f " \n { category } : " )
for pii_type in pii_types :
count = pii_counts_private . get ( pii_type , 0 )
if count > 0 :
print ( f " - { pii_type } : { count } ( { count / len ( private_with_pii ) * 100 : .1f } %) " )
# Print any other PII types not in our categories (like "other")
other_pii = [ pii_type for pii_type in pii_counts_private . keys ( ) if not any ( pii_type in types for types in pii_categories . values ( ) ) ]
if other_pii :
print ( " \n Other PII types: " )
for pii_type in other_pii :
count = pii_counts_private . get ( pii_type , 0 )
print ( f " - { pii_type } : { count } ( { count / len ( private_with_pii ) * 100 : .1f } %) " )
# Print detailed report for private documents with PII
if private_with_pii :
print ( " \n Detailed Report - Private Documents with PII: " )
print ( " - " * 80 )
for i , item in enumerate ( private_with_pii , 1 ) :
2025-04-16 19:29:45 +00:00
pdf_path = item [ " pdf_path " ]
pdf_page = item [ " pdf_page " ]
presigned_url = item . get ( " presigned_url " )
2025-04-15 22:27:07 +00:00
print ( f " { i } . PDF: { pdf_path } " )
print ( f " Page: { pdf_page } " )
if presigned_url :
print ( f " Presigned URL: { presigned_url } " )
print ( f " PII Types: { ' , ' . join ( item [ ' pii_types ' ] ) } " )
if item . get ( " description " ) :
print ( f " Description: { item [ ' description ' ] } " )
print ( " - " * 80 )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Print links to unreadable pages
2025-04-16 20:14:20 +00:00
# if annotation_results["cannot_read"]:
# print("\nUnreadable Pages:")
# print("-" * 80)
# for i, item in enumerate(annotation_results["cannot_read"], 1):
# pdf_path = item["pdf_path"]
# pdf_page = item["pdf_page"]
# presigned_url = item.get("presigned_url")
# print(f"{i}. PDF: {pdf_path}")
# print(f" Page: {pdf_page}")
# if presigned_url:
# print(f" Presigned URL: {presigned_url}")
# print("-" * 80)
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Print links to inappropriate content
if annotation_results [ " report_content " ] :
print ( " \n Reported Content: " )
print ( " - " * 80 )
for i , item in enumerate ( annotation_results [ " report_content " ] , 1 ) :
2025-04-16 19:29:45 +00:00
pdf_path = item [ " pdf_path " ]
pdf_page = item [ " pdf_page " ]
presigned_url = item . get ( " presigned_url " )
2025-04-15 22:27:07 +00:00
print ( f " { i } . PDF: { pdf_path } " )
print ( f " Page: { pdf_page } " )
if presigned_url :
print ( f " Presigned URL: { presigned_url } " )
print ( " - " * 80 )
print ( " \n Report complete. " )
def save_results ( results , output_dir ) :
""" Save the results to a JSON file. """
output_path = Path ( output_dir ) / " autoscan_results.json "
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Convert results to serializable format
serializable_results = [ ]
for pdf_path , page_num , page_text , result_file , annotation , presigned_url in results :
2025-04-16 19:29:45 +00:00
serializable_results . append (
{
" pdf_path " : pdf_path ,
" page_num " : page_num ,
" page_text " : page_text ,
" result_file " : result_file ,
" annotation " : annotation . dict ( ) ,
" presigned_url " : presigned_url ,
}
)
2025-04-15 22:27:07 +00:00
with open ( output_path , " w " ) as f :
2025-04-16 19:29:45 +00:00
json . dump ( serializable_results , f , indent = 2 , default = lambda o : o . value if isinstance ( o , Enum ) else o )
2025-04-15 22:27:07 +00:00
print ( f " Results saved to { output_path } " )
def main ( ) :
args = parse_args ( )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Get OpenAI API key from args or environment
openai_api_key = args . openai_api_key or os . environ . get ( " OPENAI_API_KEY " )
if not openai_api_key :
raise ValueError ( " OpenAI API key must be provided via --openai_api_key or OPENAI_API_KEY environment variable " )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Set up S3 clients
s3_client = boto3 . client ( " s3 " )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Set up PDF S3 client with profile if specified
if args . pdf_profile :
pdf_session = boto3 . Session ( profile_name = args . pdf_profile )
pdf_s3_client = pdf_session . client ( " s3 " )
else :
pdf_s3_client = s3_client
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Create output directory
output_dir = Path ( args . output_dir )
output_dir . mkdir ( exist_ok = True , parents = True )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# List all result files
print ( f " Listing result files in { args . workspace } /results... " )
result_files = list_result_files ( s3_client , args . workspace )
print ( f " Found { len ( result_files ) } result files " )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Get random pages
random_pages = get_random_pages ( s3_client , result_files , args . pages_per_run )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Process pages with ChatGPT
print ( f " Processing { len ( random_pages ) } pages with ChatGPT... " )
2025-04-16 19:29:45 +00:00
all_results = process_pages ( random_pages , pdf_s3_client , openai_api_key , args . openai_model , args . max_workers )
2025-04-15 22:27:07 +00:00
# Save results
save_results ( all_results , args . output_dir )
2025-04-16 19:29:45 +00:00
2025-04-15 22:27:07 +00:00
# Categorize and report results
categorized_results = categorize_results ( all_results )
print_annotation_report ( categorized_results )
if __name__ == " __main__ " :
2025-04-16 19:29:45 +00:00
main ( )