2025-01-29 15:25:10 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								import  argparse  
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								import  glob  
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:25:10 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								import  json  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								import  os  
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								import  random  
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:47:57 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								from  concurrent . futures  import  ProcessPoolExecutor ,  as_completed  
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:25:10 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								from  typing  import  Generator  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								from  urllib . parse  import  urlparse  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								import  boto3  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								from  pypdf  import  PdfReader  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								from  tqdm  import  tqdm  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-27 18:30:41 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								from  olmocr . data . renderpdf  import  render_pdf_to_base64png  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								from  olmocr . filter  import  PdfFilter  
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:25:10 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								from  olmocr . prompts  import  (  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    build_openai_silver_data_prompt , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    openai_response_format_schema , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								)  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								from  olmocr . prompts . anchor  import  get_anchor_text  
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								TARGET_IMAGE_DIM  =  2048  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								pdf_filter  =  PdfFilter ( )  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								def  build_page_query ( local_pdf_path :  str ,  pretty_pdf_path :  str ,  page :  int )  - >  dict :  
						 
					
						
							
								
									
										
										
										
											2024-10-09 16:57:13 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    image_base64  =  render_pdf_to_base64png ( local_pdf_path ,  page ,  TARGET_IMAGE_DIM ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-02 17:29:44 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    anchor_text  =  get_anchor_text ( local_pdf_path ,  page ,  pdf_engine = " pdfreport " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-02 22:17:15 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    # Construct OpenAI Batch API request format# 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # There are a few tricks to know when doing data processing with OpenAI's apis 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # First off, use the batch query system, it's 1/2 the price and exactly the same performance 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Second off, use structured outputs. If your application is not an actual chatbot, use structured outputs! 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Even if the last 10 queries you ran with the regular chat api returned exactly what you wanted without extra "LLM fluff text", that doesn't mean this will hold across 1000's of queries 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Also, structured outputs let you cheat, because the order in which fields are in your schema, is the order in which the model will answer them, so you can have it answer some "preperatory" or "chain of thought" style questions first before going into the meat of your response, which is going to give better answers 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Check your prompt for typos, it makes a performance difference! 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Ask for logprobs, it's not any more expensive and you can use them later to help identify problematic responses 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    return  { 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        " custom_id " :  f " { pretty_pdf_path } - { page } " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        " method " :  " POST " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        " url " :  " /v1/chat/completions " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        " body " :  { 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            " model " :  " gpt-4o-2024-08-06 " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            " messages " :  [ 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                { 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    " role " :  " user " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    " content " :  [ 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-02 17:29:44 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								                        { " type " :  " text " ,  " text " :  build_openai_silver_data_prompt ( anchor_text ) } , 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								                        { " type " :  " image_url " ,  " image_url " :  { " url " :  f " data:image/png;base64, { image_base64 } " } } , 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								                    ] , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                } 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            ] , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            " temperature " :  0.1 , 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-02 19:54:54 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            " max_tokens " :  6000 , 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-02 18:10:47 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            " logprobs " :  True , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            " top_logprobs " :  5 , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            " response_format " :  openai_response_format_schema ( ) , 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        } , 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    } 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								def  sample_pdf_pages ( num_pages :  int ,  first_n_pages :  int ,  max_sample_pages :  int )  - >  list :  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    if  num_pages  < =  first_n_pages : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        return  list ( range ( 1 ,  num_pages  +  1 ) )   # Return all pages if fewer than first_n_pages 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    sample_pages  =  list ( range ( 1 ,  first_n_pages  +  1 ) )   # Always get the first_n_pages 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    remaining_pages  =  list ( range ( first_n_pages  +  1 ,  num_pages  +  1 ) ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    if  remaining_pages : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        sample_pages  + =  random . sample ( remaining_pages ,  min ( max_sample_pages  -  first_n_pages ,  len ( remaining_pages ) ) ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    return  sample_pages 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								def  fetch_s3_file ( s3_url :  str ,  local_path :  str )  - >  str :  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    parsed  =  urlparse ( s3_url ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    bucket_name  =  parsed . netloc 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    key  =  parsed . path . lstrip ( " / " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    s3  =  boto3 . client ( " s3 " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    s3 . download_file ( bucket_name ,  key ,  local_path ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    return  local_path 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-02 15:46:12 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								def  process_pdf ( pdf_path :  str ,  first_n_pages :  int ,  max_sample_pages :  int ,  no_filter :  bool )  - >  Generator [ dict ,  None ,  None ] :  
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    if  pdf_path . startswith ( " s3:// " ) : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        local_pdf_path  =  os . path . join ( " /tmp " ,  os . path . basename ( pdf_path ) ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        fetch_s3_file ( pdf_path ,  local_pdf_path ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    else : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        local_pdf_path  =  pdf_path 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-02 15:46:12 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    if  ( not  no_filter )  and  pdf_filter . filter_out_pdf ( local_pdf_path ) : 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								        print ( f " Skipping  { local_pdf_path }  due to common filter " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        return  [ ] 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    pretty_pdf_path  =  pdf_path 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    pdf  =  PdfReader ( local_pdf_path ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    num_pages  =  len ( pdf . pages ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    sample_pages  =  sample_pdf_pages ( num_pages ,  first_n_pages ,  max_sample_pages ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    result  =  [ ] 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    for  page  in  sample_pages : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        try : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            query  =  build_page_query ( local_pdf_path ,  pretty_pdf_path ,  page ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            result . append ( query ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        except  Exception  as  e : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            print ( f " Error processing page  { page }  of  { pdf_path } :  { e } " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    return  result 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								def  main ( ) :  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    parser  =  argparse . ArgumentParser ( description = " Sample PDFs and create requests for GPT-4o. " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    parser . add_argument ( " --glob_path " ,  type = str ,  help = " Local or S3 path glob (e.g., *.pdf or s3://bucket/pdfs/*.pdf). " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    parser . add_argument ( " --path_list " ,  type = str ,  help = " Path to a file containing paths to PDFs, one per line. " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-02 15:46:12 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    parser . add_argument ( " --no_filter " ,  action = " store_true " ,  help = " Disables the basic spam/language filtering so that ALL pdfs listed are used " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    parser . add_argument ( " --num_sample_docs " ,  type = int ,  default = 5000 ,  help = " Number of PDF documents to sample. " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 21:19:24 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    parser . add_argument ( " --first_n_pages " ,  type = int ,  default = 0 ,  help = " Always sample the first N pages of each PDF. " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    parser . add_argument ( " --max_sample_pages " ,  type = int ,  default = 15 ,  help = " Max number of pages to sample per PDF. " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    parser . add_argument ( " --output " ,  type = str ,  default = " openai_batch_data " ,  help = " Output destination " ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    parser . add_argument ( " --reservoir_size " ,  type = int ,  default = None ,  help = " Size of the reservoir for sampling paths. Defaults to 10x num_sample_docs. " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    args  =  parser . parse_args ( ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-09-30 18:41:18 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    # Set default reservoir_size if not provided 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    if  args . reservoir_size  is  None : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        args . reservoir_size  =  10  *  args . num_sample_docs 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Initialize reservoir sampling variables 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    pdf_paths  =  [ ] 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-30 18:41:18 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    n  =  0   # Total number of items seen 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Load PDF paths from glob or path_list using reservoir sampling 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    if  args . glob_path : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        if  args . glob_path . startswith ( " s3:// " ) : 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-30 18:41:18 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            # Handle S3 globbing using boto3 with pagination 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								            parsed  =  urlparse ( args . glob_path ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            s3  =  boto3 . client ( " s3 " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								            bucket_name  =  parsed . netloc 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            prefix  =  os . path . dirname ( parsed . path . lstrip ( " / " ) )  +  " / " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            paginator  =  s3 . get_paginator ( " list_objects_v2 " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-30 18:41:18 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            page_iterator  =  paginator . paginate ( Bucket = bucket_name ,  Prefix = prefix ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            for  page  in  page_iterator : 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								                for  obj  in  page . get ( " Contents " ,  [ ] ) : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    if  obj [ " Key " ] . endswith ( " .pdf " ) : 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-30 18:41:18 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								                        n  + =  1 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                        path  =  f " s3:// { bucket_name } / { obj [ ' Key ' ] } " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                        if  len ( pdf_paths )  <  args . reservoir_size : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                            pdf_paths . append ( path ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                        else : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                            s  =  random . randint ( 1 ,  n ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                            if  s  < =  args . reservoir_size : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                                pdf_paths [ s  -  1 ]  =  path 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								        else : 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-30 18:41:18 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            # Handle local globbing using glob.iglob() 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            for  path  in  glob . iglob ( args . glob_path ,  recursive = True ) : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                n  + =  1 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                if  len ( pdf_paths )  <  args . reservoir_size : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    pdf_paths . append ( path ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                else : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    s  =  random . randint ( 1 ,  n ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    if  s  < =  args . reservoir_size : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                        pdf_paths [ s  -  1 ]  =  path 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    elif  args . path_list : 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        with  open ( args . path_list ,  " r " )  as  f : 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-30 18:41:18 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            for  line  in  f : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                n  + =  1 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                path  =  line . strip ( ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                if  len ( pdf_paths )  <  args . reservoir_size : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    pdf_paths . append ( path ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                else : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    s  =  random . randint ( 1 ,  n ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    if  s  < =  args . reservoir_size : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                        pdf_paths [ s  -  1 ]  =  path 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Shuffle the reservoir 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    random . shuffle ( pdf_paths ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-09-30 18:41:18 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    print ( f " Loaded and shuffled  { len ( pdf_paths ) }  paths to use. " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Rest of the code remains the same 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    cur_file_num  =  0 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    output_dir  =  args . output 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    max_file_size  =  99  *  1024  *  1024   # 99MB in bytes 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    cur_file_size  =  0 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    cur_file_path  =  os . path . join ( output_dir ,  f " output_ { cur_file_num } .jsonl " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Ensure output directory exists 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    os . makedirs ( output_dir ,  exist_ok = True ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Open the first file for writing 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    cur_file  =  open ( cur_file_path ,  " w " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Counter to track PDFs that produce at least one output 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    pdfs_with_output  =  0 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-09-30 18:41:18 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    # Using ThreadPoolExecutor to process files concurrently 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-05 04:04:45 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    with  ProcessPoolExecutor ( )  as  executor : 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								        futures  =  [ ] 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        with  tqdm ( desc = " Processing PDFs " ,  leave = False ,  total = args . num_sample_docs )  as  pb : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            for  pdf_path  in  pdf_paths : 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-02 15:46:12 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								                futures . append ( executor . submit ( process_pdf ,  pdf_path ,  args . first_n_pages ,  args . max_sample_pages ,  args . no_filter ) ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            for  future  in  as_completed ( futures ) : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                has_output  =  False   # Track if the current PDF produces at least one request 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                try : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    request_results  =  future . result ( )   # Get the result from the thread 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    for  request_obj  in  request_results : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                        request_json  =  json . dumps ( request_obj ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								                        request_size  =  len ( request_json . encode ( " utf-8 " ) )   # Calculate size in bytes 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                        # Check if the current request can fit in the current file 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                        if  cur_file_size  +  request_size  >  max_file_size : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                            # Close the current file and create a new one 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                            cur_file . close ( ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                            cur_file_num  + =  1 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                            cur_file_path  =  os . path . join ( output_dir ,  f " output_ { cur_file_num } .jsonl " ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								                            cur_file  =  open ( cur_file_path ,  " w " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								                            cur_file_size  =  0   # Reset file size 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                        # Write the JSON entry to the file 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                        cur_file . write ( request_json ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                        cur_file . write ( " \n " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                        cur_file_size  + =  request_size 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                        has_output  =  True   # At least one request object was generated 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    if  has_output : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                        pdfs_with_output  + =  1 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                        pb . update ( 1 ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                        if  pdfs_with_output  > =  args . num_sample_docs : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                            executor . shutdown ( cancel_futures = True ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                            break 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                except  Exception  as  e : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    print ( f " Error processing  { pdf_path } :  { str ( e ) } " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Close the last open file 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    cur_file . close ( ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Print or log the number of PDFs that resulted in at least one output 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    print ( f " Number of sampled PDFs that produced at least one output:  { pdfs_with_output } " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-09-23 17:20:18 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								if  __name__  ==  " __main__ " :  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    main ( )