2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								# Sends list of batch files to OpenAI for processing  
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								# However, it also waits and gets the files when they are done, saves its state, and  
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								# allows you to submit more than the 100GB of file request limits that the openaiAPI has  
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:25:10 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								import  argparse  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								import  datetime  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								import  json  
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								import  os  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								import  time  
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:25:10 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								from  openai  import  OpenAI  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								from  tqdm  import  tqdm  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								# Set up OpenAI client (API key should be set in the environment)  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								client  =  OpenAI ( api_key = os . getenv ( " OPENAI_API_KEY " ) )  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								MAX_OPENAI_DISK_SPACE  =  100  *  1024  *  1024  *  1024   # Max is 100GB on openAI  
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								UPLOAD_STATE_FILENAME  =  " SENDSILVER_DATA "  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								# Function to upload a file to OpenAI and start batch processing  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								def  upload_and_start_batch ( file_path ) :  
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:54:00 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    # Upload the file to OpenAI 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    with  open ( file_path ,  " rb " )  as  file : 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:54:00 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        print ( f " Uploading  { file_path }  to OpenAI Batch API... " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        upload_response  =  client . files . create ( file = file ,  purpose = " batch " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        file_id  =  upload_response . id 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        print ( f " File uploaded successfully:  { file_id } " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Create a batch job 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    print ( f " Creating batch job for  { file_path } ... " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    batch_response  =  client . batches . create ( 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        input_file_id = file_id ,  endpoint = " /v1/chat/completions " ,  completion_window = " 24h " ,  metadata = { " description " :  " pdf gold/silver data " } 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:54:00 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:54:00 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    batch_id  =  batch_response . id 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    print ( f " Batch created successfully:  { batch_id } " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    return  batch_id 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								def  download_batch_result ( batch_id ,  output_folder ) :  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Retrieve the batch result from OpenAI API 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    batch_data  =  client . batches . retrieve ( batch_id ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    if  batch_data . status  !=  " completed " : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        print ( f " WARNING:  { batch_id }  is not completed, status:  { batch_data . status } " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        return  batch_id ,  False 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:54:00 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    if  batch_data . output_file_id  is  None : 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-10 19:38:42 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        print ( f " WARNING:  { batch_id }  is completed, but no output file was generated " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:54:00 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        return  batch_id ,  False 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 16:58:38 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    print ( f " Downloading batch data for  { batch_id } " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    file_response  =  client . files . content ( batch_data . output_file_id ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Define output file path 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    output_file  =  os . path . join ( output_folder ,  f " { batch_id } .json " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Save the result to a file 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    with  open ( output_file ,  " w " )  as  f : 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								        f . write ( str ( file_response . text ) ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    return  batch_id ,  True 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								ALL_STATES  =  [ " init " ,  " processing " ,  " completed " ,  " errored_out " ,  " could_not_upload " ]  
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								FINISHED_STATES  =  [ " completed " ,  " errored_out " ]  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:35:52 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								def  _json_datetime_decoder ( obj ) :  
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    if  " last_checked "  in  obj : 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:35:52 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        try : 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            obj [ " last_checked " ]  =  datetime . datetime . fromisoformat ( obj [ " last_checked " ] ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:35:52 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        except  ( TypeError ,  ValueError ) : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            pass   # If it's not a valid ISO format, leave it as is 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    return  obj 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 16:10:19 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								def  _json_datetime_encoder ( obj ) :  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    if  isinstance ( obj ,  datetime . datetime ) : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        return  obj . isoformat ( )   # Convert datetime to ISO format string 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    raise  TypeError ( f " Object of type  { obj . __class__ . __name__ }  is not JSON serializable " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								def  get_state ( folder_path :  str )  - >  dict :  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    state_file  =  os . path . join ( folder_path ,  UPLOAD_STATE_FILENAME ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:35:52 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    try : 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								        with  open ( state_file ,  " r " )  as  f : 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:35:52 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            return  json . load ( f ,  object_hook = _json_datetime_decoder ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    except  ( json . decoder . JSONDecodeError ,  FileNotFoundError ) : 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								        # List all .jsonl files in the specified folder 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        jsonl_files  =  [ f  for  f  in  os . listdir ( folder_path )  if  f . endswith ( " .jsonl " ) ] 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        if  not  jsonl_files : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            raise  Exception ( " No JSONL files found to process " ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        state  =  { 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            f :  { 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                " filename " :  f , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                " batch_id " :  None , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                " state " :  " init " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                " size " :  os . path . getsize ( os . path . join ( folder_path ,  f ) ) , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                " last_checked " :  datetime . datetime . now ( ) , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            } 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            for  f  in  jsonl_files 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        } 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        with  open ( state_file ,  " w " )  as  f : 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 16:10:19 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            json . dump ( state ,  f ,  default = _json_datetime_encoder ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								        return  state 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								def  update_state ( folder_path :  str ,  filename :  str ,  * * kwargs ) :  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    all_state  =  get_state ( folder_path ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    for  kwarg_name ,  kwarg_value  in  kwargs . items ( ) : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        all_state [ filename ] [ kwarg_name ]  =  kwarg_value 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 16:10:19 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    all_state [ filename ] [ " last_checked " ]  =  datetime . datetime . now ( ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    state_file  =  os . path . join ( folder_path ,  UPLOAD_STATE_FILENAME ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    temp_file  =  state_file  +  " .tmp " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-23 17:51:22 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    # Write to temporary file first 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    with  open ( temp_file ,  " w " )  as  f : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        json . dump ( all_state ,  f ,  default = _json_datetime_encoder ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        f . flush ( ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        os . fsync ( f . fileno ( ) ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-23 17:51:22 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    # Atomic rename of temporary file to target file 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    os . replace ( temp_file ,  state_file ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-23 17:51:22 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    return  all_state 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								def  get_total_space_usage ( ) :  
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:54:00 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    return  sum ( file . bytes  for  file  in  client . files . list ( ) ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								def  get_estimated_space_usage ( folder_path ) :  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    all_states  =  get_state ( folder_path ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    return  sum ( s [ " size " ]  for  s  in  all_states . values ( )  if  s [ " state " ]  ==  " processing " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								def  get_next_work_item ( folder_path ) :  
						 
					
						
							
								
									
										
										
										
											2025-02-07 16:05:00 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    all_states  =  list ( get_state ( folder_path ) . values ( ) ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    all_states  =  [ s  for  s  in  all_states  if  s [ " state " ]  not  in  FINISHED_STATES ] 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    all_states . sort ( key = lambda  s :  s [ " last_checked " ] ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    return  all_states [ 0 ]  if  len ( all_states )  >  0  else  None 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 16:05:08 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								def  get_done_total ( folder_path ) :  
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    processing ,  done ,  total  =  0 ,  0 ,  0 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 16:05:08 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    for  state  in  get_state ( folder_path ) . values ( ) : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        if  state [ " state " ]  in  FINISHED_STATES : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            done  + =  1 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 18:06:04 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        if  state [ " state " ]  ==  " processing " : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            processing  + =  1 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 16:05:08 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        total  + =  1 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 18:06:04 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    return  processing ,  done ,  total 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								# Main function to process all .jsonl files in a folder  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								def  process_folder ( folder_path :  str ,  max_gb :  int ) :  
						 
					
						
							
								
									
										
										
										
											2024-10-10 18:52:42 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    output_folder  =  f " { folder_path . rstrip ( ' / ' ) } _done " 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    os . makedirs ( output_folder ,  exist_ok = True ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 16:58:38 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    last_loop_time  =  datetime . datetime . now ( ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    starting_free_space  =  MAX_OPENAI_DISK_SPACE  -  get_total_space_usage ( ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 18:06:04 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    if  starting_free_space  <  ( max_gb  *  1024 * * 3 )  *  2 : 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        raise  ValueError ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            f " Insufficient free space in OpenAI ' s file storage: Only  { starting_free_space }  GB left, but 2x { max_gb }  GB are required (1x for your uploads, 1x for your results). " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:35:52 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    while  not  all ( state [ " state " ]  in  FINISHED_STATES  for  state  in  get_state ( folder_path ) . values ( ) ) : 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 18:06:04 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        processing ,  done ,  total  =  get_done_total ( folder_path ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        print ( f " Total items  { total } , processing  { processing } , done  { done } ,  { done / total * 100 : .1f } % " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 16:05:08 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								        work_item  =  get_next_work_item ( folder_path ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:35:52 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        print ( f " Processing  { os . path . basename ( work_item [ ' filename ' ] ) } , cur status =  { work_item [ ' state ' ] } " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        # If all work items have been checked on, then you need to sleep a bit 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 16:58:38 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        if  last_loop_time  >  datetime . datetime . now ( )  -  datetime . timedelta ( seconds = 1 ) : 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 18:06:04 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            time . sleep ( 0.2 ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        if  work_item [ " state " ]  ==  " init " : 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 18:06:04 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            if  get_estimated_space_usage ( folder_path )  <  ( max_gb  *  1024 * * 3 ) : 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								                try : 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:54:00 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								                    batch_id  =  upload_and_start_batch ( os . path . join ( folder_path ,  work_item [ " filename " ] ) ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								                    update_state ( folder_path ,  work_item [ " filename " ] ,  state = " processing " ,  batch_id = batch_id ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:54:00 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								                except  Exception  as  ex : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    print ( ex ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								                    update_state ( folder_path ,  work_item [ " filename " ] ,  state = " init " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            else : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                print ( " waiting for something to finish processing before uploading more " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 16:58:38 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								                # Update the time you checked so you can move onto the next time 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                update_state ( folder_path ,  work_item [ " filename " ] ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								        elif  work_item [ " state " ]  ==  " processing " : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            batch_data  =  client . batches . retrieve ( work_item [ " batch_id " ] ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            if  batch_data . status  ==  " completed " : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                batch_id ,  success  =  download_batch_result ( work_item [ " batch_id " ] ,  output_folder ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                if  success : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    update_state ( folder_path ,  work_item [ " filename " ] ,  state = " completed " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                else : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    update_state ( folder_path ,  work_item [ " filename " ] ,  state = " errored_out " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:54:00 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								                try : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    client . files . delete ( batch_data . input_file_id ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                except  Exception  as  ex : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    print ( ex ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    print ( " Could not delete old input data " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                try : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    client . files . delete ( batch_data . output_file_id ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                except  Exception  as  ex : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    print ( ex ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    print ( " Could not delete old output data " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								            elif  batch_data . status  in  [ " failed " ,  " expired " ,  " cancelled " ] : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                update_state ( folder_path ,  work_item [ " filename " ] ,  state = " errored_out " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                try : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    client . files . delete ( batch_data . input_file_id ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                except : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    print ( " Could not delete old file data " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:54:00 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            else : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                # Update the time you checked so you can move onto the next time 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 16:58:38 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								                update_state ( folder_path ,  work_item [ " filename " ] ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        last_loop_time  =  datetime . datetime . now ( ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:54:00 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    print ( " All work has been completed " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								if  __name__  ==  " __main__ " :  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Set up argument parsing for folder input 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    parser  =  argparse . ArgumentParser ( description = " Upload .jsonl files and process batches in OpenAI API. " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    parser . add_argument ( " --max_gb " ,  type = int ,  default = 25 ,  help = " Max number of GB of batch processing files to upload at one time " ) 
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 16:58:38 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    parser . add_argument ( " --clear_all_files " ,  action = " store_true " ,  help = " Helper to delete ALL files stored in your openai account " ) 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-29 15:30:39 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    parser . add_argument ( " folder " ,  type = str ,  help = " Path to the folder containing .jsonl files " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    args  =  parser . parse_args ( ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 16:58:38 +00:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    if  args . clear_all_files : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        all_files  =  list ( client . files . list ( ) ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        if  input ( f " Are you sure you want to delete  { len ( all_files ) }  files from your OpenAI account? [y/N] " ) . lower ( )  ==  " y " : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            for  file  in  tqdm ( all_files ) : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                client . files . delete ( file . id ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        quit ( ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2024-10-04 15:27:43 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    # Process the folder and start batches 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    process_folder ( args . folder ,  args . max_gb )