Small fixes

This commit is contained in:
Jake Poznanski 2024-11-11 13:31:14 -08:00
parent a9a94f2950
commit fedda40466

View File

@ -274,11 +274,7 @@ async def process_pdf(args, pdf_s3_path: str):
start_pos = current_char_pos start_pos = current_char_pos
document_text += content document_text += content
current_char_pos = len(document_text) current_char_pos = len(document_text)
pdf_page_spans.append({ pdf_page_spans.append([start_pos, current_char_pos, page_result.page_num])
'pdf_page_number': page_result.page_num,
'start_char': start_pos,
'end_char': current_char_pos
})
if not document_text: if not document_text:
return None # Return None if the document text is empty return None # Return None if the document text is empty
@ -310,7 +306,6 @@ async def process_pdf(args, pdf_s3_path: str):
async def worker(args, queue): async def worker(args, queue):
while True: while True:
[work_hash, pdfs] = await queue.get() [work_hash, pdfs] = await queue.get()
try: try:
@ -344,7 +339,7 @@ async def worker(args, queue):
logger.info(f"Tokens per second (since process start): input {total_input_tokens / total_time:.1f}, output {total_output_tokens / total_time:.1f}, total {(total_input_tokens + total_output_tokens) / total_time:.1f}") logger.info(f"Tokens per second (since process start): input {total_input_tokens / total_time:.1f}, output {total_output_tokens / total_time:.1f}, total {(total_input_tokens + total_output_tokens) / total_time:.1f}")
# Update last batch time # Update last batch time
last_batch_time = current_time last_batch_time = time.perf_counter()
except Exception as e: except Exception as e:
logger.exception(f"Exception occurred while processing work_hash {work_hash}: {e}") logger.exception(f"Exception occurred while processing work_hash {work_hash}: {e}")
finally: finally:
@ -416,7 +411,7 @@ async def main():
parser.add_argument('--workspace_profile', help='S3 configuration profile for accessing the workspace', default=None) parser.add_argument('--workspace_profile', help='S3 configuration profile for accessing the workspace', default=None)
parser.add_argument('--pdf_profile', help='S3 configuration profile for accessing the raw pdf documents', default=None) parser.add_argument('--pdf_profile', help='S3 configuration profile for accessing the raw pdf documents', default=None)
parser.add_argument('--group_size', type=int, default=20, help='Number of pdfs that will be part of each work item in the work queue.') parser.add_argument('--group_size', type=int, default=20, help='Number of pdfs that will be part of each work item in the work queue.')
parser.add_argument('--workers', type=int, default=1, help='Number of workers to run at a time') parser.add_argument('--workers', type=int, default=2, help='Number of workers to run at a time')
parser.add_argument('--model', help='List of paths where you can find the model to convert this pdf. You can specify several different paths here, and the script will try to use the one which is fastest to access', parser.add_argument('--model', help='List of paths where you can find the model to convert this pdf. You can specify several different paths here, and the script will try to use the one which is fastest to access',
default=["weka://oe-data-default/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/best_bf16/", default=["weka://oe-data-default/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/best_bf16/",