mirror of
https://github.com/allenai/olmocr.git
synced 2025-09-03 13:52:30 +00:00
Small fixes
This commit is contained in:
parent
a9a94f2950
commit
fedda40466
@ -274,11 +274,7 @@ async def process_pdf(args, pdf_s3_path: str):
|
|||||||
start_pos = current_char_pos
|
start_pos = current_char_pos
|
||||||
document_text += content
|
document_text += content
|
||||||
current_char_pos = len(document_text)
|
current_char_pos = len(document_text)
|
||||||
pdf_page_spans.append({
|
pdf_page_spans.append([start_pos, current_char_pos, page_result.page_num])
|
||||||
'pdf_page_number': page_result.page_num,
|
|
||||||
'start_char': start_pos,
|
|
||||||
'end_char': current_char_pos
|
|
||||||
})
|
|
||||||
|
|
||||||
if not document_text:
|
if not document_text:
|
||||||
return None # Return None if the document text is empty
|
return None # Return None if the document text is empty
|
||||||
@ -310,7 +306,6 @@ async def process_pdf(args, pdf_s3_path: str):
|
|||||||
|
|
||||||
async def worker(args, queue):
|
async def worker(args, queue):
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
[work_hash, pdfs] = await queue.get()
|
[work_hash, pdfs] = await queue.get()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -344,7 +339,7 @@ async def worker(args, queue):
|
|||||||
logger.info(f"Tokens per second (since process start): input {total_input_tokens / total_time:.1f}, output {total_output_tokens / total_time:.1f}, total {(total_input_tokens + total_output_tokens) / total_time:.1f}")
|
logger.info(f"Tokens per second (since process start): input {total_input_tokens / total_time:.1f}, output {total_output_tokens / total_time:.1f}, total {(total_input_tokens + total_output_tokens) / total_time:.1f}")
|
||||||
|
|
||||||
# Update last batch time
|
# Update last batch time
|
||||||
last_batch_time = current_time
|
last_batch_time = time.perf_counter()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.exception(f"Exception occurred while processing work_hash {work_hash}: {e}")
|
logger.exception(f"Exception occurred while processing work_hash {work_hash}: {e}")
|
||||||
finally:
|
finally:
|
||||||
@ -416,7 +411,7 @@ async def main():
|
|||||||
parser.add_argument('--workspace_profile', help='S3 configuration profile for accessing the workspace', default=None)
|
parser.add_argument('--workspace_profile', help='S3 configuration profile for accessing the workspace', default=None)
|
||||||
parser.add_argument('--pdf_profile', help='S3 configuration profile for accessing the raw pdf documents', default=None)
|
parser.add_argument('--pdf_profile', help='S3 configuration profile for accessing the raw pdf documents', default=None)
|
||||||
parser.add_argument('--group_size', type=int, default=20, help='Number of pdfs that will be part of each work item in the work queue.')
|
parser.add_argument('--group_size', type=int, default=20, help='Number of pdfs that will be part of each work item in the work queue.')
|
||||||
parser.add_argument('--workers', type=int, default=1, help='Number of workers to run at a time')
|
parser.add_argument('--workers', type=int, default=2, help='Number of workers to run at a time')
|
||||||
|
|
||||||
parser.add_argument('--model', help='List of paths where you can find the model to convert this pdf. You can specify several different paths here, and the script will try to use the one which is fastest to access',
|
parser.add_argument('--model', help='List of paths where you can find the model to convert this pdf. You can specify several different paths here, and the script will try to use the one which is fastest to access',
|
||||||
default=["weka://oe-data-default/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/best_bf16/",
|
default=["weka://oe-data-default/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/best_bf16/",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user