mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-10 23:50:43 +00:00
Fixing non async threading
This commit is contained in:
parent
e607b53748
commit
b52ac23073
@ -5,6 +5,7 @@ import glob
|
|||||||
import importlib
|
import importlib
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
from pypdf import PdfReader
|
from pypdf import PdfReader
|
||||||
@ -48,13 +49,13 @@ def parse_method_arg(method_arg):
|
|||||||
|
|
||||||
|
|
||||||
# Wrapper to run synchronous functions in the event loop
|
# Wrapper to run synchronous functions in the event loop
|
||||||
async def run_sync_in_executor(func, *args, **kwargs):
|
async def run_sync_in_executor(func, executor, *args, **kwargs):
|
||||||
"""Run a synchronous function in the default executor"""
|
"""Run a synchronous function in the provided executor"""
|
||||||
loop = asyncio.get_running_loop()
|
loop = asyncio.get_running_loop()
|
||||||
return await loop.run_in_executor(None, partial(func, *args, **kwargs))
|
return await loop.run_in_executor(executor, partial(func, *args, **kwargs))
|
||||||
|
|
||||||
|
|
||||||
async def process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async):
|
async def process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async, executor=None):
|
||||||
"""Process a single PDF and save the result to output_path"""
|
"""Process a single PDF and save the result to output_path"""
|
||||||
try:
|
try:
|
||||||
if is_async:
|
if is_async:
|
||||||
@ -62,7 +63,7 @@ async def process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async)
|
|||||||
markdown = await method(pdf_path, page_num=page_num, **kwargs)
|
markdown = await method(pdf_path, page_num=page_num, **kwargs)
|
||||||
else:
|
else:
|
||||||
# Run synchronous function in the executor
|
# Run synchronous function in the executor
|
||||||
markdown = await run_sync_in_executor(method, pdf_path, page_num=page_num, **kwargs)
|
markdown = await run_sync_in_executor(method, executor, pdf_path, page_num=page_num, **kwargs)
|
||||||
|
|
||||||
if markdown is None:
|
if markdown is None:
|
||||||
print(f"Warning, did not get output for {os.path.basename(output_path)}")
|
print(f"Warning, did not get output for {os.path.basename(output_path)}")
|
||||||
@ -89,6 +90,11 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats, remove_te
|
|||||||
Process PDFs using asyncio for both sync and async methods,
|
Process PDFs using asyncio for both sync and async methods,
|
||||||
limiting the number of concurrent tasks to max_parallel.
|
limiting the number of concurrent tasks to max_parallel.
|
||||||
"""
|
"""
|
||||||
|
# Create a thread pool executor with limited threads when max_parallel is 1
|
||||||
|
# This prevents unwanted multithreading for synchronous methods
|
||||||
|
executor = ThreadPoolExecutor(max_workers=max_parallel or 1) if max_parallel else None
|
||||||
|
|
||||||
|
try:
|
||||||
for candidate in config.keys():
|
for candidate in config.keys():
|
||||||
print(f"Starting conversion using {candidate} with kwargs: {config[candidate]['kwargs']}")
|
print(f"Starting conversion using {candidate} with kwargs: {config[candidate]['kwargs']}")
|
||||||
folder_name = config[candidate]["folder_name"]
|
folder_name = config[candidate]["folder_name"]
|
||||||
@ -163,7 +169,7 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats, remove_te
|
|||||||
print("Rerun with --force flag to force regeneration")
|
print("Rerun with --force flag to force regeneration")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
task = process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async)
|
task = process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async, executor)
|
||||||
tasks.append(task)
|
tasks.append(task)
|
||||||
task_descriptions[id(task)] = f"{base_name}_pg{page_num}_repeat{repeat} ({candidate})"
|
task_descriptions[id(task)] = f"{base_name}_pg{page_num}_repeat{repeat} ({candidate})"
|
||||||
|
|
||||||
@ -192,6 +198,10 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats, remove_te
|
|||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
|
|
||||||
print(f"Completed {completed} out of {len(limited_tasks)} tasks for {candidate}")
|
print(f"Completed {completed} out of {len(limited_tasks)} tasks for {candidate}")
|
||||||
|
finally:
|
||||||
|
# Clean up the executor
|
||||||
|
if executor:
|
||||||
|
executor.shutdown(wait=False)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user