Fixing non async threading

This commit is contained in:
Jake Poznanski 2025-09-19 17:12:22 +00:00
parent e607b53748
commit b52ac23073

View File

@ -5,6 +5,7 @@ import glob
import importlib import importlib
import os import os
import tempfile import tempfile
from concurrent.futures import ThreadPoolExecutor
from functools import partial from functools import partial
from pypdf import PdfReader from pypdf import PdfReader
@ -48,13 +49,13 @@ def parse_method_arg(method_arg):
# Wrapper to run synchronous functions in the event loop # Wrapper to run synchronous functions in the event loop
async def run_sync_in_executor(func, *args, **kwargs): async def run_sync_in_executor(func, executor, *args, **kwargs):
"""Run a synchronous function in the default executor""" """Run a synchronous function in the provided executor"""
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, partial(func, *args, **kwargs)) return await loop.run_in_executor(executor, partial(func, *args, **kwargs))
async def process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async): async def process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async, executor=None):
"""Process a single PDF and save the result to output_path""" """Process a single PDF and save the result to output_path"""
try: try:
if is_async: if is_async:
@ -62,7 +63,7 @@ async def process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async)
markdown = await method(pdf_path, page_num=page_num, **kwargs) markdown = await method(pdf_path, page_num=page_num, **kwargs)
else: else:
# Run synchronous function in the executor # Run synchronous function in the executor
markdown = await run_sync_in_executor(method, pdf_path, page_num=page_num, **kwargs) markdown = await run_sync_in_executor(method, executor, pdf_path, page_num=page_num, **kwargs)
if markdown is None: if markdown is None:
print(f"Warning, did not get output for {os.path.basename(output_path)}") print(f"Warning, did not get output for {os.path.basename(output_path)}")
@ -89,6 +90,11 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats, remove_te
Process PDFs using asyncio for both sync and async methods, Process PDFs using asyncio for both sync and async methods,
limiting the number of concurrent tasks to max_parallel. limiting the number of concurrent tasks to max_parallel.
""" """
# Create a thread pool executor with limited threads when max_parallel is 1
# This prevents unwanted multithreading for synchronous methods
executor = ThreadPoolExecutor(max_workers=max_parallel or 1) if max_parallel else None
try:
for candidate in config.keys(): for candidate in config.keys():
print(f"Starting conversion using {candidate} with kwargs: {config[candidate]['kwargs']}") print(f"Starting conversion using {candidate} with kwargs: {config[candidate]['kwargs']}")
folder_name = config[candidate]["folder_name"] folder_name = config[candidate]["folder_name"]
@ -163,7 +169,7 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats, remove_te
print("Rerun with --force flag to force regeneration") print("Rerun with --force flag to force regeneration")
continue continue
task = process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async) task = process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async, executor)
tasks.append(task) tasks.append(task)
task_descriptions[id(task)] = f"{base_name}_pg{page_num}_repeat{repeat} ({candidate})" task_descriptions[id(task)] = f"{base_name}_pg{page_num}_repeat{repeat} ({candidate})"
@ -192,6 +198,10 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats, remove_te
pbar.update(1) pbar.update(1)
print(f"Completed {completed} out of {len(limited_tasks)} tasks for {candidate}") print(f"Completed {completed} out of {len(limited_tasks)} tasks for {candidate}")
finally:
# Clean up the executor
if executor:
executor.shutdown(wait=False)
if __name__ == "__main__": if __name__ == "__main__":