Fixing non async threading

This commit is contained in:
Jake Poznanski 2025-09-19 17:12:22 +00:00
parent e607b53748
commit b52ac23073

View File

@ -5,6 +5,7 @@ import glob
import importlib import importlib
import os import os
import tempfile import tempfile
from concurrent.futures import ThreadPoolExecutor
from functools import partial from functools import partial
from pypdf import PdfReader from pypdf import PdfReader
@ -48,13 +49,13 @@ def parse_method_arg(method_arg):
# Wrapper to run synchronous functions in the event loop # Wrapper to run synchronous functions in the event loop
async def run_sync_in_executor(func, *args, **kwargs): async def run_sync_in_executor(func, executor, *args, **kwargs):
"""Run a synchronous function in the default executor""" """Run a synchronous function in the provided executor"""
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, partial(func, *args, **kwargs)) return await loop.run_in_executor(executor, partial(func, *args, **kwargs))
async def process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async): async def process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async, executor=None):
"""Process a single PDF and save the result to output_path""" """Process a single PDF and save the result to output_path"""
try: try:
if is_async: if is_async:
@ -62,7 +63,7 @@ async def process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async)
markdown = await method(pdf_path, page_num=page_num, **kwargs) markdown = await method(pdf_path, page_num=page_num, **kwargs)
else: else:
# Run synchronous function in the executor # Run synchronous function in the executor
markdown = await run_sync_in_executor(method, pdf_path, page_num=page_num, **kwargs) markdown = await run_sync_in_executor(method, executor, pdf_path, page_num=page_num, **kwargs)
if markdown is None: if markdown is None:
print(f"Warning, did not get output for {os.path.basename(output_path)}") print(f"Warning, did not get output for {os.path.basename(output_path)}")
@ -89,109 +90,118 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats, remove_te
Process PDFs using asyncio for both sync and async methods, Process PDFs using asyncio for both sync and async methods,
limiting the number of concurrent tasks to max_parallel. limiting the number of concurrent tasks to max_parallel.
""" """
for candidate in config.keys(): # Create a thread pool executor with limited threads when max_parallel is 1
print(f"Starting conversion using {candidate} with kwargs: {config[candidate]['kwargs']}") # This prevents unwanted multithreading for synchronous methods
folder_name = config[candidate]["folder_name"] executor = ThreadPoolExecutor(max_workers=max_parallel or 1) if max_parallel else None
candidate_output_dir = os.path.join(data_directory, folder_name)
os.makedirs(candidate_output_dir, exist_ok=True)
method = config[candidate]["method"] try:
kwargs = config[candidate]["kwargs"] for candidate in config.keys():
is_async = asyncio.iscoroutinefunction(method) print(f"Starting conversion using {candidate} with kwargs: {config[candidate]['kwargs']}")
folder_name = config[candidate]["folder_name"]
candidate_output_dir = os.path.join(data_directory, folder_name)
os.makedirs(candidate_output_dir, exist_ok=True)
# Use recursive glob to support nested PDFs method = config[candidate]["method"]
all_pdfs = glob.glob(os.path.join(pdf_directory, "**/*.pdf"), recursive=True) kwargs = config[candidate]["kwargs"]
all_pdfs.sort() is_async = asyncio.iscoroutinefunction(method)
# Prepare all tasks # Use recursive glob to support nested PDFs
tasks = [] all_pdfs = glob.glob(os.path.join(pdf_directory, "**/*.pdf"), recursive=True)
task_descriptions = {} all_pdfs.sort()
for pdf_path in all_pdfs: # Prepare all tasks
pdf = PdfReader(pdf_path) tasks = []
num_pages = len(pdf.pages) task_descriptions = {}
base_name = os.path.basename(pdf_path).replace(".pdf", "")
# Determine the PDF's relative folder path (e.g. "arxiv_data") relative to pdf_directory
relative_pdf_path = os.path.relpath(pdf_path, pdf_directory)
pdf_relative_dir = os.path.dirname(relative_pdf_path)
if remove_text: for pdf_path in all_pdfs:
print(f"Converting {pdf_path} into images to remove text-content...") pdf = PdfReader(pdf_path)
num_pages = len(pdf.pages)
base_name = os.path.basename(pdf_path).replace(".pdf", "")
# Determine the PDF's relative folder path (e.g. "arxiv_data") relative to pdf_directory
relative_pdf_path = os.path.relpath(pdf_path, pdf_directory)
pdf_relative_dir = os.path.dirname(relative_pdf_path)
# Generate image files from each page if remove_text:
temp_image_files = [] print(f"Converting {pdf_path} into images to remove text-content...")
try:
for page_num in range(1, num_pages + 1):
# Get base64 PNG data for the current page
base64_png = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=2048)
# Decode base64 and save to temporary file # Generate image files from each page
temp_img = tempfile.NamedTemporaryFile("wb", suffix=".png", delete=False) temp_image_files = []
temp_img.write(base64.b64decode(base64_png))
temp_img.close()
temp_image_files.append(temp_img.name)
# Convert all images to a single PDF using our enhanced function
pdf_bytes = convert_image_to_pdf_bytes(temp_image_files)
# Write the PDF bytes to a temporary file
temp_pdf = tempfile.NamedTemporaryFile("wb", suffix=".pdf", delete=False)
temp_pdf.write(pdf_bytes)
temp_pdf.close()
# Update pdf_path to the new file
pdf_path = temp_pdf.name
finally:
# Clean up temporary image files
for temp_file in temp_image_files:
try:
os.remove(temp_file)
except Exception as e:
print(f"Warning: Failed to remove temporary file {temp_file}: {e}")
for repeat in range(1, repeats + 1):
for page_num in range(1, num_pages + 1):
output_filename = f"{base_name}_pg{page_num}_repeat{repeat}.md"
# Preserve the relative folder structure in the output directory
candidate_pdf_dir = os.path.join(candidate_output_dir, pdf_relative_dir)
os.makedirs(candidate_pdf_dir, exist_ok=True)
output_path = os.path.join(candidate_pdf_dir, output_filename)
if os.path.exists(output_path) and not force:
print(f"Skipping {base_name}_pg{page_num}_repeat{repeat} for {candidate}, file already exists")
print("Rerun with --force flag to force regeneration")
continue
task = process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async)
tasks.append(task)
task_descriptions[id(task)] = f"{base_name}_pg{page_num}_repeat{repeat} ({candidate})"
# Process tasks with semaphore to limit concurrency
semaphore = asyncio.Semaphore(max_parallel or 1) # Default to 1 if not specified
async def process_with_semaphore(task):
async with semaphore:
return await task
# Wrap each task with the semaphore
limited_tasks = [process_with_semaphore(task) for task in tasks]
# Process tasks with progress bar
if limited_tasks:
completed = 0
with tqdm(total=len(limited_tasks), desc=f"Processing {candidate}") as pbar:
for task in asyncio.as_completed(limited_tasks):
try: try:
result = await task for page_num in range(1, num_pages + 1):
if result: # Get base64 PNG data for the current page
completed += 1 base64_png = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=2048)
except Exception as e:
print(f"Task failed: {e}")
finally:
pbar.update(1)
print(f"Completed {completed} out of {len(limited_tasks)} tasks for {candidate}") # Decode base64 and save to temporary file
temp_img = tempfile.NamedTemporaryFile("wb", suffix=".png", delete=False)
temp_img.write(base64.b64decode(base64_png))
temp_img.close()
temp_image_files.append(temp_img.name)
# Convert all images to a single PDF using our enhanced function
pdf_bytes = convert_image_to_pdf_bytes(temp_image_files)
# Write the PDF bytes to a temporary file
temp_pdf = tempfile.NamedTemporaryFile("wb", suffix=".pdf", delete=False)
temp_pdf.write(pdf_bytes)
temp_pdf.close()
# Update pdf_path to the new file
pdf_path = temp_pdf.name
finally:
# Clean up temporary image files
for temp_file in temp_image_files:
try:
os.remove(temp_file)
except Exception as e:
print(f"Warning: Failed to remove temporary file {temp_file}: {e}")
for repeat in range(1, repeats + 1):
for page_num in range(1, num_pages + 1):
output_filename = f"{base_name}_pg{page_num}_repeat{repeat}.md"
# Preserve the relative folder structure in the output directory
candidate_pdf_dir = os.path.join(candidate_output_dir, pdf_relative_dir)
os.makedirs(candidate_pdf_dir, exist_ok=True)
output_path = os.path.join(candidate_pdf_dir, output_filename)
if os.path.exists(output_path) and not force:
print(f"Skipping {base_name}_pg{page_num}_repeat{repeat} for {candidate}, file already exists")
print("Rerun with --force flag to force regeneration")
continue
task = process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async, executor)
tasks.append(task)
task_descriptions[id(task)] = f"{base_name}_pg{page_num}_repeat{repeat} ({candidate})"
# Process tasks with semaphore to limit concurrency
semaphore = asyncio.Semaphore(max_parallel or 1) # Default to 1 if not specified
async def process_with_semaphore(task):
async with semaphore:
return await task
# Wrap each task with the semaphore
limited_tasks = [process_with_semaphore(task) for task in tasks]
# Process tasks with progress bar
if limited_tasks:
completed = 0
with tqdm(total=len(limited_tasks), desc=f"Processing {candidate}") as pbar:
for task in asyncio.as_completed(limited_tasks):
try:
result = await task
if result:
completed += 1
except Exception as e:
print(f"Task failed: {e}")
finally:
pbar.update(1)
print(f"Completed {completed} out of {len(limited_tasks)} tasks for {candidate}")
finally:
# Clean up the executor
if executor:
executor.shutdown(wait=False)
if __name__ == "__main__": if __name__ == "__main__":