Trying to get reliablity up

This commit is contained in:
Jake Poznanski 2024-11-11 13:54:04 -08:00
parent fedda40466
commit 24a9d23b00
2 changed files with 5 additions and 2 deletions

View File

@ -63,7 +63,7 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_
# Allow the page rendering to process in the background while we get the anchor text (which blocks the main thread)
image_base64 = asyncio.to_thread(render_pdf_to_base64png, local_pdf_path, page, target_longest_image_dim=target_longest_image_dim)
anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport", target_length=target_anchor_text_len)
anchor_text = asyncio.to_thread(get_anchor_text, local_pdf_path, page, pdf_engine="pdfreport", target_length=target_anchor_text_len)
image_base64 = await image_base64
if image_rotation != 0:
@ -78,6 +78,8 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_
# Encode the rotated image back to base64
image_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
anchor_text = await anchor_text
return {
"model": "Qwen/Qwen2-VL-7B-Instruct",
"messages": [
@ -246,7 +248,7 @@ async def process_pdf(args, pdf_s3_path: str):
# List to hold the tasks for processing each page
page_tasks = []
async with aiohttp.ClientSession() as session:
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=3600), connector=TCPConnector(limit=100)) as session:
for page_num in range(1, num_pages + 1):
# Create a task for each page
task = asyncio.create_task(process_page(args, session, pdf_s3_path, tf.name, page_num))

View File

@ -109,6 +109,7 @@ def main(jsonl_path, output_dir, template_path):
future.result()
except Exception as e:
print(f"An error occurred: {e}")
raise
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Generate HTML pages from a JSONL file with pre-signed S3 links.')