mirror of
https://github.com/allenai/olmocr.git
synced 2025-08-20 14:52:12 +00:00
Trying to get reliablity up
This commit is contained in:
parent
fedda40466
commit
24a9d23b00
@ -63,7 +63,7 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_
|
|||||||
|
|
||||||
# Allow the page rendering to process in the background while we get the anchor text (which blocks the main thread)
|
# Allow the page rendering to process in the background while we get the anchor text (which blocks the main thread)
|
||||||
image_base64 = asyncio.to_thread(render_pdf_to_base64png, local_pdf_path, page, target_longest_image_dim=target_longest_image_dim)
|
image_base64 = asyncio.to_thread(render_pdf_to_base64png, local_pdf_path, page, target_longest_image_dim=target_longest_image_dim)
|
||||||
anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport", target_length=target_anchor_text_len)
|
anchor_text = asyncio.to_thread(get_anchor_text, local_pdf_path, page, pdf_engine="pdfreport", target_length=target_anchor_text_len)
|
||||||
|
|
||||||
image_base64 = await image_base64
|
image_base64 = await image_base64
|
||||||
if image_rotation != 0:
|
if image_rotation != 0:
|
||||||
@ -78,6 +78,8 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_
|
|||||||
# Encode the rotated image back to base64
|
# Encode the rotated image back to base64
|
||||||
image_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
image_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||||
|
|
||||||
|
anchor_text = await anchor_text
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"model": "Qwen/Qwen2-VL-7B-Instruct",
|
"model": "Qwen/Qwen2-VL-7B-Instruct",
|
||||||
"messages": [
|
"messages": [
|
||||||
@ -246,7 +248,7 @@ async def process_pdf(args, pdf_s3_path: str):
|
|||||||
# List to hold the tasks for processing each page
|
# List to hold the tasks for processing each page
|
||||||
page_tasks = []
|
page_tasks = []
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=3600), connector=TCPConnector(limit=100)) as session:
|
||||||
for page_num in range(1, num_pages + 1):
|
for page_num in range(1, num_pages + 1):
|
||||||
# Create a task for each page
|
# Create a task for each page
|
||||||
task = asyncio.create_task(process_page(args, session, pdf_s3_path, tf.name, page_num))
|
task = asyncio.create_task(process_page(args, session, pdf_s3_path, tf.name, page_num))
|
||||||
|
@ -109,6 +109,7 @@ def main(jsonl_path, output_dir, template_path):
|
|||||||
future.result()
|
future.result()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"An error occurred: {e}")
|
print(f"An error occurred: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(description='Generate HTML pages from a JSONL file with pre-signed S3 links.')
|
parser = argparse.ArgumentParser(description='Generate HTML pages from a JSONL file with pre-signed S3 links.')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user