From 731aa73c70e6276ec48adc2fd380174b29b5119b Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 27 Mar 2025 21:56:29 +0000 Subject: [PATCH] Better synth miner script --- olmocr/bench/synth/mine_html_templates.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py index 9d6cd07..73af963 100644 --- a/olmocr/bench/synth/mine_html_templates.py +++ b/olmocr/bench/synth/mine_html_templates.py @@ -9,7 +9,7 @@ import pypdf from anthropic import Anthropic from tqdm import tqdm -from olmocr.data.renderpdf import render_pdf_to_base64png +from olmocr.data.renderpdf import render_pdf_to_base64png, get_png_dimensions_from_base64 def download_s3_pdf(s3_path, local_path): @@ -21,6 +21,8 @@ def download_s3_pdf(s3_path, local_path): def generate_html_from_image(client, image_base64): """Call Claude API to generate HTML from an image.""" + png_width, png_height = get_png_dimensions_from_base64(image_base64) + try: response = client.messages.create( model="claude-3-7-sonnet-20250219", @@ -36,8 +38,11 @@ def generate_html_from_image(client, image_base64): "text": "Render this document as clean, semantic HTML. Use appropriate HTML tags for elements like headings, paragraphs, lists, tables, etc. " "Use the
and