diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py index 9d6cd07..73af963 100644 --- a/olmocr/bench/synth/mine_html_templates.py +++ b/olmocr/bench/synth/mine_html_templates.py @@ -9,7 +9,7 @@ import pypdf from anthropic import Anthropic from tqdm import tqdm -from olmocr.data.renderpdf import render_pdf_to_base64png +from olmocr.data.renderpdf import render_pdf_to_base64png, get_png_dimensions_from_base64 def download_s3_pdf(s3_path, local_path): @@ -21,6 +21,8 @@ def download_s3_pdf(s3_path, local_path): def generate_html_from_image(client, image_base64): """Call Claude API to generate HTML from an image.""" + png_width, png_height = get_png_dimensions_from_base64(image_base64) + try: response = client.messages.create( model="claude-3-7-sonnet-20250219", @@ -36,8 +38,11 @@ def generate_html_from_image(client, image_base64): "text": "Render this document as clean, semantic HTML. Use appropriate HTML tags for elements like headings, paragraphs, lists, tables, etc. " "Use the
and