Better synth miner script

This commit is contained in:
Jake Poznanski 2025-03-27 21:56:29 +00:00
parent 42be0ccd0c
commit 731aa73c70

View File

@ -9,7 +9,7 @@ import pypdf
from anthropic import Anthropic from anthropic import Anthropic
from tqdm import tqdm from tqdm import tqdm
from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.data.renderpdf import render_pdf_to_base64png, get_png_dimensions_from_base64
def download_s3_pdf(s3_path, local_path): def download_s3_pdf(s3_path, local_path):
@ -21,6 +21,8 @@ def download_s3_pdf(s3_path, local_path):
def generate_html_from_image(client, image_base64): def generate_html_from_image(client, image_base64):
"""Call Claude API to generate HTML from an image.""" """Call Claude API to generate HTML from an image."""
png_width, png_height = get_png_dimensions_from_base64(image_base64)
try: try:
response = client.messages.create( response = client.messages.create(
model="claude-3-7-sonnet-20250219", model="claude-3-7-sonnet-20250219",
@ -36,8 +38,11 @@ def generate_html_from_image(client, image_base64):
"text": "Render this document as clean, semantic HTML. Use appropriate HTML tags for elements like headings, paragraphs, lists, tables, etc. " "text": "Render this document as clean, semantic HTML. Use appropriate HTML tags for elements like headings, paragraphs, lists, tables, etc. "
"Use the <header> and <footer> tags to represent content at the top/bottom which would not normally be part of the main content, such as page numbers, etc. " "Use the <header> and <footer> tags to represent content at the top/bottom which would not normally be part of the main content, such as page numbers, etc. "
"Use a placeholder <div> tag with class 'image' which will render as a grey box with black outline to make sure images have their original size, shape, and position on the page. " "Use a placeholder <div> tag with class 'image' which will render as a grey box with black outline to make sure images have their original size, shape, and position on the page. "
"Preserve any multi-column layouts exactly as they appear. " "If the document has a multi-column layout, you MUST have the same number of columns in your version. "
"Focus on creating valid, accessible HTML that preserves the appearance and formatting of the original page as closely as possible. ", "Focus on creating valid, accessible HTML that preserves the appearance and formatting of the original page as closely as possible. "
f"The webpage will be viewed with a fixed viewport size of {png_width // 2} pixels wide by {png_height // 2} pixels tall. "
"Before you start, output a basic analysis of the layout and a plan before enclosing your final html in a ```html block."
,
}, },
], ],
} }