diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py
index 73af963..993125b 100644
--- a/olmocr/bench/synth/mine_html_templates.py
+++ b/olmocr/bench/synth/mine_html_templates.py
@@ -1,4 +1,5 @@
import argparse
+import asyncio
import concurrent.futures
import os
import random
@@ -7,6 +8,7 @@ from concurrent.futures import ThreadPoolExecutor
import pypdf
from anthropic import Anthropic
+from playwright.async_api import async_playwright
from tqdm import tqdm
from olmocr.data.renderpdf import render_pdf_to_base64png, get_png_dimensions_from_base64
@@ -114,6 +116,39 @@ def extract_page_from_pdf(input_path, output_path, page_num):
return False
+async def render_pdf_with_playwright(html_content, output_pdf_path, png_width, png_height):
+ """
+ Render HTML content using Playwright and save it as PDF.
+
+ Args:
+ html_content: HTML content to render
+ output_html_path: Path to save the HTML content
+ output_pdf_path: Path to save the rendered PDF
+ png_width: Width of the viewport
+ png_height: Height of the viewport
+
+ Returns:
+ bool: True if rendering was successful, False otherwise
+ """
+ try:
+ async with async_playwright() as p:
+ browser = await p.chromium.launch()
+ page = await browser.new_page(viewport={"width": png_width // 2, "height": png_height // 2})
+
+ # Set the HTML content
+ await page.set_content(html_content)
+
+ # Save as PDF
+ await page.pdf(path=output_pdf_path)
+
+ await browser.close()
+
+ return True
+ except Exception as e:
+ print(f"Error rendering PDF with Playwright: {str(e)}")
+ return False
+
+
def process_pdf(pdf_info, args, client):
"""Process a single PDF, render a random page, and create an HTML template."""
s3_path, index = pdf_info
@@ -163,8 +198,32 @@ def process_pdf(pdf_info, args, client):
pdf_path = os.path.join(templates_dir, f"{pdf_id}_page{page_num}.pdf")
if not extract_page_from_pdf(local_pdf_path, pdf_path, page_num):
print(f"Failed to extract page {page_num} from {local_pdf_path}")
+
+ # Render PDF using Playwright if not skipped
+ playwright_pdf_path = None
+
+ if not args.skip_playwright:
+ playwright_pdf_path = os.path.join(templates_dir, f"{pdf_id}_page{page_num}_playwright.pdf")
+
+ try:
+ # Get PNG dimensions
+ png_width, png_height = get_png_dimensions_from_base64(image_base64)
+
+ # Run the async function in the synchronous context
+ asyncio.run(render_pdf_with_playwright(html_content, playwright_pdf_path, png_width, png_height))
+ print(f"Successfully rendered with Playwright: {playwright_pdf_path}")
+ except Exception as e:
+ print(f"Failed to render with Playwright: {e}")
+ playwright_pdf_path = None
- return {"pdf_id": pdf_id, "s3_path": s3_path, "page_number": page_num, "html_path": html_path, "pdf_path": pdf_path}
+ return {
+ "pdf_id": pdf_id,
+ "s3_path": s3_path,
+ "page_number": page_num,
+ "html_path": html_path,
+ "pdf_path": pdf_path,
+ "playwright_pdf_path": playwright_pdf_path
+ }
except Exception as e:
print(f"Error processing {s3_path}: {e}")
return None
@@ -175,13 +234,14 @@ def process_pdf(pdf_info, args, client):
def main():
- parser = argparse.ArgumentParser(description="Convert PDFs to HTML templates")
+ parser = argparse.ArgumentParser(description="Convert PDFs to HTML templates and render with Playwright")
parser.add_argument("--input_list", required=True, help="Path to a file containing S3 paths to PDFs")
parser.add_argument("--output_dir", required=True, help="Directory to store extracted pages and tests")
parser.add_argument("--temp_dir", default="/tmp/mine_tables", help="Directory for temporary files")
parser.add_argument("--max_tests", type=int, default=100, help="Maximum number of tests to generate")
parser.add_argument("--parallel", type=int, default=1, help="Number of parallel threads to use")
parser.add_argument("--api_key", help="Claude API key (or set ANTHROPIC_API_KEY environment variable)")
+ parser.add_argument("--skip_playwright", action="store_true", help="Skip Playwright PDF rendering")
args = parser.parse_args()
# Ensure output and temp directories exist
@@ -236,6 +296,11 @@ def main():
print(f"Error processing {s3_path}: {e}")
print(f"Generated {len(results)} HTML templates")
+
+ # Print summary of Playwright rendering results
+ playwright_success = sum(1 for r in results if r and r.get("playwright_pdf_path"))
+ if not args.skip_playwright:
+ print(f"Playwright PDF rendering: {playwright_success}/{len(results)} successful")
if __name__ == "__main__":