Rendering pdfs with playwright and chromium

2025-11-03 11:35:29 +00:00 · 2025-03-27 22:18:23 +00:00 · 2025-03-27 22:18:23 +00:00 · cd5a93d8d5
commit cd5a93d8d5
parent 9749e9559d
1 changed files with 67 additions and 2 deletions
--- a/olmocr/bench/synth/mine_html_templates.py
+++ b/olmocr/bench/synth/mine_html_templates.py
@ -1,4 +1,5 @@
 import argparse
+import asyncio
 import concurrent.futures
 import os
 import random
@ -7,6 +8,7 @@ from concurrent.futures import ThreadPoolExecutor

 import pypdf
 from anthropic import Anthropic
+from playwright.async_api import async_playwright
 from tqdm import tqdm

 from olmocr.data.renderpdf import render_pdf_to_base64png, get_png_dimensions_from_base64
@ -114,6 +116,39 @@ def extract_page_from_pdf(input_path, output_path, page_num):
        return False


+async def render_pdf_with_playwright(html_content, output_pdf_path, png_width, png_height):
+    """
+    Render HTML content using Playwright and save it as PDF.
+    
+    Args:
+        html_content: HTML content to render
+        output_html_path: Path to save the HTML content
+        output_pdf_path: Path to save the rendered PDF
+        png_width: Width of the viewport
+        png_height: Height of the viewport
+        
+    Returns:
+        bool: True if rendering was successful, False otherwise
+    """
+    try:
+        async with async_playwright() as p:
+            browser = await p.chromium.launch()
+            page = await browser.new_page(viewport={"width": png_width // 2, "height": png_height // 2})
+            
+            # Set the HTML content
+            await page.set_content(html_content)
+            
+            # Save as PDF
+            await page.pdf(path=output_pdf_path)
+            
+            await browser.close()
+            
+            return True
+    except Exception as e:
+        print(f"Error rendering PDF with Playwright: {str(e)}")
+        return False
+
+
 def process_pdf(pdf_info, args, client):
    """Process a single PDF, render a random page, and create an HTML template."""
    s3_path, index = pdf_info
@ -163,8 +198,32 @@ def process_pdf(pdf_info, args, client):
        pdf_path = os.path.join(templates_dir, f"{pdf_id}_page{page_num}.pdf")
        if not extract_page_from_pdf(local_pdf_path, pdf_path, page_num):
            print(f"Failed to extract page {page_num} from {local_pdf_path}")
+            
+        # Render PDF using Playwright if not skipped
+        playwright_pdf_path = None
+        
+        if not args.skip_playwright:
+            playwright_pdf_path = os.path.join(templates_dir, f"{pdf_id}_page{page_num}_playwright.pdf")
+            
+            try:
+                # Get PNG dimensions
+                png_width, png_height = get_png_dimensions_from_base64(image_base64)
+                
+                # Run the async function in the synchronous context
+                asyncio.run(render_pdf_with_playwright(html_content, playwright_pdf_path, png_width, png_height))
+                print(f"Successfully rendered with Playwright: {playwright_pdf_path}")
+            except Exception as e:
+                print(f"Failed to render with Playwright: {e}")
+                playwright_pdf_path = None

-        return {"pdf_id": pdf_id, "s3_path": s3_path, "page_number": page_num, "html_path": html_path, "pdf_path": pdf_path}
+        return {
+            "pdf_id": pdf_id, 
+            "s3_path": s3_path, 
+            "page_number": page_num, 
+            "html_path": html_path, 
+            "pdf_path": pdf_path,
+            "playwright_pdf_path": playwright_pdf_path
+        }
    except Exception as e:
        print(f"Error processing {s3_path}: {e}")
        return None
@ -175,13 +234,14 @@ def process_pdf(pdf_info, args, client):


 def main():
-    parser = argparse.ArgumentParser(description="Convert PDFs to HTML templates")
+    parser = argparse.ArgumentParser(description="Convert PDFs to HTML templates and render with Playwright")
    parser.add_argument("--input_list", required=True, help="Path to a file containing S3 paths to PDFs")
    parser.add_argument("--output_dir", required=True, help="Directory to store extracted pages and tests")
    parser.add_argument("--temp_dir", default="/tmp/mine_tables", help="Directory for temporary files")
    parser.add_argument("--max_tests", type=int, default=100, help="Maximum number of tests to generate")
    parser.add_argument("--parallel", type=int, default=1, help="Number of parallel threads to use")
    parser.add_argument("--api_key", help="Claude API key (or set ANTHROPIC_API_KEY environment variable)")
+    parser.add_argument("--skip_playwright", action="store_true", help="Skip Playwright PDF rendering")
    args = parser.parse_args()

    # Ensure output and temp directories exist
@ -236,6 +296,11 @@ def main():
                print(f"Error processing {s3_path}: {e}")

    print(f"Generated {len(results)} HTML templates")
+    
+    # Print summary of Playwright rendering results
+    playwright_success = sum(1 for r in results if r and r.get("playwright_pdf_path"))
+    if not args.skip_playwright:
+        print(f"Playwright PDF rendering: {playwright_success}/{len(results)} successful")


 if __name__ == "__main__":