Some early code for mining html templates of pages, pick mediod code

2025-10-13 09:12:18 +00:00 · 2025-03-21 17:51:29 +00:00 · 2025-03-21 17:51:29 +00:00 · 1f77aab75a
commit 1f77aab75a
parent 58276b04cb
5 changed files with 404 additions and 483 deletions
--- a/olmocr/bench/miners/pick_mediod.py
+++ b/olmocr/bench/miners/pick_mediod.py
@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""
+pick_mediod.py - Identify representative examples from repeated OCR outputs
+
+This code will take as arguments two directories:
+--input and --output
+Each of those is going to be a directory that was generated by convert.py and is a candidate to be evaluated as part of benchmark.py
+What it will do is find and group all of the .md files into their repeats
+ex. input_dir/tables/buildingnotes_pg1_repeat1.md, input_dir/tables/buildingnotes_pg1_repeat2.md, etc.
+Then, for each repeat, it will use string similarity metrics to calculate the edit distance to every other repeat
+The repeat with the lowest mean edit distance will then get output as ..._repeat1.md in the output folder
+"""
+
+import argparse
+import glob
+import os
+import re
+import shutil
+from typing import Dict, List
+
+from rapidfuzz import distance as fuzz_distance
+from tqdm import tqdm
+
+
+def compute_distance(text1: str, text2: str) -> float:
+    """
+    Compute the edit distance between two text strings using rapidfuzz.
+    Returns a normalized distance between 0.0 (identical) and 1.0 (completely different).
+    """
+    # Use Levenshtein distance for string comparison
+    return fuzz_distance.Levenshtein.normalized_distance(text1, text2)
+
+
+def find_mediod(texts: List[str]) -> int:
+    """
+    Find the index of the mediod from a list of texts.
+    The mediod is the text with the minimum average distance to all other texts.
+    """
+    if not texts:
+        return -1
+
+    if len(texts) == 1:
+        return 0
+
+    # Calculate pairwise distances between all texts
+    n = len(texts)
+    distances = [[0.0 for _ in range(n)] for _ in range(n)]
+
+    for i in range(n):
+        for j in range(i + 1, n):
+            dist = compute_distance(texts[i], texts[j])
+            distances[i][j] = dist
+            distances[j][i] = dist
+
+    # Calculate average distance of each text to all others
+    avg_distances = []
+    for i in range(n):
+        avg_dist = sum(distances[i]) / (n - 1)  # Don't include distance to self
+        avg_distances.append(avg_dist)
+
+    # Return the index of the text with the minimum average distance
+    min_avg_dist = min(avg_distances)
+    return avg_distances.index(min_avg_dist)
+
+
+def group_repeats(md_files: List[str]) -> Dict[str, List[str]]:
+    """
+    Group MD files by their base name (without the repeat number).
+    Returns a dictionary mapping base names to lists of file paths.
+    """
+    grouped = {}
+
+    for md_path in md_files:
+        base_name = re.sub(r"_repeat\d+\.md$", "", os.path.basename(md_path))
+        if base_name not in grouped:
+            grouped[base_name] = []
+        grouped[base_name].append(md_path)
+
+    return grouped
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Find mediod (most representative) examples from repeated OCR outputs.")
+    parser.add_argument(
+        "--input", type=str, required=True, help="Path to the directory containing repeated OCR outputs (e.g., *_repeat1.md, *_repeat2.md, etc.)"
+    )
+    parser.add_argument("--output", type=str, required=True, help="Path to the directory where mediod examples will be copied")
+    parser.add_argument("--min_repeats", type=int, default=3, help="Minimum number of repeats required to compute a mediod (default: 3)")
+    args = parser.parse_args()
+
+    input_dir = args.input
+    output_dir = args.output
+    min_repeats = args.min_repeats
+
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Find all markdown files in the input directory (recursive)
+    md_files = glob.glob(os.path.join(input_dir, "**/*.md"), recursive=True)
+
+    if not md_files:
+        print(f"No markdown files found in {input_dir}")
+        return
+
+    # Group files by their base name
+    grouped_files = group_repeats(md_files)
+
+    # Process each group
+    successful = 0
+    skipped = 0
+
+    print(f"Found {len(grouped_files)} unique test cases with repeats")
+
+    for base_name, file_paths in tqdm(grouped_files.items(), desc="Processing test cases"):
+        # Skip if there aren't enough repeats
+        if len(file_paths) < min_repeats:
+            print(f"Skipping {base_name}: only {len(file_paths)} repeats (minimum {min_repeats} required)")
+            skipped += 1
+            continue
+
+        # Read all text content
+        texts = []
+        for path in file_paths:
+            try:
+                with open(path, "r", encoding="utf-8") as f:
+                    texts.append(f.read())
+            except Exception as e:
+                print(f"Error reading {path}: {e}")
+                continue
+
+        # Find the mediod
+        mediod_idx = find_mediod(texts)
+        if mediod_idx == -1:
+            print(f"Failed to find mediod for {base_name}")
+            skipped += 1
+            continue
+
+        # Get the path of the mediod file
+        mediod_path = file_paths[mediod_idx]
+
+        # Create the output path, preserving the directory structure relative to input_dir
+        if os.path.isabs(mediod_path) and os.path.isabs(input_dir):
+            rel_path = os.path.relpath(mediod_path, input_dir)
+            # Change the repeat number to 1 in the output filename
+            output_filename = re.sub(r"_repeat\d+\.md$", "_repeat1.md", os.path.basename(rel_path))
+            output_subdir = os.path.dirname(rel_path)
+            output_path = os.path.join(output_dir, output_subdir, output_filename)
+
+            # Create directories if needed
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        else:
+            # Just output to the root of output_dir with renamed file
+            output_filename = re.sub(r"_repeat\d+\.md$", "_repeat1.md", os.path.basename(mediod_path))
+            output_path = os.path.join(output_dir, output_filename)
+
+        # Copy the mediod file
+        try:
+            shutil.copy2(mediod_path, output_path)
+            successful += 1
+        except Exception as e:
+            print(f"Error copying {mediod_path} to {output_path}: {e}")
+
+    print(f"Processing complete: {successful} mediods copied, {skipped} cases skipped")
+
+
+if __name__ == "__main__":
+    main()
--- a/olmocr/bench/synth/mine_html_templates.py
+++ b/olmocr/bench/synth/mine_html_templates.py
@ -0,0 +1,237 @@
+import argparse
+import concurrent.futures
+import os
+import random
+import subprocess
+from concurrent.futures import ThreadPoolExecutor
+
+import pypdf
+from anthropic import Anthropic
+from tqdm import tqdm
+
+from olmocr.data.renderpdf import render_pdf_to_base64png
+
+
+def download_s3_pdf(s3_path, local_path):
+    """Download a PDF from S3 to a local path."""
+    os.makedirs(os.path.dirname(local_path), exist_ok=True)
+    result = subprocess.run(["aws", "s3", "cp", s3_path, local_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    return result.returncode == 0
+
+
+def generate_html_from_image(client, image_base64):
+    """Call Claude API to generate HTML from an image."""
+    try:
+        response = client.messages.create(
+            model="claude-3-7-sonnet-20250219",
+            max_tokens=4000,
+            temperature=0.2,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
+                        {
+                            "type": "text",
+                            "text": "Render this document as clean, semantic HTML. Use appropriate HTML tags for elements like headings, paragraphs, lists, tables, etc. "
+                            "Use the <header> and <footer> tags to represent content at the top/bottom which would not normally be part of the main content, such as page numbers, etc. "
+                            "Use a placeholder <div> tag with class 'image' which will render as a grey box with black outline to make sure images have their original size, shape, and position on the page. "
+                            "Preserve any multi-column layouts exactly as they appear. "
+                            "Focus on creating valid, accessible HTML that preserves the appearance and formatting of the original page as closely as possible. ",
+                        },
+                    ],
+                }
+            ],
+        )
+
+        # Extract HTML from response
+        html_content = ""
+        for content in response.content:
+            if content.type == "text":
+                html_content += content.text
+
+        # Extract code blocks from response if HTML is wrapped in them
+        if "```html" in html_content:
+            start = html_content.find("```html") + 7
+            end = html_content.rfind("```")
+            if end > start:
+                html_content = html_content[start:end].strip()
+        elif "```" in html_content:
+            start = html_content.find("```") + 3
+            end = html_content.rfind("```")
+            if end > start:
+                html_content = html_content[start:end].strip()
+
+        return html_content
+    except Exception as e:
+        print(f"Error calling Claude API: {e}")
+        return None
+
+
+def extract_page_from_pdf(input_path, output_path, page_num):
+    """
+    Extract a specific page from a PDF and save it as a new PDF.
+
+    Args:
+        input_path: Path to the input PDF
+        output_path: Path to save the extracted page
+        page_num: The page number to extract (1-indexed, converted to 0-indexed for pypdf)
+
+    Returns:
+        bool: True if extraction was successful, False otherwise
+    """
+    try:
+        # Ensure output directory exists
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+
+        # Read the input PDF
+        reader = pypdf.PdfReader(input_path)
+
+        # Convert to 0-indexed for pypdf
+        zero_idx_page = page_num - 1
+
+        # Check if page number is valid
+        if zero_idx_page >= len(reader.pages) or zero_idx_page < 0:
+            print(f"Page number {page_num} out of range for {input_path} with {len(reader.pages)} pages")
+            return False
+
+        # Create a new PDF with just the selected page
+        writer = pypdf.PdfWriter()
+        writer.add_page(reader.pages[zero_idx_page])
+
+        # Write the output PDF
+        with open(output_path, "wb") as output_file:
+            writer.write(output_file)
+
+        return True
+    except Exception as e:
+        print(f"Error extracting page {page_num} from {input_path}: {str(e)}")
+        return False
+
+
+def process_pdf(pdf_info, args, client):
+    """Process a single PDF, render a random page, and create an HTML template."""
+    s3_path, index = pdf_info
+
+    # Create a unique folder for each PDF in the temp directory
+    pdf_id = f"pdf_{index:05d}"
+    temp_pdf_dir = os.path.join(args.temp_dir, pdf_id)
+    os.makedirs(temp_pdf_dir, exist_ok=True)
+
+    # Download PDF to local temp directory
+    local_pdf_path = os.path.join(temp_pdf_dir, "document.pdf")
+    if not download_s3_pdf(s3_path, local_pdf_path):
+        print(f"Failed to download PDF from {s3_path}")
+        return None
+
+    try:
+        # Get page count using pypdf
+        reader = pypdf.PdfReader(local_pdf_path)
+        num_pages = len(reader.pages)
+
+        if num_pages == 0:
+            print(f"PDF has no pages: {s3_path}")
+            return None
+
+        # Select a random page
+        page_num = random.randint(1, num_pages)
+
+        # Render the page as a base64 PNG
+        image_base64 = render_pdf_to_base64png(local_pdf_path, page_num, target_longest_image_dim=2048)
+
+        # Generate HTML from the image
+        html_content = generate_html_from_image(client, image_base64)
+        if not html_content:
+            print(f"Failed to generate HTML for {s3_path}, page {page_num}")
+            return None
+
+        # Create output directory
+        templates_dir = os.path.join(args.output_dir, "templates")
+        os.makedirs(templates_dir, exist_ok=True)
+
+        # Save HTML to output directory
+        html_path = os.path.join(templates_dir, f"{pdf_id}_page{page_num}.html")
+        with open(html_path, "w") as f:
+            f.write(html_content)
+
+        # Extract the page and save as PDF
+        pdf_path = os.path.join(templates_dir, f"{pdf_id}_page{page_num}.pdf")
+        if not extract_page_from_pdf(local_pdf_path, pdf_path, page_num):
+            print(f"Failed to extract page {page_num} from {local_pdf_path}")
+
+        return {"pdf_id": pdf_id, "s3_path": s3_path, "page_number": page_num, "html_path": html_path, "pdf_path": pdf_path}
+    except Exception as e:
+        print(f"Error processing {s3_path}: {e}")
+        return None
+    finally:
+        # Clean up temp directory for this PDF
+        if os.path.exists(temp_pdf_dir):
+            subprocess.run(["rm", "-rf", temp_pdf_dir])
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert PDFs to HTML templates")
+    parser.add_argument("--input_list", required=True, help="Path to a file containing S3 paths to PDFs")
+    parser.add_argument("--output_dir", required=True, help="Directory to store extracted pages and tests")
+    parser.add_argument("--temp_dir", default="/tmp/mine_tables", help="Directory for temporary files")
+    parser.add_argument("--max_tests", type=int, default=100, help="Maximum number of tests to generate")
+    parser.add_argument("--parallel", type=int, default=1, help="Number of parallel threads to use")
+    parser.add_argument("--api_key", help="Claude API key (or set ANTHROPIC_API_KEY environment variable)")
+    args = parser.parse_args()
+
+    # Ensure output and temp directories exist
+    os.makedirs(args.output_dir, exist_ok=True)
+    os.makedirs(args.temp_dir, exist_ok=True)
+
+    # Get API key
+    api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY")
+    if not api_key:
+        print("Error: API key not provided. Use --api_key or set ANTHROPIC_API_KEY environment variable.")
+        return
+
+    # Initialize Claude client
+    client = Anthropic(api_key=api_key)
+
+    # Reservoir sampling implementation
+    s3_paths = []
+    with open(args.input_list, "r") as f:
+        for i, line in enumerate(tqdm(f)):
+            line = line.strip()
+            if not line:
+                continue
+
+            if i < 100000:
+                s3_paths.append(line)
+            else:
+                # Randomly replace elements with decreasing probability
+                j = random.randint(0, i)
+                if j < 100000:
+                    s3_paths[j] = line
+
+    print(f"Found {len(s3_paths)} PDF paths in input list")
+
+    # Shuffle and limit to max_tests
+    random.shuffle(s3_paths)
+    s3_paths = s3_paths[: args.max_tests]
+
+    # Process PDFs in parallel
+    results = []
+    with ThreadPoolExecutor(max_workers=args.parallel) as executor:
+        # Submit all tasks
+        futures = {executor.submit(process_pdf, (s3_path, i), args, client): s3_path for i, s3_path in enumerate(s3_paths)}
+
+        # Process results as they complete
+        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing PDFs"):
+            s3_path = futures[future]
+            try:
+                result = future.result()
+                if result:
+                    results.append(result)
+            except Exception as e:
+                print(f"Error processing {s3_path}: {e}")
+
+    print(f"Generated {len(results)} HTML templates")
+
+
+if __name__ == "__main__":
+    main()
--- a/olmocr/bench/synth/render.py
+++ b/olmocr/bench/synth/render.py
@ -1,186 +0,0 @@
-#!/usr/bin/env python3
-import asyncio
-import os
-from pathlib import Path
-
-from playwright.async_api import async_playwright
-
-# Simple configuration
-CONFIG = {
-    "input_file": os.path.join(os.path.dirname(__file__), "templates", "listpage.js"),  # React component file
-    "output_pdf": "book-page.pdf",  # Output PDF filename
-    "temp_html": "temp-render.html",  # Temporary HTML file
-    "wait_time": 1500,  # Time to wait for rendering (ms)
-    "device_scale": 2,  # Resolution multiplier
-    "debug": True,  # Keep temp files for debugging
-}
-
-
-async def create_html_file():
-    """Create a temporary HTML file that loads the React component from a file."""
-    try:
-        # Check if input file exists
-        input_path = Path(CONFIG["input_file"])
-        if not input_path.exists():
-            print(f"Error: Input file '{input_path}' not found")
-            return False
-
-        # Read the component file
-        with open(input_path, "r", encoding="utf-8") as f:
-            component_code = f.read()
-
-        # Create HTML that will load our component
-        html_content = (
-            """
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset="UTF-8">
-  <meta name="viewport" content="width=device-width, initial-scale=1.0">
-  <title>Book Page Template</title>
-  <script src="https://unpkg.com/react@17/umd/react.development.js"></script>
-  <script src="https://unpkg.com/react-dom@17/umd/react-dom.development.js"></script>
-  <script src="https://unpkg.com/@babel/standalone/babel.min.js"></script>
-  <style>
-    * {
-      box-sizing: border-box;
-    }
-    
-    html, body {
-      margin: 0;
-      padding: 0;
-      width: 8.5in;
-      height: 11in;
-      overflow: hidden;
-    }
-    
-    #root {
-      width: 100%;
-      height: 100%;
-      padding: 0.25in;
-      overflow: hidden;
-    }
-    
-    @media print {
-      body {
-        -webkit-print-color-adjust: exact;
-        print-color-adjust: exact;
-      }
-    }
-  </style>
-</head>
-<body>
-  <div id="root"></div>
-
-  <script type="text/babel">
-    // The React component code loaded from external file
-    """
-            + component_code
-            + """
-    
-    // Render only the book page part, not the controls
-    ReactDOM.render(
-      <BookPageTemplate />,
-      document.getElementById('root')
-    );
-  </script>
-</body>
-</html>
-        """
-        )
-
-        with open(CONFIG["temp_html"], "w", encoding="utf-8") as f:
-            f.write(html_content)
-
-        print(f"Created HTML file: {CONFIG['temp_html']}")
-        print(f"Using React component from: {CONFIG['input_file']}")
-        return True
-    except Exception as e:
-        print(f"Error creating HTML file: {e}")
-        print(f"Exception details: {str(e)}")
-        import traceback
-
-        traceback.print_exc()
-        return False
-
-
-async def render_to_pdf():
-    """Render the React component to PDF using Playwright."""
-    try:
-        # Create the HTML file first
-        html_created = await create_html_file()
-        if not html_created:
-            print("Failed to create HTML file")
-            return
-
-        print("Launching browser...")
-        async with async_playwright() as p:
-            # Launch the browser with more debugging options
-            browser = await p.chromium.launch(
-                headless=True,  # True for production, False for debugging
-            )
-
-            # Create a new page for letter size paper
-            page = await browser.new_page(viewport={"width": 816, "height": 1056}, device_scale_factor=CONFIG["device_scale"])  # 8.5in x 11in at 96dpi
-
-            # Get absolute path to HTML file
-            html_path = Path(CONFIG["temp_html"]).absolute()
-            html_uri = f"file://{html_path}"
-
-            print(f"Navigating to: {html_uri}")
-
-            # Add event listeners for console messages and errors
-            page.on("console", lambda msg: print(f"Browser console: {msg.text}"))
-            page.on("pageerror", lambda err: print(f"Browser page error: {err}"))
-
-            # Navigate with longer timeout and wait for network idle
-            await page.goto(html_uri, wait_until="networkidle", timeout=30000)
-
-            # Wait for React to render
-            await page.wait_for_timeout(CONFIG["wait_time"])
-
-            # Add a check to ensure the component rendered
-            element_count = await page.evaluate(
-                """() => {
-                const root = document.getElementById('root');
-                return root.childElementCount;
-            }"""
-            )
-
-            if element_count == 0:
-                print("Warning: No elements found in root. Component may not have rendered.")
-            else:
-                print(f"Found {element_count} elements in root. Component rendered successfully.")
-
-            # Save debug screenshot
-            if CONFIG["debug"]:
-                await page.screenshot(path="debug-screenshot.png")
-                print("Debug screenshot saved")
-
-            # Generate PDF
-            print("Generating PDF...")
-            await page.pdf(path=CONFIG["output_pdf"], format="Letter", print_background=True, margin={"top": "0", "right": "0", "bottom": "0", "left": "0"})
-
-            print(f"PDF generated successfully: {CONFIG['output_pdf']}")
-
-            # Close the browser
-            await browser.close()
-
-        # Cleanup temp files if not in debug mode
-        if not CONFIG["debug"] and Path(CONFIG["temp_html"]).exists():
-            Path(CONFIG["temp_html"]).unlink()
-            print("Temporary HTML file removed")
-
-    except Exception as e:
-        print(f"Error generating PDF: {e}")
-
-
-if __name__ == "__main__":
-    # Run the async function
-    try:
-        asyncio.run(render_to_pdf())
-    except Exception as e:
-        print(f"Fatal error: {e}")
-        import traceback
-
-        traceback.print_exc()
--- a/olmocr/bench/synth/templates/bookpage.js
+++ b/olmocr/bench/synth/templates/bookpage.js
@ -1,214 +0,0 @@
-//import React from 'react';
-
-const BookPageTemplate = () => {
-  // Only three state variables as requested
-  const [title, setTitle] = React.useState("ADVENTURES OF DON QUIXOTE");
-  const [pageNumber, setPageNumber] = React.useState("289");
-  const [text, setText] = React.useState(
-    "deed,\" said Don Quixote, \"thou hast hit the point, Sancho, which can alone shake my resolution; I neither can, nor ought to, draw my sword, as I have often told thee, against those who are not dubbed knights. To thee which I had premeditated, thy share of the booty would have been at least the emperor's crown of gold and Cupid's painted wings; for I would have plucked them off perforce, and delivered them into thy hands.\" \"The"
-  );
-
-  // Styles for heavily degraded scan effect
-  const heavilyDegradedStyles = {
-    filter: 'grayscale(30%) contrast(120%) brightness(85%) sepia(20%)',
-    position: 'relative',
-    backgroundColor: '#e6ddc6', // More yellowed aged paper
-    backgroundImage: 'url("data:image/svg+xml,%3Csvg viewBox=\'0 0 200 200\' xmlns=\'http://www.w3.org/2000/svg\'%3E%3Cfilter id=\'noiseFilter\'%3E%3CfeTurbulence type=\'fractalNoise\' baseFrequency=\'0.85\' numOctaves=\'3\' stitchTiles=\'stitch\'/%3E%3C/filter%3E%3Crect width=\'100%25\' height=\'100%25\' filter=\'url(%23noiseFilter)\' opacity=\'0.25\'/%3E%3C/svg%3E")',
-    boxShadow: 'inset 0 0 70px rgba(0, 0, 0, 0.3), 0 0 5px rgba(0,0,0,0.1)',
-    padding: '32px',
-    borderRadius: '2px',
-    overflow: 'hidden',
-    transform: 'rotate(0.3deg)', // Slightly askew scan
-  };
-
-  // Heavily degraded text
-  const badScanTextStyle = {
-    fontFamily: '"Times New Roman", serif',
-    letterSpacing: '-0.01em',
-    wordSpacing: '0.02em',
-    fontWeight: '500',
-    color: '#222222',
-    textShadow: '0 0 1px rgba(0, 0, 0, 0.5)',
-    transform: 'scale(1.01, 0.99) rotate(-0.4deg)', // Distorted proportions
-  };
-
-  // Random coffee stain effect
-  const coffeeStain = {
-    position: 'absolute',
-    width: '100px',
-    height: '80px',
-    top: '25%',
-    right: '15%',
-    borderRadius: '50%',
-    background: 'radial-gradient(ellipse at center, rgba(139,69,19,0.15) 0%, rgba(139,69,19,0.1) 50%, rgba(139,69,19,0.05) 70%, rgba(139,69,19,0) 100%)',
-    transform: 'rotate(30deg) scale(1.5, 1)',
-    pointerEvents: 'none',
-    zIndex: 1,
-  };
-
-  // Water damage effect
-  const waterDamage = {
-    position: 'absolute',
-    width: '70%',
-    height: '40%',
-    bottom: '10%',
-    left: '5%',
-    opacity: 0.07,
-    background: 'radial-gradient(ellipse at center, rgba(0,0,0,0.2) 0%, rgba(0,0,0,0.1) 40%, rgba(0,0,0,0) 70%)',
-    borderRadius: '40% 60% 70% 30% / 40% 50% 60% 50%',
-    pointerEvents: 'none',
-    zIndex: 1,
-  };
-
-  // Add fold lines
-  const foldLine = {
-    position: 'absolute',
-    width: '100%',
-    height: '3px',
-    top: '30%',
-    left: 0,
-    background: 'linear-gradient(to right, rgba(0,0,0,0) 0%, rgba(0,0,0,0.03) 20%, rgba(0,0,0,0.08) 50%, rgba(0,0,0,0.03) 80%, rgba(0,0,0,0) 100%)',
-    boxShadow: '0 1px 3px rgba(255,255,255,0.2)',
-    pointerEvents: 'none',
-    zIndex: 2,
-  };
-
-  // Torn edge effect
-  const tornEdge = {
-    position: 'absolute',
-    top: 0,
-    right: 0,
-    width: '100%',
-    height: '100%',
-    background: 'linear-gradient(135deg, transparent 97%, #e6ddc6 97%, #e6ddc6 100%)',
-    pointerEvents: 'none',
-  };
-
-  return (
-    <div style={{
-      maxWidth: '800px',
-      margin: '0 auto',
-      padding: '16px',
-    }}>
-      {/* Heavily degraded scan container */}
-      <div style={heavilyDegradedStyles}>
-        {/* Noise overlay */}
-        <div style={{
-          position: 'absolute',
-          top: 0,
-          left: 0,
-          right: 0,
-          bottom: 0,
-          background: 'url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAYAAABw4pVUAAAKpklEQVR4Xu2di3IbOQxD3f//6aTJJLF3vSRAAiTlvWy7lUSAD1KWc97b8/m8f7/+2xZg27fs/P/LvzClv+f77Hfz79eTP+pv/5ZlmPKZfZYp7eOsU8rrQ9fQ/r5+P/s7+/2M7lO+67kTvZfnqx4zXXtcz5To/TwZj2Uxn+FiJiDCPzecjXcEh30/gokAYvSeCVu0OaNrtV5F4I9jiAILu5AZYs8QiExIRZkRYFjKIgFUCsT0rH5EdM5oBUaRr8KnUgaNKzRfARkFRBlltQKr32OATwmp0hXTHxINSkkRSCzNZQmhnWyVnvmzwAqIrQr8AYgJwWz3smW9K0OxXTQTLhaQlQJZwmKKzIwtqqiVRVbCVS1ORpSZQbKCygLIErKVoiNZVT5eAcYEfaW41XQ1c31WAFZKZBVn5aQjpwb0mRJPCKkCiFKrUmL0PBGK1aFZ0XpCsb5SoROQGQBzRUaAMwavFJEZOlOwTNGjPK+EpVK2CjsGbDTXzgBW5RiZgaJ3VAc/U9RKkVjQTu7AZopdpVOVrmaUULGGBZClsRmFKtdWPYehMKk4Sksq0KuAK4WLSsmUORXDUlWXNX72OgZkbgADDDs22xGz7ytFZ9/HpKgUQkXhDMJnQihWqB1v9RlGx+VnMZRGimYO0qpQZsCyXaCFmqUHdn71OkaACOSsV6sC9qQQjpQzy+UM+aofYIXY0hDr3Uzg2S5mdF5e7+LQlVGl3E7KovLs9qoCFUK+otK7HZdRBstiTBGrgqzKrgjwSLlVSp1R8F36mik2C/hVYRdUvTtKkMYE2Z03rXw+9lPVWUrBS5TF0lFEhUwZ2WeZ4lQtpIUuZkBZhWaK04HK8s0sfTPFV8I+C2JViFXaOALEKB0pwcnOZDtHCa16nC3oah2Y8bKFnwlp1YpZJTtSOgPwhNKXC/yRUNVCZYqsqJQpdAc2o0ymWKrrxwrFgMwKDvvuLPVlBr+eY1WFUZS0o5+5S2GZwpVCzJQVFYhZKhUguZTFvr9S/Gq1qgylunZWObtSYpW6WOV4Zyy5lFU5JqPQrKqx37Pdzxbqbjo8SXMdmLOiUSk+UzgWuLlJPFNQpjzM2NXrGJDRsxlgrBVkSlQZpVJ0dp9ZsFW1WSmJgtGZqzrJnN7TrkpZlTHYztgBrPqeKRtTyAxIloKq65gLgA7Q3LBZ8ZcM/JfkJwDtKp4lA/99dZeOVoW+Sl1Z37JSFsvCEVAMRfNzqBP4jtIzBWJKrXb4TCksbTJAWdAiFMd0xyrOCVVVIClXUEzxo7L/dAR3UlNluBmQs8DqAOksyugeK5SrwJyJrS7Q3ABVt1vLTzMbHaU4tvuYMHagd471hEGrIBxV1NlcJ38ixNdSvQyWrFjAWYEaOhJjCsAqxsq5GUgzUCIU0Xt2+5eZXJUrwEpJmRBUVbdS0soJKoGqFmulBOV7suCvamDKnO0Bsi2R4QQeS0dq1WUVZKVEWcGqFnrVrph9TtN6FVSdwCrDVgqYpasjQFmLW6W0Wd9jO1dVthN0m52hYjuT/Z05aUdx5P0ZZd1jl84Cq65Rdh9TEhPk0B2ZYquKzWb8UegYU1U5nSm3U1k50aqm8NF8JUBYoLuXlhLEDJBWK2an4qyCdYTFFGp2PbJSklJAVCBnRYftbjWNR0Bm/cQpO7wdFKVDlZJUYO1CzXbo7O5mAl9V2syYXbhM5z0dWFUgrVAi291ZGqkEGF1z6uDkDn5mvFnqYcH4boecpQGWmzv3VB2jzL6vW2lWlXl1JZXdW7HqXgmlKlgMXUyJKiGKnMcoTWlSpbDZ96pAsOszR2R0ZAKv5nLmvdmO7ij3cUZYoUSWMthOYvJgdlCpV0UZA4y9SHJngcsJPyOXdO+t3jZ3KOgIO6kkdhhRVTu2AKptOKsyLZGw/JkJKkt9lRKdGpbthsrALJ1WjqUUXXXc3wHx6CpO5z6xM6YdBa+MxCprBmSHljrCVr1OUhVb/KqdxHR36iKuqpBVAJjQDuUhQWZVvFLE7G6kAtZqQVZCUFWSI4UiQFUKrQCWGTFTTpdCmXJm/iqJpxT2SBhPujPpXFzO0JzOq+ZOQHZS00zJMmOp1PNdqFkRnAk3qtbKcdrS01BFy6pWq+qOoVJkZoioILB01tmJrNJGBlLWrYtQrSgvU/Lqe1Xlnr5O6aQvluIYVQ/hjYJpFJBVvlUKzBQhcnIGEAuWSndRoFl6iypY5iqr8m/lhAhAFZBZWM7uFjrXZwuUKdGb5V7yI9VbHOyAplU7hxm+cp7ZBWWFQlSDzqgm25Gz76v616yTGfZk77FUlcx+GgZgZVz2HNN5CmKWypUDsiqwclalhJnTuPTELjJnO4p9dpailDGrRVFVaWawrrJUu3KF6pkyrISm6nMYEI9XVzuH5lSlKFrZGKvKYbteFZ+OMXYh9WYH/LHVM3BVA1e7r1rI6HXmAKzyRulH8bE1Tk8/yUxR7LM6VKCEF1WJrNBkipQJewVOJqQu0FnaZIWD7fIV5Tr/Vnql8Oy1sxTXVL2OroBjBqpaVNbROvexVYs5eyqKIU8FUlQcT9OWokyW0pmyqxVYpbU7FCWnl52WfqdqrkCsgMiqyumTTNV1R/nOSY87HbMKnQktC+g7I3VepVnbxFLiTiVlC6IKohKWqmpXwGALwnY3y9lZ2sgU74R6UjkYoEMFzQJydJ1SXSPadXaWiZHiZ+9nPuFrB8/Q0ExYjJKrjrQSqlJOlbKYkpEVGJBPwl6V6aFJZUyZ8VVPdHU4gBmUrYcKhC683cBmlK6EzhTUXXCsqKhAYnQfXt92/hy7UuDs2VUPwXZXB/BqIWeAZiCxnXbiYC5blKpvceYqBWAGYjuJKVS1ECrESmGnZdcpOmwlK0OehI9SAGYMFrAd51SLslLGDohq8WZ0nXl9q6jrpCY7kUYCxXKXKgRK0FW6ygTUVbzTKcZxOprB71JIR0GzHlplXpaO3lScr1RYtgD3NSwdMQCYMB4/l56lplOPxoxeUdqJA1ULnaXOanG7lFlRODPuzHc9jnxiFbLDAez1bv9QxlTXX81pLH2x/nI8l52S3v09ZQZaZVD2OpvDnWmuQlMJpgpStctWKWQEULkC60CvHHeaUpYK3G7/YGkuc0xXuSvQVqiLCeFMiGUBcBrgjgGjwFn9SZidoToBZRWYKS+bLxP42fMNFXxnHq5c3gClqnRKmahIVNVhhXTZnJmwMwEpZTsFRAFktTDsOqbQ7HeZwpxQ3ErZ7fSljFdV6Uw5qsaQKXMmdFagmELspr0lUYeCywLCBJ0FgBlYLYSiXBYY5QdCK6NSfcXQ4fMfuVZXYZ3AZemxMyhLZWrqUxUyC9BxL7NSIgWwSqmqwrM0lLU0pgRMaZiCd1KWuvZMOCrAMmEzYXeAejxtS0FQHZdVPJUyVa5nKYdVrZnAnNJ5FUgK9C7crJh1AIooMqPyI9mwO/bLKXMoaFVaUp2/Sl1K+mLBYympe2dT7e7KJ7FrKuVXlNZJb53GU22YDvUwIyp3gCoFzAydxS/rxu0aJqwqPVaC7N4/VvRUgdYB8Xo+u8nMDMUowexmzFn/OCnmaBFZwF4OXKFMpqDZLmKdxE7ZXQW6C3aFMqN7X+/3/QcB/G0D8kclnwAAAABJRU5ErkJggg==") repeat',
-          opacity: 0.15,
-          pointerEvents: 'none',
-        }}></div>
-        
-        {/* Scan lines effect */}
-        <div style={{
-          position: 'absolute',
-          top: 0,
-          left: 0,
-          right: 0,
-          bottom: 0,
-          background: 'linear-gradient(to bottom, rgba(0,0,0,0.03) 1px, transparent 1px)',
-          backgroundSize: '100% 2px',
-          opacity: 0.5,
-          pointerEvents: 'none',
-        }}></div>
-        
-        {/* Add coffee stain */}
-        <div style={coffeeStain}></div>
-        
-        {/* Add water damage */}
-        <div style={waterDamage}></div>
-        
-        {/* Add fold line */}
-        <div style={foldLine}></div>
-        
-        {/* Add torn edge */}
-        <div style={tornEdge}></div>
-        
-        {/* Header with skewed alignment */}
-        <div style={{
-          display: 'flex',
-          justifyContent: 'space-between',
-          alignItems: 'center',
-          borderBottom: '2px solid #000',
-          paddingBottom: '4px',
-          marginBottom: '24px',
-          position: 'relative',
-          opacity: 0.8,
-          transform: 'skew(-0.5deg, 0.3deg)',
-        }}>
-          <div style={{width: '48px'}}></div>
-          <h1 style={{
-            ...badScanTextStyle,
-            fontSize: '20px',
-            fontWeight: 'bold',
-            textAlign: 'center',
-            textTransform: 'uppercase',
-            letterSpacing: '1px',
-            opacity: 0.8,
-          }}>{title}</h1>
-          <div style={{
-            ...badScanTextStyle,
-            fontSize: '20px', 
-            fontWeight: 'bold',
-            opacity: 0.85,
-          }}>{pageNumber}</div>
-        </div>
-        
-        {/* Horizontal divider with uneven quality */}
-        <div style={{
-          borderBottom: '1px solid #444',
-          marginBottom: '24px',
-          opacity: 0.6,
-          filter: 'blur(0.3px)',
-          transform: 'scaleY(1.5) skew(0.7deg)',
-        }}></div>
-        
-        {/* Text content with severely degraded appearance */}
-        <div style={{
-          columnCount: 2,
-          columnGap: '20px',
-          columnRule: '1px solid rgba(0,0,0,0.1)',
-          textAlign: 'justify',
-          ...badScanTextStyle,
-          fontSize: '16px',
-          lineHeight: '1.5',
-          opacity: 0.78,
-          // Very uneven ink distribution with blurry and faded parts
-          WebkitMaskImage: 'linear-gradient(to bottom, rgba(0,0,0,0.9), rgba(0,0,0,0.75) 50%, rgba(0,0,0,0.85))',
-          // Text distortion
-          filter: 'blur(0.2px)',
-        }}>
-          {/* Bad scan text with random character fading */}
-          <p>{text.split('').map((char, index) => {
-            const opacity = Math.random() > 0.8 ? 0.4 + Math.random() * 0.5 : 0.9 + Math.random() * 0.1;
-            const blur = Math.random() > 0.95 ? 1 : 0;
-            return <span key={index} style={{opacity, filter: `blur(${blur}px)`}}>{char}</span>;
-          })}</p>
-        </div>
-        
-        {/* Extra random ink spill */}
-        <div style={{
-          position: 'absolute',
-          width: '10px',
-          height: '20px',
-          top: '60%',
-          left: '25%',
-          background: 'rgba(0,0,0,0.3)',
-          borderRadius: '50%',
-          transform: 'rotate(30deg)',
-          filter: 'blur(1px)',
-          zIndex: 3,
-        }}></div>
-      </div>
-      
-    </div>
-  );
-};
-
-//export default BookPageTemplate;
-window.BookPageTemplate = BookPageTemplate;
--- a/olmocr/bench/synth/templates/listpage.js
+++ b/olmocr/bench/synth/templates/listpage.js
@ -1,83 +0,0 @@
-//import React from 'react';
-
-const PermitGuidelinesTemplate = () => {
-  // Sample data - you can replace these with your own
-  const guidelineItems = [
-    {
-      number: 'iii.',
-      content: 'Not rely on personal preference or opinion, or regional interpretation of statute, regulation or guidance that is inconsistent with the Department\'s statewide interpretation. Staff should confer with the appropriate Bureau Director as necessary.'
-    },
-    {
-      number: 'iv.',
-      content: 'Process technically adequate and scientifically sound applications for final approval to minimize elapsed time in accordance with the Permit Decision Guarantee.'
-    },
-    {
-      number: 'v.',
-      content: 'Where the Application Manager determines that the technical information submitted with the application does not meet technical guidance or standards published by the Department, the application must provide the scientific or engineering basis to support the application. Note that deviations from technical guidance can generally be approved, by the appropriate section chief and manager, when warranted, provided acceptable justification has been submitted. Minor deficiencies that can be easily corrected should be addressed through a telephone call with the applicant and consultant, and may negate the need for a deficiency letter. The Program Manager or District Manager will be responsible for making that decision.'
-    },
-    {
-      number: 'vi.',
-      content: 'If an application fails to provide the technical information necessary to document that applicable regulatory and statutory requirements will be achieved, it is technically deficient and the Application Manager will prepare a technical deficiency letter. Again, all deficiencies noted must cite the statutory or regulatory obligation that the application has failed to meet and the Section Chief and the Program Manager will routinely review these letters. For District Oil and Gas Offices and District Mining Offices the Permits Chief and the Manager will review the letters.'
-    },
-    {
-      number: 'vii.',
-      content: 'Applicant responses that do not make the application technically adequate within the established response timeframe will be subject to the Elevated Review Process below. Applications that are made technically adequate within the established response timeframe will proceed to processing for final action.'
-    }
-  ];
-
-  // Footnote data
-  const footnote = {
-    number: '2',
-    content: 'More technically complex projects and applications may receive additional deficiency letters as appropriate prior to a decision point. This exception will not void inclusion in the Permit Decision Guarantee and will follow program specific guidance that is developed. The more technically complex projects and applications are noted with an asterisk ("*") in Appendix A.'
-  };
-
-  // Document info
-  const documentInfo = "021-2100-001 / November 2, 2012 / Page 11";
-
-  // Special note about technical deficiency letter
-  const technicalDeficiencyNote = {
-    prefix: 'One',
-    superscript: '2',
-    content: ' technical deficiency letter will be sent. Each deficiency cited must note the statute, regulation or technical guidance provision. Technical guidance provides a means to compliance, but may not be used or cited when issuing a permit denial. The letter will state, as necessary, that the Permit Decision Guarantee is no longer applicable and offer the applicant an opportunity to meet and discuss the deficiencies. The letter will include a deadline for submission of the deficient information.'
-  };
-
-  return (
-    <div className="bg-white p-8 max-w-4xl mx-auto font-serif text-black">
-      <div className="mb-8">
-        {guidelineItems.map((item, index) => (
-          <div key={index} className="mb-6 flex">
-            <div className="w-12 flex-shrink-0 font-bold">{item.number}</div>
-            <div className="flex-grow">{item.content}</div>
-          </div>
-        ))}
-        
-        {/* Technical deficiency letter note */}
-        <div className="mb-6 ml-12">
-          <p>
-            {technicalDeficiencyNote.prefix}
-            <sup>{technicalDeficiencyNote.superscript}</sup>
-            {technicalDeficiencyNote.content}
-          </p>
-        </div>
-      </div>
-      
-      {/* Horizontal line */}
-      <div className="border-t border-gray-400 my-6"></div>
-      
-      {/* Footnote section */}
-      <div className="text-sm">
-        <p>
-          <sup>{footnote.number}</sup> {footnote.content}
-        </p>
-      </div>
-      
-      {/* Document info */}
-      <div className="text-center mt-6 text-sm">
-        {documentInfo}
-      </div>
-    </div>
-  );
-};
-
-//export default PermitGuidelinesTemplate;
-window.BookPageTemplate = PermitGuidelinesTemplate;