Some early code for mining html templates of pages, pick mediod code

This commit is contained in:
Jake Poznanski 2025-03-21 17:51:29 +00:00
parent 58276b04cb
commit 1f77aab75a
5 changed files with 404 additions and 483 deletions

View File

@ -0,0 +1,167 @@
#!/usr/bin/env python3
"""
pick_mediod.py - Identify representative examples from repeated OCR outputs
This code will take as arguments two directories:
--input and --output
Each of those is going to be a directory that was generated by convert.py and is a candidate to be evaluated as part of benchmark.py
What it will do is find and group all of the .md files into their repeats
ex. input_dir/tables/buildingnotes_pg1_repeat1.md, input_dir/tables/buildingnotes_pg1_repeat2.md, etc.
Then, for each repeat, it will use string similarity metrics to calculate the edit distance to every other repeat
The repeat with the lowest mean edit distance will then get output as ..._repeat1.md in the output folder
"""
import argparse
import glob
import os
import re
import shutil
from typing import Dict, List
from rapidfuzz import distance as fuzz_distance
from tqdm import tqdm
def compute_distance(text1: str, text2: str) -> float:
"""
Compute the edit distance between two text strings using rapidfuzz.
Returns a normalized distance between 0.0 (identical) and 1.0 (completely different).
"""
# Use Levenshtein distance for string comparison
return fuzz_distance.Levenshtein.normalized_distance(text1, text2)
def find_mediod(texts: List[str]) -> int:
"""
Find the index of the mediod from a list of texts.
The mediod is the text with the minimum average distance to all other texts.
"""
if not texts:
return -1
if len(texts) == 1:
return 0
# Calculate pairwise distances between all texts
n = len(texts)
distances = [[0.0 for _ in range(n)] for _ in range(n)]
for i in range(n):
for j in range(i + 1, n):
dist = compute_distance(texts[i], texts[j])
distances[i][j] = dist
distances[j][i] = dist
# Calculate average distance of each text to all others
avg_distances = []
for i in range(n):
avg_dist = sum(distances[i]) / (n - 1) # Don't include distance to self
avg_distances.append(avg_dist)
# Return the index of the text with the minimum average distance
min_avg_dist = min(avg_distances)
return avg_distances.index(min_avg_dist)
def group_repeats(md_files: List[str]) -> Dict[str, List[str]]:
"""
Group MD files by their base name (without the repeat number).
Returns a dictionary mapping base names to lists of file paths.
"""
grouped = {}
for md_path in md_files:
base_name = re.sub(r"_repeat\d+\.md$", "", os.path.basename(md_path))
if base_name not in grouped:
grouped[base_name] = []
grouped[base_name].append(md_path)
return grouped
def main():
parser = argparse.ArgumentParser(description="Find mediod (most representative) examples from repeated OCR outputs.")
parser.add_argument(
"--input", type=str, required=True, help="Path to the directory containing repeated OCR outputs (e.g., *_repeat1.md, *_repeat2.md, etc.)"
)
parser.add_argument("--output", type=str, required=True, help="Path to the directory where mediod examples will be copied")
parser.add_argument("--min_repeats", type=int, default=3, help="Minimum number of repeats required to compute a mediod (default: 3)")
args = parser.parse_args()
input_dir = args.input
output_dir = args.output
min_repeats = args.min_repeats
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Find all markdown files in the input directory (recursive)
md_files = glob.glob(os.path.join(input_dir, "**/*.md"), recursive=True)
if not md_files:
print(f"No markdown files found in {input_dir}")
return
# Group files by their base name
grouped_files = group_repeats(md_files)
# Process each group
successful = 0
skipped = 0
print(f"Found {len(grouped_files)} unique test cases with repeats")
for base_name, file_paths in tqdm(grouped_files.items(), desc="Processing test cases"):
# Skip if there aren't enough repeats
if len(file_paths) < min_repeats:
print(f"Skipping {base_name}: only {len(file_paths)} repeats (minimum {min_repeats} required)")
skipped += 1
continue
# Read all text content
texts = []
for path in file_paths:
try:
with open(path, "r", encoding="utf-8") as f:
texts.append(f.read())
except Exception as e:
print(f"Error reading {path}: {e}")
continue
# Find the mediod
mediod_idx = find_mediod(texts)
if mediod_idx == -1:
print(f"Failed to find mediod for {base_name}")
skipped += 1
continue
# Get the path of the mediod file
mediod_path = file_paths[mediod_idx]
# Create the output path, preserving the directory structure relative to input_dir
if os.path.isabs(mediod_path) and os.path.isabs(input_dir):
rel_path = os.path.relpath(mediod_path, input_dir)
# Change the repeat number to 1 in the output filename
output_filename = re.sub(r"_repeat\d+\.md$", "_repeat1.md", os.path.basename(rel_path))
output_subdir = os.path.dirname(rel_path)
output_path = os.path.join(output_dir, output_subdir, output_filename)
# Create directories if needed
os.makedirs(os.path.dirname(output_path), exist_ok=True)
else:
# Just output to the root of output_dir with renamed file
output_filename = re.sub(r"_repeat\d+\.md$", "_repeat1.md", os.path.basename(mediod_path))
output_path = os.path.join(output_dir, output_filename)
# Copy the mediod file
try:
shutil.copy2(mediod_path, output_path)
successful += 1
except Exception as e:
print(f"Error copying {mediod_path} to {output_path}: {e}")
print(f"Processing complete: {successful} mediods copied, {skipped} cases skipped")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,237 @@
import argparse
import concurrent.futures
import os
import random
import subprocess
from concurrent.futures import ThreadPoolExecutor
import pypdf
from anthropic import Anthropic
from tqdm import tqdm
from olmocr.data.renderpdf import render_pdf_to_base64png
def download_s3_pdf(s3_path, local_path):
"""Download a PDF from S3 to a local path."""
os.makedirs(os.path.dirname(local_path), exist_ok=True)
result = subprocess.run(["aws", "s3", "cp", s3_path, local_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
return result.returncode == 0
def generate_html_from_image(client, image_base64):
"""Call Claude API to generate HTML from an image."""
try:
response = client.messages.create(
model="claude-3-7-sonnet-20250219",
max_tokens=4000,
temperature=0.2,
messages=[
{
"role": "user",
"content": [
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
{
"type": "text",
"text": "Render this document as clean, semantic HTML. Use appropriate HTML tags for elements like headings, paragraphs, lists, tables, etc. "
"Use the <header> and <footer> tags to represent content at the top/bottom which would not normally be part of the main content, such as page numbers, etc. "
"Use a placeholder <div> tag with class 'image' which will render as a grey box with black outline to make sure images have their original size, shape, and position on the page. "
"Preserve any multi-column layouts exactly as they appear. "
"Focus on creating valid, accessible HTML that preserves the appearance and formatting of the original page as closely as possible. ",
},
],
}
],
)
# Extract HTML from response
html_content = ""
for content in response.content:
if content.type == "text":
html_content += content.text
# Extract code blocks from response if HTML is wrapped in them
if "```html" in html_content:
start = html_content.find("```html") + 7
end = html_content.rfind("```")
if end > start:
html_content = html_content[start:end].strip()
elif "```" in html_content:
start = html_content.find("```") + 3
end = html_content.rfind("```")
if end > start:
html_content = html_content[start:end].strip()
return html_content
except Exception as e:
print(f"Error calling Claude API: {e}")
return None
def extract_page_from_pdf(input_path, output_path, page_num):
"""
Extract a specific page from a PDF and save it as a new PDF.
Args:
input_path: Path to the input PDF
output_path: Path to save the extracted page
page_num: The page number to extract (1-indexed, converted to 0-indexed for pypdf)
Returns:
bool: True if extraction was successful, False otherwise
"""
try:
# Ensure output directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Read the input PDF
reader = pypdf.PdfReader(input_path)
# Convert to 0-indexed for pypdf
zero_idx_page = page_num - 1
# Check if page number is valid
if zero_idx_page >= len(reader.pages) or zero_idx_page < 0:
print(f"Page number {page_num} out of range for {input_path} with {len(reader.pages)} pages")
return False
# Create a new PDF with just the selected page
writer = pypdf.PdfWriter()
writer.add_page(reader.pages[zero_idx_page])
# Write the output PDF
with open(output_path, "wb") as output_file:
writer.write(output_file)
return True
except Exception as e:
print(f"Error extracting page {page_num} from {input_path}: {str(e)}")
return False
def process_pdf(pdf_info, args, client):
"""Process a single PDF, render a random page, and create an HTML template."""
s3_path, index = pdf_info
# Create a unique folder for each PDF in the temp directory
pdf_id = f"pdf_{index:05d}"
temp_pdf_dir = os.path.join(args.temp_dir, pdf_id)
os.makedirs(temp_pdf_dir, exist_ok=True)
# Download PDF to local temp directory
local_pdf_path = os.path.join(temp_pdf_dir, "document.pdf")
if not download_s3_pdf(s3_path, local_pdf_path):
print(f"Failed to download PDF from {s3_path}")
return None
try:
# Get page count using pypdf
reader = pypdf.PdfReader(local_pdf_path)
num_pages = len(reader.pages)
if num_pages == 0:
print(f"PDF has no pages: {s3_path}")
return None
# Select a random page
page_num = random.randint(1, num_pages)
# Render the page as a base64 PNG
image_base64 = render_pdf_to_base64png(local_pdf_path, page_num, target_longest_image_dim=2048)
# Generate HTML from the image
html_content = generate_html_from_image(client, image_base64)
if not html_content:
print(f"Failed to generate HTML for {s3_path}, page {page_num}")
return None
# Create output directory
templates_dir = os.path.join(args.output_dir, "templates")
os.makedirs(templates_dir, exist_ok=True)
# Save HTML to output directory
html_path = os.path.join(templates_dir, f"{pdf_id}_page{page_num}.html")
with open(html_path, "w") as f:
f.write(html_content)
# Extract the page and save as PDF
pdf_path = os.path.join(templates_dir, f"{pdf_id}_page{page_num}.pdf")
if not extract_page_from_pdf(local_pdf_path, pdf_path, page_num):
print(f"Failed to extract page {page_num} from {local_pdf_path}")
return {"pdf_id": pdf_id, "s3_path": s3_path, "page_number": page_num, "html_path": html_path, "pdf_path": pdf_path}
except Exception as e:
print(f"Error processing {s3_path}: {e}")
return None
finally:
# Clean up temp directory for this PDF
if os.path.exists(temp_pdf_dir):
subprocess.run(["rm", "-rf", temp_pdf_dir])
def main():
parser = argparse.ArgumentParser(description="Convert PDFs to HTML templates")
parser.add_argument("--input_list", required=True, help="Path to a file containing S3 paths to PDFs")
parser.add_argument("--output_dir", required=True, help="Directory to store extracted pages and tests")
parser.add_argument("--temp_dir", default="/tmp/mine_tables", help="Directory for temporary files")
parser.add_argument("--max_tests", type=int, default=100, help="Maximum number of tests to generate")
parser.add_argument("--parallel", type=int, default=1, help="Number of parallel threads to use")
parser.add_argument("--api_key", help="Claude API key (or set ANTHROPIC_API_KEY environment variable)")
args = parser.parse_args()
# Ensure output and temp directories exist
os.makedirs(args.output_dir, exist_ok=True)
os.makedirs(args.temp_dir, exist_ok=True)
# Get API key
api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
print("Error: API key not provided. Use --api_key or set ANTHROPIC_API_KEY environment variable.")
return
# Initialize Claude client
client = Anthropic(api_key=api_key)
# Reservoir sampling implementation
s3_paths = []
with open(args.input_list, "r") as f:
for i, line in enumerate(tqdm(f)):
line = line.strip()
if not line:
continue
if i < 100000:
s3_paths.append(line)
else:
# Randomly replace elements with decreasing probability
j = random.randint(0, i)
if j < 100000:
s3_paths[j] = line
print(f"Found {len(s3_paths)} PDF paths in input list")
# Shuffle and limit to max_tests
random.shuffle(s3_paths)
s3_paths = s3_paths[: args.max_tests]
# Process PDFs in parallel
results = []
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
# Submit all tasks
futures = {executor.submit(process_pdf, (s3_path, i), args, client): s3_path for i, s3_path in enumerate(s3_paths)}
# Process results as they complete
for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing PDFs"):
s3_path = futures[future]
try:
result = future.result()
if result:
results.append(result)
except Exception as e:
print(f"Error processing {s3_path}: {e}")
print(f"Generated {len(results)} HTML templates")
if __name__ == "__main__":
main()

View File

@ -1,186 +0,0 @@
#!/usr/bin/env python3
import asyncio
import os
from pathlib import Path
from playwright.async_api import async_playwright
# Simple configuration
CONFIG = {
"input_file": os.path.join(os.path.dirname(__file__), "templates", "listpage.js"), # React component file
"output_pdf": "book-page.pdf", # Output PDF filename
"temp_html": "temp-render.html", # Temporary HTML file
"wait_time": 1500, # Time to wait for rendering (ms)
"device_scale": 2, # Resolution multiplier
"debug": True, # Keep temp files for debugging
}
async def create_html_file():
"""Create a temporary HTML file that loads the React component from a file."""
try:
# Check if input file exists
input_path = Path(CONFIG["input_file"])
if not input_path.exists():
print(f"Error: Input file '{input_path}' not found")
return False
# Read the component file
with open(input_path, "r", encoding="utf-8") as f:
component_code = f.read()
# Create HTML that will load our component
html_content = (
"""
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Book Page Template</title>
<script src="https://unpkg.com/react@17/umd/react.development.js"></script>
<script src="https://unpkg.com/react-dom@17/umd/react-dom.development.js"></script>
<script src="https://unpkg.com/@babel/standalone/babel.min.js"></script>
<style>
* {
box-sizing: border-box;
}
html, body {
margin: 0;
padding: 0;
width: 8.5in;
height: 11in;
overflow: hidden;
}
#root {
width: 100%;
height: 100%;
padding: 0.25in;
overflow: hidden;
}
@media print {
body {
-webkit-print-color-adjust: exact;
print-color-adjust: exact;
}
}
</style>
</head>
<body>
<div id="root"></div>
<script type="text/babel">
// The React component code loaded from external file
"""
+ component_code
+ """
// Render only the book page part, not the controls
ReactDOM.render(
<BookPageTemplate />,
document.getElementById('root')
);
</script>
</body>
</html>
"""
)
with open(CONFIG["temp_html"], "w", encoding="utf-8") as f:
f.write(html_content)
print(f"Created HTML file: {CONFIG['temp_html']}")
print(f"Using React component from: {CONFIG['input_file']}")
return True
except Exception as e:
print(f"Error creating HTML file: {e}")
print(f"Exception details: {str(e)}")
import traceback
traceback.print_exc()
return False
async def render_to_pdf():
"""Render the React component to PDF using Playwright."""
try:
# Create the HTML file first
html_created = await create_html_file()
if not html_created:
print("Failed to create HTML file")
return
print("Launching browser...")
async with async_playwright() as p:
# Launch the browser with more debugging options
browser = await p.chromium.launch(
headless=True, # True for production, False for debugging
)
# Create a new page for letter size paper
page = await browser.new_page(viewport={"width": 816, "height": 1056}, device_scale_factor=CONFIG["device_scale"]) # 8.5in x 11in at 96dpi
# Get absolute path to HTML file
html_path = Path(CONFIG["temp_html"]).absolute()
html_uri = f"file://{html_path}"
print(f"Navigating to: {html_uri}")
# Add event listeners for console messages and errors
page.on("console", lambda msg: print(f"Browser console: {msg.text}"))
page.on("pageerror", lambda err: print(f"Browser page error: {err}"))
# Navigate with longer timeout and wait for network idle
await page.goto(html_uri, wait_until="networkidle", timeout=30000)
# Wait for React to render
await page.wait_for_timeout(CONFIG["wait_time"])
# Add a check to ensure the component rendered
element_count = await page.evaluate(
"""() => {
const root = document.getElementById('root');
return root.childElementCount;
}"""
)
if element_count == 0:
print("Warning: No elements found in root. Component may not have rendered.")
else:
print(f"Found {element_count} elements in root. Component rendered successfully.")
# Save debug screenshot
if CONFIG["debug"]:
await page.screenshot(path="debug-screenshot.png")
print("Debug screenshot saved")
# Generate PDF
print("Generating PDF...")
await page.pdf(path=CONFIG["output_pdf"], format="Letter", print_background=True, margin={"top": "0", "right": "0", "bottom": "0", "left": "0"})
print(f"PDF generated successfully: {CONFIG['output_pdf']}")
# Close the browser
await browser.close()
# Cleanup temp files if not in debug mode
if not CONFIG["debug"] and Path(CONFIG["temp_html"]).exists():
Path(CONFIG["temp_html"]).unlink()
print("Temporary HTML file removed")
except Exception as e:
print(f"Error generating PDF: {e}")
if __name__ == "__main__":
# Run the async function
try:
asyncio.run(render_to_pdf())
except Exception as e:
print(f"Fatal error: {e}")
import traceback
traceback.print_exc()

View File

@ -1,214 +0,0 @@
//import React from 'react';
const BookPageTemplate = () => {
// Only three state variables as requested
const [title, setTitle] = React.useState("ADVENTURES OF DON QUIXOTE");
const [pageNumber, setPageNumber] = React.useState("289");
const [text, setText] = React.useState(
"deed,\" said Don Quixote, \"thou hast hit the point, Sancho, which can alone shake my resolution; I neither can, nor ought to, draw my sword, as I have often told thee, against those who are not dubbed knights. To thee which I had premeditated, thy share of the booty would have been at least the emperor's crown of gold and Cupid's painted wings; for I would have plucked them off perforce, and delivered them into thy hands.\" \"The"
);
// Styles for heavily degraded scan effect
const heavilyDegradedStyles = {
filter: 'grayscale(30%) contrast(120%) brightness(85%) sepia(20%)',
position: 'relative',
backgroundColor: '#e6ddc6', // More yellowed aged paper
backgroundImage: 'url("data:image/svg+xml,%3Csvg viewBox=\'0 0 200 200\' xmlns=\'http://www.w3.org/2000/svg\'%3E%3Cfilter id=\'noiseFilter\'%3E%3CfeTurbulence type=\'fractalNoise\' baseFrequency=\'0.85\' numOctaves=\'3\' stitchTiles=\'stitch\'/%3E%3C/filter%3E%3Crect width=\'100%25\' height=\'100%25\' filter=\'url(%23noiseFilter)\' opacity=\'0.25\'/%3E%3C/svg%3E")',
boxShadow: 'inset 0 0 70px rgba(0, 0, 0, 0.3), 0 0 5px rgba(0,0,0,0.1)',
padding: '32px',
borderRadius: '2px',
overflow: 'hidden',
transform: 'rotate(0.3deg)', // Slightly askew scan
};
// Heavily degraded text
const badScanTextStyle = {
fontFamily: '"Times New Roman", serif',
letterSpacing: '-0.01em',
wordSpacing: '0.02em',
fontWeight: '500',
color: '#222222',
textShadow: '0 0 1px rgba(0, 0, 0, 0.5)',
transform: 'scale(1.01, 0.99) rotate(-0.4deg)', // Distorted proportions
};
// Random coffee stain effect
const coffeeStain = {
position: 'absolute',
width: '100px',
height: '80px',
top: '25%',
right: '15%',
borderRadius: '50%',
background: 'radial-gradient(ellipse at center, rgba(139,69,19,0.15) 0%, rgba(139,69,19,0.1) 50%, rgba(139,69,19,0.05) 70%, rgba(139,69,19,0) 100%)',
transform: 'rotate(30deg) scale(1.5, 1)',
pointerEvents: 'none',
zIndex: 1,
};
// Water damage effect
const waterDamage = {
position: 'absolute',
width: '70%',
height: '40%',
bottom: '10%',
left: '5%',
opacity: 0.07,
background: 'radial-gradient(ellipse at center, rgba(0,0,0,0.2) 0%, rgba(0,0,0,0.1) 40%, rgba(0,0,0,0) 70%)',
borderRadius: '40% 60% 70% 30% / 40% 50% 60% 50%',
pointerEvents: 'none',
zIndex: 1,
};
// Add fold lines
const foldLine = {
position: 'absolute',
width: '100%',
height: '3px',
top: '30%',
left: 0,
background: 'linear-gradient(to right, rgba(0,0,0,0) 0%, rgba(0,0,0,0.03) 20%, rgba(0,0,0,0.08) 50%, rgba(0,0,0,0.03) 80%, rgba(0,0,0,0) 100%)',
boxShadow: '0 1px 3px rgba(255,255,255,0.2)',
pointerEvents: 'none',
zIndex: 2,
};
// Torn edge effect
const tornEdge = {
position: 'absolute',
top: 0,
right: 0,
width: '100%',
height: '100%',
background: 'linear-gradient(135deg, transparent 97%, #e6ddc6 97%, #e6ddc6 100%)',
pointerEvents: 'none',
};
return (
<div style={{
maxWidth: '800px',
margin: '0 auto',
padding: '16px',
}}>
{/* Heavily degraded scan container */}
<div style={heavilyDegradedStyles}>
{/* Noise overlay */}
<div style={{
position: 'absolute',
top: 0,
left: 0,
right: 0,
bottom: 0,
background: 'url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAYAAABw4pVUAAAKpklEQVR4Xu2di3IbOQxD3f//6aTJJLF3vSRAAiTlvWy7lUSAD1KWc97b8/m8f7/+2xZg27fs/P/LvzClv+f77Hfz79eTP+pv/5ZlmPKZfZYp7eOsU8rrQ9fQ/r5+P/s7+/2M7lO+67kTvZfnqx4zXXtcz5To/TwZj2Uxn+FiJiDCPzecjXcEh30/gokAYvSeCVu0OaNrtV5F4I9jiAILu5AZYs8QiExIRZkRYFjKIgFUCsT0rH5EdM5oBUaRr8KnUgaNKzRfARkFRBlltQKr32OATwmp0hXTHxINSkkRSCzNZQmhnWyVnvmzwAqIrQr8AYgJwWz3smW9K0OxXTQTLhaQlQJZwmKKzIwtqqiVRVbCVS1ORpSZQbKCygLIErKVoiNZVT5eAcYEfaW41XQ1c31WAFZKZBVn5aQjpwb0mRJPCKkCiFKrUmL0PBGK1aFZ0XpCsb5SoROQGQBzRUaAMwavFJEZOlOwTNGjPK+EpVK2CjsGbDTXzgBW5RiZgaJ3VAc/U9RKkVjQTu7AZopdpVOVrmaUULGGBZClsRmFKtdWPYehMKk4Sksq0KuAK4WLSsmUORXDUlWXNX72OgZkbgADDDs22xGz7ytFZ9/HpKgUQkXhDMJnQihWqB1v9RlGx+VnMZRGimYO0qpQZsCyXaCFmqUHdn71OkaACOSsV6sC9qQQjpQzy+UM+aofYIXY0hDr3Uzg2S5mdF5e7+LQlVGl3E7KovLs9qoCFUK+otK7HZdRBstiTBGrgqzKrgjwSLlVSp1R8F36mik2C/hVYRdUvTtKkMYE2Z03rXw+9lPVWUrBS5TF0lFEhUwZ2WeZ4lQtpIUuZkBZhWaK04HK8s0sfTPFV8I+C2JViFXaOALEKB0pwcnOZDtHCa16nC3oah2Y8bKFnwlp1YpZJTtSOgPwhNKXC/yRUNVCZYqsqJQpdAc2o0ymWKrrxwrFgMwKDvvuLPVlBr+eY1WFUZS0o5+5S2GZwpVCzJQVFYhZKhUguZTFvr9S/Gq1qgylunZWObtSYpW6WOV4Zyy5lFU5JqPQrKqx37Pdzxbqbjo8SXMdmLOiUSk+UzgWuLlJPFNQpjzM2NXrGJDRsxlgrBVkSlQZpVJ0dp9ZsFW1WSmJgtGZqzrJnN7TrkpZlTHYztgBrPqeKRtTyAxIloKq65gLgA7Q3LBZ8ZcM/JfkJwDtKp4lA/99dZeOVoW+Sl1Z37JSFsvCEVAMRfNzqBP4jtIzBWJKrXb4TCksbTJAWdAiFMd0xyrOCVVVIClXUEzxo7L/dAR3UlNluBmQs8DqAOksyugeK5SrwJyJrS7Q3ABVt1vLTzMbHaU4tvuYMHagd471hEGrIBxV1NlcJ38ixNdSvQyWrFjAWYEaOhJjCsAqxsq5GUgzUCIU0Xt2+5eZXJUrwEpJmRBUVbdS0soJKoGqFmulBOV7suCvamDKnO0Bsi2R4QQeS0dq1WUVZKVEWcGqFnrVrph9TtN6FVSdwCrDVgqYpasjQFmLW6W0Wd9jO1dVthN0m52hYjuT/Z05aUdx5P0ZZd1jl84Cq65Rdh9TEhPk0B2ZYquKzWb8UegYU1U5nSm3U1k50aqm8NF8JUBYoLuXlhLEDJBWK2an4qyCdYTFFGp2PbJSklJAVCBnRYftbjWNR0Bm/cQpO7wdFKVDlZJUYO1CzXbo7O5mAl9V2syYXbhM5z0dWFUgrVAi291ZGqkEGF1z6uDkDn5mvFnqYcH4boecpQGWmzv3VB2jzL6vW2lWlXl1JZXdW7HqXgmlKlgMXUyJKiGKnMcoTWlSpbDZ96pAsOszR2R0ZAKv5nLmvdmO7ij3cUZYoUSWMthOYvJgdlCpV0UZA4y9SHJngcsJPyOXdO+t3jZ3KOgIO6kkdhhRVTu2AKptOKsyLZGw/JkJKkt9lRKdGpbthsrALJ1WjqUUXXXc3wHx6CpO5z6xM6YdBa+MxCprBmSHljrCVr1OUhVb/KqdxHR36iKuqpBVAJjQDuUhQWZVvFLE7G6kAtZqQVZCUFWSI4UiQFUKrQCWGTFTTpdCmXJm/iqJpxT2SBhPujPpXFzO0JzOq+ZOQHZS00zJMmOp1PNdqFkRnAk3qtbKcdrS01BFy6pWq+qOoVJkZoioILB01tmJrNJGBlLWrYtQrSgvU/Lqe1Xlnr5O6aQvluIYVQ/hjYJpFJBVvlUKzBQhcnIGEAuWSndRoFl6iypY5iqr8m/lhAhAFZBZWM7uFjrXZwuUKdGb5V7yI9VbHOyAplU7hxm+cp7ZBWWFQlSDzqgm25Gz76v616yTGfZk77FUlcx+GgZgZVz2HNN5CmKWypUDsiqwclalhJnTuPTELjJnO4p9dpailDGrRVFVaWawrrJUu3KF6pkyrISm6nMYEI9XVzuH5lSlKFrZGKvKYbteFZ+OMXYh9WYH/LHVM3BVA1e7r1rI6HXmAKzyRulH8bE1Tk8/yUxR7LM6VKCEF1WJrNBkipQJewVOJqQu0FnaZIWD7fIV5Tr/Vnql8Oy1sxTXVL2OroBjBqpaVNbROvexVYs5eyqKIU8FUlQcT9OWokyW0pmyqxVYpbU7FCWnl52WfqdqrkCsgMiqyumTTNV1R/nOSY87HbMKnQktC+g7I3VepVnbxFLiTiVlC6IKohKWqmpXwGALwnY3y9lZ2sgU74R6UjkYoEMFzQJydJ1SXSPadXaWiZHiZ+9nPuFrB8/Q0ExYjJKrjrQSqlJOlbKYkpEVGJBPwl6V6aFJZUyZ8VVPdHU4gBmUrYcKhC683cBmlK6EzhTUXXCsqKhAYnQfXt92/hy7UuDs2VUPwXZXB/BqIWeAZiCxnXbiYC5blKpvceYqBWAGYjuJKVS1ECrESmGnZdcpOmwlK0OehI9SAGYMFrAd51SLslLGDohq8WZ0nXl9q6jrpCY7kUYCxXKXKgRK0FW6ygTUVbzTKcZxOprB71JIR0GzHlplXpaO3lScr1RYtgD3NSwdMQCYMB4/l56lplOPxoxeUdqJA1ULnaXOanG7lFlRODPuzHc9jnxiFbLDAez1bv9QxlTXX81pLH2x/nI8l52S3v09ZQZaZVD2OpvDnWmuQlMJpgpStctWKWQEULkC60CvHHeaUpYK3G7/YGkuc0xXuSvQVqiLCeFMiGUBcBrgjgGjwFn9SZidoToBZRWYKS+bLxP42fMNFXxnHq5c3gClqnRKmahIVNVhhXTZnJmwMwEpZTsFRAFktTDsOqbQ7HeZwpxQ3ErZ7fSljFdV6Uw5qsaQKXMmdFagmELspr0lUYeCywLCBJ0FgBlYLYSiXBYY5QdCK6NSfcXQ4fMfuVZXYZ3AZemxMyhLZWrqUxUyC9BxL7NSIgWwSqmqwrM0lLU0pgRMaZiCd1KWuvZMOCrAMmEzYXeAejxtS0FQHZdVPJUyVa5nKYdVrZnAnNJ5FUgK9C7crJh1AIooMqPyI9mwO/bLKXMoaFVaUp2/Sl1K+mLBYympe2dT7e7KJ7FrKuVXlNZJb53GU22YDvUwIyp3gCoFzAydxS/rxu0aJqwqPVaC7N4/VvRUgdYB8Xo+u8nMDMUowexmzFn/OCnmaBFZwF4OXKFMpqDZLmKdxE7ZXQW6C3aFMqN7X+/3/QcB/G0D8kclnwAAAABJRU5ErkJggg==") repeat',
opacity: 0.15,
pointerEvents: 'none',
}}></div>
{/* Scan lines effect */}
<div style={{
position: 'absolute',
top: 0,
left: 0,
right: 0,
bottom: 0,
background: 'linear-gradient(to bottom, rgba(0,0,0,0.03) 1px, transparent 1px)',
backgroundSize: '100% 2px',
opacity: 0.5,
pointerEvents: 'none',
}}></div>
{/* Add coffee stain */}
<div style={coffeeStain}></div>
{/* Add water damage */}
<div style={waterDamage}></div>
{/* Add fold line */}
<div style={foldLine}></div>
{/* Add torn edge */}
<div style={tornEdge}></div>
{/* Header with skewed alignment */}
<div style={{
display: 'flex',
justifyContent: 'space-between',
alignItems: 'center',
borderBottom: '2px solid #000',
paddingBottom: '4px',
marginBottom: '24px',
position: 'relative',
opacity: 0.8,
transform: 'skew(-0.5deg, 0.3deg)',
}}>
<div style={{width: '48px'}}></div>
<h1 style={{
...badScanTextStyle,
fontSize: '20px',
fontWeight: 'bold',
textAlign: 'center',
textTransform: 'uppercase',
letterSpacing: '1px',
opacity: 0.8,
}}>{title}</h1>
<div style={{
...badScanTextStyle,
fontSize: '20px',
fontWeight: 'bold',
opacity: 0.85,
}}>{pageNumber}</div>
</div>
{/* Horizontal divider with uneven quality */}
<div style={{
borderBottom: '1px solid #444',
marginBottom: '24px',
opacity: 0.6,
filter: 'blur(0.3px)',
transform: 'scaleY(1.5) skew(0.7deg)',
}}></div>
{/* Text content with severely degraded appearance */}
<div style={{
columnCount: 2,
columnGap: '20px',
columnRule: '1px solid rgba(0,0,0,0.1)',
textAlign: 'justify',
...badScanTextStyle,
fontSize: '16px',
lineHeight: '1.5',
opacity: 0.78,
// Very uneven ink distribution with blurry and faded parts
WebkitMaskImage: 'linear-gradient(to bottom, rgba(0,0,0,0.9), rgba(0,0,0,0.75) 50%, rgba(0,0,0,0.85))',
// Text distortion
filter: 'blur(0.2px)',
}}>
{/* Bad scan text with random character fading */}
<p>{text.split('').map((char, index) => {
const opacity = Math.random() > 0.8 ? 0.4 + Math.random() * 0.5 : 0.9 + Math.random() * 0.1;
const blur = Math.random() > 0.95 ? 1 : 0;
return <span key={index} style={{opacity, filter: `blur(${blur}px)`}}>{char}</span>;
})}</p>
</div>
{/* Extra random ink spill */}
<div style={{
position: 'absolute',
width: '10px',
height: '20px',
top: '60%',
left: '25%',
background: 'rgba(0,0,0,0.3)',
borderRadius: '50%',
transform: 'rotate(30deg)',
filter: 'blur(1px)',
zIndex: 3,
}}></div>
</div>
</div>
);
};
//export default BookPageTemplate;
window.BookPageTemplate = BookPageTemplate;

View File

@ -1,83 +0,0 @@
//import React from 'react';
const PermitGuidelinesTemplate = () => {
// Sample data - you can replace these with your own
const guidelineItems = [
{
number: 'iii.',
content: 'Not rely on personal preference or opinion, or regional interpretation of statute, regulation or guidance that is inconsistent with the Department\'s statewide interpretation. Staff should confer with the appropriate Bureau Director as necessary.'
},
{
number: 'iv.',
content: 'Process technically adequate and scientifically sound applications for final approval to minimize elapsed time in accordance with the Permit Decision Guarantee.'
},
{
number: 'v.',
content: 'Where the Application Manager determines that the technical information submitted with the application does not meet technical guidance or standards published by the Department, the application must provide the scientific or engineering basis to support the application. Note that deviations from technical guidance can generally be approved, by the appropriate section chief and manager, when warranted, provided acceptable justification has been submitted. Minor deficiencies that can be easily corrected should be addressed through a telephone call with the applicant and consultant, and may negate the need for a deficiency letter. The Program Manager or District Manager will be responsible for making that decision.'
},
{
number: 'vi.',
content: 'If an application fails to provide the technical information necessary to document that applicable regulatory and statutory requirements will be achieved, it is technically deficient and the Application Manager will prepare a technical deficiency letter. Again, all deficiencies noted must cite the statutory or regulatory obligation that the application has failed to meet and the Section Chief and the Program Manager will routinely review these letters. For District Oil and Gas Offices and District Mining Offices the Permits Chief and the Manager will review the letters.'
},
{
number: 'vii.',
content: 'Applicant responses that do not make the application technically adequate within the established response timeframe will be subject to the Elevated Review Process below. Applications that are made technically adequate within the established response timeframe will proceed to processing for final action.'
}
];
// Footnote data
const footnote = {
number: '2',
content: 'More technically complex projects and applications may receive additional deficiency letters as appropriate prior to a decision point. This exception will not void inclusion in the Permit Decision Guarantee and will follow program specific guidance that is developed. The more technically complex projects and applications are noted with an asterisk ("*") in Appendix A.'
};
// Document info
const documentInfo = "021-2100-001 / November 2, 2012 / Page 11";
// Special note about technical deficiency letter
const technicalDeficiencyNote = {
prefix: 'One',
superscript: '2',
content: ' technical deficiency letter will be sent. Each deficiency cited must note the statute, regulation or technical guidance provision. Technical guidance provides a means to compliance, but may not be used or cited when issuing a permit denial. The letter will state, as necessary, that the Permit Decision Guarantee is no longer applicable and offer the applicant an opportunity to meet and discuss the deficiencies. The letter will include a deadline for submission of the deficient information.'
};
return (
<div className="bg-white p-8 max-w-4xl mx-auto font-serif text-black">
<div className="mb-8">
{guidelineItems.map((item, index) => (
<div key={index} className="mb-6 flex">
<div className="w-12 flex-shrink-0 font-bold">{item.number}</div>
<div className="flex-grow">{item.content}</div>
</div>
))}
{/* Technical deficiency letter note */}
<div className="mb-6 ml-12">
<p>
{technicalDeficiencyNote.prefix}
<sup>{technicalDeficiencyNote.superscript}</sup>
{technicalDeficiencyNote.content}
</p>
</div>
</div>
{/* Horizontal line */}
<div className="border-t border-gray-400 my-6"></div>
{/* Footnote section */}
<div className="text-sm">
<p>
<sup>{footnote.number}</sup> {footnote.content}
</p>
</div>
{/* Document info */}
<div className="text-center mt-6 text-sm">
{documentInfo}
</div>
</div>
);
};
//export default PermitGuidelinesTemplate;
window.BookPageTemplate = PermitGuidelinesTemplate;