mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-22 05:29:07 +00:00
1609 lines
66 KiB
Python
1609 lines
66 KiB
Python
import argparse
|
|
import asyncio
|
|
import base64
|
|
import tempfile
|
|
import glob
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import os
|
|
import random
|
|
import re
|
|
import subprocess
|
|
import uuid
|
|
from collections import defaultdict
|
|
from typing import Dict, List
|
|
|
|
import pypdf
|
|
from anthropic import AsyncAnthropic
|
|
from bs4 import BeautifulSoup
|
|
from markdownify import SPACES, MarkdownConverter
|
|
from playwright.async_api import async_playwright
|
|
from syntok.segmenter import process
|
|
from tqdm import tqdm
|
|
|
|
from olmocr.bench.tests import TableTest, TestType, parse_html_tables
|
|
from olmocr.data.renderpdf import (
|
|
get_png_dimensions_from_base64,
|
|
render_pdf_to_base64png,
|
|
)
|
|
from olmocr.filter.filter import Language, PdfFilter
|
|
|
|
# Global variables for tracking Claude API costs
|
|
total_input_tokens = 0
|
|
total_output_tokens = 0
|
|
|
|
|
|
def get_git_commit_hash():
|
|
"""Get the current git commit hash, if available."""
|
|
try:
|
|
result = subprocess.run(
|
|
["git", "rev-parse", "HEAD"],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
text=True,
|
|
check=True
|
|
)
|
|
return result.stdout.strip()
|
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
# Git not available or not a git repository
|
|
return None
|
|
|
|
|
|
# Unicode mappings for superscript characters
|
|
SUPERSCRIPT_MAP = {
|
|
"0": "⁰",
|
|
"1": "¹",
|
|
"2": "²",
|
|
"3": "³",
|
|
"4": "⁴",
|
|
"5": "⁵",
|
|
"6": "⁶",
|
|
"7": "⁷",
|
|
"8": "⁸",
|
|
"9": "⁹",
|
|
"+": "⁺",
|
|
"-": "⁻",
|
|
"=": "⁼",
|
|
"(": "⁽",
|
|
")": "⁾",
|
|
"n": "ⁿ",
|
|
"i": "ⁱ",
|
|
}
|
|
|
|
# Unicode mappings for subscript characters
|
|
SUBSCRIPT_MAP = {
|
|
"0": "₀",
|
|
"1": "₁",
|
|
"2": "₂",
|
|
"3": "₃",
|
|
"4": "₄",
|
|
"5": "₅",
|
|
"6": "₆",
|
|
"7": "₇",
|
|
"8": "₈",
|
|
"9": "₉",
|
|
"+": "₊",
|
|
"-": "₋",
|
|
"=": "₌",
|
|
"(": "₍",
|
|
")": "₎",
|
|
"a": "ₐ",
|
|
"e": "ₑ",
|
|
"o": "ₒ",
|
|
"x": "ₓ",
|
|
"h": "ₕ",
|
|
"k": "ₖ",
|
|
"l": "ₗ",
|
|
"m": "ₘ",
|
|
"n": "ₙ",
|
|
"p": "ₚ",
|
|
"s": "ₛ",
|
|
"t": "ₜ",
|
|
}
|
|
|
|
|
|
def convert_superscripts_subscripts(element):
|
|
"""
|
|
Convert HTML superscript and subscript tags to Unicode equivalents.
|
|
|
|
This function finds all <sup> and <sub> tags in the given element and
|
|
replaces them with their Unicode character equivalents. Characters not
|
|
in the mapping are left unchanged.
|
|
|
|
Args:
|
|
element: A BeautifulSoup element to process
|
|
|
|
Returns:
|
|
The element with sup/sub tags converted to Unicode
|
|
"""
|
|
if not element:
|
|
return element
|
|
|
|
# Process all superscript tags
|
|
for sup in element.find_all("sup"):
|
|
sup_text = sup.get_text()
|
|
unicode_text = "".join(SUPERSCRIPT_MAP.get(char, char) for char in sup_text)
|
|
sup.replace_with(unicode_text)
|
|
|
|
# Process all subscript tags
|
|
for sub in element.find_all("sub"):
|
|
sub_text = sub.get_text()
|
|
unicode_text = "".join(SUBSCRIPT_MAP.get(char, char) for char in sub_text)
|
|
sub.replace_with(unicode_text)
|
|
|
|
return element
|
|
|
|
|
|
def download_s3_pdf(path, local_path):
|
|
"""Download a PDF from S3 or copy from local path."""
|
|
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
|
|
|
# Check if it's a local path
|
|
if os.path.exists(path):
|
|
# It's a local file, just copy it
|
|
import shutil
|
|
|
|
try:
|
|
shutil.copy2(path, local_path)
|
|
return True
|
|
except Exception as e:
|
|
print(f"Failed to copy local file {path}: {e}")
|
|
return False
|
|
elif path.startswith("s3://"):
|
|
# It's an S3 path, download it
|
|
result = subprocess.run(["aws", "s3", "cp", path, local_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
return result.returncode == 0
|
|
else:
|
|
# Assume it's a relative local path that doesn't exist yet
|
|
print(f"Path not found and doesn't appear to be S3: {path}")
|
|
return False
|
|
|
|
|
|
class PreserveTablesConverter(MarkdownConverter):
|
|
"""
|
|
Custom MarkdownConverter that preserves HTML tables unchanged
|
|
"""
|
|
|
|
def convert_table(self, el, text, parent_tags):
|
|
# Get the outer HTML of the table element
|
|
# BeautifulSoup's prettify or str() should give us the full HTML
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Create a temporary soup with just this element to get its HTML
|
|
temp_soup = BeautifulSoup(str(el), "html.parser")
|
|
return str(temp_soup.table) if temp_soup.table else str(el)
|
|
|
|
|
|
def extract_html_metadata(html_content):
|
|
"""Extract metadata from HTML content for FrontMatter."""
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
|
|
# Extract language from html tag
|
|
html_tag = soup.find("html")
|
|
language = "en" # default
|
|
if html_tag and html_tag.get("lang"):
|
|
language = str(html_tag.get("lang"))
|
|
# Convert pt-BR to pt for now
|
|
if len(language) == 5 and language[2] == "-":
|
|
language = language[:2]
|
|
|
|
# Calculate content statistics
|
|
body = soup.find("body")
|
|
if not body:
|
|
body = soup
|
|
|
|
# First, create a version without headers and footers for all calculations
|
|
main_content_soup = BeautifulSoup(str(body), "html.parser")
|
|
# Remove headers and footers from main content
|
|
for element in main_content_soup.find_all(["header", "footer"]):
|
|
element.decompose()
|
|
|
|
# Get text content length (excluding tables and images)
|
|
text_soup = BeautifulSoup(str(main_content_soup), "html.parser")
|
|
# Remove tables
|
|
for element in text_soup.find_all("table"):
|
|
element.decompose()
|
|
# Remove images (div.image)
|
|
for element in text_soup.find_all("div", class_="image"):
|
|
element.decompose()
|
|
text_content = text_soup.get_text().strip()
|
|
text_length = len(text_content)
|
|
|
|
# Count table content (from main content, excluding headers/footers)
|
|
tables = main_content_soup.find_all("table")
|
|
table_text_length = 0
|
|
for table in tables:
|
|
table_text_length += len(table.get_text().strip())
|
|
|
|
# Count images (div.image elements) (from main content, excluding headers/footers)
|
|
images = main_content_soup.find_all("div", class_="image")
|
|
# Rough estimate: each image takes up about 500 characters worth of "space"
|
|
image_content_estimate = len(images) * 500
|
|
|
|
# Calculate total content "length"
|
|
total_content_length = text_length + table_text_length + image_content_estimate
|
|
|
|
# Determine if mostly tables or images
|
|
is_table = False
|
|
is_diagram = False
|
|
|
|
if total_content_length > 0:
|
|
table_ratio = table_text_length / total_content_length
|
|
image_ratio = image_content_estimate / total_content_length
|
|
|
|
is_table = table_ratio > 0.5
|
|
is_diagram = image_ratio > 0.5
|
|
|
|
return {"primary_language": language, "is_rotation_valid": True, "rotation_correction": 0, "is_table": is_table, "is_diagram": is_diagram}
|
|
|
|
|
|
def html_to_markdown_with_frontmatter(html_content):
|
|
"""Convert HTML to markdown with FrontMatter metadata."""
|
|
# Extract metadata
|
|
metadata = extract_html_metadata(html_content)
|
|
|
|
# Parse HTML and extract only body content for markdown conversion
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
body = soup.find("body")
|
|
|
|
# If no body tag, use the whole soup as fallback
|
|
if body:
|
|
# Create a new soup with just the body content
|
|
body_soup = BeautifulSoup(str(body), "html.parser")
|
|
else:
|
|
body_soup = soup
|
|
|
|
# First, remove all header and footer elements from the body
|
|
for header in body_soup.find_all("header"):
|
|
header.decompose()
|
|
for footer in body_soup.find_all("footer"):
|
|
footer.decompose()
|
|
|
|
# Also remove divs with page-header or page-footer classes (in case they weren't converted to header/footer tags)
|
|
for div in body_soup.find_all("div", class_="page-header"):
|
|
div.decompose()
|
|
for div in body_soup.find_all("div", class_="page-footer"):
|
|
div.decompose()
|
|
|
|
# Handle image placeholders - replace div.image with actual img tags for proper markdown conversion
|
|
for img_div in body_soup.find_all("div", class_="image"):
|
|
alt_text = "Image Placeholder" # For now, in the render it's all just a placeholder
|
|
# Create an img tag with placeholder src and appropriate alt text
|
|
img_tag = body_soup.new_tag("img", src="page.png", alt=alt_text)
|
|
img_div.replace_with(img_tag)
|
|
|
|
# Convert superscripts and subscripts to Unicode before markdown conversion
|
|
convert_superscripts_subscripts(body_soup)
|
|
|
|
# Get the modified HTML (only body content)
|
|
modified_html = str(body_soup)
|
|
|
|
# Create custom converter instance
|
|
converter = PreserveTablesConverter(
|
|
heading_style="ATX", # Use # style headings
|
|
bullets="-", # Use - for unordered lists
|
|
strip=["a"], # Remove links but keep text
|
|
newline_style=SPACES, # Use backslash for line breaks
|
|
code_language="", # Don't add language to code blocks
|
|
escape_asterisks=False, # Don't escape asterisks
|
|
escape_underscores=False, # Don't escape underscores
|
|
)
|
|
|
|
# Convert to markdown
|
|
markdown = converter.convert(modified_html)
|
|
|
|
# Clean up excessive newlines
|
|
while "\n\n\n" in markdown:
|
|
markdown = markdown.replace("\n\n\n", "\n\n")
|
|
|
|
# Strip and clean up markdown content
|
|
markdown_content = markdown.strip()
|
|
|
|
# Remove leading or trailing --- if present
|
|
while markdown_content.startswith("---"):
|
|
markdown_content = markdown_content[3:].strip()
|
|
while markdown_content.endswith("---"):
|
|
markdown_content = markdown_content[:-3].strip()
|
|
|
|
# Create FrontMatter
|
|
frontmatter = f"""---
|
|
primary_language: {metadata['primary_language']}
|
|
is_rotation_valid: {metadata['is_rotation_valid']}
|
|
rotation_correction: {metadata['rotation_correction']}
|
|
is_table: {metadata['is_table']}
|
|
is_diagram: {metadata['is_diagram']}
|
|
---"""
|
|
|
|
# Combine FrontMatter with markdown content
|
|
if markdown_content:
|
|
return f"{frontmatter}\n{markdown_content}"
|
|
else:
|
|
return frontmatter
|
|
|
|
|
|
def extract_code_block(initial_response):
|
|
# Use regex to find the last instance of a code block
|
|
# First try to find HTML specific code blocks
|
|
html_blocks = re.findall(r"```html\n(.*?)```", initial_response, re.DOTALL)
|
|
|
|
# If HTML blocks found, return the last one
|
|
if html_blocks:
|
|
return html_blocks[-1].strip()
|
|
|
|
# Otherwise, try to find any code blocks
|
|
code_blocks = re.findall(r"```\n(.*?)```", initial_response, re.DOTALL)
|
|
|
|
# If code blocks found, return the last one
|
|
if code_blocks:
|
|
return code_blocks[-1].strip()
|
|
|
|
# If no code blocks found with newlines after backticks, try without newlines
|
|
html_blocks_no_newline = re.findall(r"```html(.*?)```", initial_response, re.DOTALL)
|
|
if html_blocks_no_newline:
|
|
return html_blocks_no_newline[-1].strip()
|
|
|
|
code_blocks_no_newline = re.findall(r"```(.*?)```", initial_response, re.DOTALL)
|
|
if code_blocks_no_newline:
|
|
return code_blocks_no_newline[-1].strip()
|
|
|
|
# Return empty string if no code blocks found
|
|
return None
|
|
|
|
|
|
async def generate_html_from_image(client, image_base64):
|
|
"""Call Claude API to generate HTML from an image using a multi-step prompting strategy."""
|
|
global total_input_tokens, total_output_tokens
|
|
png_width, png_height = get_png_dimensions_from_base64(image_base64)
|
|
|
|
try:
|
|
# Step 0: Check that the orientation of the original document is right-side-up. If not, we will
|
|
# skip this page, to keep the code simple
|
|
orientation_response = await client.messages.create(
|
|
model="claude-sonnet-4-5-20250929",
|
|
max_tokens=1000,
|
|
temperature=0,
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
|
|
{
|
|
"type": "text",
|
|
"text": "Please analyze this document image and determine its orientation.\n\n"
|
|
"Is this document right-side-up (correctly oriented), or is it rotated?\n\n"
|
|
"Make your decision based on the main document contents that takes up most of the page area.\n\n"
|
|
"Respond with ONLY one of the following:\n"
|
|
"- RIGHT_SIDE_UP: The document is correctly oriented and readable\n"
|
|
"- ROTATED_90: The document is rotated 90 degrees clockwise\n"
|
|
"- ROTATED_180: The document is upside down (rotated 180 degrees)\n"
|
|
"- ROTATED_270: The document is rotated 270 degrees clockwise (90 degrees counter-clockwise)\n"
|
|
"- UNCLEAR: Cannot determine orientation (e.g., blank page, purely graphical content)\n\n"
|
|
"Important: Only respond with one of these exact terms, nothing else.",
|
|
},
|
|
],
|
|
}
|
|
],
|
|
)
|
|
|
|
# Extract orientation from response
|
|
orientation_text = ""
|
|
for content in orientation_response.content:
|
|
if content.type == "text":
|
|
orientation_text += content.text.strip()
|
|
|
|
# Track token usage from orientation check
|
|
if hasattr(orientation_response, "usage"):
|
|
total_input_tokens += orientation_response.usage.input_tokens
|
|
total_output_tokens += orientation_response.usage.output_tokens
|
|
|
|
# Check orientation result
|
|
if "RIGHT_SIDE_UP" not in orientation_text:
|
|
print(f"Skipping page due to orientation: {orientation_text}")
|
|
return None
|
|
|
|
# Step 1: Initial analysis and column detection
|
|
analysis_response = await client.messages.create(
|
|
model="claude-sonnet-4-5-20250929",
|
|
max_tokens=20000,
|
|
temperature=0.1,
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
|
|
{
|
|
"type": "text",
|
|
"text": "Analyze this document and provide a detailed assessment of its structure. Focus specifically on:\n"
|
|
"1. How many columns does the document have? Is it single-column, two-column, three-column, or a mixed layout?\n"
|
|
"2. What are the main sections and content types (headings, paragraphs, lists, tables, images, etc.)?\n"
|
|
"3. Does it have headers, footers, page numbers, or other special elements?\n"
|
|
"4. Is there any complex formatting that would be challenging to reproduce in HTML?\n\n"
|
|
"Please be very precise about the number of columns and how they're arranged.",
|
|
},
|
|
],
|
|
}
|
|
],
|
|
)
|
|
|
|
# Check if response was complete
|
|
if hasattr(analysis_response, 'stop_reason') and analysis_response.stop_reason != 'end_turn':
|
|
print(f"Warning: Analysis response incomplete (stop_reason: {analysis_response.stop_reason})")
|
|
return None
|
|
|
|
analysis_text = ""
|
|
for content in analysis_response.content:
|
|
if content.type == "text":
|
|
analysis_text += content.text
|
|
|
|
# Track token usage from first API call
|
|
if hasattr(analysis_response, "usage"):
|
|
total_input_tokens += analysis_response.usage.input_tokens
|
|
total_output_tokens += analysis_response.usage.output_tokens
|
|
|
|
# Step 2: Initial HTML generation with detailed layout instructions
|
|
initial_response = await client.messages.create(
|
|
model="claude-sonnet-4-5-20250929",
|
|
max_tokens=20000,
|
|
temperature=0.2,
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
|
|
{
|
|
"type": "text",
|
|
"text": "Render this document as clean, semantic HTML. Here's my analysis of the document structure:\n\n"
|
|
f"{analysis_text}\n\n"
|
|
"Important requirements:\n"
|
|
"1. Use appropriate HTML tags for elements like headings, paragraphs, lists, tables, etc.\n"
|
|
"2. Use the <header> and <footer> tags to represent content at the top/bottom which would not normally be part of the main content, such as page numbers, etc.\n"
|
|
"3. Use a placeholder <div> tag with class 'image' which will render as a grey box with black outline to make sure images have their original size, shape, and position on the page. Include an alt-text of the original image as a 'data-description' attribute on the tag. Include 'data-x', 'data-y', 'data-width', 'data-height' attributes which specify where the image was found in the original document.\n"
|
|
"4. Render any math equations and Latex inline using either \\[ \\] or \\( \\) delimeters.\n"
|
|
"5. CRITICAL: If the document has a multi-column layout, you MUST preserve the exact same number of columns in your HTML. Use CSS flexbox or grid to create the columns.\n"
|
|
"6. Focus on creating valid, accessible HTML that preserves the appearance and formatting of the original page as closely as possible.\n"
|
|
f"7. The webpage will be viewed with a fixed viewport size of {png_width} pixels wide by {png_height} pixels tall.\n"
|
|
"8. For multi-column layouts, use explicit CSS. The most important aspect is preserving the column structure of the original document - this is critical.\n\n"
|
|
"Enclose your HTML in a ```html code block.",
|
|
},
|
|
],
|
|
}
|
|
],
|
|
)
|
|
|
|
# Check if response was complete
|
|
if hasattr(initial_response, 'stop_reason') and initial_response.stop_reason != 'end_turn':
|
|
print(f"Warning: Initial HTML response incomplete (stop_reason: {initial_response.stop_reason})")
|
|
return None
|
|
|
|
# Extract initial HTML
|
|
initial_html_text = ""
|
|
for content in initial_response.content:
|
|
if content.type == "text":
|
|
initial_html_text += content.text
|
|
|
|
# Track token usage from second API call
|
|
if hasattr(initial_response, "usage"):
|
|
total_input_tokens += initial_response.usage.input_tokens
|
|
total_output_tokens += initial_response.usage.output_tokens
|
|
|
|
initial_html = extract_code_block(initial_html_text)
|
|
if not initial_html:
|
|
print("Warning: No HTML code block found in initial response")
|
|
return None
|
|
|
|
|
|
# Step 3: Render the initial HTML to PDF and then back to PNG for comparison
|
|
# Create a temporary PDF file
|
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf:
|
|
tmp_pdf_path = tmp_pdf.name
|
|
|
|
try:
|
|
# Render HTML to PDF using existing function
|
|
render_success = await render_pdf_with_playwright(initial_html, tmp_pdf_path, png_width, png_height)
|
|
|
|
if not render_success:
|
|
print("Warning: Failed to render initial HTML to PDF for refinement")
|
|
# Fall back to returning the initial HTML without refinement
|
|
return initial_html
|
|
|
|
# Convert PDF back to PNG
|
|
rendered_image_base64 = render_pdf_to_base64png(tmp_pdf_path, 1, max(png_width, png_height))
|
|
|
|
if not rendered_image_base64:
|
|
print("Warning: Failed to convert rendered PDF to PNG for refinement")
|
|
# Fall back to returning the initial HTML without refinement
|
|
return initial_html
|
|
|
|
# Step 4: Refinement - Show both images to Claude and ask for corrections
|
|
async with client.messages.stream(
|
|
model="claude-sonnet-4-5-20250929",
|
|
max_tokens=40000,
|
|
temperature=1.0,
|
|
thinking={
|
|
"type": "enabled",
|
|
"budget_tokens": 12000
|
|
},
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "text", "text": "I'm going to show you two images:\n1. The original document\n2. How the HTML I generated renders\n\nPlease compare them carefully and provide a revised version of the HTML that better matches the original."},
|
|
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
|
|
{"type": "text", "text": "Above is the ORIGINAL document."},
|
|
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": rendered_image_base64}},
|
|
{"type": "text", "text": "Above is how my HTML currently renders."},
|
|
{
|
|
"type": "text",
|
|
"text": f"Here is the current HTML code:\n\n```html\n{initial_html}\n```\n\n"
|
|
"Please analyze the differences between the original document and the rendered version. Focus on:\n"
|
|
"1. Layout issues - are columns preserved correctly?\n"
|
|
"2. Positioning - are elements in the right place?\n"
|
|
"3. Spacing - are margins, padding, and spacing between elements correct?\n"
|
|
"4. Occlusion - is any important content hidden or overlapping?\n"
|
|
"5. Text formatting - are fonts, sizes, and styles appropriate?\n"
|
|
"6. Tables - are the headers on tables are aligned with the correct corresponding columns?\n"
|
|
f"The webpage will be viewed at {png_width}x{png_height} pixels.\n\n"
|
|
"Provide a REVISED version of the HTML that corrects any issues you identified. "
|
|
"Make sure all important elements are visible and the layout matches the original as closely as possible.\n"
|
|
"Output the complete revised HTML in a ```html code block."
|
|
},
|
|
],
|
|
}
|
|
],
|
|
) as refinement_stream:
|
|
|
|
async for event in refinement_stream:
|
|
pass
|
|
|
|
refinement_response = await refinement_stream.get_final_message()
|
|
|
|
# Check if refinement response was complete
|
|
if hasattr(refinement_response, 'stop_reason') and refinement_response.stop_reason != 'end_turn':
|
|
print(f"Warning: Refinement response incomplete (stop_reason: {refinement_response.stop_reason})")
|
|
# Return initial HTML as fallback since it was complete
|
|
return initial_html
|
|
|
|
# Extract refined HTML
|
|
refined_html_text = ""
|
|
for content in refinement_response.content:
|
|
if content.type == "text":
|
|
refined_html_text += content.text
|
|
|
|
# Track token usage from refinement API call
|
|
if hasattr(refinement_response, "usage"):
|
|
total_input_tokens += refinement_response.usage.input_tokens
|
|
total_output_tokens += refinement_response.usage.output_tokens
|
|
|
|
refined_html = extract_code_block(refined_html_text)
|
|
|
|
# Return refined HTML if available, otherwise return initial HTML
|
|
if refined_html:
|
|
print("Successfully refined HTML using visual comparison")
|
|
return refined_html
|
|
else:
|
|
print("Warning: No HTML code block found in refinement response, using initial HTML")
|
|
return initial_html
|
|
|
|
finally:
|
|
# Clean up temporary PDF file
|
|
if os.path.exists(tmp_pdf_path):
|
|
os.remove(tmp_pdf_path)
|
|
|
|
except Exception as e:
|
|
print(f"Error calling Claude API: {e}")
|
|
return None
|
|
|
|
|
|
def extract_page_from_pdf(input_path, output_path, page_num):
|
|
"""
|
|
Extract a specific page from a PDF and save it as a new PDF.
|
|
|
|
Args:
|
|
input_path: Path to the input PDF
|
|
output_path: Path to save the extracted page
|
|
page_num: The page number to extract (1-indexed, converted to 0-indexed for pypdf)
|
|
|
|
Returns:
|
|
bool: True if extraction was successful, False otherwise
|
|
"""
|
|
try:
|
|
# Ensure output directory exists
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
|
|
# Read the input PDF
|
|
reader = pypdf.PdfReader(input_path)
|
|
|
|
# Convert to 0-indexed for pypdf
|
|
zero_idx_page = page_num - 1
|
|
|
|
# Check if page number is valid
|
|
if zero_idx_page >= len(reader.pages) or zero_idx_page < 0:
|
|
print(f"Page number {page_num} out of range for {input_path} with {len(reader.pages)} pages")
|
|
return False
|
|
|
|
# Create a new PDF with just the selected page
|
|
writer = pypdf.PdfWriter()
|
|
writer.add_page(reader.pages[zero_idx_page])
|
|
|
|
# Write the output PDF
|
|
with open(output_path, "wb") as output_file:
|
|
writer.write(output_file)
|
|
|
|
return True
|
|
except Exception as e:
|
|
print(f"Error extracting page {page_num} from {input_path}: {str(e)}")
|
|
return False
|
|
|
|
|
|
async def render_pdf_with_playwright(html_content, output_pdf_path, png_width, png_height):
|
|
"""
|
|
Render HTML content using Playwright and save it as PDF.
|
|
Try different scale factors if needed to ensure the output is exactly one page.
|
|
|
|
Args:
|
|
html_content: HTML content to render
|
|
output_pdf_path: Path to save the rendered PDF
|
|
png_width: Width of the viewport
|
|
png_height: Height of the viewport
|
|
|
|
Returns:
|
|
bool: True if rendering was successful with exactly one page, False otherwise
|
|
"""
|
|
scale_factors = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5] # Try these scale factors in order
|
|
|
|
# Determine page format based on PNG dimensions
|
|
# Define thresholds with some tolerance (±5%)
|
|
aspect_ratio = png_width / png_height
|
|
|
|
# Letter Portrait: 8.5" x 11" (aspect ratio ~0.77)
|
|
# Letter Landscape: 11" x 8.5" (aspect ratio ~1.29)
|
|
# A4 Portrait: 210mm x 297mm (aspect ratio ~0.71)
|
|
# A4 Landscape: 297mm x 210mm (aspect ratio ~1.41)
|
|
|
|
pdf_options = {
|
|
"path": output_pdf_path,
|
|
"print_background": True,
|
|
}
|
|
|
|
if 0.73 <= aspect_ratio <= 0.81: # Letter Portrait (8.5/11 = 0.77)
|
|
pdf_options["width"] = "8.5in"
|
|
pdf_options["height"] = "11in"
|
|
elif 1.23 <= aspect_ratio <= 1.35: # Letter Landscape (11/8.5 = 1.29)
|
|
pdf_options["width"] = "11in"
|
|
pdf_options["height"] = "8.5in"
|
|
elif 0.67 <= aspect_ratio <= 0.73: # A4 Portrait (210/297 = 0.71)
|
|
pdf_options["width"] = "210mm"
|
|
pdf_options["height"] = "297mm"
|
|
elif 1.36 <= aspect_ratio <= 1.47: # A4 Landscape (297/210 = 1.41)
|
|
pdf_options["width"] = "297mm"
|
|
pdf_options["height"] = "210mm"
|
|
# else: Other - leave width and height unset
|
|
|
|
for scale in scale_factors:
|
|
try:
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch()
|
|
page = await browser.new_page(viewport={"width": int(png_width * scale), "height": int(png_height * scale)})
|
|
|
|
# Set the HTML content
|
|
await page.set_content(html_content)
|
|
|
|
# Add in katex and setup auto rendering
|
|
katex_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "katex")
|
|
katex_css_path = os.path.join(katex_dir, "katex.min.css")
|
|
katex_js_path = os.path.join(katex_dir, "katex.min.js")
|
|
katex_autorender_js_path = os.path.join(katex_dir, "auto-render.min.js")
|
|
|
|
await page.add_style_tag(path=katex_css_path)
|
|
await page.add_script_tag(path=katex_js_path)
|
|
await page.add_script_tag(path=katex_autorender_js_path)
|
|
|
|
# Run the KaTeX auto-renderer immediately rather than waiting for DOMContentLoaded
|
|
await page.evaluate(
|
|
"""
|
|
renderMathInElement(document.body, {
|
|
// customised options
|
|
// • auto-render specific keys, e.g.:
|
|
delimiters: [
|
|
{left: '\\\\(', right: '\\\\)', display: false},
|
|
{left: '\\\\[', right: '\\\\]', display: true}
|
|
],
|
|
// • rendering keys, e.g.:
|
|
throwOnError: false
|
|
});
|
|
"""
|
|
)
|
|
|
|
# Save as PDF with formatting options
|
|
# Add scale to the options
|
|
pdf_options["scale"] = scale
|
|
await page.pdf(**pdf_options)
|
|
|
|
await browser.close()
|
|
|
|
# Check if the output PDF has exactly one page
|
|
try:
|
|
reader = pypdf.PdfReader(output_pdf_path)
|
|
if len(reader.pages) == 1:
|
|
print(f"Successfully rendered as a single page PDF with scale factor {scale}")
|
|
return True
|
|
else:
|
|
print(f"PDF has {len(reader.pages)} pages with scale factor {scale}, trying a smaller scale...")
|
|
# Continue to the next scale factor
|
|
except Exception as pdf_check_error:
|
|
print(f"Error checking PDF page count: {pdf_check_error}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"Error rendering PDF with Playwright at scale {scale}: {str(e)}")
|
|
# Try the next scale factor
|
|
|
|
print("Failed to render PDF as a single page with any scale factor")
|
|
return False
|
|
|
|
|
|
def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, random_gen: random.Random, verbose_table_testing: bool = False) -> List[Dict]:
|
|
"""
|
|
Generate tests from HTML content parsed from the PDF.
|
|
|
|
Args:
|
|
html_content: The HTML content of the page
|
|
pdf_id: The unique identifier for the PDF
|
|
page_num: The page number
|
|
verbose_table_testing: Whether to print table test verification details
|
|
|
|
Returns:
|
|
A list of test dictionaries that can be saved as JSONL
|
|
"""
|
|
|
|
# Use the module-level conversion function
|
|
|
|
tests = []
|
|
pdf_filename = f"{pdf_id}_page{page_num}.pdf"
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
|
|
# Remove any divs or spans with class "line-number"
|
|
for element in soup.find_all(["div", "span"], class_="line-number"):
|
|
element.extract()
|
|
|
|
# Rewrite any page-header and page-footer divs to be normalized to headers
|
|
# Convert div.page-footer to footer in one line
|
|
for div in soup.find_all("div", class_="page-header"):
|
|
div.name = "header"
|
|
|
|
for div in soup.find_all("div", class_="page-footer"):
|
|
div.name = "footer"
|
|
|
|
# Remove elements in the body that appear before the header or after the footer
|
|
body = soup.find("body")
|
|
if body:
|
|
header = soup.find("header")
|
|
footer = soup.find("footer")
|
|
|
|
if header:
|
|
# Remove elements before the header
|
|
current = body.contents[0]
|
|
while current and current != header:
|
|
next_elem = current.next_sibling
|
|
current.extract()
|
|
current = next_elem
|
|
|
|
if footer:
|
|
# Remove elements after the footer
|
|
current = footer.next_sibling
|
|
while current:
|
|
next_elem = current.next_sibling
|
|
current.extract()
|
|
current = next_elem
|
|
|
|
# Step 1: Process headers, footers, and page numbers for TextAbsenceTests
|
|
headers = soup.find_all("header")
|
|
footers = soup.find_all("footer")
|
|
page_numbers = soup.find_all("div", class_="page-number")
|
|
|
|
# Function to create absence tests from text elements
|
|
def create_absence_tests_from_elements(parent_element, element_type):
|
|
mini_soup = BeautifulSoup(str(parent_element), "html.parser")
|
|
|
|
# Convert superscripts and subscripts in the mini soup
|
|
convert_superscripts_subscripts(mini_soup)
|
|
|
|
# Remove headers, footers, and tables from the main_soup
|
|
for element in mini_soup.find_all(["h1", "h2"]):
|
|
element.extract()
|
|
|
|
# Find all text-containing leaf elements within the parent
|
|
text_elements = []
|
|
|
|
# Get all target elements
|
|
target_tags = mini_soup.find_all(["span", "div", "p", "h3", "h4", "h5", "h6"])
|
|
|
|
# Filter to only include leaf nodes (elements that don't contain other target elements)
|
|
for tag in target_tags:
|
|
# Check if this element has no children from our target tags
|
|
is_leaf = not tag.find(["span", "div", "p", "h3", "h4", "h5", "h6"])
|
|
|
|
if is_leaf:
|
|
text = tag.get_text().strip()
|
|
if text:
|
|
text_elements.append(text)
|
|
|
|
# If no elements found, use the parent's text as a fallback, but only if
|
|
if not text_elements:
|
|
parent_text = mini_soup.get_text().strip()
|
|
if parent_text:
|
|
text_elements.append(parent_text)
|
|
|
|
# Create tests for each text element
|
|
for text in text_elements:
|
|
if "\n" in text:
|
|
text = text.split("\n")[0]
|
|
|
|
if len(text) > 3 or len([c for c in text if c.isdigit()]): # Only create tests for meaningful text
|
|
tests.append(
|
|
{
|
|
"pdf": pdf_filename,
|
|
"page": 1,
|
|
"id": f"{pdf_id}_{element_type}_{uuid.uuid4().hex[:8]}",
|
|
"type": TestType.ABSENT.value,
|
|
"text": text,
|
|
"max_diffs": round(len(text) * 0.05),
|
|
}
|
|
)
|
|
|
|
# Create TextAbsenceTests for headers
|
|
for header in headers:
|
|
create_absence_tests_from_elements(header, "header")
|
|
|
|
# Create TextAbsenceTests for footers
|
|
for footer in footers:
|
|
create_absence_tests_from_elements(footer, "footer")
|
|
|
|
# Create TextAbsenceTests for page numbers
|
|
for page_number in page_numbers:
|
|
# Convert any superscripts/subscripts in the page number
|
|
page_number_soup = BeautifulSoup(str(page_number), "html.parser")
|
|
convert_superscripts_subscripts(page_number_soup)
|
|
page_number_text = page_number_soup.get_text().strip()
|
|
|
|
if page_number_text:
|
|
tests.append(
|
|
{
|
|
"pdf": pdf_filename,
|
|
"page": 1,
|
|
"id": f"{pdf_id}_page_number_{uuid.uuid4().hex[:8]}",
|
|
"type": TestType.ABSENT.value,
|
|
"text": page_number_text,
|
|
"max_diffs": 0,
|
|
}
|
|
)
|
|
|
|
# Step 2: Generate tests from tables using parse_html_tables
|
|
# Convert superscripts and subscripts to Unicode equivalents in tables
|
|
table_soup = BeautifulSoup(html_content, "html.parser")
|
|
|
|
# Convert superscripts and subscripts in the table HTML
|
|
convert_superscripts_subscripts(table_soup)
|
|
html_content_with_unicode = str(table_soup)
|
|
|
|
table_data_list = parse_html_tables(html_content_with_unicode)
|
|
|
|
for table_idx, table_data in enumerate(table_data_list):
|
|
# Get the table data as a numpy array
|
|
table_array = table_data.data
|
|
table_tests = []
|
|
|
|
# Skip tables that are too small
|
|
if table_array.shape[0] < 2 or table_array.shape[1] < 2:
|
|
continue
|
|
|
|
# Get a limited number of cells to create tests for
|
|
# Select random rows and columns, excluding header rows/columns
|
|
non_header_rows = [i for i in range(table_array.shape[0]) if i not in table_data.header_rows]
|
|
non_header_cols = [j for j in range(table_array.shape[1]) if j not in table_data.header_cols]
|
|
|
|
# If we don't have enough non-header cells, use all cells
|
|
if len(non_header_rows) < 2 or len(non_header_cols) < 2:
|
|
cell_positions = [(i, j) for i in range(table_array.shape[0]) for j in range(table_array.shape[1])]
|
|
else:
|
|
cell_positions = [
|
|
(i, j)
|
|
for i in random_gen.sample(non_header_rows, min(3, len(non_header_rows)))
|
|
for j in random_gen.sample(non_header_cols, min(2, len(non_header_cols)))
|
|
]
|
|
|
|
random_gen.shuffle(cell_positions)
|
|
|
|
# Create tests for each selected cell
|
|
for row_idx, col_idx in cell_positions:
|
|
cell_text = str(table_array[row_idx, col_idx]).strip()
|
|
|
|
# Skip cells with minimal text
|
|
if not cell_text or len(cell_text) < 3:
|
|
continue
|
|
|
|
# Create a TableTest with relevant relationships
|
|
test_data = {
|
|
"pdf": pdf_filename,
|
|
"page": 1,
|
|
"id": f"{pdf_id}_table{table_idx}_{uuid.uuid4().hex[:8]}",
|
|
"type": TestType.TABLE.value,
|
|
"cell": cell_text,
|
|
"max_diffs": 0,
|
|
"ignore_markdown_tables": True,
|
|
}
|
|
|
|
# Check cell up
|
|
if row_idx > 0:
|
|
up_text = str(table_array[row_idx - 1, col_idx]).strip()
|
|
if up_text and "\n" not in up_text:
|
|
test_data["up"] = up_text
|
|
|
|
# Check cell down
|
|
if row_idx < table_array.shape[0] - 1:
|
|
down_text = str(table_array[row_idx + 1, col_idx]).strip()
|
|
if down_text and "\n" not in down_text:
|
|
test_data["down"] = down_text
|
|
|
|
# Check cell left
|
|
if col_idx > 0:
|
|
left_text = str(table_array[row_idx, col_idx - 1]).strip()
|
|
if left_text and "\n" not in left_text:
|
|
test_data["left"] = left_text
|
|
|
|
# Check cell right
|
|
if col_idx < table_array.shape[1] - 1:
|
|
right_text = str(table_array[row_idx, col_idx + 1]).strip()
|
|
if right_text and "\n" not in right_text:
|
|
test_data["right"] = right_text
|
|
|
|
# Check if current cell is a heading cell
|
|
is_header_cell = row_idx in table_data.header_rows or col_idx in table_data.header_cols
|
|
|
|
# Check for top heading using header information (skip if current cell is a heading)
|
|
if not is_header_cell and col_idx in table_data.col_headers:
|
|
# Get the headers for this column
|
|
col_headers = table_data.col_headers[col_idx]
|
|
if col_headers:
|
|
# Use the first header as the top heading
|
|
_, top_heading = col_headers[0]
|
|
if top_heading and "\n" not in top_heading:
|
|
test_data["top_heading"] = top_heading
|
|
|
|
# Check for left heading using header information (skip if current cell is a heading)
|
|
if not is_header_cell and row_idx in table_data.row_headers:
|
|
# Get the headers for this row
|
|
row_headers = table_data.row_headers[row_idx]
|
|
if row_headers:
|
|
# Use the first header as the left heading
|
|
_, left_heading = row_headers[0]
|
|
if left_heading and "\n" not in left_heading:
|
|
test_data["left_heading"] = left_heading
|
|
|
|
# Only add the test if we have at least one relation
|
|
if any(x in test_data for x in ["up", "down", "left", "right", "top_heading", "left_heading"]):
|
|
# Verify that the test passes with the current table HTML
|
|
# Create the actual test object
|
|
test_obj = TableTest(
|
|
pdf=test_data["pdf"],
|
|
page=test_data["page"],
|
|
id=test_data["id"],
|
|
type=test_data["type"],
|
|
cell=test_data["cell"],
|
|
max_diffs=test_data["max_diffs"],
|
|
up=test_data.get("up", ""),
|
|
down=test_data.get("down", ""),
|
|
left=test_data.get("left", ""),
|
|
right=test_data.get("right", ""),
|
|
top_heading=test_data.get("top_heading", ""),
|
|
left_heading=test_data.get("left_heading", ""),
|
|
)
|
|
|
|
# Extract just the relevant table HTML
|
|
tables = soup.find_all("table")
|
|
if table_idx < len(tables):
|
|
table_html = str(tables[table_idx])
|
|
|
|
# Run the test against the original HTML
|
|
passed, explanation = test_obj.run(table_html)
|
|
else:
|
|
# Shouldn't happen, but handle it gracefully
|
|
passed = False
|
|
|
|
# Only add tests that pass
|
|
if passed:
|
|
table_tests.append(test_data)
|
|
|
|
if len(table_tests) > 25:
|
|
break
|
|
|
|
# Done with inner for loop iterating over cells
|
|
# So add in the bulk of the test cases back in now
|
|
tests.extend(table_tests)
|
|
|
|
|
|
# Step 3: Generate TextPresenceTests and OrderingTests from markdown content
|
|
# Convert HTML to markdown to get cleaner text for presence and ordering tests
|
|
markdown_content = html_to_markdown_with_frontmatter(html_content)
|
|
|
|
# Remove any HTML tables from the markdown content
|
|
# Tables can persist in markdown as raw HTML and we want to exclude them
|
|
markdown_content = re.sub(r"<table[^>]*>.*?</table>", "", markdown_content, flags=re.DOTALL | re.IGNORECASE)
|
|
|
|
# Extract just the content part (after frontmatter)
|
|
markdown_lines = markdown_content.split("\n")
|
|
content_start_idx = 0
|
|
|
|
# Skip frontmatter if present
|
|
if markdown_lines[0] == "---":
|
|
for idx, line in enumerate(markdown_lines[1:], 1):
|
|
if line == "---":
|
|
content_start_idx = idx + 1
|
|
break
|
|
|
|
# Get markdown content without frontmatter
|
|
markdown_text = "\n".join(markdown_lines[content_start_idx:]).strip()
|
|
|
|
# Parse sentences from markdown content
|
|
sentences = []
|
|
if markdown_text:
|
|
for paragraph in process(markdown_text):
|
|
for sentence in paragraph:
|
|
# Convert token sequence to string and clean it
|
|
sentence_str = ""
|
|
for token in sentence:
|
|
sentence_str += token.spacing + token.value
|
|
|
|
sentence_str = sentence_str.strip()
|
|
|
|
if sentence_str:
|
|
# Skip HTML content that might still be in markdown
|
|
if not sentence_str.startswith("<") and not sentence_str.endswith(">"):
|
|
# Skip image placeholders - match any markdown image syntax 
|
|
if re.search(r"!\[.*?\]\(.*?\)", sentence_str):
|
|
continue
|
|
|
|
# Remove leading # marks (markdown headers)
|
|
while sentence_str.startswith("#"):
|
|
sentence_str = sentence_str[1:]
|
|
sentence_str = sentence_str.strip()
|
|
|
|
# Remove leading "- " for unordered lists
|
|
if sentence_str.startswith("- "):
|
|
sentence_str = sentence_str[2:]
|
|
|
|
sentence_str = sentence_str.strip()
|
|
|
|
if sentence_str: # Only add if there's still content after cleaning
|
|
sentences.append(sentence_str)
|
|
|
|
# Add a few random ordering tests
|
|
all_indexes = list(range(len(sentences)))
|
|
random_gen.shuffle(all_indexes)
|
|
random_pairs = [(all_indexes[i * 2], all_indexes[i * 2 + 1]) for i in range(len(all_indexes) // 2)]
|
|
random_pairs = [(min(i, j), max(i, j)) for (i, j) in random_pairs]
|
|
|
|
num_order_tests = 0
|
|
for i, j in random_pairs:
|
|
first_sentence = sentences[i]
|
|
second_sentence = sentences[j]
|
|
|
|
if len(first_sentence) < 5 or len(second_sentence) < 5:
|
|
continue
|
|
|
|
if "\n" in first_sentence:
|
|
first_sentence = first_sentence.split("\n")[0].strip()
|
|
if "\n" in second_sentence:
|
|
second_sentence = second_sentence.split("\n")[0].strip()
|
|
|
|
max_diffs = round(max(len(first_sentence), len(second_sentence)) * 0.02)
|
|
|
|
# Too big of a length discrepancy causes issues
|
|
if max_diffs > len(first_sentence) // 4 or max_diffs > len(second_sentence) // 4:
|
|
continue
|
|
|
|
tests.append(
|
|
{
|
|
"pdf": pdf_filename,
|
|
"page": 1,
|
|
"id": f"{pdf_id}_order_{uuid.uuid4().hex[:8]}",
|
|
"type": TestType.ORDER.value,
|
|
"before": first_sentence,
|
|
"after": second_sentence,
|
|
"max_diffs": max_diffs,
|
|
}
|
|
)
|
|
num_order_tests += 1
|
|
|
|
if num_order_tests > 5:
|
|
break
|
|
|
|
# Step 4: Generate Math tests for LaTeX equations from the markdown
|
|
|
|
# Define math patterns to search for
|
|
math_patterns = [
|
|
(r"\$\$(.+?)\$\$", re.DOTALL), # $$...$$ (multiline)
|
|
(r"\\\((.+?)\\\)", re.DOTALL), # \(...\) (multiline)
|
|
(r"\\\[(.+?)\\\]", re.DOTALL), # \[...\] (multiline)
|
|
]
|
|
|
|
math_equations = []
|
|
for pattern, flags in math_patterns:
|
|
matches = re.findall(pattern, markdown_content, flags)
|
|
for match in matches:
|
|
# Clean up the match - remove extra whitespace and newlines
|
|
equation = match.strip()
|
|
# Skip empty or very short equations
|
|
if len(equation) > 2:
|
|
math_equations.append(equation)
|
|
|
|
# Remove duplicates while preserving order
|
|
seen = set()
|
|
unique_equations = []
|
|
for eq in math_equations:
|
|
if eq not in seen:
|
|
seen.add(eq)
|
|
unique_equations.append(eq)
|
|
|
|
# Create math tests for up to 50 unique equations
|
|
for i, equation in enumerate(unique_equations[:50]):
|
|
tests.append(
|
|
{
|
|
"pdf": pdf_filename,
|
|
"page": 1,
|
|
"id": f"{pdf_id}_math_{uuid.uuid4().hex[:8]}",
|
|
"type": "math",
|
|
"math": equation,
|
|
"max_diffs": 0,
|
|
"ignore_dollar_delimited": True,
|
|
}
|
|
)
|
|
|
|
# Final test filtering out stage
|
|
|
|
# Now double check that the absent tests don't find any matches in the markdown_text
|
|
# If they do, filter them out
|
|
tests = [t for t in tests if t["type"] != "absent" or t["text"] not in markdown_text]
|
|
|
|
# Remove any tests where text-based fields have no alphanumeric characters, contain LaTeX, or contain Unicode super/subscripts
|
|
text_fields = ["text", "cell", "before", "after", "up", "down", "left", "right", "top_heading", "left_heading"]
|
|
|
|
def contains_alphanumeric(value):
|
|
return any(c.isalnum() for c in value) if isinstance(value, str) else False
|
|
|
|
def contains_latex(value):
|
|
if not isinstance(value, str):
|
|
return False
|
|
# Check for LaTeX delimiters
|
|
latex_patterns = [r"\(", r"\)", r"\[", r"\]"]
|
|
return any(pattern in value for pattern in latex_patterns)
|
|
|
|
def contains_unicode_super_or_subscripts(value):
|
|
if not isinstance(value, str):
|
|
return False
|
|
|
|
# Unicode ranges for superscripts and subscripts
|
|
superscript_chars = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿⁱ"
|
|
subscript_chars = "₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₕₖₗₘₙₚₛₜ"
|
|
|
|
return any(c in superscript_chars or c in subscript_chars for c in value)
|
|
|
|
filtered_tests = []
|
|
for test in tests:
|
|
# Math tests should not be filtered for LaTeX content
|
|
if test.get("type") == "math":
|
|
filtered_tests.append(test)
|
|
continue
|
|
|
|
# Check all text fields in the test for alphanumeric content, LaTeX, and Unicode super/subscripts
|
|
all_valid = True
|
|
for field in text_fields:
|
|
if field in test:
|
|
# Skip test if field has no alphanumeric characters
|
|
if not contains_alphanumeric(test[field]):
|
|
all_valid = False
|
|
break
|
|
# Skip test if field contains LaTeX delimiters
|
|
if contains_latex(test[field]):
|
|
all_valid = False
|
|
break
|
|
# Skip test if field contains Unicode super or subscripts
|
|
if contains_unicode_super_or_subscripts(test[field]):
|
|
all_valid = False
|
|
break
|
|
if all_valid:
|
|
filtered_tests.append(test)
|
|
|
|
tests = filtered_tests
|
|
|
|
# Remove duplicate tests (identical on everything but the id field)
|
|
unique_tests = []
|
|
test_signatures = set()
|
|
|
|
for test in tests:
|
|
# Create a signature for the test by using all fields except 'id'
|
|
test_dict = test.copy()
|
|
test_dict.pop("id")
|
|
|
|
# Convert dict to a sorted tuple of items for hashability
|
|
test_signature = tuple(sorted((k, str(v)) for k, v in test_dict.items()))
|
|
|
|
# Only add the test if we haven't seen an identical one
|
|
if test_signature not in test_signatures:
|
|
test_signatures.add(test_signature)
|
|
unique_tests.append(test)
|
|
|
|
return unique_tests
|
|
|
|
|
|
async def process_pdf(pdf_info, args, client, pdf_filter=None):
|
|
"""Process a single PDF, render a random page, and create an HTML template."""
|
|
pdf_path, index = pdf_info
|
|
|
|
# Create a unique folder for each PDF in the temp directory
|
|
pdf_id = f"pdf_{index:05d}"
|
|
temp_pdf_dir = os.path.join(args.temp_dir, pdf_id)
|
|
os.makedirs(temp_pdf_dir, exist_ok=True)
|
|
|
|
# Determine if we should log table test verification
|
|
verbose_table_testing = args.verbose
|
|
|
|
# Download PDF to local temp directory (or copy if local)
|
|
local_pdf_path = os.path.join(temp_pdf_dir, "document.pdf")
|
|
if not download_s3_pdf(pdf_path, local_pdf_path):
|
|
print(f"Failed to download/copy PDF from {pdf_path}")
|
|
return None
|
|
|
|
# Apply filter if enabled
|
|
if pdf_filter and pdf_filter.filter_out_pdf(local_pdf_path):
|
|
print(f"PDF filtered out: {pdf_path}")
|
|
return None
|
|
|
|
# Seed with SHA1 hash of PDF contents for reproducibility
|
|
with open(local_pdf_path, "rb") as f:
|
|
pdf_content = f.read()
|
|
pdf_hash = hashlib.sha1(pdf_content).hexdigest()
|
|
|
|
# Use the first 8 characters of the hash as an integer seed
|
|
seed = int(pdf_hash[:8], 16)
|
|
random_generator = random.Random(seed)
|
|
|
|
try:
|
|
# Get page count using pypdf
|
|
reader = pypdf.PdfReader(local_pdf_path)
|
|
num_pages = len(reader.pages)
|
|
|
|
if num_pages == 0:
|
|
print(f"PDF has no pages: {pdf_path}")
|
|
return None
|
|
|
|
# Select a random page
|
|
page_num = random_generator.randint(1, num_pages)
|
|
|
|
# Render the page as a base64 PNG (run in thread pool since it's blocking I/O)
|
|
loop = asyncio.get_event_loop()
|
|
image_base64 = await loop.run_in_executor(None, render_pdf_to_base64png, local_pdf_path, page_num, 1024)
|
|
|
|
# Generate HTML from the image
|
|
html_content = await generate_html_from_image(client, image_base64)
|
|
if not html_content:
|
|
print(f"Failed to generate HTML for {pdf_path}, page {page_num}")
|
|
return None
|
|
|
|
# Add git commit meta tag if available
|
|
git_commit = get_git_commit_hash()
|
|
if git_commit:
|
|
# Parse the HTML to add the meta tag in the head section
|
|
html_soup = BeautifulSoup(html_content, "html.parser")
|
|
|
|
# Only add meta tag if head element exists
|
|
head = html_soup.find("head")
|
|
if head:
|
|
# Add meta tag with git commit
|
|
meta_tag = html_soup.new_tag("meta", attrs={"name": "olmocr_git_commit", "content": git_commit})
|
|
head.insert(0, meta_tag)
|
|
|
|
# Update initial_html with the modified version
|
|
html_content = str(html_soup)
|
|
|
|
# Create output directories
|
|
html_dir = os.path.join(args.output_dir, "html", args.name)
|
|
pdfs_dir = os.path.join(args.output_dir, "pdfs", args.name)
|
|
training_dir = os.path.join(args.output_dir, "training", args.name)
|
|
bench_data_dir = os.path.join(args.output_dir, "bench_data")
|
|
bench_synthetic_dir = os.path.join(bench_data_dir, "pdfs", args.name)
|
|
claude_original_dir = os.path.join(bench_data_dir, "claude_original", args.name)
|
|
os.makedirs(html_dir, exist_ok=True)
|
|
os.makedirs(pdfs_dir, exist_ok=True)
|
|
os.makedirs(training_dir, exist_ok=True)
|
|
os.makedirs(bench_data_dir, exist_ok=True)
|
|
os.makedirs(bench_synthetic_dir, exist_ok=True)
|
|
os.makedirs(claude_original_dir, exist_ok=True)
|
|
|
|
# Save HTML to output directory
|
|
html_path = os.path.join(html_dir, f"{pdf_id}_page{page_num}.html")
|
|
with open(html_path, "w") as f:
|
|
f.write(html_content)
|
|
|
|
# Convert HTML to markdown with FrontMatter and save
|
|
markdown_content = html_to_markdown_with_frontmatter(html_content)
|
|
markdown_filename = f"{pdf_id}_page{page_num}.md"
|
|
markdown_path = os.path.join(training_dir, markdown_filename)
|
|
with open(markdown_path, "w") as f:
|
|
f.write(markdown_content)
|
|
|
|
# Create soft link to PDF in training directory
|
|
pdf_link_name = f"{pdf_id}_page{page_num}.pdf"
|
|
pdf_link_path = os.path.join(training_dir, pdf_link_name)
|
|
# Remove existing link if it exists
|
|
if os.path.exists(pdf_link_path) or os.path.islink(pdf_link_path):
|
|
os.remove(pdf_link_path)
|
|
# Create relative symlink from training to pdfs directory
|
|
os.symlink(os.path.relpath(os.path.join(pdfs_dir, f"{pdf_id}_page{page_num}.pdf"), training_dir), pdf_link_path)
|
|
|
|
# Create soft link to markdown in claude_original/synthetic with new naming scheme
|
|
claude_md_link_name = f"{pdf_id}_page{page_num}_pg1_repeat1.md"
|
|
claude_md_link_path = os.path.join(claude_original_dir, claude_md_link_name)
|
|
# Remove existing link if it exists
|
|
if os.path.exists(claude_md_link_path) or os.path.islink(claude_md_link_path):
|
|
os.remove(claude_md_link_path)
|
|
# Create relative symlink from claude_original/synthetic to training directory
|
|
os.symlink(os.path.relpath(markdown_path, claude_original_dir), claude_md_link_path)
|
|
|
|
# Extract the page and save as PDF
|
|
original_pdf_path = os.path.join(pdfs_dir, f"{pdf_id}_page{page_num}_original.pdf")
|
|
if not extract_page_from_pdf(local_pdf_path, original_pdf_path, page_num):
|
|
print(f"Failed to extract page {page_num} from {local_pdf_path}")
|
|
|
|
# Render PDF using Playwright if not skipped
|
|
playwright_pdf_path = None
|
|
render_success = False
|
|
playwright_pdf_filename = f"{pdf_id}_page{page_num}.pdf" # This will be used in the tests
|
|
|
|
if not args.skip_playwright:
|
|
playwright_pdf_path = os.path.join(pdfs_dir, playwright_pdf_filename)
|
|
|
|
try:
|
|
# Get PNG dimensions
|
|
png_width, png_height = get_png_dimensions_from_base64(image_base64)
|
|
|
|
# Run the async function directly since we're already in an async context
|
|
render_success = await render_pdf_with_playwright(html_content, playwright_pdf_path, png_width, png_height)
|
|
|
|
if render_success:
|
|
print(f"Successfully rendered with Playwright: {playwright_pdf_path}")
|
|
else:
|
|
print(f"Failed to render as a single page PDF: {playwright_pdf_path}")
|
|
playwright_pdf_path = None
|
|
except Exception as e:
|
|
print(f"Failed to render with Playwright: {e}")
|
|
playwright_pdf_path = None
|
|
render_success = False
|
|
|
|
# If playwright rendering failed and was required, return None to skip this test
|
|
if not args.skip_playwright and not render_success:
|
|
return None
|
|
|
|
# Create soft link in bench_data/synthetic/ directory
|
|
if playwright_pdf_path:
|
|
synthetic_link_path = os.path.join(bench_synthetic_dir, playwright_pdf_filename)
|
|
# Remove existing link if it exists
|
|
if os.path.exists(synthetic_link_path) or os.path.islink(synthetic_link_path):
|
|
os.remove(synthetic_link_path)
|
|
# Create relative symlink from bench_data/synthetic to pdfs directory
|
|
os.symlink(os.path.relpath(playwright_pdf_path, bench_synthetic_dir), synthetic_link_path)
|
|
|
|
# Generate tests from the HTML content
|
|
# Use the playwright rendered PDF path for tests
|
|
tests = generate_tests_from_html(html_content, pdf_id, page_num, random_generator, verbose_table_testing)
|
|
|
|
# Update the PDF path in all tests to use the playwright rendered PDF with the specified name prefix
|
|
for test in tests:
|
|
test["pdf"] = f"{args.name}/{playwright_pdf_filename}"
|
|
|
|
# Log table test stats if verbose
|
|
if verbose_table_testing:
|
|
table_tests = [t for t in tests if t["type"] == TestType.TABLE.value]
|
|
print(f"Generated {len(table_tests)} table tests for {pdf_id}, page {page_num} (passed verification)")
|
|
|
|
return {
|
|
"pdf_id": pdf_id,
|
|
"pdf_path": pdf_path,
|
|
"page_number": page_num,
|
|
"html_path": html_path,
|
|
"markdown_path": markdown_path,
|
|
"original_pdf_path": original_pdf_path,
|
|
"playwright_pdf_path": playwright_pdf_path,
|
|
"tests": tests,
|
|
"num_tests": len(tests),
|
|
}
|
|
except Exception as e:
|
|
print(f"Error processing {pdf_path}: {e}")
|
|
return None
|
|
finally:
|
|
# Clean up temp directory for this PDF
|
|
if os.path.exists(temp_pdf_dir):
|
|
subprocess.run(["rm", "-rf", temp_pdf_dir])
|
|
|
|
|
|
async def main():
|
|
# Configure logging to suppress httpx messages
|
|
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
|
|
|
parser = argparse.ArgumentParser(description="Convert PDFs to HTML templates and render with Playwright")
|
|
parser.add_argument("--input_list", required=True, help="Path to a file containing S3 paths or local paths to PDFs")
|
|
parser.add_argument("--output_dir", required=True, help="Directory to store extracted pages and tests")
|
|
parser.add_argument("--temp_dir", default="/tmp/mine_tables", help="Directory for temporary files")
|
|
parser.add_argument("--max_tests", type=int, default=100, help="Maximum number of tests to generate")
|
|
parser.add_argument("--parallel", type=int, default=1, help="Number of parallel tasks to use")
|
|
parser.add_argument("--api_key", help="Claude API key (or set ANTHROPIC_API_KEY environment variable)")
|
|
parser.add_argument("--skip_playwright", action="store_true", help="Skip Playwright PDF rendering")
|
|
parser.add_argument("--verbose", action="store_true", help="Enable verbose output including table test verification")
|
|
parser.add_argument("--filter", action="store_true", help="Apply PDF filtering to remove forms, spam, and non-English content")
|
|
parser.add_argument("--name", default="synthetic", help="Name for the output JSONL file and subfolder (default: synthetic)")
|
|
args = parser.parse_args()
|
|
|
|
# Ensure output and temp directories exist
|
|
os.makedirs(args.output_dir, exist_ok=True)
|
|
os.makedirs(args.temp_dir, exist_ok=True)
|
|
|
|
# Get API key
|
|
api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY")
|
|
if not api_key:
|
|
print("Error: API key not provided. Use --api_key or set ANTHROPIC_API_KEY environment variable.")
|
|
return
|
|
|
|
# Initialize async Claude client
|
|
client = AsyncAnthropic(api_key=api_key)
|
|
|
|
# Initialize PDF filter if enabled
|
|
pdf_filter = None
|
|
if args.filter:
|
|
pdf_filter = PdfFilter(
|
|
languages_to_keep={Language.ENGLISH, None}, # None means could not detect language, that's okay keep it, might be an OCR
|
|
apply_download_spam_check=True,
|
|
apply_form_check=True,
|
|
)
|
|
print("PDF filtering enabled")
|
|
|
|
# Reservoir sampling implementation
|
|
random_gen = random.Random(42)
|
|
pdf_paths = []
|
|
|
|
if os.path.isdir(args.input_list):
|
|
pdf_paths = list(glob.glob(os.path.join(args.input_list, "*.pdf"), recursive=True))
|
|
else:
|
|
with open(args.input_list, "r") as f:
|
|
for i, line in enumerate(tqdm(f)):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
if i < 100000:
|
|
pdf_paths.append(line)
|
|
else:
|
|
# Randomly replace elements with decreasing probability
|
|
j = random_gen.randint(0, i)
|
|
if j < 100000:
|
|
pdf_paths[j] = line
|
|
|
|
print(f"Found {len(pdf_paths)} PDF paths in input list")
|
|
|
|
# Shuffle and limit to max_tests
|
|
random_gen.shuffle(pdf_paths)
|
|
pdf_paths = pdf_paths[: args.max_tests]
|
|
|
|
# Initialize the JSONL file in bench_data folder with the specified name
|
|
bench_data_dir = os.path.join(args.output_dir, "bench_data")
|
|
os.makedirs(bench_data_dir, exist_ok=True)
|
|
synthetic_json_path = os.path.join(bench_data_dir, f"{args.name}.jsonl")
|
|
open(synthetic_json_path, "w").close() # Create empty file
|
|
|
|
# Initialize the metadata JSONL file
|
|
metadata_dir = os.path.join(args.output_dir, "metadata")
|
|
os.makedirs(metadata_dir, exist_ok=True)
|
|
metadata_json_path = os.path.join(metadata_dir, f"{args.name}.jsonl")
|
|
open(metadata_json_path, "w").close() # Create empty file
|
|
|
|
# Counter for test statistics
|
|
test_counter = 0
|
|
test_types = defaultdict(int) # Automatically handles any test type
|
|
results = []
|
|
|
|
# Initialize an asyncio lock for file access
|
|
file_lock = asyncio.Lock()
|
|
|
|
# Process PDFs in parallel using asyncio
|
|
async def process_with_progress(pdf_info):
|
|
pdf_path = pdf_info[0]
|
|
try:
|
|
result = await process_pdf(pdf_info, args, client, pdf_filter)
|
|
if result and result.get("tests"):
|
|
# Append tests to synthetic.json as they're created (JSONL format)
|
|
async with file_lock:
|
|
# Append each test as a separate JSON line
|
|
with open(synthetic_json_path, "a") as f:
|
|
for test in result["tests"]:
|
|
f.write(json.dumps(test) + "\n")
|
|
|
|
# Write metadata mapping (pdf_id to source URL)
|
|
with open(metadata_json_path, "a") as f:
|
|
metadata = {"pdf_id": result["pdf_id"], "source_url": result["pdf_path"], "page_number": result["page_number"]}
|
|
f.write(json.dumps(metadata) + "\n")
|
|
|
|
# Update counters
|
|
nonlocal test_counter
|
|
test_counter += len(result["tests"])
|
|
for test in result["tests"]:
|
|
test_type = test.get("type", "unknown")
|
|
test_types[test_type] += 1
|
|
|
|
print(f"Added {len(result['tests'])} tests from {result['pdf_id']}, total: {test_counter}")
|
|
|
|
return result
|
|
except Exception as e:
|
|
print(f"Error processing {pdf_path}: {e}")
|
|
return None
|
|
|
|
# Create tasks for all PDFs
|
|
tasks = []
|
|
for i, pdf_path in enumerate(pdf_paths):
|
|
tasks.append(process_with_progress((pdf_path, i)))
|
|
|
|
# Run tasks with limited concurrency
|
|
semaphore = asyncio.Semaphore(args.parallel)
|
|
|
|
async def bounded_task(task_coro):
|
|
async with semaphore:
|
|
return await task_coro
|
|
|
|
bounded_tasks = [bounded_task(task) for task in tasks]
|
|
|
|
# Process all tasks with progress bar
|
|
pbar = tqdm(asyncio.as_completed(bounded_tasks), total=len(bounded_tasks), desc="Processing PDFs")
|
|
for coro in pbar:
|
|
result = await coro
|
|
if result:
|
|
results.append(result)
|
|
|
|
# Update progress bar with cost information
|
|
cost_input = (total_input_tokens / 1_000_000) * 3.0 # $3 per million input tokens
|
|
cost_output = (total_output_tokens / 1_000_000) * 15.0 # $15 per million output tokens
|
|
total_cost = cost_input + cost_output
|
|
pbar.set_postfix({"in_tokens": f"{total_input_tokens:,}", "out_tokens": f"{total_output_tokens:,}", "cost": f"${total_cost:.2f}"})
|
|
|
|
print(f"Generated {len(results)} HTML templates")
|
|
|
|
# Print summary of Playwright rendering results
|
|
playwright_success = sum(1 for r in results if r and r.get("playwright_pdf_path"))
|
|
if not args.skip_playwright:
|
|
print(f"Playwright PDF rendering: {playwright_success}/{len(results)} successful")
|
|
|
|
print(f"Saved {test_counter} tests to {synthetic_json_path}")
|
|
|
|
# Print summary of generated tests
|
|
print(f"Generated a total of {test_counter} tests across {len(results)} templates")
|
|
|
|
# Print test type distribution
|
|
if test_counter > 0:
|
|
print("Test type distribution:")
|
|
for test_type, count in test_types.items():
|
|
print(f" - {test_type}: {count} tests")
|
|
|
|
# Print final Claude API cost summary
|
|
print("\nClaude Sonnet API Usage Summary:")
|
|
print(f" Total input tokens: {total_input_tokens:,}")
|
|
print(f" Total output tokens: {total_output_tokens:,}")
|
|
cost_input = (total_input_tokens / 1_000_000) * 3.0
|
|
cost_output = (total_output_tokens / 1_000_000) * 15.0
|
|
total_cost = cost_input + cost_output
|
|
print(f" Input cost: ${cost_input:.2f} ($3/MTok)")
|
|
print(f" Output cost: ${cost_output:.2f} ($15/MTok)")
|
|
print(f" Total cost: ${total_cost:.2f}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|