mirror of
https://github.com/allenai/olmocr.git
synced 2025-06-27 04:00:02 +00:00
664 lines
27 KiB
Python
664 lines
27 KiB
Python
import argparse
|
|
import asyncio
|
|
import concurrent.futures
|
|
import json
|
|
import os
|
|
import random
|
|
import subprocess
|
|
import uuid
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from typing import Dict, List
|
|
|
|
import pypdf
|
|
from anthropic import Anthropic
|
|
from bs4 import BeautifulSoup
|
|
from playwright.async_api import async_playwright
|
|
from syntok.segmenter import process
|
|
from tqdm import tqdm
|
|
|
|
from olmocr.bench.tests import (
|
|
TestType,
|
|
)
|
|
from olmocr.data.renderpdf import (
|
|
get_png_dimensions_from_base64,
|
|
render_pdf_to_base64png,
|
|
)
|
|
|
|
|
|
def download_s3_pdf(s3_path, local_path):
|
|
"""Download a PDF from S3 to a local path."""
|
|
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
|
result = subprocess.run(["aws", "s3", "cp", s3_path, local_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
return result.returncode == 0
|
|
|
|
|
|
def generate_html_from_image(client, image_base64):
|
|
"""Call Claude API to generate HTML from an image using a multi-step prompting strategy."""
|
|
png_width, png_height = get_png_dimensions_from_base64(image_base64)
|
|
|
|
try:
|
|
# Step 1: Initial analysis and column detection
|
|
analysis_response = client.messages.create(
|
|
model="claude-3-7-sonnet-20250219",
|
|
max_tokens=2000,
|
|
temperature=0.1,
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
|
|
{
|
|
"type": "text",
|
|
"text": "Analyze this document and provide a detailed assessment of its structure. Focus specifically on:\n"
|
|
"1. How many columns does the document have? Is it single-column, two-column, three-column, or a mixed layout?\n"
|
|
"2. What are the main sections and content types (headings, paragraphs, lists, tables, images, etc.)?\n"
|
|
"3. Does it have headers, footers, page numbers, or other special elements?\n"
|
|
"4. Is there any complex formatting that would be challenging to reproduce in HTML?\n\n"
|
|
"Please be very precise about the number of columns and how they're arranged.",
|
|
},
|
|
],
|
|
}
|
|
],
|
|
)
|
|
|
|
analysis_text = ""
|
|
for content in analysis_response.content:
|
|
if content.type == "text":
|
|
analysis_text += content.text
|
|
|
|
# Step 2: Initial HTML generation with detailed layout instructions
|
|
initial_response = client.messages.create(
|
|
model="claude-3-7-sonnet-20250219",
|
|
max_tokens=6000,
|
|
temperature=0.2,
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
|
|
{
|
|
"type": "text",
|
|
"text": "Render this document as clean, semantic HTML. Here's my analysis of the document structure:\n\n"
|
|
f"{analysis_text}\n\n"
|
|
"Important requirements:\n"
|
|
"1. Use appropriate HTML tags for elements like headings, paragraphs, lists, tables, etc.\n"
|
|
"2. Use the <header> and <footer> tags to represent content at the top/bottom which would not normally be part of the main content, such as page numbers, etc.\n"
|
|
"3. Use a placeholder <div> tag with class 'image' which will render as a grey box with black outline to make sure images have their original size, shape, and position on the page.\n"
|
|
"4. CRITICAL: If the document has a multi-column layout, you MUST preserve the exact same number of columns in your HTML. Use CSS flexbox or grid to create the columns.\n"
|
|
"5. Focus on creating valid, accessible HTML that preserves the appearance and formatting of the original page as closely as possible.\n"
|
|
f"6. The webpage will be viewed with a fixed viewport size of {png_width // 2} pixels wide by {png_height // 2} pixels tall.\n\n"
|
|
"7. For multi-column layouts, use explicit CSS. The most important aspect is preserving the column structure of the original document - this is critical.\n\n"
|
|
"Enclose your HTML in a ```html code block.",
|
|
},
|
|
],
|
|
}
|
|
],
|
|
)
|
|
|
|
# Extract initial HTML
|
|
initial_html = ""
|
|
for content in initial_response.content:
|
|
if content.type == "text":
|
|
initial_html += content.text
|
|
|
|
# Extract code block
|
|
if "```html" in initial_html:
|
|
start = initial_html.find("```html") + 7
|
|
end = initial_html.rfind("```")
|
|
if end > start:
|
|
initial_html = initial_html[start:end].strip()
|
|
else:
|
|
initial_html = initial_html[start:].strip()
|
|
elif "```" in initial_html:
|
|
start = initial_html.find("```") + 3
|
|
end = initial_html.rfind("```")
|
|
if end > start:
|
|
initial_html = initial_html[start:end].strip()
|
|
else:
|
|
initial_html = initial_html[start:].strip()
|
|
|
|
return initial_html
|
|
except Exception as e:
|
|
print(f"Error calling Claude API: {e}")
|
|
return None
|
|
|
|
|
|
def extract_page_from_pdf(input_path, output_path, page_num):
|
|
"""
|
|
Extract a specific page from a PDF and save it as a new PDF.
|
|
|
|
Args:
|
|
input_path: Path to the input PDF
|
|
output_path: Path to save the extracted page
|
|
page_num: The page number to extract (1-indexed, converted to 0-indexed for pypdf)
|
|
|
|
Returns:
|
|
bool: True if extraction was successful, False otherwise
|
|
"""
|
|
try:
|
|
# Ensure output directory exists
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
|
|
# Read the input PDF
|
|
reader = pypdf.PdfReader(input_path)
|
|
|
|
# Convert to 0-indexed for pypdf
|
|
zero_idx_page = page_num - 1
|
|
|
|
# Check if page number is valid
|
|
if zero_idx_page >= len(reader.pages) or zero_idx_page < 0:
|
|
print(f"Page number {page_num} out of range for {input_path} with {len(reader.pages)} pages")
|
|
return False
|
|
|
|
# Create a new PDF with just the selected page
|
|
writer = pypdf.PdfWriter()
|
|
writer.add_page(reader.pages[zero_idx_page])
|
|
|
|
# Write the output PDF
|
|
with open(output_path, "wb") as output_file:
|
|
writer.write(output_file)
|
|
|
|
return True
|
|
except Exception as e:
|
|
print(f"Error extracting page {page_num} from {input_path}: {str(e)}")
|
|
return False
|
|
|
|
|
|
async def render_pdf_with_playwright(html_content, output_pdf_path, png_width, png_height):
|
|
"""
|
|
Render HTML content using Playwright and save it as PDF.
|
|
Try different scale factors if needed to ensure the output is exactly one page.
|
|
|
|
Args:
|
|
html_content: HTML content to render
|
|
output_pdf_path: Path to save the rendered PDF
|
|
png_width: Width of the viewport
|
|
png_height: Height of the viewport
|
|
|
|
Returns:
|
|
bool: True if rendering was successful with exactly one page, False otherwise
|
|
"""
|
|
scale_factors = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5] # Try these scale factors in order
|
|
|
|
for scale in scale_factors:
|
|
try:
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch()
|
|
page = await browser.new_page(viewport={"width": int(png_width // 2 * scale), "height": int(png_height // 2 * scale)})
|
|
|
|
# Set the HTML content
|
|
await page.set_content(html_content)
|
|
|
|
# Save as PDF with formatting options
|
|
await page.pdf(
|
|
path=output_pdf_path,
|
|
scale=scale,
|
|
print_background=True,
|
|
)
|
|
|
|
await browser.close()
|
|
|
|
# Check if the output PDF has exactly one page
|
|
try:
|
|
reader = pypdf.PdfReader(output_pdf_path)
|
|
if len(reader.pages) == 1:
|
|
print(f"Successfully rendered as a single page PDF with scale factor {scale}")
|
|
return True
|
|
else:
|
|
print(f"PDF has {len(reader.pages)} pages with scale factor {scale}, trying a smaller scale...")
|
|
# Continue to the next scale factor
|
|
except Exception as pdf_check_error:
|
|
print(f"Error checking PDF page count: {pdf_check_error}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"Error rendering PDF with Playwright at scale {scale}: {str(e)}")
|
|
# Try the next scale factor
|
|
|
|
print("Failed to render PDF as a single page with any scale factor")
|
|
return False
|
|
|
|
|
|
def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int) -> List[Dict]:
|
|
"""
|
|
Generate tests from HTML content parsed from the PDF.
|
|
|
|
Args:
|
|
html_content: The HTML content of the page
|
|
pdf_id: The unique identifier for the PDF
|
|
page_num: The page number
|
|
|
|
Returns:
|
|
A list of test dictionaries that can be saved as JSONL
|
|
"""
|
|
tests = []
|
|
pdf_filename = f"{pdf_id}_page{page_num}.pdf"
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
|
|
# Step 1: Process headers, footers, and page numbers for TextAbsenceTests
|
|
headers = soup.find_all("header")
|
|
footers = soup.find_all("footer")
|
|
page_numbers = soup.find_all("div", class_="page-number")
|
|
|
|
# Function to create absence tests from text elements
|
|
def create_absence_tests_from_elements(parent_element, element_type):
|
|
# Find all text-containing elements within the parent
|
|
text_elements = []
|
|
|
|
# First get direct text nodes within spans, divs, p, and heading tags
|
|
for tag in parent_element.find_all(["span", "div", "p", "h1", "h2", "h3", "h4", "h5", "h6"]):
|
|
text = tag.get_text().strip()
|
|
if text:
|
|
text_elements.append(text)
|
|
|
|
# If no elements found, use the parent's text as a fallback
|
|
if not text_elements:
|
|
parent_text = parent_element.get_text().strip()
|
|
if parent_text:
|
|
text_elements.append(parent_text)
|
|
|
|
# Create tests for each text element
|
|
for text in text_elements:
|
|
if len(text) > 3: # Only create tests for meaningful text
|
|
tests.append(
|
|
{
|
|
"pdf": pdf_filename,
|
|
"page": page_num,
|
|
"id": f"{pdf_id}_{element_type}_{uuid.uuid4().hex[:8]}",
|
|
"type": TestType.ABSENT.value,
|
|
"text": text,
|
|
"max_diffs": 5,
|
|
}
|
|
)
|
|
|
|
# Create TextAbsenceTests for headers
|
|
for header in headers:
|
|
create_absence_tests_from_elements(header, "header")
|
|
|
|
# Create TextAbsenceTests for footers
|
|
for footer in footers:
|
|
create_absence_tests_from_elements(footer, "footer")
|
|
|
|
# Create TextAbsenceTests for page numbers
|
|
for page_number in page_numbers:
|
|
page_number_text = page_number.get_text().strip()
|
|
if page_number_text:
|
|
tests.append(
|
|
{
|
|
"pdf": pdf_filename,
|
|
"page": page_num,
|
|
"id": f"{pdf_id}_page_number_{uuid.uuid4().hex[:8]}",
|
|
"type": TestType.ABSENT.value,
|
|
"text": page_number_text,
|
|
"max_diffs": 5,
|
|
}
|
|
)
|
|
|
|
# Step 2: Generate tests from tables
|
|
tables = soup.find_all("table")
|
|
for table_idx, table in enumerate(tables):
|
|
# Get all cells in the table
|
|
cells = table.find_all(["td", "th"])
|
|
|
|
# Skip empty tables or tables with very few cells
|
|
if len(cells) < 4:
|
|
continue
|
|
|
|
# Generate tests for some randomly selected cells
|
|
sampled_cells = random.sample(cells, min(3, len(cells)))
|
|
|
|
for cell in sampled_cells:
|
|
cell_text = cell.get_text().strip()
|
|
if not cell_text or len(cell_text) < 3:
|
|
continue
|
|
|
|
# Find position of this cell in the table
|
|
row = cell.find_parent("tr")
|
|
rows = table.find_all("tr")
|
|
row_idx = rows.index(row)
|
|
|
|
# Find cells in this row
|
|
row_cells = row.find_all(["td", "th"])
|
|
col_idx = row_cells.index(cell)
|
|
|
|
# Create a TableTest with relevant relationships
|
|
test_data = {
|
|
"pdf": pdf_filename,
|
|
"page": page_num,
|
|
"id": f"{pdf_id}_table{table_idx}_{uuid.uuid4().hex[:8]}",
|
|
"type": TestType.TABLE.value,
|
|
"cell": cell_text,
|
|
"max_diffs": 5,
|
|
}
|
|
|
|
# Check cell up
|
|
if row_idx > 0:
|
|
prev_row = rows[row_idx - 1]
|
|
prev_row_cells = prev_row.find_all(["td", "th"])
|
|
if col_idx < len(prev_row_cells):
|
|
up_text = prev_row_cells[col_idx].get_text().strip()
|
|
if up_text:
|
|
test_data["up"] = up_text
|
|
|
|
# Check cell down
|
|
if row_idx < len(rows) - 1:
|
|
next_row = rows[row_idx + 1]
|
|
next_row_cells = next_row.find_all(["td", "th"])
|
|
if col_idx < len(next_row_cells):
|
|
down_text = next_row_cells[col_idx].get_text().strip()
|
|
if down_text:
|
|
test_data["down"] = down_text
|
|
|
|
# Check cell left
|
|
if col_idx > 0:
|
|
left_text = row_cells[col_idx - 1].get_text().strip()
|
|
if left_text:
|
|
test_data["left"] = left_text
|
|
|
|
# Check cell right
|
|
if col_idx < len(row_cells) - 1:
|
|
right_text = row_cells[col_idx + 1].get_text().strip()
|
|
if right_text:
|
|
test_data["right"] = right_text
|
|
|
|
# Check top heading (first row in the table or a header row)
|
|
if row_idx > 0:
|
|
header_row = rows[0]
|
|
header_cells = header_row.find_all(["td", "th"])
|
|
if col_idx < len(header_cells):
|
|
top_heading = header_cells[col_idx].get_text().strip()
|
|
if top_heading:
|
|
test_data["top_heading"] = top_heading
|
|
|
|
# Check left heading (first column in the table)
|
|
if col_idx > 0:
|
|
left_heading = row_cells[0].get_text().strip()
|
|
if left_heading:
|
|
test_data["left_heading"] = left_heading
|
|
|
|
# Only add the test if we have at least one relation
|
|
if len(test_data) > 6: # 6 is the number of required fields
|
|
tests.append(test_data)
|
|
|
|
# Step 3: Generate TextPresenceTests for main body content
|
|
# Make a copy of the soup for the main content
|
|
main_soup = BeautifulSoup(str(soup), "html.parser")
|
|
|
|
# Remove headers, footers, and tables from the main_soup
|
|
for element in main_soup.find_all(["header", "footer", "table"]):
|
|
element.extract()
|
|
|
|
# Get all paragraphs and headings in the main content
|
|
paragraphs = main_soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6"])
|
|
|
|
# Sample a few paragraphs to use for presence tests
|
|
if paragraphs:
|
|
sampled_paragraphs = random.sample(paragraphs, min(5, len(paragraphs)))
|
|
|
|
for paragraph in sampled_paragraphs:
|
|
text = paragraph.get_text().strip()
|
|
# Only create tests for paragraphs with sufficient content
|
|
if text and len(text) > 20:
|
|
tests.append(
|
|
{
|
|
"pdf": pdf_filename,
|
|
"page": page_num,
|
|
"id": f"{pdf_id}_text_{uuid.uuid4().hex[:8]}",
|
|
"type": TestType.PRESENT.value,
|
|
"text": text[:200], # Limit to 200 chars to keep tests manageable
|
|
"max_diffs": 10,
|
|
}
|
|
)
|
|
|
|
# Generate some TextOrderTests for content that should appear in a specific order
|
|
if len(paragraphs) >= 2:
|
|
# Extract all text from the main content
|
|
all_text = " ".join([p.get_text().strip() for p in paragraphs])
|
|
|
|
# Use syntok to segment the text into sentences
|
|
sentences = []
|
|
for paragraph in process(all_text):
|
|
for sentence in paragraph:
|
|
# Convert token sequence to string and clean it
|
|
sentence_text = " ".join([token.value for token in sentence]).strip()
|
|
if sentence_text and len(sentence_text) > 10 and len(sentence_text) < 100:
|
|
sentences.append(sentence_text)
|
|
|
|
# Create TextOrderTests from pairs of sentences that are at least 3 sentences apart
|
|
# to ensure they're from different parts of the document
|
|
if len(sentences) >= 5:
|
|
num_tests = min(3, len(sentences) // 5)
|
|
for _ in range(num_tests):
|
|
# Get two random indices with sufficient distance between them
|
|
i = random.randint(0, len(sentences) - 4)
|
|
j = random.randint(i + 3, min(i + 10, len(sentences) - 1))
|
|
|
|
first_sentence = sentences[i]
|
|
second_sentence = sentences[j]
|
|
|
|
tests.append(
|
|
{
|
|
"pdf": pdf_filename,
|
|
"page": page_num,
|
|
"id": f"{pdf_id}_order_{uuid.uuid4().hex[:8]}",
|
|
"type": TestType.ORDER.value,
|
|
"before": first_sentence,
|
|
"after": second_sentence,
|
|
"max_diffs": 10,
|
|
}
|
|
)
|
|
|
|
return tests
|
|
|
|
|
|
def process_pdf(pdf_info, args, client):
|
|
"""Process a single PDF, render a random page, and create an HTML template."""
|
|
s3_path, index = pdf_info
|
|
|
|
# Create a unique folder for each PDF in the temp directory
|
|
pdf_id = f"pdf_{index:05d}"
|
|
temp_pdf_dir = os.path.join(args.temp_dir, pdf_id)
|
|
os.makedirs(temp_pdf_dir, exist_ok=True)
|
|
|
|
# Download PDF to local temp directory
|
|
local_pdf_path = os.path.join(temp_pdf_dir, "document.pdf")
|
|
if not download_s3_pdf(s3_path, local_pdf_path):
|
|
print(f"Failed to download PDF from {s3_path}")
|
|
return None
|
|
|
|
try:
|
|
# Get page count using pypdf
|
|
reader = pypdf.PdfReader(local_pdf_path)
|
|
num_pages = len(reader.pages)
|
|
|
|
if num_pages == 0:
|
|
print(f"PDF has no pages: {s3_path}")
|
|
return None
|
|
|
|
# Select a random page
|
|
page_num = random.randint(1, num_pages)
|
|
|
|
# Render the page as a base64 PNG
|
|
image_base64 = render_pdf_to_base64png(local_pdf_path, page_num, target_longest_image_dim=2048)
|
|
|
|
# Generate HTML from the image
|
|
html_content = generate_html_from_image(client, image_base64)
|
|
if not html_content:
|
|
print(f"Failed to generate HTML for {s3_path}, page {page_num}")
|
|
return None
|
|
|
|
# Create output directory
|
|
templates_dir = os.path.join(args.output_dir, "templates")
|
|
os.makedirs(templates_dir, exist_ok=True)
|
|
|
|
# Save HTML to output directory
|
|
html_path = os.path.join(templates_dir, f"{pdf_id}_page{page_num}.html")
|
|
with open(html_path, "w") as f:
|
|
f.write(html_content)
|
|
|
|
# Generate tests from the HTML content
|
|
tests = generate_tests_from_html(html_content, pdf_id, page_num)
|
|
|
|
# Save tests to a JSONL file
|
|
tests_dir = os.path.join(args.output_dir, "tests")
|
|
os.makedirs(tests_dir, exist_ok=True)
|
|
tests_path = os.path.join(tests_dir, f"{pdf_id}_page{page_num}_tests.jsonl")
|
|
with open(tests_path, "w") as f:
|
|
for test in tests:
|
|
f.write(json.dumps(test) + "\n")
|
|
print(f"Generated {len(tests)} tests for {pdf_id}, page {page_num}")
|
|
|
|
# Extract the page and save as PDF
|
|
pdf_path = os.path.join(templates_dir, f"{pdf_id}_page{page_num}.pdf")
|
|
if not extract_page_from_pdf(local_pdf_path, pdf_path, page_num):
|
|
print(f"Failed to extract page {page_num} from {local_pdf_path}")
|
|
|
|
# Render PDF using Playwright if not skipped
|
|
playwright_pdf_path = None
|
|
render_success = False
|
|
|
|
if not args.skip_playwright:
|
|
playwright_pdf_path = os.path.join(templates_dir, f"{pdf_id}_page{page_num}_playwright.pdf")
|
|
|
|
try:
|
|
# Get PNG dimensions
|
|
png_width, png_height = get_png_dimensions_from_base64(image_base64)
|
|
|
|
# Run the async function in the synchronous context
|
|
render_success = asyncio.run(render_pdf_with_playwright(html_content, playwright_pdf_path, png_width, png_height))
|
|
|
|
if render_success:
|
|
print(f"Successfully rendered with Playwright: {playwright_pdf_path}")
|
|
else:
|
|
print(f"Failed to render as a single page PDF: {playwright_pdf_path}")
|
|
# Remove the tests if we couldn't render a proper single-page PDF
|
|
if os.path.exists(tests_path):
|
|
os.remove(tests_path)
|
|
print(f"Removed tests for {pdf_id} due to rendering failure")
|
|
playwright_pdf_path = None
|
|
except Exception as e:
|
|
print(f"Failed to render with Playwright: {e}")
|
|
playwright_pdf_path = None
|
|
render_success = False
|
|
|
|
# If playwright rendering failed and was required, return None to skip this test
|
|
if not args.skip_playwright and not render_success:
|
|
return None
|
|
|
|
return {
|
|
"pdf_id": pdf_id,
|
|
"s3_path": s3_path,
|
|
"page_number": page_num,
|
|
"html_path": html_path,
|
|
"pdf_path": pdf_path,
|
|
"playwright_pdf_path": playwright_pdf_path,
|
|
"tests_path": tests_path,
|
|
"num_tests": len(tests),
|
|
}
|
|
except Exception as e:
|
|
print(f"Error processing {s3_path}: {e}")
|
|
return None
|
|
finally:
|
|
# Clean up temp directory for this PDF
|
|
if os.path.exists(temp_pdf_dir):
|
|
subprocess.run(["rm", "-rf", temp_pdf_dir])
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Convert PDFs to HTML templates and render with Playwright")
|
|
parser.add_argument("--input_list", required=True, help="Path to a file containing S3 paths to PDFs")
|
|
parser.add_argument("--output_dir", required=True, help="Directory to store extracted pages and tests")
|
|
parser.add_argument("--temp_dir", default="/tmp/mine_tables", help="Directory for temporary files")
|
|
parser.add_argument("--max_tests", type=int, default=100, help="Maximum number of tests to generate")
|
|
parser.add_argument("--parallel", type=int, default=1, help="Number of parallel threads to use")
|
|
parser.add_argument("--api_key", help="Claude API key (or set ANTHROPIC_API_KEY environment variable)")
|
|
parser.add_argument("--skip_playwright", action="store_true", help="Skip Playwright PDF rendering")
|
|
args = parser.parse_args()
|
|
|
|
# Ensure output and temp directories exist
|
|
os.makedirs(args.output_dir, exist_ok=True)
|
|
os.makedirs(args.temp_dir, exist_ok=True)
|
|
|
|
# Get API key
|
|
api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY")
|
|
if not api_key:
|
|
print("Error: API key not provided. Use --api_key or set ANTHROPIC_API_KEY environment variable.")
|
|
return
|
|
|
|
# Initialize Claude client
|
|
client = Anthropic(api_key=api_key)
|
|
|
|
# Reservoir sampling implementation
|
|
s3_paths = []
|
|
with open(args.input_list, "r") as f:
|
|
for i, line in enumerate(tqdm(f)):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
if i < 100000:
|
|
s3_paths.append(line)
|
|
else:
|
|
# Randomly replace elements with decreasing probability
|
|
j = random.randint(0, i)
|
|
if j < 100000:
|
|
s3_paths[j] = line
|
|
|
|
print(f"Found {len(s3_paths)} PDF paths in input list")
|
|
|
|
# Shuffle and limit to max_tests
|
|
random.shuffle(s3_paths)
|
|
s3_paths = s3_paths[: args.max_tests]
|
|
|
|
# Process PDFs in parallel
|
|
results = []
|
|
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
|
|
# Submit all tasks
|
|
futures = {executor.submit(process_pdf, (s3_path, i), args, client): s3_path for i, s3_path in enumerate(s3_paths)}
|
|
|
|
# Process results as they complete
|
|
for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing PDFs"):
|
|
s3_path = futures[future]
|
|
try:
|
|
result = future.result()
|
|
if result:
|
|
results.append(result)
|
|
except Exception as e:
|
|
print(f"Error processing {s3_path}: {e}")
|
|
|
|
print(f"Generated {len(results)} HTML templates")
|
|
|
|
# Print summary of Playwright rendering results
|
|
playwright_success = sum(1 for r in results if r and r.get("playwright_pdf_path"))
|
|
if not args.skip_playwright:
|
|
print(f"Playwright PDF rendering: {playwright_success}/{len(results)} successful")
|
|
|
|
# Print summary of generated tests
|
|
total_tests = sum(r.get("num_tests", 0) for r in results if r)
|
|
print(f"Generated a total of {total_tests} tests across {len(results)} templates")
|
|
|
|
# Optional: Collect and display test type statistics
|
|
if total_tests > 0:
|
|
# Count the tests by type from a sample of result files
|
|
test_types = {"present": 0, "absent": 0, "table": 0, "order": 0}
|
|
for r in results[: min(10, len(results))]:
|
|
if r and r.get("tests_path"):
|
|
try:
|
|
with open(r.get("tests_path"), "r") as f:
|
|
for line in f:
|
|
test = json.loads(line)
|
|
test_type = test.get("type", "")
|
|
if test_type in test_types:
|
|
test_types[test_type] += 1
|
|
except Exception as e:
|
|
print(f"Error reading test file {r.get('tests_path')}: {e}")
|
|
|
|
# Print test type distribution for the sample
|
|
print("Test type distribution (from sample):")
|
|
for test_type, count in test_types.items():
|
|
print(f" - {test_type}: {count} tests")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|