diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py
index 75c57ca..17a4846 100644
--- a/olmocr/bench/synth/mine_html_templates.py
+++ b/olmocr/bench/synth/mine_html_templates.py
@@ -487,38 +487,29 @@ def process_pdf(pdf_info, args, client):
print(f"Failed to generate HTML for {s3_path}, page {page_num}")
return None
- # Create output directory
- templates_dir = os.path.join(args.output_dir, "templates")
- os.makedirs(templates_dir, exist_ok=True)
+ # Create output directories
+ html_dir = os.path.join(args.output_dir, "html")
+ pdfs_dir = os.path.join(args.output_dir, "pdfs")
+ os.makedirs(html_dir, exist_ok=True)
+ os.makedirs(pdfs_dir, exist_ok=True)
# Save HTML to output directory
- html_path = os.path.join(templates_dir, f"{pdf_id}_page{page_num}.html")
+ html_path = os.path.join(html_dir, f"{pdf_id}_page{page_num}.html")
with open(html_path, "w") as f:
f.write(html_content)
- # Generate tests from the HTML content
- tests = generate_tests_from_html(html_content, pdf_id, page_num)
-
- # Save tests to a JSONL file
- tests_dir = os.path.join(args.output_dir, "tests")
- os.makedirs(tests_dir, exist_ok=True)
- tests_path = os.path.join(tests_dir, f"{pdf_id}_page{page_num}_tests.jsonl")
- with open(tests_path, "w") as f:
- for test in tests:
- f.write(json.dumps(test) + "\n")
- print(f"Generated {len(tests)} tests for {pdf_id}, page {page_num}")
-
# Extract the page and save as PDF
- pdf_path = os.path.join(templates_dir, f"{pdf_id}_page{page_num}.pdf")
- if not extract_page_from_pdf(local_pdf_path, pdf_path, page_num):
+ original_pdf_path = os.path.join(pdfs_dir, f"{pdf_id}_page{page_num}_original.pdf")
+ if not extract_page_from_pdf(local_pdf_path, original_pdf_path, page_num):
print(f"Failed to extract page {page_num} from {local_pdf_path}")
# Render PDF using Playwright if not skipped
playwright_pdf_path = None
render_success = False
+ playwright_pdf_filename = f"{pdf_id}_page{page_num}.pdf" # This will be used in the tests
if not args.skip_playwright:
- playwright_pdf_path = os.path.join(templates_dir, f"{pdf_id}_page{page_num}_playwright.pdf")
+ playwright_pdf_path = os.path.join(pdfs_dir, playwright_pdf_filename)
try:
# Get PNG dimensions
@@ -531,10 +522,6 @@ def process_pdf(pdf_info, args, client):
print(f"Successfully rendered with Playwright: {playwright_pdf_path}")
else:
print(f"Failed to render as a single page PDF: {playwright_pdf_path}")
- # Remove the tests if we couldn't render a proper single-page PDF
- if os.path.exists(tests_path):
- os.remove(tests_path)
- print(f"Removed tests for {pdf_id} due to rendering failure")
playwright_pdf_path = None
except Exception as e:
print(f"Failed to render with Playwright: {e}")
@@ -544,15 +531,23 @@ def process_pdf(pdf_info, args, client):
# If playwright rendering failed and was required, return None to skip this test
if not args.skip_playwright and not render_success:
return None
-
+
+ # Generate tests from the HTML content
+ # Use the playwright rendered PDF path for tests
+ tests = generate_tests_from_html(html_content, pdf_id, page_num)
+
+ # Update the PDF path in all tests to use the playwright rendered PDF
+ for test in tests:
+ test["pdf"] = playwright_pdf_filename
+
return {
"pdf_id": pdf_id,
"s3_path": s3_path,
"page_number": page_num,
"html_path": html_path,
- "pdf_path": pdf_path,
+ "original_pdf_path": original_pdf_path,
"playwright_pdf_path": playwright_pdf_path,
- "tests_path": tests_path,
+ "tests": tests,
"num_tests": len(tests),
}
except Exception as e:
@@ -609,9 +604,21 @@ def main():
# Shuffle and limit to max_tests
random.shuffle(s3_paths)
s3_paths = s3_paths[: args.max_tests]
+
+ # Initialize synthetic.json as a JSONL file (empty initially)
+ synthetic_json_path = os.path.join(args.output_dir, "synthetic.jsonl")
+ open(synthetic_json_path, "w").close() # Create empty file
+
+ # Counter for test statistics
+ test_counter = 0
+ test_types = {"present": 0, "absent": 0, "table": 0, "order": 0}
+ results = []
+
+ # Initialize a threading lock for file access
+ import threading
+ file_lock = threading.Lock()
# Process PDFs in parallel
- results = []
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
# Submit all tasks
futures = {executor.submit(process_pdf, (s3_path, i), args, client): s3_path for i, s3_path in enumerate(s3_paths)}
@@ -621,8 +628,24 @@ def main():
s3_path = futures[future]
try:
result = future.result()
- if result:
+ if result and result.get("tests"):
results.append(result)
+
+ # Append tests to synthetic.json as they're created (JSONL format)
+ with file_lock:
+ # Append each test as a separate JSON line
+ with open(synthetic_json_path, "a") as f:
+ for test in result["tests"]:
+ f.write(json.dumps(test) + "\n")
+
+ # Update counters
+ test_counter += len(result["tests"])
+ for test in result["tests"]:
+ test_type = test.get("type", "")
+ if test_type in test_types:
+ test_types[test_type] += 1
+
+ print(f"Added {len(result['tests'])} tests from {result['pdf_id']}, total: {test_counter}")
except Exception as e:
print(f"Error processing {s3_path}: {e}")
@@ -632,29 +655,15 @@ def main():
playwright_success = sum(1 for r in results if r and r.get("playwright_pdf_path"))
if not args.skip_playwright:
print(f"Playwright PDF rendering: {playwright_success}/{len(results)} successful")
-
+
+ print(f"Saved {test_counter} tests to {synthetic_json_path}")
+
# Print summary of generated tests
- total_tests = sum(r.get("num_tests", 0) for r in results if r)
- print(f"Generated a total of {total_tests} tests across {len(results)} templates")
+ print(f"Generated a total of {test_counter} tests across {len(results)} templates")
- # Optional: Collect and display test type statistics
- if total_tests > 0:
- # Count the tests by type from a sample of result files
- test_types = {"present": 0, "absent": 0, "table": 0, "order": 0}
- for r in results[: min(10, len(results))]:
- if r and r.get("tests_path"):
- try:
- with open(r.get("tests_path"), "r") as f:
- for line in f:
- test = json.loads(line)
- test_type = test.get("type", "")
- if test_type in test_types:
- test_types[test_type] += 1
- except Exception as e:
- print(f"Error reading test file {r.get('tests_path')}: {e}")
-
- # Print test type distribution for the sample
- print("Test type distribution (from sample):")
+ # Print test type distribution
+ if test_counter > 0:
+ print("Test type distribution:")
for test_type, count in test_types.items():
print(f" - {test_type}: {count} tests")