diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py index 75c57ca..17a4846 100644 --- a/olmocr/bench/synth/mine_html_templates.py +++ b/olmocr/bench/synth/mine_html_templates.py @@ -487,38 +487,29 @@ def process_pdf(pdf_info, args, client): print(f"Failed to generate HTML for {s3_path}, page {page_num}") return None - # Create output directory - templates_dir = os.path.join(args.output_dir, "templates") - os.makedirs(templates_dir, exist_ok=True) + # Create output directories + html_dir = os.path.join(args.output_dir, "html") + pdfs_dir = os.path.join(args.output_dir, "pdfs") + os.makedirs(html_dir, exist_ok=True) + os.makedirs(pdfs_dir, exist_ok=True) # Save HTML to output directory - html_path = os.path.join(templates_dir, f"{pdf_id}_page{page_num}.html") + html_path = os.path.join(html_dir, f"{pdf_id}_page{page_num}.html") with open(html_path, "w") as f: f.write(html_content) - # Generate tests from the HTML content - tests = generate_tests_from_html(html_content, pdf_id, page_num) - - # Save tests to a JSONL file - tests_dir = os.path.join(args.output_dir, "tests") - os.makedirs(tests_dir, exist_ok=True) - tests_path = os.path.join(tests_dir, f"{pdf_id}_page{page_num}_tests.jsonl") - with open(tests_path, "w") as f: - for test in tests: - f.write(json.dumps(test) + "\n") - print(f"Generated {len(tests)} tests for {pdf_id}, page {page_num}") - # Extract the page and save as PDF - pdf_path = os.path.join(templates_dir, f"{pdf_id}_page{page_num}.pdf") - if not extract_page_from_pdf(local_pdf_path, pdf_path, page_num): + original_pdf_path = os.path.join(pdfs_dir, f"{pdf_id}_page{page_num}_original.pdf") + if not extract_page_from_pdf(local_pdf_path, original_pdf_path, page_num): print(f"Failed to extract page {page_num} from {local_pdf_path}") # Render PDF using Playwright if not skipped playwright_pdf_path = None render_success = False + playwright_pdf_filename = f"{pdf_id}_page{page_num}.pdf" # This will be used in the tests if not args.skip_playwright: - playwright_pdf_path = os.path.join(templates_dir, f"{pdf_id}_page{page_num}_playwright.pdf") + playwright_pdf_path = os.path.join(pdfs_dir, playwright_pdf_filename) try: # Get PNG dimensions @@ -531,10 +522,6 @@ def process_pdf(pdf_info, args, client): print(f"Successfully rendered with Playwright: {playwright_pdf_path}") else: print(f"Failed to render as a single page PDF: {playwright_pdf_path}") - # Remove the tests if we couldn't render a proper single-page PDF - if os.path.exists(tests_path): - os.remove(tests_path) - print(f"Removed tests for {pdf_id} due to rendering failure") playwright_pdf_path = None except Exception as e: print(f"Failed to render with Playwright: {e}") @@ -544,15 +531,23 @@ def process_pdf(pdf_info, args, client): # If playwright rendering failed and was required, return None to skip this test if not args.skip_playwright and not render_success: return None - + + # Generate tests from the HTML content + # Use the playwright rendered PDF path for tests + tests = generate_tests_from_html(html_content, pdf_id, page_num) + + # Update the PDF path in all tests to use the playwright rendered PDF + for test in tests: + test["pdf"] = playwright_pdf_filename + return { "pdf_id": pdf_id, "s3_path": s3_path, "page_number": page_num, "html_path": html_path, - "pdf_path": pdf_path, + "original_pdf_path": original_pdf_path, "playwright_pdf_path": playwright_pdf_path, - "tests_path": tests_path, + "tests": tests, "num_tests": len(tests), } except Exception as e: @@ -609,9 +604,21 @@ def main(): # Shuffle and limit to max_tests random.shuffle(s3_paths) s3_paths = s3_paths[: args.max_tests] + + # Initialize synthetic.json as a JSONL file (empty initially) + synthetic_json_path = os.path.join(args.output_dir, "synthetic.jsonl") + open(synthetic_json_path, "w").close() # Create empty file + + # Counter for test statistics + test_counter = 0 + test_types = {"present": 0, "absent": 0, "table": 0, "order": 0} + results = [] + + # Initialize a threading lock for file access + import threading + file_lock = threading.Lock() # Process PDFs in parallel - results = [] with ThreadPoolExecutor(max_workers=args.parallel) as executor: # Submit all tasks futures = {executor.submit(process_pdf, (s3_path, i), args, client): s3_path for i, s3_path in enumerate(s3_paths)} @@ -621,8 +628,24 @@ def main(): s3_path = futures[future] try: result = future.result() - if result: + if result and result.get("tests"): results.append(result) + + # Append tests to synthetic.json as they're created (JSONL format) + with file_lock: + # Append each test as a separate JSON line + with open(synthetic_json_path, "a") as f: + for test in result["tests"]: + f.write(json.dumps(test) + "\n") + + # Update counters + test_counter += len(result["tests"]) + for test in result["tests"]: + test_type = test.get("type", "") + if test_type in test_types: + test_types[test_type] += 1 + + print(f"Added {len(result['tests'])} tests from {result['pdf_id']}, total: {test_counter}") except Exception as e: print(f"Error processing {s3_path}: {e}") @@ -632,29 +655,15 @@ def main(): playwright_success = sum(1 for r in results if r and r.get("playwright_pdf_path")) if not args.skip_playwright: print(f"Playwright PDF rendering: {playwright_success}/{len(results)} successful") - + + print(f"Saved {test_counter} tests to {synthetic_json_path}") + # Print summary of generated tests - total_tests = sum(r.get("num_tests", 0) for r in results if r) - print(f"Generated a total of {total_tests} tests across {len(results)} templates") + print(f"Generated a total of {test_counter} tests across {len(results)} templates") - # Optional: Collect and display test type statistics - if total_tests > 0: - # Count the tests by type from a sample of result files - test_types = {"present": 0, "absent": 0, "table": 0, "order": 0} - for r in results[: min(10, len(results))]: - if r and r.get("tests_path"): - try: - with open(r.get("tests_path"), "r") as f: - for line in f: - test = json.loads(line) - test_type = test.get("type", "") - if test_type in test_types: - test_types[test_type] += 1 - except Exception as e: - print(f"Error reading test file {r.get('tests_path')}: {e}") - - # Print test type distribution for the sample - print("Test type distribution (from sample):") + # Print test type distribution + if test_counter > 0: + print("Test type distribution:") for test_type, count in test_types.items(): print(f" - {test_type}: {count} tests")