Better quality synth data from both sides

2025-10-13 09:12:18 +00:00 · 2025-08-22 18:37:59 +00:00 · 2025-08-22 18:37:59 +00:00 · c1c83fd86c
commit c1c83fd86c
parent d9789947d5
1 changed files with 32 additions and 2 deletions
--- a/olmocr/bench/synth/mine_html_templates.py
+++ b/olmocr/bench/synth/mine_html_templates.py
@ -2,6 +2,7 @@ import argparse
 import asyncio
 import concurrent.futures
 import json
 import logging
 import os
 import random
 import re
@ -18,7 +19,7 @@ from playwright.async_api import async_playwright
 from syntok.segmenter import process
 from tqdm import tqdm
-from olmocr.bench.tests import TableTest, TestType, parse_html_tables
+from olmocr.bench.tests import TableTest, TestType, parse_html_tables, load_single_test
 from olmocr.data.renderpdf import (
    get_png_dimensions_from_base64,
    render_pdf_to_base64png,
@ -969,7 +970,36 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
            test_signatures.add(test_signature)
            unique_tests.append(test)
-    return unique_tests
+    # Validate each test against the markdown content
    validated_tests = []
    failed_test_count = 0
    # Get the markdown content for validation
    validation_markdown = markdown_content
    for test in unique_tests:
        try:
            # Create test object from the dictionary
            test_obj = load_single_test(test)
            # Run the test on the markdown content
            passed, error_msg = test_obj.run(validation_markdown)
            if passed:
                validated_tests.append(test)
            else:
                failed_test_count += 1
                if verbose_table_testing:
                    print(f"Test {test['id']} (type: {test['type']}) failed validation: {error_msg}")
        except Exception as e:
            failed_test_count += 1
            if verbose_table_testing:
                print(f"Test {test['id']} (type: {test['type']}) errored during validation: {str(e)}")
    if failed_test_count > 0:
        print(f"Filtered out {failed_test_count} tests that failed validation against markdown content for {pdf_id}")
    return validated_tests
 def process_pdf(pdf_info, args, client, pdf_filter=None):