diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py index e23c5d2..2318812 100644 --- a/olmocr/bench/synth/mine_html_templates.py +++ b/olmocr/bench/synth/mine_html_templates.py @@ -2,6 +2,7 @@ import argparse import asyncio import concurrent.futures import json +import logging import os import random import re @@ -18,7 +19,7 @@ from playwright.async_api import async_playwright from syntok.segmenter import process from tqdm import tqdm -from olmocr.bench.tests import TableTest, TestType, parse_html_tables +from olmocr.bench.tests import TableTest, TestType, parse_html_tables, load_single_test from olmocr.data.renderpdf import ( get_png_dimensions_from_base64, render_pdf_to_base64png, @@ -969,7 +970,36 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb test_signatures.add(test_signature) unique_tests.append(test) - return unique_tests + # Validate each test against the markdown content + validated_tests = [] + failed_test_count = 0 + + # Get the markdown content for validation + validation_markdown = markdown_content + + for test in unique_tests: + try: + # Create test object from the dictionary + test_obj = load_single_test(test) + + # Run the test on the markdown content + passed, error_msg = test_obj.run(validation_markdown) + + if passed: + validated_tests.append(test) + else: + failed_test_count += 1 + if verbose_table_testing: + print(f"Test {test['id']} (type: {test['type']}) failed validation: {error_msg}") + except Exception as e: + failed_test_count += 1 + if verbose_table_testing: + print(f"Test {test['id']} (type: {test['type']}) errored during validation: {str(e)}") + + if failed_test_count > 0: + print(f"Filtered out {failed_test_count} tests that failed validation against markdown content for {pdf_id}") + + return validated_tests def process_pdf(pdf_info, args, client, pdf_filter=None):