Baseline tests for blanks

2025-10-11 16:22:29 +00:00 · 2025-08-25 22:01:24 +00:00 · 2025-08-25 22:01:24 +00:00 · 6be12c2e06
commit 6be12c2e06
parent ad33672781
1 changed files with 56 additions and 19 deletions
--- a/olmocr/bench/tests.py
+++ b/olmocr/bench/tests.py
@ -5,7 +5,7 @@ import unicodedata
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import asdict, dataclass, field
 from enum import Enum
-from typing import List, Optional, Set, Tuple
+from typing import Dict, List, Optional, Set, Tuple, Union
 import numpy as np
 from bs4 import BeautifulSoup
@ -130,7 +130,7 @@ def normalize_text(md_content: str) -> str:
    md_content = re.sub(r"\*(.*?)\*", r"\1", md_content)
    md_content = re.sub(r"_(.*?)_", r"\1", md_content)
-    # Convert down to a consistent unicode form, so é == e + accent, unicode forms
+    # Convert down to a consistent unicode form, so é == e + accent, unicode forms
    md_content = unicodedata.normalize("NFC", md_content)
    # Dictionary of characters to replace: keys are fancy characters, values are ASCII equivalents, unicode micro with greek mu comes up often enough too
@ -867,11 +867,22 @@ class BaselineTest(BasePDFTest):
    """
    max_length: Optional[int] = None  # Used to implement blank page checks
    max_repeats: int = 30
    check_disallowed_characters: bool = True
    def run(self, content: str) -> Tuple[bool, str]:
-        if len("".join(c for c in content if c.isalnum()).strip()) == 0:
+        base_content_len = len("".join(c for c in content if c.isalnum()).strip())
        # If this a blank page check, then it short circuits the rest of the checks
        if self.max_length is not None:
            if base_content_len > self.max_length:
                return False, f"{base_content_len} characters were output for a page we expected to be blank"
            else:
                return True, ""
        if base_content_len == 0:
            return False, "The text contains no alpha numeric characters"
        # Makes sure that the content has no egregious repeated ngrams at the end, which indicate a degradation of quality
@ -965,29 +976,28 @@ class MathTest(BasePDFTest):
        return False, f"No match found for {self.math} anywhere in content"
-def load_tests(jsonl_file: str) -> List[BasePDFTest]:
+def load_single_test(data: Union[str, Dict]) -> BasePDFTest:
    """
-    Load tests from a JSONL file using parallel processing with a ThreadPoolExecutor.
+    Load a single test from a JSON line string or JSON object.
    Args:
-        jsonl_file: Path to the JSONL file containing test definitions.
+        data: Either a JSON string to parse or a dictionary containing test data.
    Returns:
-        A list of test objects.
+        A test object of the appropriate type.
    """
-    def process_line(line_tuple: Tuple[int, str]) -> Optional[Tuple[int, BasePDFTest]]:
+    Raises:
        ValidationError: If the test type is unknown or data is invalid.
        json.JSONDecodeError: If the string cannot be parsed as JSON.
    """
-        Process a single line from the JSONL file and return a tuple of (line_number, test object).
+    # Handle JSON string input
-        Returns None for empty lines.
+    if isinstance(data, str):
-        """
+        data = data.strip()
-        line_number, line = line_tuple
+        if not data:
-        line = line.strip()
+            raise ValueError("Empty string provided")
-        if not line:
+        data = json.loads(data)
            return None
-        try:
+    # Process the test data
            data = json.loads(line)
    test_type = data.get("type")
    if test_type in {TestType.PRESENT.value, TestType.ABSENT.value}:
        test = TextPresenceTest(**data)
@ -1001,6 +1011,33 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
        test = BaselineTest(**data)
    else:
        raise ValidationError(f"Unknown test type: {test_type}")
    return test
 def load_tests(jsonl_file: str) -> List[BasePDFTest]:
    """
    Load tests from a JSONL file using parallel processing with a ThreadPoolExecutor.
    Args:
        jsonl_file: Path to the JSONL file containing test definitions.
    Returns:
        A list of test objects.
    """
    def process_line_with_number(line_tuple: Tuple[int, str]) -> Optional[Tuple[int, BasePDFTest]]:
        """
        Process a single line from the JSONL file and return a tuple of (line_number, test object).
        Returns None for empty lines.
        """
        line_number, line = line_tuple
        line = line.strip()
        if not line:
            return None
        try:
            test = load_single_test(line)
            return (line_number, test)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON on line {line_number}: {e}")
@ -1021,7 +1058,7 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
    # Use a ThreadPoolExecutor to process each line in parallel.
    with ThreadPoolExecutor(max_workers=min(os.cpu_count() or 1, 64)) as executor:
        # Submit all tasks concurrently.
-        futures = {executor.submit(process_line, item): item[0] for item in lines}
+        futures = {executor.submit(process_line_with_number, item): item[0] for item in lines}
        # Use tqdm to show progress as futures complete.
        for future in tqdm(as_completed(futures), total=len(futures), desc="Loading tests"):
            result = future.result()