From 6be12c2e06fe6036a968bfd05daca2ca29e4d3ed Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 25 Aug 2025 22:01:24 +0000 Subject: [PATCH] Baseline tests for blanks --- olmocr/bench/tests.py | 75 ++++++++++++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 19 deletions(-) diff --git a/olmocr/bench/tests.py b/olmocr/bench/tests.py index 320d31a..dc461f0 100644 --- a/olmocr/bench/tests.py +++ b/olmocr/bench/tests.py @@ -5,7 +5,7 @@ import unicodedata from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import asdict, dataclass, field from enum import Enum -from typing import List, Optional, Set, Tuple +from typing import Dict, List, Optional, Set, Tuple, Union import numpy as np from bs4 import BeautifulSoup @@ -130,7 +130,7 @@ def normalize_text(md_content: str) -> str: md_content = re.sub(r"\*(.*?)\*", r"\1", md_content) md_content = re.sub(r"_(.*?)_", r"\1", md_content) - # Convert down to a consistent unicode form, so é == e + accent, unicode forms + # Convert down to a consistent unicode form, so é == e + accent, unicode forms md_content = unicodedata.normalize("NFC", md_content) # Dictionary of characters to replace: keys are fancy characters, values are ASCII equivalents, unicode micro with greek mu comes up often enough too @@ -867,11 +867,22 @@ class BaselineTest(BasePDFTest): """ + max_length: Optional[int] = None # Used to implement blank page checks + max_repeats: int = 30 check_disallowed_characters: bool = True def run(self, content: str) -> Tuple[bool, str]: - if len("".join(c for c in content if c.isalnum()).strip()) == 0: + base_content_len = len("".join(c for c in content if c.isalnum()).strip()) + + # If this a blank page check, then it short circuits the rest of the checks + if self.max_length is not None: + if base_content_len > self.max_length: + return False, f"{base_content_len} characters were output for a page we expected to be blank" + else: + return True, "" + + if base_content_len == 0: return False, "The text contains no alpha numeric characters" # Makes sure that the content has no egregious repeated ngrams at the end, which indicate a degradation of quality @@ -965,6 +976,45 @@ class MathTest(BasePDFTest): return False, f"No match found for {self.math} anywhere in content" +def load_single_test(data: Union[str, Dict]) -> BasePDFTest: + """ + Load a single test from a JSON line string or JSON object. + + Args: + data: Either a JSON string to parse or a dictionary containing test data. + + Returns: + A test object of the appropriate type. + + Raises: + ValidationError: If the test type is unknown or data is invalid. + json.JSONDecodeError: If the string cannot be parsed as JSON. + """ + # Handle JSON string input + if isinstance(data, str): + data = data.strip() + if not data: + raise ValueError("Empty string provided") + data = json.loads(data) + + # Process the test data + test_type = data.get("type") + if test_type in {TestType.PRESENT.value, TestType.ABSENT.value}: + test = TextPresenceTest(**data) + elif test_type == TestType.ORDER.value: + test = TextOrderTest(**data) + elif test_type == TestType.TABLE.value: + test = TableTest(**data) + elif test_type == TestType.MATH.value: + test = MathTest(**data) + elif test_type == TestType.BASELINE.value: + test = BaselineTest(**data) + else: + raise ValidationError(f"Unknown test type: {test_type}") + + return test + + def load_tests(jsonl_file: str) -> List[BasePDFTest]: """ Load tests from a JSONL file using parallel processing with a ThreadPoolExecutor. @@ -976,7 +1026,7 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]: A list of test objects. """ - def process_line(line_tuple: Tuple[int, str]) -> Optional[Tuple[int, BasePDFTest]]: + def process_line_with_number(line_tuple: Tuple[int, str]) -> Optional[Tuple[int, BasePDFTest]]: """ Process a single line from the JSONL file and return a tuple of (line_number, test object). Returns None for empty lines. @@ -987,20 +1037,7 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]: return None try: - data = json.loads(line) - test_type = data.get("type") - if test_type in {TestType.PRESENT.value, TestType.ABSENT.value}: - test = TextPresenceTest(**data) - elif test_type == TestType.ORDER.value: - test = TextOrderTest(**data) - elif test_type == TestType.TABLE.value: - test = TableTest(**data) - elif test_type == TestType.MATH.value: - test = MathTest(**data) - elif test_type == TestType.BASELINE.value: - test = BaselineTest(**data) - else: - raise ValidationError(f"Unknown test type: {test_type}") + test = load_single_test(line) return (line_number, test) except json.JSONDecodeError as e: print(f"Error parsing JSON on line {line_number}: {e}") @@ -1021,7 +1058,7 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]: # Use a ThreadPoolExecutor to process each line in parallel. with ThreadPoolExecutor(max_workers=min(os.cpu_count() or 1, 64)) as executor: # Submit all tasks concurrently. - futures = {executor.submit(process_line, item): item[0] for item in lines} + futures = {executor.submit(process_line_with_number, item): item[0] for item in lines} # Use tqdm to show progress as futures complete. for future in tqdm(as_completed(futures), total=len(futures), desc="Loading tests"): result = future.result()