Baseline tests for blanks

This commit is contained in:
Jake Poznanski 2025-08-25 22:01:24 +00:00
parent ad33672781
commit 6be12c2e06

View File

@ -5,7 +5,7 @@ import unicodedata
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import asdict, dataclass, field
from enum import Enum
from typing import List, Optional, Set, Tuple
from typing import Dict, List, Optional, Set, Tuple, Union
import numpy as np
from bs4 import BeautifulSoup
@ -130,7 +130,7 @@ def normalize_text(md_content: str) -> str:
md_content = re.sub(r"\*(.*?)\*", r"\1", md_content)
md_content = re.sub(r"_(.*?)_", r"\1", md_content)
# Convert down to a consistent unicode form, so == e + accent, unicode forms
# Convert down to a consistent unicode form, so é == e + accent, unicode forms
md_content = unicodedata.normalize("NFC", md_content)
# Dictionary of characters to replace: keys are fancy characters, values are ASCII equivalents, unicode micro with greek mu comes up often enough too
@ -867,11 +867,22 @@ class BaselineTest(BasePDFTest):
"""
max_length: Optional[int] = None # Used to implement blank page checks
max_repeats: int = 30
check_disallowed_characters: bool = True
def run(self, content: str) -> Tuple[bool, str]:
if len("".join(c for c in content if c.isalnum()).strip()) == 0:
base_content_len = len("".join(c for c in content if c.isalnum()).strip())
# If this a blank page check, then it short circuits the rest of the checks
if self.max_length is not None:
if base_content_len > self.max_length:
return False, f"{base_content_len} characters were output for a page we expected to be blank"
else:
return True, ""
if base_content_len == 0:
return False, "The text contains no alpha numeric characters"
# Makes sure that the content has no egregious repeated ngrams at the end, which indicate a degradation of quality
@ -965,6 +976,45 @@ class MathTest(BasePDFTest):
return False, f"No match found for {self.math} anywhere in content"
def load_single_test(data: Union[str, Dict]) -> BasePDFTest:
"""
Load a single test from a JSON line string or JSON object.
Args:
data: Either a JSON string to parse or a dictionary containing test data.
Returns:
A test object of the appropriate type.
Raises:
ValidationError: If the test type is unknown or data is invalid.
json.JSONDecodeError: If the string cannot be parsed as JSON.
"""
# Handle JSON string input
if isinstance(data, str):
data = data.strip()
if not data:
raise ValueError("Empty string provided")
data = json.loads(data)
# Process the test data
test_type = data.get("type")
if test_type in {TestType.PRESENT.value, TestType.ABSENT.value}:
test = TextPresenceTest(**data)
elif test_type == TestType.ORDER.value:
test = TextOrderTest(**data)
elif test_type == TestType.TABLE.value:
test = TableTest(**data)
elif test_type == TestType.MATH.value:
test = MathTest(**data)
elif test_type == TestType.BASELINE.value:
test = BaselineTest(**data)
else:
raise ValidationError(f"Unknown test type: {test_type}")
return test
def load_tests(jsonl_file: str) -> List[BasePDFTest]:
"""
Load tests from a JSONL file using parallel processing with a ThreadPoolExecutor.
@ -976,7 +1026,7 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
A list of test objects.
"""
def process_line(line_tuple: Tuple[int, str]) -> Optional[Tuple[int, BasePDFTest]]:
def process_line_with_number(line_tuple: Tuple[int, str]) -> Optional[Tuple[int, BasePDFTest]]:
"""
Process a single line from the JSONL file and return a tuple of (line_number, test object).
Returns None for empty lines.
@ -987,20 +1037,7 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
return None
try:
data = json.loads(line)
test_type = data.get("type")
if test_type in {TestType.PRESENT.value, TestType.ABSENT.value}:
test = TextPresenceTest(**data)
elif test_type == TestType.ORDER.value:
test = TextOrderTest(**data)
elif test_type == TestType.TABLE.value:
test = TableTest(**data)
elif test_type == TestType.MATH.value:
test = MathTest(**data)
elif test_type == TestType.BASELINE.value:
test = BaselineTest(**data)
else:
raise ValidationError(f"Unknown test type: {test_type}")
test = load_single_test(line)
return (line_number, test)
except json.JSONDecodeError as e:
print(f"Error parsing JSON on line {line_number}: {e}")
@ -1021,7 +1058,7 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
# Use a ThreadPoolExecutor to process each line in parallel.
with ThreadPoolExecutor(max_workers=min(os.cpu_count() or 1, 64)) as executor:
# Submit all tasks concurrently.
futures = {executor.submit(process_line, item): item[0] for item in lines}
futures = {executor.submit(process_line_with_number, item): item[0] for item in lines}
# Use tqdm to show progress as futures complete.
for future in tqdm(as_completed(futures), total=len(futures), desc="Loading tests"):
result = future.result()