Baseline tests for blanks

This commit is contained in:
Jake Poznanski 2025-08-25 22:01:24 +00:00
parent ad33672781
commit 6be12c2e06

View File

@ -5,7 +5,7 @@ import unicodedata
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import asdict, dataclass, field from dataclasses import asdict, dataclass, field
from enum import Enum from enum import Enum
from typing import List, Optional, Set, Tuple from typing import Dict, List, Optional, Set, Tuple, Union
import numpy as np import numpy as np
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -130,7 +130,7 @@ def normalize_text(md_content: str) -> str:
md_content = re.sub(r"\*(.*?)\*", r"\1", md_content) md_content = re.sub(r"\*(.*?)\*", r"\1", md_content)
md_content = re.sub(r"_(.*?)_", r"\1", md_content) md_content = re.sub(r"_(.*?)_", r"\1", md_content)
# Convert down to a consistent unicode form, so == e + accent, unicode forms # Convert down to a consistent unicode form, so é == e + accent, unicode forms
md_content = unicodedata.normalize("NFC", md_content) md_content = unicodedata.normalize("NFC", md_content)
# Dictionary of characters to replace: keys are fancy characters, values are ASCII equivalents, unicode micro with greek mu comes up often enough too # Dictionary of characters to replace: keys are fancy characters, values are ASCII equivalents, unicode micro with greek mu comes up often enough too
@ -867,11 +867,22 @@ class BaselineTest(BasePDFTest):
""" """
max_length: Optional[int] = None # Used to implement blank page checks
max_repeats: int = 30 max_repeats: int = 30
check_disallowed_characters: bool = True check_disallowed_characters: bool = True
def run(self, content: str) -> Tuple[bool, str]: def run(self, content: str) -> Tuple[bool, str]:
if len("".join(c for c in content if c.isalnum()).strip()) == 0: base_content_len = len("".join(c for c in content if c.isalnum()).strip())
# If this a blank page check, then it short circuits the rest of the checks
if self.max_length is not None:
if base_content_len > self.max_length:
return False, f"{base_content_len} characters were output for a page we expected to be blank"
else:
return True, ""
if base_content_len == 0:
return False, "The text contains no alpha numeric characters" return False, "The text contains no alpha numeric characters"
# Makes sure that the content has no egregious repeated ngrams at the end, which indicate a degradation of quality # Makes sure that the content has no egregious repeated ngrams at the end, which indicate a degradation of quality
@ -965,29 +976,28 @@ class MathTest(BasePDFTest):
return False, f"No match found for {self.math} anywhere in content" return False, f"No match found for {self.math} anywhere in content"
def load_tests(jsonl_file: str) -> List[BasePDFTest]: def load_single_test(data: Union[str, Dict]) -> BasePDFTest:
""" """
Load tests from a JSONL file using parallel processing with a ThreadPoolExecutor. Load a single test from a JSON line string or JSON object.
Args: Args:
jsonl_file: Path to the JSONL file containing test definitions. data: Either a JSON string to parse or a dictionary containing test data.
Returns: Returns:
A list of test objects. A test object of the appropriate type.
"""
def process_line(line_tuple: Tuple[int, str]) -> Optional[Tuple[int, BasePDFTest]]: Raises:
ValidationError: If the test type is unknown or data is invalid.
json.JSONDecodeError: If the string cannot be parsed as JSON.
""" """
Process a single line from the JSONL file and return a tuple of (line_number, test object). # Handle JSON string input
Returns None for empty lines. if isinstance(data, str):
""" data = data.strip()
line_number, line = line_tuple if not data:
line = line.strip() raise ValueError("Empty string provided")
if not line: data = json.loads(data)
return None
try: # Process the test data
data = json.loads(line)
test_type = data.get("type") test_type = data.get("type")
if test_type in {TestType.PRESENT.value, TestType.ABSENT.value}: if test_type in {TestType.PRESENT.value, TestType.ABSENT.value}:
test = TextPresenceTest(**data) test = TextPresenceTest(**data)
@ -1001,6 +1011,33 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
test = BaselineTest(**data) test = BaselineTest(**data)
else: else:
raise ValidationError(f"Unknown test type: {test_type}") raise ValidationError(f"Unknown test type: {test_type}")
return test
def load_tests(jsonl_file: str) -> List[BasePDFTest]:
"""
Load tests from a JSONL file using parallel processing with a ThreadPoolExecutor.
Args:
jsonl_file: Path to the JSONL file containing test definitions.
Returns:
A list of test objects.
"""
def process_line_with_number(line_tuple: Tuple[int, str]) -> Optional[Tuple[int, BasePDFTest]]:
"""
Process a single line from the JSONL file and return a tuple of (line_number, test object).
Returns None for empty lines.
"""
line_number, line = line_tuple
line = line.strip()
if not line:
return None
try:
test = load_single_test(line)
return (line_number, test) return (line_number, test)
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
print(f"Error parsing JSON on line {line_number}: {e}") print(f"Error parsing JSON on line {line_number}: {e}")
@ -1021,7 +1058,7 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
# Use a ThreadPoolExecutor to process each line in parallel. # Use a ThreadPoolExecutor to process each line in parallel.
with ThreadPoolExecutor(max_workers=min(os.cpu_count() or 1, 64)) as executor: with ThreadPoolExecutor(max_workers=min(os.cpu_count() or 1, 64)) as executor:
# Submit all tasks concurrently. # Submit all tasks concurrently.
futures = {executor.submit(process_line, item): item[0] for item in lines} futures = {executor.submit(process_line_with_number, item): item[0] for item in lines}
# Use tqdm to show progress as futures complete. # Use tqdm to show progress as futures complete.
for future in tqdm(as_completed(futures), total=len(futures), desc="Loading tests"): for future in tqdm(as_completed(futures), total=len(futures), desc="Loading tests"):
result = future.result() result = future.result()