mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-11 16:22:29 +00:00
Baseline tests for blanks
This commit is contained in:
parent
ad33672781
commit
6be12c2e06
@ -5,7 +5,7 @@ import unicodedata
|
|||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from dataclasses import asdict, dataclass, field
|
from dataclasses import asdict, dataclass, field
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import List, Optional, Set, Tuple
|
from typing import Dict, List, Optional, Set, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
@ -130,7 +130,7 @@ def normalize_text(md_content: str) -> str:
|
|||||||
md_content = re.sub(r"\*(.*?)\*", r"\1", md_content)
|
md_content = re.sub(r"\*(.*?)\*", r"\1", md_content)
|
||||||
md_content = re.sub(r"_(.*?)_", r"\1", md_content)
|
md_content = re.sub(r"_(.*?)_", r"\1", md_content)
|
||||||
|
|
||||||
# Convert down to a consistent unicode form, so é == e + accent, unicode forms
|
# Convert down to a consistent unicode form, so é == e + accent, unicode forms
|
||||||
md_content = unicodedata.normalize("NFC", md_content)
|
md_content = unicodedata.normalize("NFC", md_content)
|
||||||
|
|
||||||
# Dictionary of characters to replace: keys are fancy characters, values are ASCII equivalents, unicode micro with greek mu comes up often enough too
|
# Dictionary of characters to replace: keys are fancy characters, values are ASCII equivalents, unicode micro with greek mu comes up often enough too
|
||||||
@ -867,11 +867,22 @@ class BaselineTest(BasePDFTest):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
max_length: Optional[int] = None # Used to implement blank page checks
|
||||||
|
|
||||||
max_repeats: int = 30
|
max_repeats: int = 30
|
||||||
check_disallowed_characters: bool = True
|
check_disallowed_characters: bool = True
|
||||||
|
|
||||||
def run(self, content: str) -> Tuple[bool, str]:
|
def run(self, content: str) -> Tuple[bool, str]:
|
||||||
if len("".join(c for c in content if c.isalnum()).strip()) == 0:
|
base_content_len = len("".join(c for c in content if c.isalnum()).strip())
|
||||||
|
|
||||||
|
# If this a blank page check, then it short circuits the rest of the checks
|
||||||
|
if self.max_length is not None:
|
||||||
|
if base_content_len > self.max_length:
|
||||||
|
return False, f"{base_content_len} characters were output for a page we expected to be blank"
|
||||||
|
else:
|
||||||
|
return True, ""
|
||||||
|
|
||||||
|
if base_content_len == 0:
|
||||||
return False, "The text contains no alpha numeric characters"
|
return False, "The text contains no alpha numeric characters"
|
||||||
|
|
||||||
# Makes sure that the content has no egregious repeated ngrams at the end, which indicate a degradation of quality
|
# Makes sure that the content has no egregious repeated ngrams at the end, which indicate a degradation of quality
|
||||||
@ -965,6 +976,45 @@ class MathTest(BasePDFTest):
|
|||||||
return False, f"No match found for {self.math} anywhere in content"
|
return False, f"No match found for {self.math} anywhere in content"
|
||||||
|
|
||||||
|
|
||||||
|
def load_single_test(data: Union[str, Dict]) -> BasePDFTest:
|
||||||
|
"""
|
||||||
|
Load a single test from a JSON line string or JSON object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: Either a JSON string to parse or a dictionary containing test data.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A test object of the appropriate type.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValidationError: If the test type is unknown or data is invalid.
|
||||||
|
json.JSONDecodeError: If the string cannot be parsed as JSON.
|
||||||
|
"""
|
||||||
|
# Handle JSON string input
|
||||||
|
if isinstance(data, str):
|
||||||
|
data = data.strip()
|
||||||
|
if not data:
|
||||||
|
raise ValueError("Empty string provided")
|
||||||
|
data = json.loads(data)
|
||||||
|
|
||||||
|
# Process the test data
|
||||||
|
test_type = data.get("type")
|
||||||
|
if test_type in {TestType.PRESENT.value, TestType.ABSENT.value}:
|
||||||
|
test = TextPresenceTest(**data)
|
||||||
|
elif test_type == TestType.ORDER.value:
|
||||||
|
test = TextOrderTest(**data)
|
||||||
|
elif test_type == TestType.TABLE.value:
|
||||||
|
test = TableTest(**data)
|
||||||
|
elif test_type == TestType.MATH.value:
|
||||||
|
test = MathTest(**data)
|
||||||
|
elif test_type == TestType.BASELINE.value:
|
||||||
|
test = BaselineTest(**data)
|
||||||
|
else:
|
||||||
|
raise ValidationError(f"Unknown test type: {test_type}")
|
||||||
|
|
||||||
|
return test
|
||||||
|
|
||||||
|
|
||||||
def load_tests(jsonl_file: str) -> List[BasePDFTest]:
|
def load_tests(jsonl_file: str) -> List[BasePDFTest]:
|
||||||
"""
|
"""
|
||||||
Load tests from a JSONL file using parallel processing with a ThreadPoolExecutor.
|
Load tests from a JSONL file using parallel processing with a ThreadPoolExecutor.
|
||||||
@ -976,7 +1026,7 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
|
|||||||
A list of test objects.
|
A list of test objects.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def process_line(line_tuple: Tuple[int, str]) -> Optional[Tuple[int, BasePDFTest]]:
|
def process_line_with_number(line_tuple: Tuple[int, str]) -> Optional[Tuple[int, BasePDFTest]]:
|
||||||
"""
|
"""
|
||||||
Process a single line from the JSONL file and return a tuple of (line_number, test object).
|
Process a single line from the JSONL file and return a tuple of (line_number, test object).
|
||||||
Returns None for empty lines.
|
Returns None for empty lines.
|
||||||
@ -987,20 +1037,7 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
data = json.loads(line)
|
test = load_single_test(line)
|
||||||
test_type = data.get("type")
|
|
||||||
if test_type in {TestType.PRESENT.value, TestType.ABSENT.value}:
|
|
||||||
test = TextPresenceTest(**data)
|
|
||||||
elif test_type == TestType.ORDER.value:
|
|
||||||
test = TextOrderTest(**data)
|
|
||||||
elif test_type == TestType.TABLE.value:
|
|
||||||
test = TableTest(**data)
|
|
||||||
elif test_type == TestType.MATH.value:
|
|
||||||
test = MathTest(**data)
|
|
||||||
elif test_type == TestType.BASELINE.value:
|
|
||||||
test = BaselineTest(**data)
|
|
||||||
else:
|
|
||||||
raise ValidationError(f"Unknown test type: {test_type}")
|
|
||||||
return (line_number, test)
|
return (line_number, test)
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
print(f"Error parsing JSON on line {line_number}: {e}")
|
print(f"Error parsing JSON on line {line_number}: {e}")
|
||||||
@ -1021,7 +1058,7 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
|
|||||||
# Use a ThreadPoolExecutor to process each line in parallel.
|
# Use a ThreadPoolExecutor to process each line in parallel.
|
||||||
with ThreadPoolExecutor(max_workers=min(os.cpu_count() or 1, 64)) as executor:
|
with ThreadPoolExecutor(max_workers=min(os.cpu_count() or 1, 64)) as executor:
|
||||||
# Submit all tasks concurrently.
|
# Submit all tasks concurrently.
|
||||||
futures = {executor.submit(process_line, item): item[0] for item in lines}
|
futures = {executor.submit(process_line_with_number, item): item[0] for item in lines}
|
||||||
# Use tqdm to show progress as futures complete.
|
# Use tqdm to show progress as futures complete.
|
||||||
for future in tqdm(as_completed(futures), total=len(futures), desc="Loading tests"):
|
for future in tqdm(as_completed(futures), total=len(futures), desc="Loading tests"):
|
||||||
result = future.result()
|
result = future.result()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user