mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-12 00:32:45 +00:00
Verifying bench loading
This commit is contained in:
parent
14f19e5d58
commit
09036b07d9
@ -14,6 +14,7 @@ import glob
|
|||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from rapidfuzz import distance
|
from rapidfuzz import distance
|
||||||
|
import sys
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -108,7 +109,21 @@ class OlmOCRBenchDataset(Dataset):
|
|||||||
if os.path.exists(claude_file_path):
|
if os.path.exists(claude_file_path):
|
||||||
try:
|
try:
|
||||||
with open(claude_file_path, 'r', encoding='utf-8') as f:
|
with open(claude_file_path, 'r', encoding='utf-8') as f:
|
||||||
return f.read()
|
content = f.read()
|
||||||
|
|
||||||
|
# Parse the frontmatter to validate the content
|
||||||
|
parser = FrontMatterParser(front_matter_class=PageResponse)
|
||||||
|
try:
|
||||||
|
front_matter, text = parser._extract_front_matter_and_text(content)
|
||||||
|
page_response = parser._parse_front_matter(front_matter, text)
|
||||||
|
# Parsing succeeded, return the original content
|
||||||
|
return content
|
||||||
|
except Exception as parse_error:
|
||||||
|
logger.error(f"CRITICAL: Failed to parse frontmatter from claude_original file {claude_file_path}")
|
||||||
|
logger.error(f"Parse error: {type(parse_error).__name__}: {str(parse_error)}")
|
||||||
|
logger.error("Aborting run due to invalid claude_original file format")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to read claude_original file {claude_file_path}: {e}")
|
logger.warning(f"Failed to read claude_original file {claude_file_path}: {e}")
|
||||||
else:
|
else:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user