Verifying bench loading

2025-12-19 19:24:24 +00:00 · 2025-08-27 19:00:29 +00:00 · 2025-08-27 19:00:29 +00:00 · 09036b07d9
commit 09036b07d9
parent 14f19e5d58
1 changed files with 16 additions and 1 deletions
--- a/olmocr/train/grpo_train.py
+++ b/olmocr/train/grpo_train.py
@ -14,6 +14,7 @@ import glob
 from functools import lru_cache
 from concurrent.futures import ThreadPoolExecutor
 from rapidfuzz import distance
+import sys

 import torch
 import numpy as np
@ -108,7 +109,21 @@ class OlmOCRBenchDataset(Dataset):
        if os.path.exists(claude_file_path):
            try:
                with open(claude_file_path, 'r', encoding='utf-8') as f:
-                    return f.read()
+                    content = f.read()
+                    
+                # Parse the frontmatter to validate the content
+                parser = FrontMatterParser(front_matter_class=PageResponse)
+                try:
+                    front_matter, text = parser._extract_front_matter_and_text(content)
+                    page_response = parser._parse_front_matter(front_matter, text)
+                    # Parsing succeeded, return the original content
+                    return content
+                except Exception as parse_error:
+                    logger.error(f"CRITICAL: Failed to parse frontmatter from claude_original file {claude_file_path}")
+                    logger.error(f"Parse error: {type(parse_error).__name__}: {str(parse_error)}")
+                    logger.error("Aborting run due to invalid claude_original file format")
+                    sys.exit(1)
+                    
            except Exception as e:
                logger.warning(f"Failed to read claude_original file {claude_file_path}: {e}")
        else: