From 4c21e15d0e348e7c88d340f00980b186959ec1ec Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 9 Oct 2025 21:52:05 +0000
Subject: [PATCH] Packaging and repackaging test works

---
 olmocr/data/repackage_olmocrmix.py            |  45 +++---------------
 tests/sample_dataset/{ => 0000}/blanktext.md  |   0
 tests/sample_dataset/{ => 0000}/blanktext.pdf | Bin
 tests/sample_dataset/{ => 0000}/edgar.md      |   2 +-
 tests/sample_dataset/{ => 0000}/edgar.pdf     | Bin
 5 files changed, 7 insertions(+), 40 deletions(-)
 rename tests/sample_dataset/{ => 0000}/blanktext.md (100%)
 rename tests/sample_dataset/{ => 0000}/blanktext.pdf (100%)
 rename tests/sample_dataset/{ => 0000}/edgar.md (99%)
 rename tests/sample_dataset/{ => 0000}/edgar.pdf (100%)

diff --git a/olmocr/data/repackage_olmocrmix.py b/olmocr/data/repackage_olmocrmix.py
index 77d105c..86ebadb 100644
--- a/olmocr/data/repackage_olmocrmix.py
+++ b/olmocr/data/repackage_olmocrmix.py
@@ -17,6 +17,8 @@ import tarfile
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, Iterator, List, Optional, Tuple
+from olmocr.prompts import PageResponse
+from olmocr.train.dataloader import FrontMatterParser
 
 import pandas as pd
 import yaml
@@ -44,27 +46,7 @@ class DocumentRecord:
     pdf_relpath: Optional[str] = None
 
 
-def parse_front_matter(markdown_text: str) -> Tuple[Dict[str, object], str]:
-    """
-    Parse YAML front matter from a markdown string.
 
-    Returns a tuple of (front_matter_dict, body_text).
-    """
-    if not markdown_text.startswith("---"):
-        return {}, markdown_text.strip()
-
-    closing_idx = markdown_text.find("\n---", 3)
-    if closing_idx == -1:
-        return {}, markdown_text.strip()
-
-    fm_block = markdown_text[3:closing_idx]
-    remainder = markdown_text[closing_idx + 4 :]
-
-    front_matter = yaml.safe_load(fm_block) or {}
-    # Preserve internal spacing but trim a single leading newline if present.
-    if remainder.startswith("\n"):
-        remainder = remainder[1:]
-    return front_matter, remainder
 
 
 def infer_doc_id(md_path: Path, processed_root: Path) -> str:
@@ -108,24 +90,7 @@ def normalize_response_payload(front_matter: Dict[str, object], body_text: str)
 
 
 def guess_url(front_matter: Dict[str, object], doc_id: str, source_url_template: Optional[str]) -> Optional[str]:
-    """
-    Infer a URL for the document.
-
-    Priority:
-      1. Front matter fields named url/source_url/pdf_url
-      2. Provided template with placeholders {doc_id}, {prefix}, {base_id}
-    """
-    for key in ("url", "source_url", "pdf_url", "uri"):
-        value = front_matter.get(key)
-        if isinstance(value, str) and value:
-            return value
-
-    if source_url_template:
-        prefix = doc_id[:4]
-        base_id = doc_id[4:]
-        base_pdf = base_id.rsplit("-", 1)[0] if "-" in base_id else base_id
-        return source_url_template.format(doc_id=doc_id, prefix=prefix, base_id=base_id, base_pdf=base_pdf)
-
+    # TODO, we will have to add some better support for this
     return None
 
 
@@ -165,12 +130,14 @@ def collect_documents(
         "natural_text",
     }
 
+    parser = FrontMatterParser(front_matter_class=PageResponse)
+
     for md_path in tqdm(md_files, desc="Scanning markdown files"):
         try:
             doc_id = infer_doc_id(md_path, processed_dir)
             pdf_path = infer_pdf_path(md_path, doc_id, pdf_root)
             markdown_text = md_path.read_text(encoding="utf-8")
-            front_matter, body_text = parse_front_matter(markdown_text)
+            front_matter, body_text = parser._extract_front_matter_and_text(markdown_text)
             response_payload = normalize_response_payload(front_matter, body_text)
             pdf_size = pdf_path.stat().st_size
             page_number = parse_page_number(doc_id, front_matter)
diff --git a/tests/sample_dataset/blanktext.md b/tests/sample_dataset/0000/blanktext.md
similarity index 100%
rename from tests/sample_dataset/blanktext.md
rename to tests/sample_dataset/0000/blanktext.md
diff --git a/tests/sample_dataset/blanktext.pdf b/tests/sample_dataset/0000/blanktext.pdf
similarity index 100%
rename from tests/sample_dataset/blanktext.pdf
rename to tests/sample_dataset/0000/blanktext.pdf
diff --git a/tests/sample_dataset/edgar.md b/tests/sample_dataset/0000/edgar.md
similarity index 99%
rename from tests/sample_dataset/edgar.md
rename to tests/sample_dataset/0000/edgar.md
index 8dee232..d83c5ec 100644
--- a/tests/sample_dataset/edgar.md
+++ b/tests/sample_dataset/0000/edgar.md
@@ -2,7 +2,7 @@
 primary_language: en
 is_rotation_valid: True
 rotation_correction: 0
-is_table: False
+is_table: True
 is_diagram: False
 ---
 Edgar (or Eadgar;[1] c. 944 – 8 July 975) was King of the English from 959 until his death in 975. He became king of all England on his brother's death. He was the younger son of King Edmund I and his first wife Ælfgifu. A detailed account of Edgar's reign is not possible, because only a few events were recorded by chroniclers and monastic writers were more interested in recording the activities of the leaders of the church.
diff --git a/tests/sample_dataset/edgar.pdf b/tests/sample_dataset/0000/edgar.pdf
similarity index 100%
rename from tests/sample_dataset/edgar.pdf
rename to tests/sample_dataset/0000/edgar.pdf