From 4c21e15d0e348e7c88d340f00980b186959ec1ec Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 9 Oct 2025 21:52:05 +0000 Subject: [PATCH] Packaging and repackaging test works --- olmocr/data/repackage_olmocrmix.py | 45 +++--------------- tests/sample_dataset/{ => 0000}/blanktext.md | 0 tests/sample_dataset/{ => 0000}/blanktext.pdf | Bin tests/sample_dataset/{ => 0000}/edgar.md | 2 +- tests/sample_dataset/{ => 0000}/edgar.pdf | Bin 5 files changed, 7 insertions(+), 40 deletions(-) rename tests/sample_dataset/{ => 0000}/blanktext.md (100%) rename tests/sample_dataset/{ => 0000}/blanktext.pdf (100%) rename tests/sample_dataset/{ => 0000}/edgar.md (99%) rename tests/sample_dataset/{ => 0000}/edgar.pdf (100%) diff --git a/olmocr/data/repackage_olmocrmix.py b/olmocr/data/repackage_olmocrmix.py index 77d105c..86ebadb 100644 --- a/olmocr/data/repackage_olmocrmix.py +++ b/olmocr/data/repackage_olmocrmix.py @@ -17,6 +17,8 @@ import tarfile from dataclasses import dataclass from pathlib import Path from typing import Dict, Iterator, List, Optional, Tuple +from olmocr.prompts import PageResponse +from olmocr.train.dataloader import FrontMatterParser import pandas as pd import yaml @@ -44,27 +46,7 @@ class DocumentRecord: pdf_relpath: Optional[str] = None -def parse_front_matter(markdown_text: str) -> Tuple[Dict[str, object], str]: - """ - Parse YAML front matter from a markdown string. - Returns a tuple of (front_matter_dict, body_text). - """ - if not markdown_text.startswith("---"): - return {}, markdown_text.strip() - - closing_idx = markdown_text.find("\n---", 3) - if closing_idx == -1: - return {}, markdown_text.strip() - - fm_block = markdown_text[3:closing_idx] - remainder = markdown_text[closing_idx + 4 :] - - front_matter = yaml.safe_load(fm_block) or {} - # Preserve internal spacing but trim a single leading newline if present. - if remainder.startswith("\n"): - remainder = remainder[1:] - return front_matter, remainder def infer_doc_id(md_path: Path, processed_root: Path) -> str: @@ -108,24 +90,7 @@ def normalize_response_payload(front_matter: Dict[str, object], body_text: str) def guess_url(front_matter: Dict[str, object], doc_id: str, source_url_template: Optional[str]) -> Optional[str]: - """ - Infer a URL for the document. - - Priority: - 1. Front matter fields named url/source_url/pdf_url - 2. Provided template with placeholders {doc_id}, {prefix}, {base_id} - """ - for key in ("url", "source_url", "pdf_url", "uri"): - value = front_matter.get(key) - if isinstance(value, str) and value: - return value - - if source_url_template: - prefix = doc_id[:4] - base_id = doc_id[4:] - base_pdf = base_id.rsplit("-", 1)[0] if "-" in base_id else base_id - return source_url_template.format(doc_id=doc_id, prefix=prefix, base_id=base_id, base_pdf=base_pdf) - + # TODO, we will have to add some better support for this return None @@ -165,12 +130,14 @@ def collect_documents( "natural_text", } + parser = FrontMatterParser(front_matter_class=PageResponse) + for md_path in tqdm(md_files, desc="Scanning markdown files"): try: doc_id = infer_doc_id(md_path, processed_dir) pdf_path = infer_pdf_path(md_path, doc_id, pdf_root) markdown_text = md_path.read_text(encoding="utf-8") - front_matter, body_text = parse_front_matter(markdown_text) + front_matter, body_text = parser._extract_front_matter_and_text(markdown_text) response_payload = normalize_response_payload(front_matter, body_text) pdf_size = pdf_path.stat().st_size page_number = parse_page_number(doc_id, front_matter) diff --git a/tests/sample_dataset/blanktext.md b/tests/sample_dataset/0000/blanktext.md similarity index 100% rename from tests/sample_dataset/blanktext.md rename to tests/sample_dataset/0000/blanktext.md diff --git a/tests/sample_dataset/blanktext.pdf b/tests/sample_dataset/0000/blanktext.pdf similarity index 100% rename from tests/sample_dataset/blanktext.pdf rename to tests/sample_dataset/0000/blanktext.pdf diff --git a/tests/sample_dataset/edgar.md b/tests/sample_dataset/0000/edgar.md similarity index 99% rename from tests/sample_dataset/edgar.md rename to tests/sample_dataset/0000/edgar.md index 8dee232..d83c5ec 100644 --- a/tests/sample_dataset/edgar.md +++ b/tests/sample_dataset/0000/edgar.md @@ -2,7 +2,7 @@ primary_language: en is_rotation_valid: True rotation_correction: 0 -is_table: False +is_table: True is_diagram: False --- Edgar (or Eadgar;[1] c. 944 – 8 July 975) was King of the English from 959 until his death in 975. He became king of all England on his brother's death. He was the younger son of King Edmund I and his first wife Ælfgifu. A detailed account of Edgar's reign is not possible, because only a few events were recorded by chroniclers and monastic writers were more interested in recording the activities of the leaders of the church. diff --git a/tests/sample_dataset/edgar.pdf b/tests/sample_dataset/0000/edgar.pdf similarity index 100% rename from tests/sample_dataset/edgar.pdf rename to tests/sample_dataset/0000/edgar.pdf