mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-29 17:05:18 +00:00
Packaging and repackaging test works
This commit is contained in:
parent
9f4a2d4177
commit
4c21e15d0e
@ -17,6 +17,8 @@ import tarfile
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterator, List, Optional, Tuple
|
||||
from olmocr.prompts import PageResponse
|
||||
from olmocr.train.dataloader import FrontMatterParser
|
||||
|
||||
import pandas as pd
|
||||
import yaml
|
||||
@ -44,27 +46,7 @@ class DocumentRecord:
|
||||
pdf_relpath: Optional[str] = None
|
||||
|
||||
|
||||
def parse_front_matter(markdown_text: str) -> Tuple[Dict[str, object], str]:
|
||||
"""
|
||||
Parse YAML front matter from a markdown string.
|
||||
|
||||
Returns a tuple of (front_matter_dict, body_text).
|
||||
"""
|
||||
if not markdown_text.startswith("---"):
|
||||
return {}, markdown_text.strip()
|
||||
|
||||
closing_idx = markdown_text.find("\n---", 3)
|
||||
if closing_idx == -1:
|
||||
return {}, markdown_text.strip()
|
||||
|
||||
fm_block = markdown_text[3:closing_idx]
|
||||
remainder = markdown_text[closing_idx + 4 :]
|
||||
|
||||
front_matter = yaml.safe_load(fm_block) or {}
|
||||
# Preserve internal spacing but trim a single leading newline if present.
|
||||
if remainder.startswith("\n"):
|
||||
remainder = remainder[1:]
|
||||
return front_matter, remainder
|
||||
|
||||
|
||||
def infer_doc_id(md_path: Path, processed_root: Path) -> str:
|
||||
@ -108,24 +90,7 @@ def normalize_response_payload(front_matter: Dict[str, object], body_text: str)
|
||||
|
||||
|
||||
def guess_url(front_matter: Dict[str, object], doc_id: str, source_url_template: Optional[str]) -> Optional[str]:
|
||||
"""
|
||||
Infer a URL for the document.
|
||||
|
||||
Priority:
|
||||
1. Front matter fields named url/source_url/pdf_url
|
||||
2. Provided template with placeholders {doc_id}, {prefix}, {base_id}
|
||||
"""
|
||||
for key in ("url", "source_url", "pdf_url", "uri"):
|
||||
value = front_matter.get(key)
|
||||
if isinstance(value, str) and value:
|
||||
return value
|
||||
|
||||
if source_url_template:
|
||||
prefix = doc_id[:4]
|
||||
base_id = doc_id[4:]
|
||||
base_pdf = base_id.rsplit("-", 1)[0] if "-" in base_id else base_id
|
||||
return source_url_template.format(doc_id=doc_id, prefix=prefix, base_id=base_id, base_pdf=base_pdf)
|
||||
|
||||
# TODO, we will have to add some better support for this
|
||||
return None
|
||||
|
||||
|
||||
@ -165,12 +130,14 @@ def collect_documents(
|
||||
"natural_text",
|
||||
}
|
||||
|
||||
parser = FrontMatterParser(front_matter_class=PageResponse)
|
||||
|
||||
for md_path in tqdm(md_files, desc="Scanning markdown files"):
|
||||
try:
|
||||
doc_id = infer_doc_id(md_path, processed_dir)
|
||||
pdf_path = infer_pdf_path(md_path, doc_id, pdf_root)
|
||||
markdown_text = md_path.read_text(encoding="utf-8")
|
||||
front_matter, body_text = parse_front_matter(markdown_text)
|
||||
front_matter, body_text = parser._extract_front_matter_and_text(markdown_text)
|
||||
response_payload = normalize_response_payload(front_matter, body_text)
|
||||
pdf_size = pdf_path.stat().st_size
|
||||
page_number = parse_page_number(doc_id, front_matter)
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
primary_language: en
|
||||
is_rotation_valid: True
|
||||
rotation_correction: 0
|
||||
is_table: False
|
||||
is_table: True
|
||||
is_diagram: False
|
||||
---
|
||||
Edgar (or Eadgar;[1] c. 944 – 8 July 975) was King of the English from 959 until his death in 975. He became king of all England on his brother's death. He was the younger son of King Edmund I and his first wife Ælfgifu. A detailed account of Edgar's reign is not possible, because only a few events were recorded by chroniclers and monastic writers were more interested in recording the activities of the leaders of the church.
|
||||
Loading…
x
Reference in New Issue
Block a user