Merge branch 'main' into jakep/olmocr_v03

This commit is contained in:
Jake Poznanski 2025-08-04 22:28:29 +00:00
commit 3ae173bd72

View File

@ -155,11 +155,15 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
f.write("---\n")
for k, v in front_matter.items():
f.write(f"{k}: {v}\n")
f.write("---\n")
# Write natural text
f.write(natural_text)
if natural_text is not None and len(natural_text.strip()) > 0:
f.write("---\n")
# Write natural text
f.write(natural_text)
else:
f.write("---")
# Look for matching PDF in extracted directory and create symlinks
extracted_pdfs_dir = dest_path / "hugging_face" / "pdf_tarballs" / "extracted"