From be1f845da44b7f671781df3e06813053fc92460d Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 4 Aug 2025 21:50:54 +0000 Subject: [PATCH] Fixing issue with blank documents --- olmocr/train/prepare_olmocrmix.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/olmocr/train/prepare_olmocrmix.py b/olmocr/train/prepare_olmocrmix.py index ea5b6ca..1092c51 100644 --- a/olmocr/train/prepare_olmocrmix.py +++ b/olmocr/train/prepare_olmocrmix.py @@ -155,11 +155,15 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination: f.write("---\n") for k, v in front_matter.items(): f.write(f"{k}: {v}\n") - f.write("---\n") - # Write natural text - f.write(natural_text) + if natural_text is not None and len(natural_text.strip()) > 0: + f.write("---\n") + # Write natural text + f.write(natural_text) + else: + f.write("---") + # Look for matching PDF in extracted directory and create symlinks extracted_pdfs_dir = dest_path / "hugging_face" / "pdf_tarballs" / "extracted"