mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-01 10:33:57 +00:00
Merge branch 'main' into jakep/olmocr_v03
This commit is contained in:
commit
3ae173bd72
@ -155,11 +155,15 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
|
||||
f.write("---\n")
|
||||
for k, v in front_matter.items():
|
||||
f.write(f"{k}: {v}\n")
|
||||
f.write("---\n")
|
||||
|
||||
# Write natural text
|
||||
f.write(natural_text)
|
||||
if natural_text is not None and len(natural_text.strip()) > 0:
|
||||
f.write("---\n")
|
||||
|
||||
# Write natural text
|
||||
f.write(natural_text)
|
||||
else:
|
||||
f.write("---")
|
||||
|
||||
# Look for matching PDF in extracted directory and create symlinks
|
||||
extracted_pdfs_dir = dest_path / "hugging_face" / "pdf_tarballs" / "extracted"
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user