mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-12 00:20:13 +00:00
Merge branch 'main' into jakep/olmocr_v03
This commit is contained in:
commit
3ae173bd72
@ -155,10 +155,14 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
|
|||||||
f.write("---\n")
|
f.write("---\n")
|
||||||
for k, v in front_matter.items():
|
for k, v in front_matter.items():
|
||||||
f.write(f"{k}: {v}\n")
|
f.write(f"{k}: {v}\n")
|
||||||
|
|
||||||
|
if natural_text is not None and len(natural_text.strip()) > 0:
|
||||||
f.write("---\n")
|
f.write("---\n")
|
||||||
|
|
||||||
# Write natural text
|
# Write natural text
|
||||||
f.write(natural_text)
|
f.write(natural_text)
|
||||||
|
else:
|
||||||
|
f.write("---")
|
||||||
|
|
||||||
# Look for matching PDF in extracted directory and create symlinks
|
# Look for matching PDF in extracted directory and create symlinks
|
||||||
extracted_pdfs_dir = dest_path / "hugging_face" / "pdf_tarballs" / "extracted"
|
extracted_pdfs_dir = dest_path / "hugging_face" / "pdf_tarballs" / "extracted"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user