mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-03 20:34:09 +00:00
36 lines
1.0 KiB
Python
36 lines
1.0 KiB
Python
import asyncio
|
|
import glob
|
|
import json
|
|
import os
|
|
import shutil
|
|
import sys
|
|
|
|
import olmocr.pipeline
|
|
|
|
# Set sys.argv as if you were running the script from the command line.
|
|
|
|
workspace_dir = "olmocr/bench/sample_data/olmocr/workspace"
|
|
|
|
sys.argv = [
|
|
"pipeline.py", # The script name (can be arbitrary)
|
|
"olmocr/bench/sample_data/olmocr/workspace", # Positional argument: workspace
|
|
"--pdfs",
|
|
*list(glob.glob("olmocr/bench/sample_data/pdfs/*.pdf")), # PDF paths
|
|
]
|
|
|
|
# Call the async main() function.
|
|
asyncio.run(olmocr.pipeline.main())
|
|
|
|
# Now, take a produced jsonl files and unpack them into mds
|
|
for jsonl_path in glob.glob(workspace_dir + "/results/*.jsonl"):
|
|
with open(jsonl_path, "r") as jsonl_f:
|
|
for line in jsonl_f:
|
|
data = json.loads(line)
|
|
|
|
name = os.path.basename(data["metadata"]["Source-File"])
|
|
|
|
with open(f"olmocr/bench/sample_data/olmocr/{name.replace('.pdf', '.md')}", "w") as out_f:
|
|
out_f.write(data["text"])
|
|
|
|
shutil.rmtree(workspace_dir)
|