mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-03 03:25:22 +00:00
170 lines
6.0 KiB
Python
170 lines
6.0 KiB
Python
import argparse
|
|
import json
|
|
from os import PathLike
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
import pandas as pd
|
|
|
|
from huggingface_hub import snapshot_download
|
|
|
|
|
|
def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination: str | PathLike, max_examples: Optional[int] = None) -> str:
|
|
"""
|
|
Prepare OLMoCR mix dataset by downloading from HuggingFace and organizing into a folder structure.
|
|
|
|
Args:
|
|
dataset_path: HuggingFace dataset path
|
|
subset: Dataset subset name
|
|
split: Dataset split (train/validation/test)
|
|
destination: Destination directory path
|
|
max_examples: Maximum number of examples to process (None for all)
|
|
"""
|
|
# Step 1: Download dataset using hugging face hub snapshot_download to destination/hugging_face folder
|
|
dest_path = Path(destination)
|
|
hugging_face_dir = dest_path / "hugging_face"
|
|
hugging_face_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"Downloading dataset {dataset_path} to {hugging_face_dir}...")
|
|
|
|
# Download the entire repository including PDFs and parquet files
|
|
local_dir = snapshot_download(
|
|
repo_id=dataset_path,
|
|
repo_type="dataset",
|
|
local_dir=hugging_face_dir,
|
|
)
|
|
|
|
print(f"Downloaded to: {local_dir}")
|
|
|
|
# Step 2: Create destination folder structure for processed markdown files
|
|
processed_dir = dest_path / f"processed_{subset}_{split}"
|
|
processed_dir.mkdir(exist_ok=True)
|
|
|
|
# Manual map to parquet files for now
|
|
assert dataset_path == "allenai/olmOCR-mix-0225", "Only supporting the olmocr-mix for now, later will support other training sets"
|
|
if subset == "00_documents" and split == "train_s2pdf":
|
|
parquet_files = [dest_path / "hugging_face" / "train-s2pdf.parquet"]
|
|
elif subset == "00_documents" and split == "eval_s2pdf":
|
|
parquet_files = [dest_path / "hugging_face" / "eval-s2pdf.parquet"]
|
|
elif subset == "01_books" and split == "train_s2pdf":
|
|
parquet_files = [dest_path / "hugging_face" / "train-iabooks.parquet"]
|
|
elif subset == "01_books" and split == "train_s2pdf":
|
|
parquet_files = [dest_path / "hugging_face" / "eval-iabooks.parquet"]
|
|
else:
|
|
raise NotImplementedError()
|
|
|
|
|
|
# Step 3: Process parquet files
|
|
total_processed = 0
|
|
total_errors = 0
|
|
|
|
for parquet_file in parquet_files:
|
|
print(f"Processing {parquet_file.name}...")
|
|
df = pd.read_parquet(parquet_file)
|
|
|
|
# Process each row
|
|
for idx, row in df.iterrows():
|
|
if max_examples and total_processed >= max_examples:
|
|
break
|
|
|
|
try:
|
|
|
|
# Extract fields from the row
|
|
# The rows in the parquet will look like url, page_number, response (json format), and id
|
|
response = row.get('response', '')
|
|
doc_id = str(idx)
|
|
|
|
assert len(doc_id) > 4
|
|
|
|
# Parse response if it's a JSON string
|
|
response_data = json.loads(response)
|
|
response = response_data
|
|
|
|
# Create folder structure using first 4 digits of id
|
|
# Make a folder structure, to prevent a huge amount of files in one folder, using the first 4 digits of the id, ex. id[:4]/id[4:].md
|
|
folder_name = doc_id[:4]
|
|
file_name = f"{doc_id[4:]}.md"
|
|
|
|
# Create directory
|
|
output_dir = processed_dir / folder_name
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
# Write markdown file with front matter and natural text
|
|
output_file = output_dir / file_name
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
# Extract natural_text and other fields for front matter
|
|
natural_text = response.get('natural_text', '')
|
|
# Create front matter from other fields
|
|
front_matter = {k: v for k, v in response.items() if k != 'natural_text'}
|
|
|
|
# Write front matter
|
|
f.write("---\n")
|
|
for k, v in front_matter.items():
|
|
f.write(f"{k}: {v}\n")
|
|
f.write("---\n")
|
|
|
|
# Write natural text
|
|
f.write(natural_text)
|
|
|
|
total_processed += 1
|
|
if total_processed % 1000 == 0:
|
|
print(f"Processed {total_processed} examples...")
|
|
except Exception as ex:
|
|
print(f"Error processing line: {ex}")
|
|
total_errors += 1
|
|
|
|
if max_examples and total_processed >= max_examples:
|
|
break
|
|
|
|
print(f"Completed! Processed {total_processed} examples to {processed_dir}")
|
|
print(f"Total errors: {total_errors}")
|
|
return str(processed_dir)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Prepare OLMoCR mix dataset")
|
|
parser.add_argument(
|
|
"--dataset-path",
|
|
type=str,
|
|
default="allenai/olmOCR-mix-0225",
|
|
help="HuggingFace dataset path (e.g., 'allenai/olmocr-mix')"
|
|
)
|
|
parser.add_argument(
|
|
"--subset",
|
|
type=str,
|
|
default="00_documents",
|
|
required=True,
|
|
help="Dataset subset name"
|
|
)
|
|
parser.add_argument(
|
|
"--split",
|
|
type=str,
|
|
default="eval_s2pdf",
|
|
required=True,
|
|
help="Dataset split ex eval_s2pdf"
|
|
)
|
|
parser.add_argument(
|
|
"--destination",
|
|
type=str,
|
|
required=True,
|
|
help="Destination directory path"
|
|
)
|
|
parser.add_argument(
|
|
"--max-examples",
|
|
type=int,
|
|
default=None,
|
|
help="Maximum number of examples to process (default: all)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
prepare_olmocr_mix(
|
|
dataset_path=args.dataset_path,
|
|
subset=args.subset,
|
|
split=args.split,
|
|
destination=args.destination,
|
|
max_examples=args.max_examples
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |