olmocr/olmocr/train/prepare_olmocrmix.py

import argparse
import json
from os import PathLike
from pathlib import Path
from typing import Optional
import pandas as pd

from huggingface_hub import snapshot_download


def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination: str | PathLike, max_examples: Optional[int] = None) -> str:
    """
    Prepare OLMoCR mix dataset by downloading from HuggingFace and organizing into a folder structure.

    Args:
        dataset_path: HuggingFace dataset path
        subset: Dataset subset name
        split: Dataset split (train/validation/test)
        destination: Destination directory path
        max_examples: Maximum number of examples to process (None for all)
    """
    # Step 1: Download dataset using hugging face hub snapshot_download to destination/hugging_face folder
    dest_path = Path(destination)
    hugging_face_dir = dest_path / "hugging_face"
    hugging_face_dir.mkdir(parents=True, exist_ok=True)

    print(f"Downloading dataset {dataset_path} to {hugging_face_dir}...")

    # Download the entire repository including PDFs and parquet files
    local_dir = snapshot_download(
        repo_id=dataset_path,
        repo_type="dataset",
        local_dir=hugging_face_dir,
    )

    print(f"Downloaded to: {local_dir}")

    # Step 2: Create destination folder structure for processed markdown files
    processed_dir = dest_path / f"processed_{subset}_{split}"
    processed_dir.mkdir(exist_ok=True)

    # Manual map to parquet files for now
    assert dataset_path == "allenai/olmOCR-mix-0225", "Only supporting the olmocr-mix for now, later will support other training sets"
    if subset == "00_documents" and split == "train_s2pdf":
        parquet_files = [dest_path / "hugging_face" / "train-s2pdf.parquet"]
    elif subset == "00_documents" and split == "eval_s2pdf":
        parquet_files = [dest_path / "hugging_face" / "eval-s2pdf.parquet"]
    elif subset == "01_books" and split == "train_s2pdf":
        parquet_files = [dest_path / "hugging_face" / "train-iabooks.parquet"]
    elif subset == "01_books" and split == "train_s2pdf":
        parquet_files = [dest_path / "hugging_face" / "eval-iabooks.parquet"]
    else:
        raise NotImplementedError()


    # Step 3: Process parquet files
    total_processed = 0
    total_errors = 0

    for parquet_file in parquet_files:
        print(f"Processing {parquet_file.name}...")
        df = pd.read_parquet(parquet_file)

        # Process each row
        for idx, row in df.iterrows():
            if max_examples and total_processed >= max_examples:
                break

            try:

                # Extract fields from the row
                # The rows in the parquet will look like url, page_number, response (json format), and id
                response = row.get('response', '')
                doc_id = str(idx)

                assert len(doc_id) > 4

                # Parse response if it's a JSON string
                response_data = json.loads(response)
                response = response_data

                # Create folder structure using first 4 digits of id
                # Make a folder structure, to prevent a huge amount of files in one folder, using the first 4 digits of the id, ex. id[:4]/id[4:].md
                folder_name = doc_id[:4]
                file_name = f"{doc_id[4:]}.md"

                # Create directory
                output_dir = processed_dir / folder_name
                output_dir.mkdir(exist_ok=True)

                # Write markdown file with front matter and natural text
                output_file = output_dir / file_name
                with open(output_file, 'w', encoding='utf-8') as f:
                    # Extract natural_text and other fields for front matter
                    natural_text = response.get('natural_text', '')
                    # Create front matter from other fields
                    front_matter = {k: v for k, v in response.items() if k != 'natural_text'}

                    # Write front matter
                    f.write("---\n")
                    for k, v in front_matter.items():
                        f.write(f"{k}: {v}\n")
                    f.write("---\n")

                    # Write natural text
                    f.write(natural_text)

                total_processed += 1
                if total_processed % 1000 == 0:
                    print(f"Processed {total_processed} examples...")
            except Exception as ex:
                print(f"Error processing line: {ex}")
                total_errors += 1

        if max_examples and total_processed >= max_examples:
            break

    print(f"Completed! Processed {total_processed} examples to {processed_dir}")
    print(f"Total errors: {total_errors}")
    return str(processed_dir)


def main():
    parser = argparse.ArgumentParser(description="Prepare OLMoCR mix dataset")
    parser.add_argument(
        "--dataset-path",
        type=str,
        default="allenai/olmOCR-mix-0225",
        help="HuggingFace dataset path (e.g., 'allenai/olmocr-mix')"
    )
    parser.add_argument(
        "--subset",
        type=str,
        default="00_documents",
        required=True,
        help="Dataset subset name"
    )
    parser.add_argument(
        "--split",
        type=str,
        default="eval_s2pdf",
        required=True,
        help="Dataset split ex eval_s2pdf"
    )
    parser.add_argument(
        "--destination",
        type=str,
        required=True,
        help="Destination directory path"
    )
    parser.add_argument(
        "--max-examples",
        type=int,
        default=None,
        help="Maximum number of examples to process (default: all)"
    )

    args = parser.parse_args()

    prepare_olmocr_mix(
        dataset_path=args.dataset_path,
        subset=args.subset,
        split=args.split,
        destination=args.destination,
        max_examples=args.max_examples
    )


if __name__ == "__main__":
    main()