olmocr/scripts/movedolmadocs_to_md.py

#!/usr/bin/env python3
import argparse
import json
import os
from urllib.parse import urlparse

import boto3


def parse_args():
    parser = argparse.ArgumentParser(description="Read JSONL files from an S3 prefix, extract text, and write to local .md files.")
    parser.add_argument(
        "--s3-prefix",
        default="s3://ai2-oe-data/jakep/pdfworkspaces/pdelfin_testset/results/",
        help="S3 prefix containing the JSONL files (default: s3://ai2-oe-data/jakep/pdfworkspaces/pdelfin_testset/results/)",
    )
    parser.add_argument("--output-dir", default="output_md", help="Local directory to store output .md files (default: output_md)")
    return parser.parse_args()


def main():
    args = parse_args()

    # Parse the s3-prefix into bucket and prefix
    parsed_s3 = urlparse(args.s3_prefix)
    # e.g. netloc = 'ai2-oe-data', path = '/jakep/pdfworkspaces/pdelfin_testset/results/'
    bucket_name = parsed_s3.netloc
    # Remove leading '/' from parsed_s3.path
    prefix = parsed_s3.path.lstrip("/")

    # Ensure local output directory exists
    os.makedirs(args.output_dir, exist_ok=True)

    # Initialize S3 client
    s3 = boto3.client("s3")

    # List all objects under the prefix
    paginator = s3.get_paginator("list_objects_v2")
    pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)

    for page in pages:
        if "Contents" not in page:
            continue

        for obj in page["Contents"]:
            key = obj["Key"]
            # Skip non-jsonl files
            if not key.endswith(".jsonl"):
                continue

            print(f"Processing S3 object: s3://{bucket_name}/{key}")

            # Read the S3 object
            s3_object = s3.get_object(Bucket=bucket_name, Key=key)
            # s3_object['Body'] is a StreamingBody, so we can read it line-by-line
            body_stream = s3_object["Body"].iter_lines()

            for line in body_stream:
                if not line.strip():
                    continue

                try:
                    record = json.loads(line)
                except json.JSONDecodeError:
                    print("Warning: Failed to decode JSON line.")
                    continue

                # Extract text
                text_content = record.get("text", "")
                if not text_content.strip():
                    # If there's no text, skip
                    continue

                # Derive the output filename based on the "Source-File" metadata
                metadata = record.get("metadata", {})
                source_file = metadata.get("Source-File", "")

                # Example: source_file = 's3://ai2-oe-data/jakep/pdfdata/pdelfin_testset/fcffd2dd327d4e58d3c6d1d22ba62531c863_page8.pdf'
                # We want to end up with: 'fcffd2dd327d4e58d3c6d1d22ba62531c863_page8_pdelf.md'

                # 1) Extract just the filename from the path
                # 2) Remove '.pdf'
                # 3) Append '_pdelf.md'
                source_filename = os.path.basename(source_file)  # e.g. 'fcffd2dd327d4e58d3c6d1d22ba62531c863_page8.pdf'
                if source_filename.lower().endswith(".pdf"):
                    source_filename = source_filename[:-4]  # remove .pdf

                output_filename = f"{source_filename}_pdelf.md"
                output_path = os.path.join(args.output_dir, output_filename)

                # Append the text to the corresponding file
                # If you want to overwrite instead, change mode to 'w'
                with open(output_path, "a", encoding="utf-8") as f:
                    f.write(text_content + "\n")

                # Optional: Print or log what you've written
                # print(f"Appended text to {output_path}")

    print("Done processing all JSONL files.")


if __name__ == "__main__":
    main()