olmocr/scripts/movedolmadocs_to_md.py

104 lines
3.7 KiB
Python
Raw Permalink Normal View History

2025-01-14 19:36:18 +00:00
#!/usr/bin/env python3
2025-01-29 15:25:10 -08:00
import argparse
2025-01-14 19:36:18 +00:00
import json
2025-01-29 15:25:10 -08:00
import os
2025-01-14 19:36:18 +00:00
from urllib.parse import urlparse
2025-01-29 15:25:10 -08:00
import boto3
2025-01-14 19:36:18 +00:00
def parse_args():
parser = argparse.ArgumentParser(description="Read JSONL files from an S3 prefix, extract text, and write to local .md files.")
2025-01-29 15:30:39 -08:00
parser.add_argument(
"--s3-prefix",
default="s3://ai2-oe-data/jakep/pdfworkspaces/pdelfin_testset/results/",
help="S3 prefix containing the JSONL files (default: s3://ai2-oe-data/jakep/pdfworkspaces/pdelfin_testset/results/)",
)
parser.add_argument("--output-dir", default="output_md", help="Local directory to store output .md files (default: output_md)")
2025-01-14 19:36:18 +00:00
return parser.parse_args()
2025-01-29 15:30:39 -08:00
2025-01-14 19:36:18 +00:00
def main():
args = parse_args()
2025-01-29 15:30:39 -08:00
2025-01-14 19:36:18 +00:00
# Parse the s3-prefix into bucket and prefix
parsed_s3 = urlparse(args.s3_prefix)
# e.g. netloc = 'ai2-oe-data', path = '/jakep/pdfworkspaces/pdelfin_testset/results/'
bucket_name = parsed_s3.netloc
# Remove leading '/' from parsed_s3.path
prefix = parsed_s3.path.lstrip("/")
# Ensure local output directory exists
os.makedirs(args.output_dir, exist_ok=True)
2025-01-29 15:30:39 -08:00
2025-01-14 19:36:18 +00:00
# Initialize S3 client
s3 = boto3.client("s3")
# List all objects under the prefix
paginator = s3.get_paginator("list_objects_v2")
pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
for page in pages:
if "Contents" not in page:
continue
2025-01-29 15:30:39 -08:00
2025-01-14 19:36:18 +00:00
for obj in page["Contents"]:
key = obj["Key"]
# Skip non-jsonl files
if not key.endswith(".jsonl"):
continue
2025-01-29 15:30:39 -08:00
2025-01-14 19:36:18 +00:00
print(f"Processing S3 object: s3://{bucket_name}/{key}")
# Read the S3 object
s3_object = s3.get_object(Bucket=bucket_name, Key=key)
# s3_object['Body'] is a StreamingBody, so we can read it line-by-line
body_stream = s3_object["Body"].iter_lines()
for line in body_stream:
if not line.strip():
continue
try:
record = json.loads(line)
except json.JSONDecodeError:
print("Warning: Failed to decode JSON line.")
continue
# Extract text
text_content = record.get("text", "")
if not text_content.strip():
# If there's no text, skip
continue
# Derive the output filename based on the "Source-File" metadata
metadata = record.get("metadata", {})
source_file = metadata.get("Source-File", "")
# Example: source_file = 's3://ai2-oe-data/jakep/pdfdata/pdelfin_testset/fcffd2dd327d4e58d3c6d1d22ba62531c863_page8.pdf'
# We want to end up with: 'fcffd2dd327d4e58d3c6d1d22ba62531c863_page8_pdelf.md'
# 1) Extract just the filename from the path
# 2) Remove '.pdf'
# 3) Append '_pdelf.md'
source_filename = os.path.basename(source_file) # e.g. 'fcffd2dd327d4e58d3c6d1d22ba62531c863_page8.pdf'
if source_filename.lower().endswith(".pdf"):
source_filename = source_filename[:-4] # remove .pdf
output_filename = f"{source_filename}_pdelf.md"
output_path = os.path.join(args.output_dir, output_filename)
# Append the text to the corresponding file
# If you want to overwrite instead, change mode to 'w'
with open(output_path, "a", encoding="utf-8") as f:
f.write(text_content + "\n")
# Optional: Print or log what you've written
# print(f"Appended text to {output_path}")
print("Done processing all JSONL files.")
2025-01-29 15:30:39 -08:00
2025-01-14 19:36:18 +00:00
if __name__ == "__main__":
main()