2025-01-14 19:36:18 +00:00
|
|
|
#!/usr/bin/env python3
|
2025-01-29 15:25:10 -08:00
|
|
|
import argparse
|
2025-01-14 19:36:18 +00:00
|
|
|
import json
|
2025-01-29 15:25:10 -08:00
|
|
|
import os
|
2025-01-14 19:36:18 +00:00
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
2025-01-29 15:25:10 -08:00
|
|
|
import boto3
|
|
|
|
|
|
|
|
|
2025-01-14 19:36:18 +00:00
|
|
|
def parse_args():
|
|
|
|
parser = argparse.ArgumentParser(description="Read JSONL files from an S3 prefix, extract text, and write to local .md files.")
|
2025-01-29 15:30:39 -08:00
|
|
|
parser.add_argument(
|
|
|
|
"--s3-prefix",
|
|
|
|
default="s3://ai2-oe-data/jakep/pdfworkspaces/pdelfin_testset/results/",
|
|
|
|
help="S3 prefix containing the JSONL files (default: s3://ai2-oe-data/jakep/pdfworkspaces/pdelfin_testset/results/)",
|
|
|
|
)
|
|
|
|
parser.add_argument("--output-dir", default="output_md", help="Local directory to store output .md files (default: output_md)")
|
2025-01-14 19:36:18 +00:00
|
|
|
return parser.parse_args()
|
|
|
|
|
2025-01-29 15:30:39 -08:00
|
|
|
|
2025-01-14 19:36:18 +00:00
|
|
|
def main():
|
|
|
|
args = parse_args()
|
2025-01-29 15:30:39 -08:00
|
|
|
|
2025-01-14 19:36:18 +00:00
|
|
|
# Parse the s3-prefix into bucket and prefix
|
|
|
|
parsed_s3 = urlparse(args.s3_prefix)
|
|
|
|
# e.g. netloc = 'ai2-oe-data', path = '/jakep/pdfworkspaces/pdelfin_testset/results/'
|
|
|
|
bucket_name = parsed_s3.netloc
|
|
|
|
# Remove leading '/' from parsed_s3.path
|
|
|
|
prefix = parsed_s3.path.lstrip("/")
|
|
|
|
|
|
|
|
# Ensure local output directory exists
|
|
|
|
os.makedirs(args.output_dir, exist_ok=True)
|
2025-01-29 15:30:39 -08:00
|
|
|
|
2025-01-14 19:36:18 +00:00
|
|
|
# Initialize S3 client
|
|
|
|
s3 = boto3.client("s3")
|
|
|
|
|
|
|
|
# List all objects under the prefix
|
|
|
|
paginator = s3.get_paginator("list_objects_v2")
|
|
|
|
pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
|
|
|
|
|
|
|
|
for page in pages:
|
|
|
|
if "Contents" not in page:
|
|
|
|
continue
|
2025-01-29 15:30:39 -08:00
|
|
|
|
2025-01-14 19:36:18 +00:00
|
|
|
for obj in page["Contents"]:
|
|
|
|
key = obj["Key"]
|
|
|
|
# Skip non-jsonl files
|
|
|
|
if not key.endswith(".jsonl"):
|
|
|
|
continue
|
2025-01-29 15:30:39 -08:00
|
|
|
|
2025-01-14 19:36:18 +00:00
|
|
|
print(f"Processing S3 object: s3://{bucket_name}/{key}")
|
|
|
|
|
|
|
|
# Read the S3 object
|
|
|
|
s3_object = s3.get_object(Bucket=bucket_name, Key=key)
|
|
|
|
# s3_object['Body'] is a StreamingBody, so we can read it line-by-line
|
|
|
|
body_stream = s3_object["Body"].iter_lines()
|
|
|
|
|
|
|
|
for line in body_stream:
|
|
|
|
if not line.strip():
|
|
|
|
continue
|
|
|
|
|
|
|
|
try:
|
|
|
|
record = json.loads(line)
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
print("Warning: Failed to decode JSON line.")
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Extract text
|
|
|
|
text_content = record.get("text", "")
|
|
|
|
if not text_content.strip():
|
|
|
|
# If there's no text, skip
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Derive the output filename based on the "Source-File" metadata
|
|
|
|
metadata = record.get("metadata", {})
|
|
|
|
source_file = metadata.get("Source-File", "")
|
|
|
|
|
|
|
|
# Example: source_file = 's3://ai2-oe-data/jakep/pdfdata/pdelfin_testset/fcffd2dd327d4e58d3c6d1d22ba62531c863_page8.pdf'
|
|
|
|
# We want to end up with: 'fcffd2dd327d4e58d3c6d1d22ba62531c863_page8_pdelf.md'
|
|
|
|
|
|
|
|
# 1) Extract just the filename from the path
|
|
|
|
# 2) Remove '.pdf'
|
|
|
|
# 3) Append '_pdelf.md'
|
|
|
|
source_filename = os.path.basename(source_file) # e.g. 'fcffd2dd327d4e58d3c6d1d22ba62531c863_page8.pdf'
|
|
|
|
if source_filename.lower().endswith(".pdf"):
|
|
|
|
source_filename = source_filename[:-4] # remove .pdf
|
|
|
|
|
|
|
|
output_filename = f"{source_filename}_pdelf.md"
|
|
|
|
output_path = os.path.join(args.output_dir, output_filename)
|
|
|
|
|
|
|
|
# Append the text to the corresponding file
|
|
|
|
# If you want to overwrite instead, change mode to 'w'
|
|
|
|
with open(output_path, "a", encoding="utf-8") as f:
|
|
|
|
f.write(text_content + "\n")
|
|
|
|
|
|
|
|
# Optional: Print or log what you've written
|
|
|
|
# print(f"Appended text to {output_path}")
|
|
|
|
|
|
|
|
print("Done processing all JSONL files.")
|
|
|
|
|
2025-01-29 15:30:39 -08:00
|
|
|
|
2025-01-14 19:36:18 +00:00
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|