Random code to eval gpt4o in the ELO rankings for the technical report

This commit is contained in:
Jake Poznanski 2025-02-10 22:00:07 +00:00
parent c74d47a553
commit 7bfc285f12
6 changed files with 163 additions and 126 deletions

2
.gitignore vendored
View File

@ -11,6 +11,8 @@ sample200_vllm/*
sample200_sglang/*
pdelfin_testset/*
localworkspace/*
gpt4otestset/*
gpt4otestset_output/*
/*.html
scoreelo.csv
debug.log

View File

@ -0,0 +1,75 @@
import os
import json
import tempfile
import boto3
import concurrent.futures
from tqdm import tqdm
from olmocr.s3_utils import get_s3_bytes, expand_s3_glob
from olmocr.data.buildsilver import build_page_query
# Initialize the boto3 S3 client.
s3 = boto3.client("s3")
# Expand the S3 glob to get a list of all PDF S3 URIs.
all_pdfs = expand_s3_glob(s3, "s3://ai2-oe-data/jakep/pdfdata/pdelfin_testset/*.pdf")
print(f"Found {len(all_pdfs)} PDFs.")
def process_pdf(pdf_s3_uri):
"""
Downloads a PDF from S3, writes it to a temporary file,
builds a query dictionary from the PDF, and cleans up the temporary file.
Returns the natural_text string.
"""
local_pdf_path = None
try:
# Download the PDF as bytes from S3.
pdf_bytes = get_s3_bytes(s3, pdf_s3_uri)
# Write the PDF bytes to a temporary file so that build_page_query can work on a file path.
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
tmp_file.write(pdf_bytes)
tmp_file.flush()
local_pdf_path = tmp_file.name
# Build the query dictionary from the PDF.
example = build_page_query(local_pdf_path, pdf_s3_uri, 1)
print(example.choices[0].message.content)
data = json.loads(example.choices[0].message.content)
return data["natural_text"]
except Exception as e:
print(f"Error processing {pdf_s3_uri}: {e}")
return None
finally:
# Remove the temporary file if it exists.
if local_pdf_path and os.path.exists(local_pdf_path):
os.remove(local_pdf_path)
# Use ThreadPoolExecutor to process PDFs concurrently.
max_workers = 4 # Adjust the number of threads as needed.
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all the PDF processing tasks.
future_to_pdf = {executor.submit(process_pdf, pdf_uri): pdf_uri for pdf_uri in all_pdfs}
# Use tqdm to display a progress bar as tasks complete.
for future in tqdm(concurrent.futures.as_completed(future_to_pdf),
total=len(future_to_pdf), desc="Processing PDFs"):
pdf_s3_uri = future_to_pdf[future]
result = future.result()
if result is not None:
# Construct the output filename.
# Here we take the base name (e.g., "document.pdf") and replace ".pdf" with "_gpt4o.md".
pdf_basename = os.path.basename(pdf_s3_uri)
output_filename = pdf_basename.replace(".pdf", "_gpt4o.md")
# Optionally, set an output directory (this example uses "output").
output_dir = "gpt4otestset_output"
os.makedirs(output_dir, exist_ok=True)
output_filepath = os.path.join(output_dir, output_filename)
with open(output_filepath, "w", encoding="utf-8") as outfile:
outfile.write(result)
print(f"Wrote result to {output_filepath}")
print("Processing complete.")

View File

@ -30,27 +30,28 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di
anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
# DEBUG crappy temporary code here that does the actual api call live so I can debug it a bit
# from openai import OpenAI
# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# response = client.chat.completions.create(
# model="gpt-4o-2024-08-06",
# messages= [
# {
# "role": "user",
# "content": [
# {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)},
# {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
# ],
# }
# ],
# temperature=0.1,
# max_tokens=3000,
# logprobs=True,
# top_logprobs=5,
# response_format=openai_response_format_schema()
# )
# print(response)
response = client.chat.completions.create(
model="gpt-4o-2024-08-06",
messages= [
{
"role": "user",
"content": [
{"type": "text", "text": build_openai_silver_data_prompt(anchor_text)},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
],
}
],
temperature=0.1,
max_tokens=3000,
logprobs=True,
top_logprobs=5,
response_format=openai_response_format_schema()
)
return response
# Construct OpenAI Batch API request format#
# There are a few tricks to know when doing data processing with OpenAI's apis

View File

@ -34,7 +34,7 @@ class Comparison:
return re.search(r"page[0-9]+_(\w+)\.md$", self.comparison_b_path).group(1)
def process_single_pdf(pdf_path, all_mds, comparisons, segmenter_name="spacy"):
def process_single_pdf(pdf_path, all_mds, comparisons, segmenter_name="spacy", force_comparison=None):
"""Process a single PDF and return its comparisons."""
# Create resources inside the worker process
s3_client = boto3.client("s3")
@ -42,21 +42,24 @@ def process_single_pdf(pdf_path, all_mds, comparisons, segmenter_name="spacy"):
aligner = HirschbergAligner(match_score=1, mismatch_score=-1, indel_score=-1)
comparer = DocumentEditSimilarity(segmenter=segmenter, aligner=aligner)
pdf_comps = []
result_comps = []
# Get all comparison files for this PDF
# Original behavior: collect all comparison files for this PDF.
pdf_comps = []
for comp in comparisons:
comp_path = pdf_path.replace(".pdf", f"_{comp}.md")
if comp_path in all_mds:
pdf_comps.append(comp_path)
# Generate all possible combinations
# Generate all possible combinations (randomizing order occasionally)
for compa, compb in combinations(pdf_comps, 2):
if random.choice([True, False]):
compa, compb = compb, compa
# Get the text content
if force_comparison:
if not compa.endswith(f"_{force_comparison}.md") and not compb.endswith(f"_{force_comparison}.md"):
continue
text_a = get_s3_bytes(s3_client, compa).decode("utf-8")
text_b = get_s3_bytes(s3_client, compb).decode("utf-8")
@ -96,7 +99,9 @@ def build_review_page(args, comparisons, index=0):
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generates comparison voting pages between different pairs of parses for a PDF.")
parser = argparse.ArgumentParser(
description="Generates comparison voting pages between different pairs of parses for a PDF."
)
parser.add_argument("--name", default="review_page", help="What name to give to this evaluation/comparison")
parser.add_argument(
"--review_size",
@ -110,7 +115,11 @@ if __name__ == "__main__":
default=None,
help="Maximum number of worker processes to use for parallel processing",
)
parser.add_argument("--comparisons", default=["pdelf", "marker", "gotocr_format", "mineru"], help="Different variants to compare against")
parser.add_argument(
"--comparisons",
default=["pdelf", "marker", "gotocr_format", "mineru", "gpt4o"],
help="Different variants to compare against",
)
parser.add_argument(
"--num_copies",
default=1,
@ -118,7 +127,15 @@ if __name__ == "__main__":
help="Number of reports to generate, labeled _0, _1, etc. if greater than 1",
)
parser.add_argument(
"s3_path", type=str, help="Path to the folder where you keep your data files, expecting to see *.md files in there along with *.png and *.pdf"
"--force_comparison",
type=str,
default=None,
help="Force one method to be included in every comparison (e.g., 'mineru')."
)
parser.add_argument(
"s3_path",
type=str,
help="Path to the folder where you keep your data files, expecting to see *.md files in there along with *.png and *.pdf",
)
args = parser.parse_args()
@ -132,8 +149,13 @@ if __name__ == "__main__":
all_comps = []
# Create a partial function with all the common arguments
process_pdf = functools.partial(process_single_pdf, all_mds=all_mds, comparisons=args.comparisons)
# Create a partial function with the common arguments, including the forced method (if any)
process_pdf = functools.partial(
process_single_pdf,
all_mds=all_mds,
comparisons=args.comparisons,
force_comparison=args.force_comparison,
)
# Use ProcessPoolExecutor for parallel processing
with ProcessPoolExecutor(max_workers=args.max_workers) as executor:

View File

@ -286,26 +286,39 @@ def make_report(urls):
if __name__ == "__main__":
# Example usage
urls = [
"https://jakep-tinyhost.s3.amazonaws.com/review_page_0-ff70abb8f517.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=NarEyyCfvusCh%2FHdB47VfHOnnBs%3D&Expires=1738359221",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_1-0800f9af46cf.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=ncTWAu5rSndBJJsU26HRYDaK6i8%3D&Expires=1738359222",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_10-f7081f6ca6f9.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=gYX8yjGyYshRqXGgdsX17%2Fdi9Ig%3D&Expires=1738359223",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_11-355dc69335bc.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=7%2Bc5qoa8Tbk06z0VcvJiIIVAz9M%3D&Expires=1738359224",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_12-95fce9bf0c18.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=fw4PBo0LnxikmLZ8xH%2BGD%2F%2BhXMU%3D&Expires=1738359225",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_13-f88f7d7482bf.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=yXkQp9oFDtroKgiO50EwpYdGLcA%3D&Expires=1738359226",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_14-8ac0b974bfd5.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=EgZTpj1%2FdzMBUgd%2BX4pVZ1Sp%2FrA%3D&Expires=1738359226",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_15-e3136188de5c.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=YKhAv4unNIlRcerQAaHN4kjc4qI%3D&Expires=1738359227",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_16-2c5abde50d49.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=Mj8%2BK5ISKzAYQFeYvmzTgCPcRwA%3D&Expires=1738359228",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_17-f13132a4cdcc.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=%2FHuzw2cjJ4oFm91UXojPnGzYi8Q%3D&Expires=1738359229",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_18-25070f2aa05e.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=ctd%2BUIM%2FxryJm%2FcwA%2BRZ%2FbRzBp8%3D&Expires=1738359230",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_19-d436ee434162.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=jVdFKobIoHlbTQ7zziG%2BXiIQ0Fo%3D&Expires=1738359230",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_2-a5ece743fd31.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=K8hIrjWtvo4SLVQrOB8TiXLgNJk%3D&Expires=1738359231",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_3-9ce03af05f51.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=T0fLGSH%2Bv%2F19veqbxnLxoSf7gVA%3D&Expires=1738359232",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_4-94eec18f8027.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=u2R1LundKpfnAUCcD%2BdGHA6uIR0%3D&Expires=1738359233",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_5-377d0a7d8f5a.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=5R38ZQAR9ew5x%2BRmMVQbTqbfVh0%3D&Expires=1738359234",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_6-537b22646a26.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=PLOELum1qzOXW8Cm5rfZphlFeMw%3D&Expires=1738359235",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_7-a4a7dcb08f20.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=DxPHukGXEpPrEPL6TF9QBKPE1Xg%3D&Expires=1738359236",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_8-48a71c829863.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=TjEINKj69HdmXsKY59k4f3PieeM%3D&Expires=1738359237",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_9-8557438928c3.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=F7sQxw5A%2FDOcOaa%2FQSeqepH0PQc%3D&Expires=1738359238",
# With GPT 4o comparisons
"https://jakep-tinyhost.s3.amazonaws.com/review_page_0-7c659f0a21b8.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=hhwK5u98Rr3%2B%2BhZPnfERG5J6lCQ%3D&Expires=1739810142",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_1-ec88d9678783.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=e5Hz2mPqvWF6UOIAnM2TIsIzlqE%3D&Expires=1739810143",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_2-0a136799c857.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=AnVvCG6vzI8DVFkjWH1gu5WK5Vs%3D&Expires=1739810144",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_3-0fcd55d81dca.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=I4aW6Eg%2BhaI9NGsWX1%2BCL%2BRi5I8%3D&Expires=1739810144",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_4-16170071ec9e.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=DV9KeGhT2HJhayp7NWdn4xN32BA%3D&Expires=1739810146",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_5-8b7069ba44e6.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=VEr1NU1h45%2FcbCif0Ah%2FzX7nkqE%3D&Expires=1739810146",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_6-3ac7c41a8d1f.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=ILDl2MEoltkXjMu7FkOUpN%2FeMhI%3D&Expires=1739810148",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_7-c7afd77206f8.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=QtH1rXLr2a2ed8M0Stm1qfAIlCU%3D&Expires=1739810148",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_8-41cf458b1ccb.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=DyNT4FheHsNt7YaW2rOK3dgAbxc%3D&Expires=1739810149",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_9-a21cd92d0df8.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=xbEFm8NkINBW3O2UasG3Nlo51lc%3D&Expires=1739810150",
# Original Evals
"https://jakep-tinyhost.s3.amazonaws.com/review_page_0-c15d3c34d10d.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=kqoPu6Ht3sAv11ZAFE4liKu0ho0%3D&Expires=1739812401",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_1-b9a4cc301fa3.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=ix5zoGQyQF%2FQCcqDP1gBSlqjM0E%3D&Expires=1739812403",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_10-3e00795c34de.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=WJo1Tqes4ydf4kxCr024g0KbEds%3D&Expires=1739812404",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_11-4ea062af447c.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=jwNO31I3WqrRIAW8jUhW6FrxV1A%3D&Expires=1739812405",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_12-9f3ea088d8a0.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=8bhpA%2FGoy14%2Fh3La4A0rLFvuPxE%3D&Expires=1739812406",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_13-f7481e41c5d0.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=VJ3CZXMw%2B0OoIB1FKBibGa9E3yk%3D&Expires=1739812407",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_14-cd6624035b58.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=9pxeFOTR9ptqwepa5qtffHrh8Ko%3D&Expires=1739812408",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_15-6fe9491d125b.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=K3i8HpltQoFZaN3NMcPcbG0Vdtc%3D&Expires=1739812409",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_16-21ede1015505.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=Fuj%2BvYc6sCwfRAgvY3QminEZ0jo%3D&Expires=1739812410",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_17-088c7e4e2c24.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=LY2TDT8ypI4QhU2eHEEF7avvrGI%3D&Expires=1739812411",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_18-d525c9d28236.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=t17%2FAJIEwWFox8ZAbNEctQtsuXg%3D&Expires=1739812412",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_19-ef01ff7f17fa.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=QmMlFJ1IUYPfx%2FFOjHA%2FbedzXhA%3D&Expires=1739812413",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_2-e96cf945ea1c.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=aE7A%2F7Klf2UEagT4IZ0vdfMCDRY%3D&Expires=1739812414",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_3-6ab4be814f65.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=AsODz2eclnNf0qLr3037a1Lo7EQ%3D&Expires=1739812415",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_4-3b930c7969e5.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=mf6IKyI9HPbAVhgy79IrQu28jtY%3D&Expires=1739812416",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_5-c0ce41895c33.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=wlbiLV4CsIPqcH%2F%2BsCH62HiiDCA%3D&Expires=1739812417",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_6-8e483112d495.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=IxGBDhb6FcP8V322n%2FLUUS3C5VA%3D&Expires=1739812419",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_7-cf765eb5df67.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=I8u8xE0CIpOXuTUMbMCXPeGzOwY%3D&Expires=1739812420",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_8-07d44a6d53ac.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=nE0z2tGfJoaKM6m2vDdNTFbd2M4%3D&Expires=1739812421",
"https://jakep-tinyhost.s3.amazonaws.com/review_page_9-26dc69a5ffe9.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=pPJmms04gIfPhuencLUYHLv%2F%2FP4%3D&Expires=1739812423",
]
# import tinyhost

View File

@ -1,76 +0,0 @@
import argparse
import logging
import os
import boto3
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
from datasets import Dataset
from olmocr.train.dataloader import build_batch_query_response_vision_dataset
def save_dataset_in_parquet(dataset: Dataset, output_dir: str, rows_per_file: int = 10000, s3_endpoint_url: str = None):
logger.info("Saving dataset in Parquet files")
# Check if the output is an S3 path
is_s3 = output_dir.startswith("s3://")
if is_s3:
s3_client = boto3.client("s3", endpoint_url=s3_endpoint_url) if s3_endpoint_url else boto3.client("s3")
else:
os.makedirs(output_dir, exist_ok=True)
total_rows = len(dataset)
for start_idx in range(0, total_rows, rows_per_file):
end_idx = min(start_idx + rows_per_file, total_rows)
file_name = f"dataset_{start_idx}_{end_idx}.parquet"
if is_s3:
# Saving to S3
bucket_name, key_prefix = parse_s3_path(output_dir)
output_path = f"{key_prefix}/{file_name}"
local_temp_file = f"/tmp/{file_name}"
logger.info(f"Saving rows {start_idx} to {end_idx} locally at {local_temp_file}")
dataset.select(range(start_idx, end_idx)).to_parquet(local_temp_file)
try:
logger.info(f"Uploading {local_temp_file} to s3://{bucket_name}/{output_path}")
s3_client.upload_file(local_temp_file, bucket_name, output_path)
except (NoCredentialsError, PartialCredentialsError) as e:
logger.error(f"Failed to upload to S3: {e}")
raise
finally:
os.remove(local_temp_file)
else:
# Saving locally
output_path = os.path.join(output_dir, file_name)
logger.info(f"Saving rows {start_idx} to {end_idx} in {output_path}")
dataset.select(range(start_idx, end_idx)).to_parquet(output_path)
def parse_s3_path(s3_path: str):
"""Parses an S3 path into bucket and key prefix."""
if not s3_path.startswith("s3://"):
raise ValueError("S3 path must start with 's3://'")
path = s3_path[5:]
bucket_name, _, key_prefix = path.partition("/")
return bucket_name, key_prefix
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process and save dataset as Parquet files.")
parser.add_argument("--query_path", type=str, required=True, help="Path to the query dataset JSONL files.")
parser.add_argument("--response_path", type=str, required=True, help="Path to the response dataset JSONL files.")
parser.add_argument("--output_dir", type=str, required=True, help="Directory or S3 path to save the output Parquet files.")
parser.add_argument("--num_proc", type=int, default=32, help="Number of processes to use for data processing.")
parser.add_argument("--s3_endpoint_url", type=str, default=None, help="Custom S3 endpoint URL, e.g., for S3-compatible storage.")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Build the dataset
final_dataset = build_batch_query_response_vision_dataset(query_glob_path=args.query_path, response_glob_path=args.response_path, num_proc=args.num_proc)
# Save the dataset as Parquet files
save_dataset_in_parquet(final_dataset, args.output_dir, s3_endpoint_url=args.s3_endpoint_url)
logger.info("Dataset processing and saving completed.")