mirror of
https://github.com/allenai/olmocr.git
synced 2025-06-27 04:00:02 +00:00
Random code to eval gpt4o in the ELO rankings for the technical report
This commit is contained in:
parent
c74d47a553
commit
7bfc285f12
2
.gitignore
vendored
2
.gitignore
vendored
@ -11,6 +11,8 @@ sample200_vllm/*
|
||||
sample200_sglang/*
|
||||
pdelfin_testset/*
|
||||
localworkspace/*
|
||||
gpt4otestset/*
|
||||
gpt4otestset_output/*
|
||||
/*.html
|
||||
scoreelo.csv
|
||||
debug.log
|
||||
|
75
olmocr/data/buildgpt4otest.py
Normal file
75
olmocr/data/buildgpt4otest.py
Normal file
@ -0,0 +1,75 @@
|
||||
import os
|
||||
import json
|
||||
import tempfile
|
||||
import boto3
|
||||
import concurrent.futures
|
||||
from tqdm import tqdm
|
||||
|
||||
from olmocr.s3_utils import get_s3_bytes, expand_s3_glob
|
||||
from olmocr.data.buildsilver import build_page_query
|
||||
|
||||
# Initialize the boto3 S3 client.
|
||||
s3 = boto3.client("s3")
|
||||
# Expand the S3 glob to get a list of all PDF S3 URIs.
|
||||
all_pdfs = expand_s3_glob(s3, "s3://ai2-oe-data/jakep/pdfdata/pdelfin_testset/*.pdf")
|
||||
|
||||
print(f"Found {len(all_pdfs)} PDFs.")
|
||||
|
||||
def process_pdf(pdf_s3_uri):
|
||||
"""
|
||||
Downloads a PDF from S3, writes it to a temporary file,
|
||||
builds a query dictionary from the PDF, and cleans up the temporary file.
|
||||
Returns the natural_text string.
|
||||
"""
|
||||
local_pdf_path = None
|
||||
try:
|
||||
# Download the PDF as bytes from S3.
|
||||
pdf_bytes = get_s3_bytes(s3, pdf_s3_uri)
|
||||
|
||||
# Write the PDF bytes to a temporary file so that build_page_query can work on a file path.
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
|
||||
tmp_file.write(pdf_bytes)
|
||||
tmp_file.flush()
|
||||
local_pdf_path = tmp_file.name
|
||||
|
||||
# Build the query dictionary from the PDF.
|
||||
example = build_page_query(local_pdf_path, pdf_s3_uri, 1)
|
||||
print(example.choices[0].message.content)
|
||||
|
||||
data = json.loads(example.choices[0].message.content)
|
||||
return data["natural_text"]
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {pdf_s3_uri}: {e}")
|
||||
return None
|
||||
|
||||
finally:
|
||||
# Remove the temporary file if it exists.
|
||||
if local_pdf_path and os.path.exists(local_pdf_path):
|
||||
os.remove(local_pdf_path)
|
||||
|
||||
# Use ThreadPoolExecutor to process PDFs concurrently.
|
||||
max_workers = 4 # Adjust the number of threads as needed.
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
# Submit all the PDF processing tasks.
|
||||
future_to_pdf = {executor.submit(process_pdf, pdf_uri): pdf_uri for pdf_uri in all_pdfs}
|
||||
|
||||
# Use tqdm to display a progress bar as tasks complete.
|
||||
for future in tqdm(concurrent.futures.as_completed(future_to_pdf),
|
||||
total=len(future_to_pdf), desc="Processing PDFs"):
|
||||
pdf_s3_uri = future_to_pdf[future]
|
||||
result = future.result()
|
||||
if result is not None:
|
||||
# Construct the output filename.
|
||||
# Here we take the base name (e.g., "document.pdf") and replace ".pdf" with "_gpt4o.md".
|
||||
pdf_basename = os.path.basename(pdf_s3_uri)
|
||||
output_filename = pdf_basename.replace(".pdf", "_gpt4o.md")
|
||||
# Optionally, set an output directory (this example uses "output").
|
||||
output_dir = "gpt4otestset_output"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_filepath = os.path.join(output_dir, output_filename)
|
||||
with open(output_filepath, "w", encoding="utf-8") as outfile:
|
||||
outfile.write(result)
|
||||
print(f"Wrote result to {output_filepath}")
|
||||
|
||||
print("Processing complete.")
|
@ -30,27 +30,28 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di
|
||||
anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
|
||||
|
||||
# DEBUG crappy temporary code here that does the actual api call live so I can debug it a bit
|
||||
# from openai import OpenAI
|
||||
# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
from openai import OpenAI
|
||||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
# response = client.chat.completions.create(
|
||||
# model="gpt-4o-2024-08-06",
|
||||
# messages= [
|
||||
# {
|
||||
# "role": "user",
|
||||
# "content": [
|
||||
# {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)},
|
||||
# {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
|
||||
# ],
|
||||
# }
|
||||
# ],
|
||||
# temperature=0.1,
|
||||
# max_tokens=3000,
|
||||
# logprobs=True,
|
||||
# top_logprobs=5,
|
||||
# response_format=openai_response_format_schema()
|
||||
# )
|
||||
# print(response)
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o-2024-08-06",
|
||||
messages= [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": build_openai_silver_data_prompt(anchor_text)},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
|
||||
],
|
||||
}
|
||||
],
|
||||
temperature=0.1,
|
||||
max_tokens=3000,
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
response_format=openai_response_format_schema()
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
# Construct OpenAI Batch API request format#
|
||||
# There are a few tricks to know when doing data processing with OpenAI's apis
|
||||
|
@ -34,7 +34,7 @@ class Comparison:
|
||||
return re.search(r"page[0-9]+_(\w+)\.md$", self.comparison_b_path).group(1)
|
||||
|
||||
|
||||
def process_single_pdf(pdf_path, all_mds, comparisons, segmenter_name="spacy"):
|
||||
def process_single_pdf(pdf_path, all_mds, comparisons, segmenter_name="spacy", force_comparison=None):
|
||||
"""Process a single PDF and return its comparisons."""
|
||||
# Create resources inside the worker process
|
||||
s3_client = boto3.client("s3")
|
||||
@ -42,21 +42,24 @@ def process_single_pdf(pdf_path, all_mds, comparisons, segmenter_name="spacy"):
|
||||
aligner = HirschbergAligner(match_score=1, mismatch_score=-1, indel_score=-1)
|
||||
comparer = DocumentEditSimilarity(segmenter=segmenter, aligner=aligner)
|
||||
|
||||
pdf_comps = []
|
||||
result_comps = []
|
||||
|
||||
# Get all comparison files for this PDF
|
||||
# Original behavior: collect all comparison files for this PDF.
|
||||
pdf_comps = []
|
||||
for comp in comparisons:
|
||||
comp_path = pdf_path.replace(".pdf", f"_{comp}.md")
|
||||
if comp_path in all_mds:
|
||||
pdf_comps.append(comp_path)
|
||||
|
||||
# Generate all possible combinations
|
||||
# Generate all possible combinations (randomizing order occasionally)
|
||||
for compa, compb in combinations(pdf_comps, 2):
|
||||
if random.choice([True, False]):
|
||||
compa, compb = compb, compa
|
||||
|
||||
# Get the text content
|
||||
if force_comparison:
|
||||
if not compa.endswith(f"_{force_comparison}.md") and not compb.endswith(f"_{force_comparison}.md"):
|
||||
continue
|
||||
|
||||
text_a = get_s3_bytes(s3_client, compa).decode("utf-8")
|
||||
text_b = get_s3_bytes(s3_client, compb).decode("utf-8")
|
||||
|
||||
@ -96,7 +99,9 @@ def build_review_page(args, comparisons, index=0):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Generates comparison voting pages between different pairs of parses for a PDF.")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generates comparison voting pages between different pairs of parses for a PDF."
|
||||
)
|
||||
parser.add_argument("--name", default="review_page", help="What name to give to this evaluation/comparison")
|
||||
parser.add_argument(
|
||||
"--review_size",
|
||||
@ -110,7 +115,11 @@ if __name__ == "__main__":
|
||||
default=None,
|
||||
help="Maximum number of worker processes to use for parallel processing",
|
||||
)
|
||||
parser.add_argument("--comparisons", default=["pdelf", "marker", "gotocr_format", "mineru"], help="Different variants to compare against")
|
||||
parser.add_argument(
|
||||
"--comparisons",
|
||||
default=["pdelf", "marker", "gotocr_format", "mineru", "gpt4o"],
|
||||
help="Different variants to compare against",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_copies",
|
||||
default=1,
|
||||
@ -118,7 +127,15 @@ if __name__ == "__main__":
|
||||
help="Number of reports to generate, labeled _0, _1, etc. if greater than 1",
|
||||
)
|
||||
parser.add_argument(
|
||||
"s3_path", type=str, help="Path to the folder where you keep your data files, expecting to see *.md files in there along with *.png and *.pdf"
|
||||
"--force_comparison",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Force one method to be included in every comparison (e.g., 'mineru')."
|
||||
)
|
||||
parser.add_argument(
|
||||
"s3_path",
|
||||
type=str,
|
||||
help="Path to the folder where you keep your data files, expecting to see *.md files in there along with *.png and *.pdf",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
@ -132,8 +149,13 @@ if __name__ == "__main__":
|
||||
|
||||
all_comps = []
|
||||
|
||||
# Create a partial function with all the common arguments
|
||||
process_pdf = functools.partial(process_single_pdf, all_mds=all_mds, comparisons=args.comparisons)
|
||||
# Create a partial function with the common arguments, including the forced method (if any)
|
||||
process_pdf = functools.partial(
|
||||
process_single_pdf,
|
||||
all_mds=all_mds,
|
||||
comparisons=args.comparisons,
|
||||
force_comparison=args.force_comparison,
|
||||
)
|
||||
|
||||
# Use ProcessPoolExecutor for parallel processing
|
||||
with ProcessPoolExecutor(max_workers=args.max_workers) as executor:
|
||||
|
@ -286,26 +286,39 @@ def make_report(urls):
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
urls = [
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_0-ff70abb8f517.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=NarEyyCfvusCh%2FHdB47VfHOnnBs%3D&Expires=1738359221",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_1-0800f9af46cf.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=ncTWAu5rSndBJJsU26HRYDaK6i8%3D&Expires=1738359222",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_10-f7081f6ca6f9.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=gYX8yjGyYshRqXGgdsX17%2Fdi9Ig%3D&Expires=1738359223",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_11-355dc69335bc.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=7%2Bc5qoa8Tbk06z0VcvJiIIVAz9M%3D&Expires=1738359224",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_12-95fce9bf0c18.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=fw4PBo0LnxikmLZ8xH%2BGD%2F%2BhXMU%3D&Expires=1738359225",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_13-f88f7d7482bf.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=yXkQp9oFDtroKgiO50EwpYdGLcA%3D&Expires=1738359226",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_14-8ac0b974bfd5.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=EgZTpj1%2FdzMBUgd%2BX4pVZ1Sp%2FrA%3D&Expires=1738359226",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_15-e3136188de5c.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=YKhAv4unNIlRcerQAaHN4kjc4qI%3D&Expires=1738359227",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_16-2c5abde50d49.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=Mj8%2BK5ISKzAYQFeYvmzTgCPcRwA%3D&Expires=1738359228",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_17-f13132a4cdcc.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=%2FHuzw2cjJ4oFm91UXojPnGzYi8Q%3D&Expires=1738359229",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_18-25070f2aa05e.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=ctd%2BUIM%2FxryJm%2FcwA%2BRZ%2FbRzBp8%3D&Expires=1738359230",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_19-d436ee434162.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=jVdFKobIoHlbTQ7zziG%2BXiIQ0Fo%3D&Expires=1738359230",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_2-a5ece743fd31.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=K8hIrjWtvo4SLVQrOB8TiXLgNJk%3D&Expires=1738359231",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_3-9ce03af05f51.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=T0fLGSH%2Bv%2F19veqbxnLxoSf7gVA%3D&Expires=1738359232",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_4-94eec18f8027.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=u2R1LundKpfnAUCcD%2BdGHA6uIR0%3D&Expires=1738359233",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_5-377d0a7d8f5a.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=5R38ZQAR9ew5x%2BRmMVQbTqbfVh0%3D&Expires=1738359234",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_6-537b22646a26.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=PLOELum1qzOXW8Cm5rfZphlFeMw%3D&Expires=1738359235",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_7-a4a7dcb08f20.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=DxPHukGXEpPrEPL6TF9QBKPE1Xg%3D&Expires=1738359236",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_8-48a71c829863.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=TjEINKj69HdmXsKY59k4f3PieeM%3D&Expires=1738359237",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_9-8557438928c3.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=F7sQxw5A%2FDOcOaa%2FQSeqepH0PQc%3D&Expires=1738359238",
|
||||
# With GPT 4o comparisons
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_0-7c659f0a21b8.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=hhwK5u98Rr3%2B%2BhZPnfERG5J6lCQ%3D&Expires=1739810142",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_1-ec88d9678783.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=e5Hz2mPqvWF6UOIAnM2TIsIzlqE%3D&Expires=1739810143",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_2-0a136799c857.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=AnVvCG6vzI8DVFkjWH1gu5WK5Vs%3D&Expires=1739810144",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_3-0fcd55d81dca.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=I4aW6Eg%2BhaI9NGsWX1%2BCL%2BRi5I8%3D&Expires=1739810144",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_4-16170071ec9e.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=DV9KeGhT2HJhayp7NWdn4xN32BA%3D&Expires=1739810146",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_5-8b7069ba44e6.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=VEr1NU1h45%2FcbCif0Ah%2FzX7nkqE%3D&Expires=1739810146",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_6-3ac7c41a8d1f.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=ILDl2MEoltkXjMu7FkOUpN%2FeMhI%3D&Expires=1739810148",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_7-c7afd77206f8.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=QtH1rXLr2a2ed8M0Stm1qfAIlCU%3D&Expires=1739810148",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_8-41cf458b1ccb.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=DyNT4FheHsNt7YaW2rOK3dgAbxc%3D&Expires=1739810149",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_9-a21cd92d0df8.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=xbEFm8NkINBW3O2UasG3Nlo51lc%3D&Expires=1739810150",
|
||||
|
||||
# Original Evals
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_0-c15d3c34d10d.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=kqoPu6Ht3sAv11ZAFE4liKu0ho0%3D&Expires=1739812401",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_1-b9a4cc301fa3.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=ix5zoGQyQF%2FQCcqDP1gBSlqjM0E%3D&Expires=1739812403",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_10-3e00795c34de.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=WJo1Tqes4ydf4kxCr024g0KbEds%3D&Expires=1739812404",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_11-4ea062af447c.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=jwNO31I3WqrRIAW8jUhW6FrxV1A%3D&Expires=1739812405",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_12-9f3ea088d8a0.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=8bhpA%2FGoy14%2Fh3La4A0rLFvuPxE%3D&Expires=1739812406",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_13-f7481e41c5d0.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=VJ3CZXMw%2B0OoIB1FKBibGa9E3yk%3D&Expires=1739812407",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_14-cd6624035b58.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=9pxeFOTR9ptqwepa5qtffHrh8Ko%3D&Expires=1739812408",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_15-6fe9491d125b.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=K3i8HpltQoFZaN3NMcPcbG0Vdtc%3D&Expires=1739812409",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_16-21ede1015505.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=Fuj%2BvYc6sCwfRAgvY3QminEZ0jo%3D&Expires=1739812410",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_17-088c7e4e2c24.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=LY2TDT8ypI4QhU2eHEEF7avvrGI%3D&Expires=1739812411",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_18-d525c9d28236.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=t17%2FAJIEwWFox8ZAbNEctQtsuXg%3D&Expires=1739812412",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_19-ef01ff7f17fa.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=QmMlFJ1IUYPfx%2FFOjHA%2FbedzXhA%3D&Expires=1739812413",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_2-e96cf945ea1c.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=aE7A%2F7Klf2UEagT4IZ0vdfMCDRY%3D&Expires=1739812414",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_3-6ab4be814f65.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=AsODz2eclnNf0qLr3037a1Lo7EQ%3D&Expires=1739812415",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_4-3b930c7969e5.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=mf6IKyI9HPbAVhgy79IrQu28jtY%3D&Expires=1739812416",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_5-c0ce41895c33.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=wlbiLV4CsIPqcH%2F%2BsCH62HiiDCA%3D&Expires=1739812417",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_6-8e483112d495.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=IxGBDhb6FcP8V322n%2FLUUS3C5VA%3D&Expires=1739812419",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_7-cf765eb5df67.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=I8u8xE0CIpOXuTUMbMCXPeGzOwY%3D&Expires=1739812420",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_8-07d44a6d53ac.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=nE0z2tGfJoaKM6m2vDdNTFbd2M4%3D&Expires=1739812421",
|
||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page_9-26dc69a5ffe9.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=pPJmms04gIfPhuencLUYHLv%2F%2FP4%3D&Expires=1739812423",
|
||||
]
|
||||
# import tinyhost
|
||||
|
||||
|
@ -1,76 +0,0 @@
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
|
||||
import boto3
|
||||
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
|
||||
from datasets import Dataset
|
||||
|
||||
from olmocr.train.dataloader import build_batch_query_response_vision_dataset
|
||||
|
||||
|
||||
def save_dataset_in_parquet(dataset: Dataset, output_dir: str, rows_per_file: int = 10000, s3_endpoint_url: str = None):
|
||||
logger.info("Saving dataset in Parquet files")
|
||||
|
||||
# Check if the output is an S3 path
|
||||
is_s3 = output_dir.startswith("s3://")
|
||||
if is_s3:
|
||||
s3_client = boto3.client("s3", endpoint_url=s3_endpoint_url) if s3_endpoint_url else boto3.client("s3")
|
||||
else:
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
total_rows = len(dataset)
|
||||
for start_idx in range(0, total_rows, rows_per_file):
|
||||
end_idx = min(start_idx + rows_per_file, total_rows)
|
||||
file_name = f"dataset_{start_idx}_{end_idx}.parquet"
|
||||
if is_s3:
|
||||
# Saving to S3
|
||||
bucket_name, key_prefix = parse_s3_path(output_dir)
|
||||
output_path = f"{key_prefix}/{file_name}"
|
||||
local_temp_file = f"/tmp/{file_name}"
|
||||
logger.info(f"Saving rows {start_idx} to {end_idx} locally at {local_temp_file}")
|
||||
dataset.select(range(start_idx, end_idx)).to_parquet(local_temp_file)
|
||||
try:
|
||||
logger.info(f"Uploading {local_temp_file} to s3://{bucket_name}/{output_path}")
|
||||
s3_client.upload_file(local_temp_file, bucket_name, output_path)
|
||||
except (NoCredentialsError, PartialCredentialsError) as e:
|
||||
logger.error(f"Failed to upload to S3: {e}")
|
||||
raise
|
||||
finally:
|
||||
os.remove(local_temp_file)
|
||||
else:
|
||||
# Saving locally
|
||||
output_path = os.path.join(output_dir, file_name)
|
||||
logger.info(f"Saving rows {start_idx} to {end_idx} in {output_path}")
|
||||
dataset.select(range(start_idx, end_idx)).to_parquet(output_path)
|
||||
|
||||
|
||||
def parse_s3_path(s3_path: str):
|
||||
"""Parses an S3 path into bucket and key prefix."""
|
||||
if not s3_path.startswith("s3://"):
|
||||
raise ValueError("S3 path must start with 's3://'")
|
||||
path = s3_path[5:]
|
||||
bucket_name, _, key_prefix = path.partition("/")
|
||||
return bucket_name, key_prefix
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Process and save dataset as Parquet files.")
|
||||
parser.add_argument("--query_path", type=str, required=True, help="Path to the query dataset JSONL files.")
|
||||
parser.add_argument("--response_path", type=str, required=True, help="Path to the response dataset JSONL files.")
|
||||
parser.add_argument("--output_dir", type=str, required=True, help="Directory or S3 path to save the output Parquet files.")
|
||||
parser.add_argument("--num_proc", type=int, default=32, help="Number of processes to use for data processing.")
|
||||
parser.add_argument("--s3_endpoint_url", type=str, default=None, help="Custom S3 endpoint URL, e.g., for S3-compatible storage.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Build the dataset
|
||||
final_dataset = build_batch_query_response_vision_dataset(query_glob_path=args.query_path, response_glob_path=args.response_path, num_proc=args.num_proc)
|
||||
|
||||
# Save the dataset as Parquet files
|
||||
save_dataset_in_parquet(final_dataset, args.output_dir, s3_endpoint_url=args.s3_endpoint_url)
|
||||
|
||||
logger.info("Dataset processing and saving completed.")
|
Loading…
x
Reference in New Issue
Block a user