Random code to eval gpt4o in the ELO rankings for the technical report

2025-06-27 04:00:02 +00:00 · 2025-02-10 22:00:07 +00:00 · 2025-02-10 22:00:07 +00:00 · 7bfc285f12
commit 7bfc285f12
parent c74d47a553
6 changed files with 163 additions and 126 deletions
--- a/.gitignore
+++ b/.gitignore
@ -11,6 +11,8 @@ sample200_vllm/*
 sample200_sglang/*
 pdelfin_testset/*
 localworkspace/*
+gpt4otestset/*
+gpt4otestset_output/*
 /*.html
 scoreelo.csv
 debug.log
--- a/olmocr/data/buildgpt4otest.py
+++ b/olmocr/data/buildgpt4otest.py
@ -0,0 +1,75 @@
+import os
+import json
+import tempfile
+import boto3
+import concurrent.futures
+from tqdm import tqdm
+
+from olmocr.s3_utils import get_s3_bytes, expand_s3_glob
+from olmocr.data.buildsilver import build_page_query
+
+# Initialize the boto3 S3 client.
+s3 = boto3.client("s3")
+# Expand the S3 glob to get a list of all PDF S3 URIs.
+all_pdfs = expand_s3_glob(s3, "s3://ai2-oe-data/jakep/pdfdata/pdelfin_testset/*.pdf")
+
+print(f"Found {len(all_pdfs)} PDFs.")
+
+def process_pdf(pdf_s3_uri):
+    """
+    Downloads a PDF from S3, writes it to a temporary file,
+    builds a query dictionary from the PDF, and cleans up the temporary file.
+    Returns the natural_text string.
+    """
+    local_pdf_path = None
+    try:
+        # Download the PDF as bytes from S3.
+        pdf_bytes = get_s3_bytes(s3, pdf_s3_uri)
+        
+        # Write the PDF bytes to a temporary file so that build_page_query can work on a file path.
+        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
+            tmp_file.write(pdf_bytes)
+            tmp_file.flush()
+            local_pdf_path = tmp_file.name
+
+        # Build the query dictionary from the PDF.
+        example = build_page_query(local_pdf_path, pdf_s3_uri, 1)
+        print(example.choices[0].message.content)
+
+        data = json.loads(example.choices[0].message.content)
+        return data["natural_text"]
+       
+    except Exception as e:
+        print(f"Error processing {pdf_s3_uri}: {e}")
+        return None
+
+    finally:
+        # Remove the temporary file if it exists.
+        if local_pdf_path and os.path.exists(local_pdf_path):
+            os.remove(local_pdf_path)
+
+# Use ThreadPoolExecutor to process PDFs concurrently.
+max_workers = 4  # Adjust the number of threads as needed.
+with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+    # Submit all the PDF processing tasks.
+    future_to_pdf = {executor.submit(process_pdf, pdf_uri): pdf_uri for pdf_uri in all_pdfs}
+
+    # Use tqdm to display a progress bar as tasks complete.
+    for future in tqdm(concurrent.futures.as_completed(future_to_pdf),
+                       total=len(future_to_pdf), desc="Processing PDFs"):
+        pdf_s3_uri = future_to_pdf[future]
+        result = future.result()
+        if result is not None:
+            # Construct the output filename.
+            # Here we take the base name (e.g., "document.pdf") and replace ".pdf" with "_gpt4o.md".
+            pdf_basename = os.path.basename(pdf_s3_uri)
+            output_filename = pdf_basename.replace(".pdf", "_gpt4o.md")
+            # Optionally, set an output directory (this example uses "output").
+            output_dir = "gpt4otestset_output"
+            os.makedirs(output_dir, exist_ok=True)
+            output_filepath = os.path.join(output_dir, output_filename)
+            with open(output_filepath, "w", encoding="utf-8") as outfile:
+                outfile.write(result)
+            print(f"Wrote result to {output_filepath}")
+
+print("Processing complete.")
--- a/olmocr/data/buildsilver.py
+++ b/olmocr/data/buildsilver.py
@ -30,27 +30,28 @@ def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> di
    anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")

    # DEBUG crappy temporary code here that does the actual api call live so I can debug it a bit
-    # from openai import OpenAI
-    # client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+    from openai import OpenAI
+    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

-    # response = client.chat.completions.create(
-    #     model="gpt-4o-2024-08-06",
-    #     messages= [
-    #             {
-    #                 "role": "user",
-    #                 "content": [
-    #                     {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)},
-    #                     {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
-    #                 ],
-    #             }
-    #         ],
-    #     temperature=0.1,
-    #     max_tokens=3000,
-    #     logprobs=True,
-    #     top_logprobs=5,
-    #     response_format=openai_response_format_schema()
-    # )
-    # print(response)
+    response = client.chat.completions.create(
+        model="gpt-4o-2024-08-06",
+        messages= [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)},
+                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
+                    ],
+                }
+            ],
+        temperature=0.1,
+        max_tokens=3000,
+        logprobs=True,
+        top_logprobs=5,
+        response_format=openai_response_format_schema()
+    )
+ 
+    return response

    # Construct OpenAI Batch API request format#
    # There are a few tricks to know when doing data processing with OpenAI's apis
--- a/olmocr/eval/buildelo.py
+++ b/olmocr/eval/buildelo.py
@ -34,7 +34,7 @@ class Comparison:
        return re.search(r"page[0-9]+_(\w+)\.md$", self.comparison_b_path).group(1)


-def process_single_pdf(pdf_path, all_mds, comparisons, segmenter_name="spacy"):
+def process_single_pdf(pdf_path, all_mds, comparisons, segmenter_name="spacy", force_comparison=None):
    """Process a single PDF and return its comparisons."""
    # Create resources inside the worker process
    s3_client = boto3.client("s3")
@ -42,21 +42,24 @@ def process_single_pdf(pdf_path, all_mds, comparisons, segmenter_name="spacy"):
    aligner = HirschbergAligner(match_score=1, mismatch_score=-1, indel_score=-1)
    comparer = DocumentEditSimilarity(segmenter=segmenter, aligner=aligner)

-    pdf_comps = []
    result_comps = []

-    # Get all comparison files for this PDF
+    # Original behavior: collect all comparison files for this PDF.
+    pdf_comps = []
    for comp in comparisons:
        comp_path = pdf_path.replace(".pdf", f"_{comp}.md")
        if comp_path in all_mds:
            pdf_comps.append(comp_path)

-    # Generate all possible combinations
+    # Generate all possible combinations (randomizing order occasionally)
    for compa, compb in combinations(pdf_comps, 2):
        if random.choice([True, False]):
            compa, compb = compb, compa

-        # Get the text content
+        if force_comparison:
+            if not compa.endswith(f"_{force_comparison}.md") and not compb.endswith(f"_{force_comparison}.md"):
+                continue
+
        text_a = get_s3_bytes(s3_client, compa).decode("utf-8")
        text_b = get_s3_bytes(s3_client, compb).decode("utf-8")

@ -96,7 +99,9 @@ def build_review_page(args, comparisons, index=0):


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Generates comparison voting pages between different pairs of parses for a PDF.")
+    parser = argparse.ArgumentParser(
+        description="Generates comparison voting pages between different pairs of parses for a PDF."
+    )
    parser.add_argument("--name", default="review_page", help="What name to give to this evaluation/comparison")
    parser.add_argument(
        "--review_size",
@ -110,7 +115,11 @@ if __name__ == "__main__":
        default=None,
        help="Maximum number of worker processes to use for parallel processing",
    )
-    parser.add_argument("--comparisons", default=["pdelf", "marker", "gotocr_format", "mineru"], help="Different variants to compare against")
+    parser.add_argument(
+        "--comparisons",
+        default=["pdelf", "marker", "gotocr_format", "mineru", "gpt4o"],
+        help="Different variants to compare against",
+    )
    parser.add_argument(
        "--num_copies",
        default=1,
@ -118,7 +127,15 @@ if __name__ == "__main__":
        help="Number of reports to generate, labeled _0, _1, etc. if greater than 1",
    )
    parser.add_argument(
-        "s3_path", type=str, help="Path to the folder where you keep your data files, expecting to see *.md files in there along with *.png and *.pdf"
+        "--force_comparison",
+        type=str,
+        default=None,
+        help="Force one method to be included in every comparison (e.g., 'mineru')."
+    )
+    parser.add_argument(
+        "s3_path",
+        type=str,
+        help="Path to the folder where you keep your data files, expecting to see *.md files in there along with *.png and *.pdf",
    )

    args = parser.parse_args()
@ -132,8 +149,13 @@ if __name__ == "__main__":

    all_comps = []

-    # Create a partial function with all the common arguments
-    process_pdf = functools.partial(process_single_pdf, all_mds=all_mds, comparisons=args.comparisons)
+    # Create a partial function with the common arguments, including the forced method (if any)
+    process_pdf = functools.partial(
+        process_single_pdf,
+        all_mds=all_mds,
+        comparisons=args.comparisons,
+        force_comparison=args.force_comparison,
+    )

    # Use ProcessPoolExecutor for parallel processing
    with ProcessPoolExecutor(max_workers=args.max_workers) as executor:
--- a/olmocr/eval/scoreelo.py
+++ b/olmocr/eval/scoreelo.py
@ -286,26 +286,39 @@ def make_report(urls):
 if __name__ == "__main__":
    # Example usage
    urls = [
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_0-ff70abb8f517.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=NarEyyCfvusCh%2FHdB47VfHOnnBs%3D&Expires=1738359221",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_1-0800f9af46cf.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=ncTWAu5rSndBJJsU26HRYDaK6i8%3D&Expires=1738359222",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_10-f7081f6ca6f9.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=gYX8yjGyYshRqXGgdsX17%2Fdi9Ig%3D&Expires=1738359223",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_11-355dc69335bc.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=7%2Bc5qoa8Tbk06z0VcvJiIIVAz9M%3D&Expires=1738359224",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_12-95fce9bf0c18.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=fw4PBo0LnxikmLZ8xH%2BGD%2F%2BhXMU%3D&Expires=1738359225",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_13-f88f7d7482bf.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=yXkQp9oFDtroKgiO50EwpYdGLcA%3D&Expires=1738359226",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_14-8ac0b974bfd5.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=EgZTpj1%2FdzMBUgd%2BX4pVZ1Sp%2FrA%3D&Expires=1738359226",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_15-e3136188de5c.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=YKhAv4unNIlRcerQAaHN4kjc4qI%3D&Expires=1738359227",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_16-2c5abde50d49.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=Mj8%2BK5ISKzAYQFeYvmzTgCPcRwA%3D&Expires=1738359228",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_17-f13132a4cdcc.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=%2FHuzw2cjJ4oFm91UXojPnGzYi8Q%3D&Expires=1738359229",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_18-25070f2aa05e.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=ctd%2BUIM%2FxryJm%2FcwA%2BRZ%2FbRzBp8%3D&Expires=1738359230",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_19-d436ee434162.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=jVdFKobIoHlbTQ7zziG%2BXiIQ0Fo%3D&Expires=1738359230",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_2-a5ece743fd31.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=K8hIrjWtvo4SLVQrOB8TiXLgNJk%3D&Expires=1738359231",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_3-9ce03af05f51.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=T0fLGSH%2Bv%2F19veqbxnLxoSf7gVA%3D&Expires=1738359232",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_4-94eec18f8027.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=u2R1LundKpfnAUCcD%2BdGHA6uIR0%3D&Expires=1738359233",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_5-377d0a7d8f5a.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=5R38ZQAR9ew5x%2BRmMVQbTqbfVh0%3D&Expires=1738359234",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_6-537b22646a26.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=PLOELum1qzOXW8Cm5rfZphlFeMw%3D&Expires=1738359235",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_7-a4a7dcb08f20.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=DxPHukGXEpPrEPL6TF9QBKPE1Xg%3D&Expires=1738359236",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_8-48a71c829863.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=TjEINKj69HdmXsKY59k4f3PieeM%3D&Expires=1738359237",
-        "https://jakep-tinyhost.s3.amazonaws.com/review_page_9-8557438928c3.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=F7sQxw5A%2FDOcOaa%2FQSeqepH0PQc%3D&Expires=1738359238",
+        # With GPT 4o comparisons
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_0-7c659f0a21b8.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=hhwK5u98Rr3%2B%2BhZPnfERG5J6lCQ%3D&Expires=1739810142",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_1-ec88d9678783.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=e5Hz2mPqvWF6UOIAnM2TIsIzlqE%3D&Expires=1739810143",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_2-0a136799c857.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=AnVvCG6vzI8DVFkjWH1gu5WK5Vs%3D&Expires=1739810144",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_3-0fcd55d81dca.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=I4aW6Eg%2BhaI9NGsWX1%2BCL%2BRi5I8%3D&Expires=1739810144",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_4-16170071ec9e.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=DV9KeGhT2HJhayp7NWdn4xN32BA%3D&Expires=1739810146",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_5-8b7069ba44e6.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=VEr1NU1h45%2FcbCif0Ah%2FzX7nkqE%3D&Expires=1739810146",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_6-3ac7c41a8d1f.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=ILDl2MEoltkXjMu7FkOUpN%2FeMhI%3D&Expires=1739810148",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_7-c7afd77206f8.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=QtH1rXLr2a2ed8M0Stm1qfAIlCU%3D&Expires=1739810148",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_8-41cf458b1ccb.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=DyNT4FheHsNt7YaW2rOK3dgAbxc%3D&Expires=1739810149",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_9-a21cd92d0df8.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=xbEFm8NkINBW3O2UasG3Nlo51lc%3D&Expires=1739810150",
+       
+        # Original Evals
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_0-c15d3c34d10d.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=kqoPu6Ht3sAv11ZAFE4liKu0ho0%3D&Expires=1739812401",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_1-b9a4cc301fa3.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=ix5zoGQyQF%2FQCcqDP1gBSlqjM0E%3D&Expires=1739812403",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_10-3e00795c34de.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=WJo1Tqes4ydf4kxCr024g0KbEds%3D&Expires=1739812404",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_11-4ea062af447c.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=jwNO31I3WqrRIAW8jUhW6FrxV1A%3D&Expires=1739812405",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_12-9f3ea088d8a0.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=8bhpA%2FGoy14%2Fh3La4A0rLFvuPxE%3D&Expires=1739812406",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_13-f7481e41c5d0.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=VJ3CZXMw%2B0OoIB1FKBibGa9E3yk%3D&Expires=1739812407",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_14-cd6624035b58.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=9pxeFOTR9ptqwepa5qtffHrh8Ko%3D&Expires=1739812408",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_15-6fe9491d125b.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=K3i8HpltQoFZaN3NMcPcbG0Vdtc%3D&Expires=1739812409",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_16-21ede1015505.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=Fuj%2BvYc6sCwfRAgvY3QminEZ0jo%3D&Expires=1739812410",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_17-088c7e4e2c24.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=LY2TDT8ypI4QhU2eHEEF7avvrGI%3D&Expires=1739812411",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_18-d525c9d28236.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=t17%2FAJIEwWFox8ZAbNEctQtsuXg%3D&Expires=1739812412",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_19-ef01ff7f17fa.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=QmMlFJ1IUYPfx%2FFOjHA%2FbedzXhA%3D&Expires=1739812413",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_2-e96cf945ea1c.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=aE7A%2F7Klf2UEagT4IZ0vdfMCDRY%3D&Expires=1739812414",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_3-6ab4be814f65.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=AsODz2eclnNf0qLr3037a1Lo7EQ%3D&Expires=1739812415",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_4-3b930c7969e5.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=mf6IKyI9HPbAVhgy79IrQu28jtY%3D&Expires=1739812416",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_5-c0ce41895c33.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=wlbiLV4CsIPqcH%2F%2BsCH62HiiDCA%3D&Expires=1739812417",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_6-8e483112d495.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=IxGBDhb6FcP8V322n%2FLUUS3C5VA%3D&Expires=1739812419",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_7-cf765eb5df67.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=I8u8xE0CIpOXuTUMbMCXPeGzOwY%3D&Expires=1739812420",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_8-07d44a6d53ac.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=nE0z2tGfJoaKM6m2vDdNTFbd2M4%3D&Expires=1739812421",
+        "https://jakep-tinyhost.s3.amazonaws.com/review_page_9-26dc69a5ffe9.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=pPJmms04gIfPhuencLUYHLv%2F%2FP4%3D&Expires=1739812423",
    ]
    # import tinyhost

--- a/olmocr/train/buildparquetdataset.py
+++ b/olmocr/train/buildparquetdataset.py
@ -1,76 +0,0 @@
-import argparse
-import logging
-import os
-
-import boto3
-from botocore.exceptions import NoCredentialsError, PartialCredentialsError
-from datasets import Dataset
-
-from olmocr.train.dataloader import build_batch_query_response_vision_dataset
-
-
-def save_dataset_in_parquet(dataset: Dataset, output_dir: str, rows_per_file: int = 10000, s3_endpoint_url: str = None):
-    logger.info("Saving dataset in Parquet files")
-
-    # Check if the output is an S3 path
-    is_s3 = output_dir.startswith("s3://")
-    if is_s3:
-        s3_client = boto3.client("s3", endpoint_url=s3_endpoint_url) if s3_endpoint_url else boto3.client("s3")
-    else:
-        os.makedirs(output_dir, exist_ok=True)
-
-    total_rows = len(dataset)
-    for start_idx in range(0, total_rows, rows_per_file):
-        end_idx = min(start_idx + rows_per_file, total_rows)
-        file_name = f"dataset_{start_idx}_{end_idx}.parquet"
-        if is_s3:
-            # Saving to S3
-            bucket_name, key_prefix = parse_s3_path(output_dir)
-            output_path = f"{key_prefix}/{file_name}"
-            local_temp_file = f"/tmp/{file_name}"
-            logger.info(f"Saving rows {start_idx} to {end_idx} locally at {local_temp_file}")
-            dataset.select(range(start_idx, end_idx)).to_parquet(local_temp_file)
-            try:
-                logger.info(f"Uploading {local_temp_file} to s3://{bucket_name}/{output_path}")
-                s3_client.upload_file(local_temp_file, bucket_name, output_path)
-            except (NoCredentialsError, PartialCredentialsError) as e:
-                logger.error(f"Failed to upload to S3: {e}")
-                raise
-            finally:
-                os.remove(local_temp_file)
-        else:
-            # Saving locally
-            output_path = os.path.join(output_dir, file_name)
-            logger.info(f"Saving rows {start_idx} to {end_idx} in {output_path}")
-            dataset.select(range(start_idx, end_idx)).to_parquet(output_path)
-
-
-def parse_s3_path(s3_path: str):
-    """Parses an S3 path into bucket and key prefix."""
-    if not s3_path.startswith("s3://"):
-        raise ValueError("S3 path must start with 's3://'")
-    path = s3_path[5:]
-    bucket_name, _, key_prefix = path.partition("/")
-    return bucket_name, key_prefix
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Process and save dataset as Parquet files.")
-    parser.add_argument("--query_path", type=str, required=True, help="Path to the query dataset JSONL files.")
-    parser.add_argument("--response_path", type=str, required=True, help="Path to the response dataset JSONL files.")
-    parser.add_argument("--output_dir", type=str, required=True, help="Directory or S3 path to save the output Parquet files.")
-    parser.add_argument("--num_proc", type=int, default=32, help="Number of processes to use for data processing.")
-    parser.add_argument("--s3_endpoint_url", type=str, default=None, help="Custom S3 endpoint URL, e.g., for S3-compatible storage.")
-
-    args = parser.parse_args()
-
-    logging.basicConfig(level=logging.INFO)
-    logger = logging.getLogger(__name__)
-
-    # Build the dataset
-    final_dataset = build_batch_query_response_vision_dataset(query_glob_path=args.query_path, response_glob_path=args.response_path, num_proc=args.num_proc)
-
-    # Save the dataset as Parquet files
-    save_dataset_in_parquet(final_dataset, args.output_dir, s3_endpoint_url=args.s3_endpoint_url)
-
-    logger.info("Dataset processing and saving completed.")