olmocr/tests/test_dataloader.py
2024-09-18 21:42:09 +00:00

26 lines
1.1 KiB
Python

import unittest
from pdelfin.train.dataloader import load_jsonl_from_s3, build_batch_query_response_vision_dataset
from pdelfin.train.dataloader import extract_openai_batch_query
class TestBatchQueryResponseDataset(unittest.TestCase):
def testLoadS3(self):
ds = load_jsonl_from_s3("s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl", first_n_files=3)
print(f"Loaded {len(ds)} entries")
print(ds)
print(ds["train"])
def testCombinedQueryResponse(self):
ds = build_batch_query_response_vision_dataset(query_glob_path="s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl",
response_glob_path="s3://ai2-oe-data/jakep/openai_batch_done_v2/*.json")
print(ds)
def testExtractBatch(self):
query_data = load_jsonl_from_s3("s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl", first_n_files=3)
query_data = query_data["train"]
query_data = query_data.map(extract_openai_batch_query, remove_columns=query_data.column_names)
print(query_data)
print(query_data[0]["custom_id"], query_data[0]["input_prompt_text"])