From d8e459c9f35a2d7d0b908bc97f7f956e5ecf62bf Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 7 Oct 2024 09:04:13 -0700 Subject: [PATCH] Weird issue with surrogate pairs in json --- pdelfin/train/dataloader.py | 10 +++++----- tests/test_dataloader.py | 19 +++---------------- 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/pdelfin/train/dataloader.py b/pdelfin/train/dataloader.py index 6d10923..c0caa3d 100644 --- a/pdelfin/train/dataloader.py +++ b/pdelfin/train/dataloader.py @@ -211,11 +211,11 @@ def build_batch_query_response_vision_dataset(query_glob_path: str, response_glo # Map the datasets down to the core fields that we're going to need to make them easier to process logger.info("Mapping query data") query_data = query_data["train"] - query_data = query_data.map(extract_openai_batch_query, remove_columns=query_data.column_names) + query_data = query_data.map(extract_openai_batch_query, remove_columns=query_data.column_names, num_proc=num_proc) logger.info("Mapping response data") response_data = response_data["train"] - response_data = response_data.map(extract_openai_batch_response, remove_columns=response_data.column_names) + response_data = response_data.map(extract_openai_batch_response, remove_columns=response_data.column_names, num_proc=num_proc) # What we're going to do, is build an in-memory map for the response data from custom_id to row # This will let us do quick lookups when we do a merge step, but it will not scale past a certain point @@ -231,17 +231,17 @@ def build_batch_query_response_vision_dataset(query_glob_path: str, response_glo ) # Don't include data where the model cut off due to a length issue, or moderation issue - final_dataset = final_dataset.filter(lambda x: x["finish_reason"] == "stop") + final_dataset = final_dataset.filter(lambda x: x["finish_reason"] == "stop", num_proc=num_proc) # Pick things that have a reasonable image size only def pick_image_sizes(x): width, height = get_png_dimensions_from_base64(x["input_prompt_image_base64"]) return 1800 <= max(width, height) <= 2200 - final_dataset = final_dataset.filter(pick_image_sizes) + final_dataset = final_dataset.filter(pick_image_sizes, num_proc=num_proc) # Limit the size of the input text not to explode the context size - final_dataset = final_dataset.filter(lambda x: len(x["raw_page_text"]) < 4000) + final_dataset = final_dataset.filter(lambda x: len(x["raw_page_text"]) < 4000, num_proc=num_proc) return final_dataset diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py index dd79a08..4c53ee5 100644 --- a/tests/test_dataloader.py +++ b/tests/test_dataloader.py @@ -9,7 +9,7 @@ from pdelfin.train.dataloader import ( build_batch_query_response_vision_dataset, extract_openai_batch_query, extract_openai_batch_response, - load_jsonl_into_ds, + load_jsonl_into_ds ) from pdelfin.train.dataprep import batch_prepare_data_for_qwen2_training, prepare_data_for_qwen2_training @@ -25,8 +25,8 @@ class TestBatchQueryResponseDataset(unittest.TestCase): def testCombinedQueryResponse(self): ds = build_batch_query_response_vision_dataset( - query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl", - response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json", + query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_train/*.jsonl", + response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_train/*.json", ) print(ds) @@ -37,19 +37,6 @@ class TestBatchQueryResponseDataset(unittest.TestCase): print(ds[0]) - def testLocalDS(self): - ds = build_batch_query_response_vision_dataset( - query_glob_path="/root/openai_batch_data_v5_1_train/*.jsonl", - response_glob_path="/root/openai_batch_data_v5_1_train_done/*.json", - ) - - print(ds) - - processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct") - from pdelfin.train.dataprep import filter_by_max_seq_len - ds = ds.filter(partial(filter_by_max_seq_len, processor=processor, max_prompt_len=1000)) - - print(ds[0]) def testPlotSequenceLengthHistogram(self): import plotly.express as px