diff --git a/pdelfin/train/dataprep.py b/pdelfin/train/dataprep.py index b05ca61..58598cd 100644 --- a/pdelfin/train/dataprep.py +++ b/pdelfin/train/dataprep.py @@ -29,7 +29,7 @@ def prepare_data_for_qwen2_training(example, processor): # Right now, we are going to downsample to 1024 on the longest dimension, because # 2048 as we passed to OpenAI is too large for training width, height = main_image.size - assert 1800 <= max(width, height) <= 2200 + assert 1800 <= max(width, height) <= 2200, f"Image size {width}x{height} invalid" main_image = main_image.resize((width // 2, height // 2), Image.LANCZOS) diff --git a/sequence_lengths_histogram.png b/sequence_lengths_histogram.png new file mode 100644 index 0000000..d98be1f Binary files /dev/null and b/sequence_lengths_histogram.png differ diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py index fbf5045..71c1da1 100644 --- a/tests/test_dataloader.py +++ b/tests/test_dataloader.py @@ -1,4 +1,9 @@ import unittest +from torch.utils.data import DataLoader +from tqdm import tqdm +from functools import partial + +from transformers import AutoProcessor from pdelfin.train.dataloader import ( build_batch_query_response_vision_dataset, @@ -7,6 +12,8 @@ from pdelfin.train.dataloader import ( load_jsonl_from_s3, ) +from pdelfin.train.dataprep import batch_prepare_data_for_qwen2_training + class TestBatchQueryResponseDataset(unittest.TestCase): def testLoadS3(self): @@ -24,6 +31,44 @@ class TestBatchQueryResponseDataset(unittest.TestCase): print(ds) + def testPlotSequenceLengthHistogram(self): + import plotly.express as px + + ds = build_batch_query_response_vision_dataset( + query_glob_path="s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl", + response_glob_path="s3://ai2-oe-data/jakep/openai_batch_done_v2/*.json", + ) + processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct") + + formatted_dataset = ds.with_transform(partial(batch_prepare_data_for_qwen2_training, processor=processor)) + train_dataloader = DataLoader(formatted_dataset, batch_size=1, num_workers=50, shuffle=False) + + max_seen_len = 0 + steps = 0 + sequence_lengths = [] # List to store sequence lengths + for entry in tqdm(train_dataloader): + num_input_tokens = entry["input_ids"].shape[1] + max_seen_len = max(max_seen_len, num_input_tokens) + sequence_lengths.append(num_input_tokens) # Collecting sequence lengths + + if steps % 100 == 0: + print(f"Max input len {max_seen_len}") + + steps += 1 + + # model.forward(**{k: v.to("cuda:0") for (k,v) in entry.items()}) + print(f"Max input len {max_seen_len}") + + # Plotting the histogram using Plotly + fig = px.histogram( + sequence_lengths, + nbins=100, + title="Distribution of Input Sequence Lengths", + labels={'value': 'Sequence Length', 'count': 'Frequency'} + ) + + fig.write_image("sequence_lengths_histogram.png") + def testExtractBatch(self): query_data = load_jsonl_from_s3("s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl", first_n_files=3) query_data = query_data["train"]