mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-02 11:04:25 +00:00
Sampling some sequence lengths
This commit is contained in:
parent
07c0323c91
commit
9cbc128553
@ -29,7 +29,7 @@ def prepare_data_for_qwen2_training(example, processor):
|
||||
# Right now, we are going to downsample to 1024 on the longest dimension, because
|
||||
# 2048 as we passed to OpenAI is too large for training
|
||||
width, height = main_image.size
|
||||
assert 1800 <= max(width, height) <= 2200
|
||||
assert 1800 <= max(width, height) <= 2200, f"Image size {width}x{height} invalid"
|
||||
main_image = main_image.resize((width // 2, height // 2), Image.LANCZOS)
|
||||
|
||||
|
||||
|
||||
BIN
sequence_lengths_histogram.png
Normal file
BIN
sequence_lengths_histogram.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 24 KiB |
@ -1,4 +1,9 @@
|
||||
import unittest
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm import tqdm
|
||||
from functools import partial
|
||||
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from pdelfin.train.dataloader import (
|
||||
build_batch_query_response_vision_dataset,
|
||||
@ -7,6 +12,8 @@ from pdelfin.train.dataloader import (
|
||||
load_jsonl_from_s3,
|
||||
)
|
||||
|
||||
from pdelfin.train.dataprep import batch_prepare_data_for_qwen2_training
|
||||
|
||||
|
||||
class TestBatchQueryResponseDataset(unittest.TestCase):
|
||||
def testLoadS3(self):
|
||||
@ -24,6 +31,44 @@ class TestBatchQueryResponseDataset(unittest.TestCase):
|
||||
|
||||
print(ds)
|
||||
|
||||
def testPlotSequenceLengthHistogram(self):
|
||||
import plotly.express as px
|
||||
|
||||
ds = build_batch_query_response_vision_dataset(
|
||||
query_glob_path="s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl",
|
||||
response_glob_path="s3://ai2-oe-data/jakep/openai_batch_done_v2/*.json",
|
||||
)
|
||||
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
|
||||
|
||||
formatted_dataset = ds.with_transform(partial(batch_prepare_data_for_qwen2_training, processor=processor))
|
||||
train_dataloader = DataLoader(formatted_dataset, batch_size=1, num_workers=50, shuffle=False)
|
||||
|
||||
max_seen_len = 0
|
||||
steps = 0
|
||||
sequence_lengths = [] # List to store sequence lengths
|
||||
for entry in tqdm(train_dataloader):
|
||||
num_input_tokens = entry["input_ids"].shape[1]
|
||||
max_seen_len = max(max_seen_len, num_input_tokens)
|
||||
sequence_lengths.append(num_input_tokens) # Collecting sequence lengths
|
||||
|
||||
if steps % 100 == 0:
|
||||
print(f"Max input len {max_seen_len}")
|
||||
|
||||
steps += 1
|
||||
|
||||
# model.forward(**{k: v.to("cuda:0") for (k,v) in entry.items()})
|
||||
print(f"Max input len {max_seen_len}")
|
||||
|
||||
# Plotting the histogram using Plotly
|
||||
fig = px.histogram(
|
||||
sequence_lengths,
|
||||
nbins=100,
|
||||
title="Distribution of Input Sequence Lengths",
|
||||
labels={'value': 'Sequence Length', 'count': 'Frequency'}
|
||||
)
|
||||
|
||||
fig.write_image("sequence_lengths_histogram.png")
|
||||
|
||||
def testExtractBatch(self):
|
||||
query_data = load_jsonl_from_s3("s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl", first_n_files=3)
|
||||
query_data = query_data["train"]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user