mirror of
https://github.com/allenai/olmocr.git
synced 2025-07-05 08:03:22 +00:00
66 lines
1.9 KiB
Python
66 lines
1.9 KiB
Python
![]() |
import unittest
|
||
|
import base64
|
||
|
from io import BytesIO
|
||
|
from PIL import Image
|
||
|
from transformers import AutoProcessor
|
||
|
|
||
|
from pdelfin.train.dataloader import (
|
||
|
build_batch_query_response_vision_dataset,
|
||
|
)
|
||
|
|
||
|
from pdelfin.train.dataprep import (
|
||
|
prepare_data_for_qwen2_training
|
||
|
)
|
||
|
|
||
|
|
||
|
class TestDataprep(unittest.TestCase):
|
||
|
def testTokenizationMatches(self):
|
||
|
ds = build_batch_query_response_vision_dataset(
|
||
|
query_glob_path="s3://ai2-oe-data/jakep/openai_batch_data_v2_mini/*.jsonl",
|
||
|
response_glob_path="s3://ai2-oe-data/jakep/openai_batch_done_v2_mini/*.json",
|
||
|
)
|
||
|
|
||
|
example = ds[0]
|
||
|
|
||
|
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
|
||
|
|
||
|
full_messages = [
|
||
|
{
|
||
|
"role": "user",
|
||
|
"content": [
|
||
|
{
|
||
|
"type": "image",
|
||
|
"image": example["input_prompt_image_base64"] # Placeholder
|
||
|
},
|
||
|
{"type": "text", "text": example["input_prompt_text"]},
|
||
|
],
|
||
|
},
|
||
|
|
||
|
{
|
||
|
"role": "assistant",
|
||
|
"content": example["response"]
|
||
|
}
|
||
|
]
|
||
|
|
||
|
text = processor.apply_chat_template(full_messages, tokenize=False, add_generation_prompt=True)
|
||
|
|
||
|
# Decode image from base64
|
||
|
main_image = Image.open(BytesIO(base64.b64decode(example["input_prompt_image_base64"])))
|
||
|
|
||
|
# Process inputs using processor
|
||
|
inference_inputs = processor(
|
||
|
text=[text],
|
||
|
images=[main_image],
|
||
|
padding=True,
|
||
|
return_tensors="np",
|
||
|
)
|
||
|
|
||
|
print(inference_inputs)
|
||
|
print(inference_inputs["input_ids"].shape)
|
||
|
|
||
|
training_inputs = prepare_data_for_qwen2_training(example, processor=processor)
|
||
|
|
||
|
print(training_inputs)
|
||
|
print(training_inputs["input_ids"].shape)
|
||
|
|