mirror of
https://github.com/allenai/olmocr.git
synced 2025-08-15 20:32:45 +00:00
Weird issue with surrogate pairs in json
This commit is contained in:
parent
98020cabbb
commit
d8e459c9f3
@ -211,11 +211,11 @@ def build_batch_query_response_vision_dataset(query_glob_path: str, response_glo
|
||||
# Map the datasets down to the core fields that we're going to need to make them easier to process
|
||||
logger.info("Mapping query data")
|
||||
query_data = query_data["train"]
|
||||
query_data = query_data.map(extract_openai_batch_query, remove_columns=query_data.column_names)
|
||||
query_data = query_data.map(extract_openai_batch_query, remove_columns=query_data.column_names, num_proc=num_proc)
|
||||
|
||||
logger.info("Mapping response data")
|
||||
response_data = response_data["train"]
|
||||
response_data = response_data.map(extract_openai_batch_response, remove_columns=response_data.column_names)
|
||||
response_data = response_data.map(extract_openai_batch_response, remove_columns=response_data.column_names, num_proc=num_proc)
|
||||
|
||||
# What we're going to do, is build an in-memory map for the response data from custom_id to row
|
||||
# This will let us do quick lookups when we do a merge step, but it will not scale past a certain point
|
||||
@ -231,17 +231,17 @@ def build_batch_query_response_vision_dataset(query_glob_path: str, response_glo
|
||||
)
|
||||
|
||||
# Don't include data where the model cut off due to a length issue, or moderation issue
|
||||
final_dataset = final_dataset.filter(lambda x: x["finish_reason"] == "stop")
|
||||
final_dataset = final_dataset.filter(lambda x: x["finish_reason"] == "stop", num_proc=num_proc)
|
||||
|
||||
# Pick things that have a reasonable image size only
|
||||
def pick_image_sizes(x):
|
||||
width, height = get_png_dimensions_from_base64(x["input_prompt_image_base64"])
|
||||
return 1800 <= max(width, height) <= 2200
|
||||
|
||||
final_dataset = final_dataset.filter(pick_image_sizes)
|
||||
final_dataset = final_dataset.filter(pick_image_sizes, num_proc=num_proc)
|
||||
|
||||
# Limit the size of the input text not to explode the context size
|
||||
final_dataset = final_dataset.filter(lambda x: len(x["raw_page_text"]) < 4000)
|
||||
final_dataset = final_dataset.filter(lambda x: len(x["raw_page_text"]) < 4000, num_proc=num_proc)
|
||||
|
||||
return final_dataset
|
||||
|
||||
|
@ -9,7 +9,7 @@ from pdelfin.train.dataloader import (
|
||||
build_batch_query_response_vision_dataset,
|
||||
extract_openai_batch_query,
|
||||
extract_openai_batch_response,
|
||||
load_jsonl_into_ds,
|
||||
load_jsonl_into_ds
|
||||
)
|
||||
|
||||
from pdelfin.train.dataprep import batch_prepare_data_for_qwen2_training, prepare_data_for_qwen2_training
|
||||
@ -25,8 +25,8 @@ class TestBatchQueryResponseDataset(unittest.TestCase):
|
||||
|
||||
def testCombinedQueryResponse(self):
|
||||
ds = build_batch_query_response_vision_dataset(
|
||||
query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl",
|
||||
response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json",
|
||||
query_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_train/*.jsonl",
|
||||
response_glob_path="s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_train/*.json",
|
||||
)
|
||||
|
||||
print(ds)
|
||||
@ -37,19 +37,6 @@ class TestBatchQueryResponseDataset(unittest.TestCase):
|
||||
|
||||
print(ds[0])
|
||||
|
||||
def testLocalDS(self):
|
||||
ds = build_batch_query_response_vision_dataset(
|
||||
query_glob_path="/root/openai_batch_data_v5_1_train/*.jsonl",
|
||||
response_glob_path="/root/openai_batch_data_v5_1_train_done/*.json",
|
||||
)
|
||||
|
||||
print(ds)
|
||||
|
||||
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
|
||||
from pdelfin.train.dataprep import filter_by_max_seq_len
|
||||
ds = ds.filter(partial(filter_by_max_seq_len, processor=processor, max_prompt_len=1000))
|
||||
|
||||
print(ds[0])
|
||||
|
||||
def testPlotSequenceLengthHistogram(self):
|
||||
import plotly.express as px
|
||||
|
Loading…
x
Reference in New Issue
Block a user