mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-27 08:04:19 +00:00
Filtering based on cpu count
This commit is contained in:
parent
a90eb94951
commit
ec09408ca9
@ -59,7 +59,7 @@ def make_dataset(config: TrainConfig, processor: AutoProcessor) -> tuple[Dataset
|
|||||||
for source in config.train_data.sources
|
for source in config.train_data.sources
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
.filter(partial(filter_by_max_seq_len, processor=processor))
|
.filter(partial(filter_by_max_seq_len, processor=processor), num_proc=multiprocessing.cpu_count())
|
||||||
.with_transform(partial(batch_prepare_data_for_qwen2_training, processor=processor))
|
.with_transform(partial(batch_prepare_data_for_qwen2_training, processor=processor))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user