Filtering based on cpu count

This commit is contained in:
Jake Poznanski 2024-10-07 15:40:29 -07:00
parent a90eb94951
commit ec09408ca9

View File

@ -59,7 +59,7 @@ def make_dataset(config: TrainConfig, processor: AutoProcessor) -> tuple[Dataset
for source in config.train_data.sources
]
)
.filter(partial(filter_by_max_seq_len, processor=processor))
.filter(partial(filter_by_max_seq_len, processor=processor), num_proc=multiprocessing.cpu_count())
.with_transform(partial(batch_prepare_data_for_qwen2_training, processor=processor))
)