From ec09408ca9cddcc6534ea0a31952730d0b253441 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 7 Oct 2024 15:40:29 -0700 Subject: [PATCH] Filtering based on cpu count --- pdelfin/train/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdelfin/train/utils.py b/pdelfin/train/utils.py index ad56591..c97be31 100644 --- a/pdelfin/train/utils.py +++ b/pdelfin/train/utils.py @@ -59,7 +59,7 @@ def make_dataset(config: TrainConfig, processor: AutoProcessor) -> tuple[Dataset for source in config.train_data.sources ] ) - .filter(partial(filter_by_max_seq_len, processor=processor)) + .filter(partial(filter_by_max_seq_len, processor=processor), num_proc=multiprocessing.cpu_count()) .with_transform(partial(batch_prepare_data_for_qwen2_training, processor=processor)) )