From 47dc069afe7fb42b09e428e683ba01d35c76a073 Mon Sep 17 00:00:00 2001 From: Moshe Berchansky Date: Fri, 19 Mar 2021 19:11:25 +0200 Subject: [PATCH] Fix for allocate memory exception by specifing max_processes (#910) --- haystack/retriever/dense.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haystack/retriever/dense.py b/haystack/retriever/dense.py index 728b7f974..5f86e05b5 100644 --- a/haystack/retriever/dense.py +++ b/haystack/retriever/dense.py @@ -270,6 +270,7 @@ class DensePassageRetriever(BaseRetriever): train_filename: str, dev_filename: str = None, test_filename: str = None, + max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, @@ -295,6 +296,8 @@ class DensePassageRetriever(BaseRetriever): :param train_filename: training filename :param dev_filename: development set filename, file to be used by model in eval step of training :param test_filename: test set filename, file to be used by model in test step after training + :param max_processes: the maximum number of processes to spawn in the multiprocessing.Pool used in DataSilo. + It can be set to 1 to disable the use of multiprocessing or make debugging easier. :param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None :param batch_size: total number of samples in 1 batch of data :param embed_title: whether to concatenate passage title with each passage. The default setting in official DPR embeds passage title with the corresponding passage @@ -333,7 +336,7 @@ class DensePassageRetriever(BaseRetriever): self.model.connect_heads_with_processor(self.processor.tasks, require_labels=True) - data_silo = DataSilo(processor=self.processor, batch_size=batch_size, distributed=False) + data_silo = DataSilo(processor=self.processor, batch_size=batch_size, distributed=False, max_processes=max_processes) # 5. Create an optimizer self.model, optimizer, lr_schedule = initialize_optimizer(