mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-12 07:17:41 +00:00
Better default value for mp chunksize (#923)
* Better default value for mp chunksize * Add latest docstring and tutorial changes Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
cafa1230da
commit
1244d16010
@ -341,7 +341,7 @@ Embeddings of documents / passages shape (batch_size, embedding_dim)
|
||||
#### train
|
||||
|
||||
```python
|
||||
| train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, optimizer_name: str = "TransformersAdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder")
|
||||
| train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, optimizer_name: str = "TransformersAdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder")
|
||||
```
|
||||
|
||||
train a DensePassageRetrieval model
|
||||
@ -352,6 +352,8 @@ train a DensePassageRetrieval model
|
||||
- `train_filename`: training filename
|
||||
- `dev_filename`: development set filename, file to be used by model in eval step of training
|
||||
- `test_filename`: test set filename, file to be used by model in test step after training
|
||||
- `max_processes`: the maximum number of processes to spawn in the multiprocessing.Pool used in DataSilo.
|
||||
It can be set to 1 to disable the use of multiprocessing or make debugging easier.
|
||||
- `dev_split`: The proportion of the train set that will sliced. Only works if dev_filename is set to None
|
||||
- `batch_size`: total number of samples in 1 batch of data
|
||||
- `embed_title`: whether to concatenate passage title with each passage. The default setting in official DPR embeds passage title with the corresponding passage
|
||||
|
||||
@ -309,7 +309,7 @@ class FARMReader(BaseReader):
|
||||
self.inferencer.batch_size = batch_size
|
||||
# make predictions on all document-query pairs
|
||||
predictions = self.inferencer.inference_from_objects(
|
||||
objects=inputs, return_json=False, multiprocessing_chunksize=1
|
||||
objects=inputs, return_json=False, multiprocessing_chunksize=10
|
||||
)
|
||||
|
||||
# group predictions together
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user