From 24d0c4d42d5b8d17317371b80414e7e9ddf50df8 Mon Sep 17 00:00:00 2001 From: Branden Chan <33759007+brandenchan@users.noreply.github.com> Date: Wed, 17 Mar 2021 18:33:59 +0100 Subject: [PATCH] Fix DPR training batch size (#898) * Adjust batch size * Add latest docstring and tutorial changes * Update training results * Add latest docstring and tutorial changes Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/_src/tutorials/tutorials/9.md | 29 ++++++++++++++++++-------- tutorials/Tutorial9_DPR_training.ipynb | 29 ++++++++++++++++++-------- tutorials/Tutorial9_DPR_training.py | 4 ++-- 3 files changed, 42 insertions(+), 20 deletions(-) diff --git a/docs/_src/tutorials/tutorials/9.md b/docs/_src/tutorials/tutorials/9.md index c8e09594a..efa44a1f6 100644 --- a/docs/_src/tutorials/tutorials/9.md +++ b/docs/_src/tutorials/tutorials/9.md @@ -174,8 +174,8 @@ retriever = DensePassageRetriever( Let's start training and save our trained model! -On a V100 GPU, you can fit up to batch size 4 so we set gradient accumulation steps to 4 in order -to simulate the batch size 16 of the original DPR experiment. +On a V100 GPU, you can fit up to batch size 16 so we set gradient accumulation steps to 8 in order +to simulate the batch size 128 of the original DPR experiment. When `embed_title=True`, the document title is prepended to the input text sequence with a `[SEP]` token between it and document text. @@ -183,11 +183,22 @@ between it and document text. When training from scratch with the above variables, 1 epoch takes around an hour and we reached the following performance: ``` -loss: 0.09334952129693501 -acc: 0.984035000191887 -f1: 0.936147352264006 -acc_and_f1: 0.9600911762279465 -average_rank: 0.07075978511128166 +loss: 0.046580662854042276 +task_name: text_similarity +acc: 0.992524064068483 +f1: 0.8804297774366846 +acc_and_f1: 0.9364769207525838 +average_rank: 0.19631619339984652 +report: + precision recall f1-score support + +hard_negative 0.9961 0.9961 0.9961 201887 + positive 0.8804 0.8804 0.8804 6515 + + accuracy 0.9925 208402 + macro avg 0.9383 0.9383 0.9383 208402 + weighted avg 0.9925 0.9925 0.9925 208402 + ``` @@ -200,8 +211,8 @@ retriever.train( dev_filename=dev_filename, test_filename=dev_filename, n_epochs=1, - batch_size=4, - grad_acc_steps=4, + batch_size=16, + grad_acc_steps=8, save_dir=save_dir, evaluate_every=3000, embed_title=True, diff --git a/tutorials/Tutorial9_DPR_training.ipynb b/tutorials/Tutorial9_DPR_training.ipynb index 2b630dc39..85bbd0c2f 100644 --- a/tutorials/Tutorial9_DPR_training.ipynb +++ b/tutorials/Tutorial9_DPR_training.ipynb @@ -294,8 +294,8 @@ "\n", "Let's start training and save our trained model!\n", "\n", - "On a V100 GPU, you can fit up to batch size 4 so we set gradient accumulation steps to 4 in order\n", - "to simulate the batch size 16 of the original DPR experiment.\n", + "On a V100 GPU, you can fit up to batch size 16 so we set gradient accumulation steps to 8 in order\n", + "to simulate the batch size 128 of the original DPR experiment.\n", "\n", "When `embed_title=True`, the document title is prepended to the input text sequence with a `[SEP]` token\n", "between it and document text." @@ -313,11 +313,22 @@ "When training from scratch with the above variables, 1 epoch takes around an hour and we reached the following performance:\n", "\n", "```\n", - "loss: 0.09334952129693501\n", - "acc: 0.984035000191887\n", - "f1: 0.936147352264006\n", - "acc_and_f1: 0.9600911762279465\n", - "average_rank: 0.07075978511128166\n", + "loss: 0.046580662854042276\n", + "task_name: text_similarity\n", + "acc: 0.992524064068483\n", + "f1: 0.8804297774366846\n", + "acc_and_f1: 0.9364769207525838\n", + "average_rank: 0.19631619339984652\n", + "report:\n", + " precision recall f1-score support\n", + "\n", + "hard_negative 0.9961 0.9961 0.9961 201887\n", + " positive 0.8804 0.8804 0.8804 6515\n", + "\n", + " accuracy 0.9925 208402\n", + " macro avg 0.9383 0.9383 0.9383 208402\n", + " weighted avg 0.9925 0.9925 0.9925 208402\n", + "\n", "```" ], "metadata": { @@ -340,8 +351,8 @@ " dev_filename=dev_filename,\n", " test_filename=dev_filename,\n", " n_epochs=1,\n", - " batch_size=4,\n", - " grad_acc_steps=4,\n", + " batch_size=16,\n", + " grad_acc_steps=8,\n", " save_dir=save_dir,\n", " evaluate_every=3000,\n", " embed_title=True,\n", diff --git a/tutorials/Tutorial9_DPR_training.py b/tutorials/Tutorial9_DPR_training.py index 6190c6364..dcc2bdecc 100644 --- a/tutorials/Tutorial9_DPR_training.py +++ b/tutorials/Tutorial9_DPR_training.py @@ -65,8 +65,8 @@ def tutorial9_dpr_training(): dev_filename=dev_filename, test_filename=dev_filename, n_epochs=1, - batch_size=4, - grad_acc_steps=4, + batch_size=16, + grad_acc_steps=8, save_dir=save_dir, evaluate_every=3000, embed_title=True,