From 24d0c4d42d5b8d17317371b80414e7e9ddf50df8 Mon Sep 17 00:00:00 2001
From: Branden Chan <33759007+brandenchan@users.noreply.github.com>
Date: Wed, 17 Mar 2021 18:33:59 +0100
Subject: [PATCH] Fix DPR training batch size (#898)

* Adjust batch size

* Add latest docstring and tutorial changes

* Update training results

* Add latest docstring and tutorial changes

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 docs/_src/tutorials/tutorials/9.md     | 29 ++++++++++++++++++--------
 tutorials/Tutorial9_DPR_training.ipynb | 29 ++++++++++++++++++--------
 tutorials/Tutorial9_DPR_training.py    |  4 ++--
 3 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/docs/_src/tutorials/tutorials/9.md b/docs/_src/tutorials/tutorials/9.md
index c8e09594a..efa44a1f6 100644
--- a/docs/_src/tutorials/tutorials/9.md
+++ b/docs/_src/tutorials/tutorials/9.md
@@ -174,8 +174,8 @@ retriever = DensePassageRetriever(
 
 Let's start training and save our trained model!
 
-On a V100 GPU, you can fit up to batch size 4 so we set gradient accumulation steps to 4 in order
-to simulate the batch size 16 of the original DPR experiment.
+On a V100 GPU, you can fit up to batch size 16 so we set gradient accumulation steps to 8 in order
+to simulate the batch size 128 of the original DPR experiment.
 
 When `embed_title=True`, the document title is prepended to the input text sequence with a `[SEP]` token
 between it and document text.
@@ -183,11 +183,22 @@ between it and document text.
 When training from scratch with the above variables, 1 epoch takes around an hour and we reached the following performance:
 
 ```
-loss: 0.09334952129693501
-acc: 0.984035000191887
-f1: 0.936147352264006
-acc_and_f1: 0.9600911762279465
-average_rank: 0.07075978511128166
+loss: 0.046580662854042276
+task_name: text_similarity
+acc: 0.992524064068483
+f1: 0.8804297774366846
+acc_and_f1: 0.9364769207525838
+average_rank: 0.19631619339984652
+report:
+                precision    recall  f1-score   support
+
+hard_negative     0.9961    0.9961    0.9961    201887
+     positive     0.8804    0.8804    0.8804      6515
+
+     accuracy                         0.9925    208402
+    macro avg     0.9383    0.9383    0.9383    208402
+ weighted avg     0.9925    0.9925    0.9925    208402
+
 ```
 
 
@@ -200,8 +211,8 @@ retriever.train(
     dev_filename=dev_filename,
     test_filename=dev_filename,
     n_epochs=1,
-    batch_size=4,
-    grad_acc_steps=4,
+    batch_size=16,
+    grad_acc_steps=8,
     save_dir=save_dir,
     evaluate_every=3000,
     embed_title=True,
diff --git a/tutorials/Tutorial9_DPR_training.ipynb b/tutorials/Tutorial9_DPR_training.ipynb
index 2b630dc39..85bbd0c2f 100644
--- a/tutorials/Tutorial9_DPR_training.ipynb
+++ b/tutorials/Tutorial9_DPR_training.ipynb
@@ -294,8 +294,8 @@
     "\n",
     "Let's start training and save our trained model!\n",
     "\n",
-    "On a V100 GPU, you can fit up to batch size 4 so we set gradient accumulation steps to 4 in order\n",
-    "to simulate the batch size 16 of the original DPR experiment.\n",
+    "On a V100 GPU, you can fit up to batch size 16 so we set gradient accumulation steps to 8 in order\n",
+    "to simulate the batch size 128 of the original DPR experiment.\n",
     "\n",
     "When `embed_title=True`, the document title is prepended to the input text sequence with a `[SEP]` token\n",
     "between it and document text."
@@ -313,11 +313,22 @@
     "When training from scratch with the above variables, 1 epoch takes around an hour and we reached the following performance:\n",
     "\n",
     "```\n",
-    "loss: 0.09334952129693501\n",
-    "acc: 0.984035000191887\n",
-    "f1: 0.936147352264006\n",
-    "acc_and_f1: 0.9600911762279465\n",
-    "average_rank: 0.07075978511128166\n",
+    "loss: 0.046580662854042276\n",
+    "task_name: text_similarity\n",
+    "acc: 0.992524064068483\n",
+    "f1: 0.8804297774366846\n",
+    "acc_and_f1: 0.9364769207525838\n",
+    "average_rank: 0.19631619339984652\n",
+    "report:\n",
+    "                precision    recall  f1-score   support\n",
+    "\n",
+    "hard_negative     0.9961    0.9961    0.9961    201887\n",
+    "     positive     0.8804    0.8804    0.8804      6515\n",
+    "\n",
+    "     accuracy                         0.9925    208402\n",
+    "    macro avg     0.9383    0.9383    0.9383    208402\n",
+    " weighted avg     0.9925    0.9925    0.9925    208402\n",
+    "\n",
     "```"
    ],
    "metadata": {
@@ -340,8 +351,8 @@
     "    dev_filename=dev_filename,\n",
     "    test_filename=dev_filename,\n",
     "    n_epochs=1,\n",
-    "    batch_size=4,\n",
-    "    grad_acc_steps=4,\n",
+    "    batch_size=16,\n",
+    "    grad_acc_steps=8,\n",
     "    save_dir=save_dir,\n",
     "    evaluate_every=3000,\n",
     "    embed_title=True,\n",
diff --git a/tutorials/Tutorial9_DPR_training.py b/tutorials/Tutorial9_DPR_training.py
index 6190c6364..dcc2bdecc 100644
--- a/tutorials/Tutorial9_DPR_training.py
+++ b/tutorials/Tutorial9_DPR_training.py
@@ -65,8 +65,8 @@ def tutorial9_dpr_training():
         dev_filename=dev_filename,
         test_filename=dev_filename,
         n_epochs=1,
-        batch_size=4,
-        grad_acc_steps=4,
+        batch_size=16,
+        grad_acc_steps=8,
         save_dir=save_dir,
         evaluate_every=3000,
         embed_title=True,