explain extra padding token

2025-12-27 15:12:08 +00:00 · 2024-07-17 07:38:19 -05:00 · 2024-07-17 07:38:19 -05:00 · 3195594680
commit 3195594680
parent a2bb045984
1 changed files with 6 additions and 3 deletions
--- a/ch07/01_main-chapter-code/ch07.ipynb
+++ b/ch07/01_main-chapter-code/ch07.ipynb
@ -618,6 +618,8 @@
    "    device=\"cpu\"\n",
    "):\n",
    "    # Find the longest sequence in the batch\n",
+    "    # and increase the max length by +1, which will add one extra\n",
+    "    # padding token below\n",
    "    batch_max_length = max(len(item)+1 for item in batch)\n",
    "\n",
    "    # Pad and prepare inputs\n",
@ -627,13 +629,14 @@
    "        new_item = item.copy()\n",
    "        # Add an <|endoftext|> token\n",
    "        new_item += [pad_token_id]\n",
-    "        # Pad sequences to max_length\n",
-    "        # this always adds at least 1 additional padding tokens\n",
+    "        # Pad sequences to batch_max_length\n",
    "        padded = (\n",
    "            new_item + [pad_token_id] * \n",
    "            (batch_max_length - len(new_item))\n",
    "        )\n",
-    "        # We remove this extra padded token again here\n",
+    "        # Via padded[:-1], we remove the extra padded token \n",
+    "        # that has been added via the +1 setting in batch_max_length\n",
+    "        # (the extra padding token will be relevant in later codes)\n",
    "        inputs = torch.tensor(padded[:-1])\n",
    "        inputs_lst.append(inputs)\n",
    "\n",