explain extra padding token

This commit is contained in:
rasbt 2024-07-17 07:38:19 -05:00
parent a2bb045984
commit 3195594680

View File

@ -618,6 +618,8 @@
" device=\"cpu\"\n",
"):\n",
" # Find the longest sequence in the batch\n",
" # and increase the max length by +1, which will add one extra\n",
" # padding token below\n",
" batch_max_length = max(len(item)+1 for item in batch)\n",
"\n",
" # Pad and prepare inputs\n",
@ -627,13 +629,14 @@
" new_item = item.copy()\n",
" # Add an <|endoftext|> token\n",
" new_item += [pad_token_id]\n",
" # Pad sequences to max_length\n",
" # this always adds at least 1 additional padding tokens\n",
" # Pad sequences to batch_max_length\n",
" padded = (\n",
" new_item + [pad_token_id] * \n",
" (batch_max_length - len(new_item))\n",
" )\n",
" # We remove this extra padded token again here\n",
" # Via padded[:-1], we remove the extra padded token \n",
" # that has been added via the +1 setting in batch_max_length\n",
" # (the extra padding token will be relevant in later codes)\n",
" inputs = torch.tensor(padded[:-1])\n",
" inputs_lst.append(inputs)\n",
"\n",