Remove reundant dropout in MLP module (#105)

This commit is contained in:
Sebastian Raschka 2024-04-03 20:19:08 -05:00 committed by GitHub
parent edcae09884
commit 5beff4e25a
11 changed files with 202 additions and 266 deletions

File diff suppressed because one or more lines are too long

View File

@ -152,7 +152,6 @@ class FeedForward(nn.Module):
nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
GELU(), GELU(),
nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
nn.Dropout(cfg["drop_rate"])
) )
def forward(self, x): def forward(self, x):

File diff suppressed because one or more lines are too long

View File

@ -253,7 +253,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 1,
"id": "5fee2cf5-61c3-4167-81b5-44ea155bbaf2", "id": "5fee2cf5-61c3-4167-81b5-44ea155bbaf2",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -265,7 +265,6 @@
" \"n_heads\": 12,\n", " \"n_heads\": 12,\n",
" \"n_layers\": 12,\n", " \"n_layers\": 12,\n",
" \"drop_rate_emb\": 0.1, # NEW: dropout for embedding layers\n", " \"drop_rate_emb\": 0.1, # NEW: dropout for embedding layers\n",
" \"drop_rate_ffn\": 0.1, # NEW: dropout for feed forward module\n",
" \"drop_rate_attn\": 0.1, # NEW: dropout for multi-head attention \n", " \"drop_rate_attn\": 0.1, # NEW: dropout for multi-head attention \n",
" \"drop_rate_resid\": 0.1, # NEW: dropout for residual connections \n", " \"drop_rate_resid\": 0.1, # NEW: dropout for residual connections \n",
" \"qkv_bias\": False\n", " \"qkv_bias\": False\n",
@ -274,26 +273,13 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 2,
"id": "5aa1b0c1-d78a-48fc-ad08-4802458b43f7", "id": "5aa1b0c1-d78a-48fc-ad08-4802458b43f7",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import torch.nn as nn\n", "import torch.nn as nn\n",
"from gpt import MultiHeadAttention, LayerNorm, GELU\n", "from gpt import MultiHeadAttention, LayerNorm, GELU, FeedForward\n",
"\n",
"class FeedForward(nn.Module):\n",
" def __init__(self, cfg):\n",
" super().__init__()\n",
" self.layers = nn.Sequential(\n",
" nn.Linear(cfg[\"emb_dim\"], 4 * cfg[\"emb_dim\"]),\n",
" GELU(),\n",
" nn.Linear(4 * cfg[\"emb_dim\"], cfg[\"emb_dim\"]),\n",
" nn.Dropout(cfg[\"drop_rate_ffn\"]) # NEW: dropout for feed forward module\n",
" )\n",
"\n",
" def forward(self, x):\n",
" return self.layers(x)\n",
"\n", "\n",
"\n", "\n",
"class TransformerBlock(nn.Module):\n", "class TransformerBlock(nn.Module):\n",
@ -356,7 +342,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 3,
"id": "1d013d32-c275-4f42-be21-9010f1537227", "id": "1d013d32-c275-4f42-be21-9010f1537227",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -384,7 +370,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.6" "version": "3.11.4"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -144,7 +144,6 @@ class FeedForward(nn.Module):
nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
GELU(), GELU(),
nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
nn.Dropout(cfg["drop_rate"])
) )
def forward(self, x): def forward(self, x):

File diff suppressed because one or more lines are too long

View File

@ -222,7 +222,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 4,
"id": "a61a4034-797a-4635-bf42-ddfff1b07125", "id": "a61a4034-797a-4635-bf42-ddfff1b07125",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -253,7 +253,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 5,
"id": "ee95a272-b852-43b4-9827-ea7e1dbd5724", "id": "ee95a272-b852-43b4-9827-ea7e1dbd5724",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -264,7 +264,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 6,
"id": "4ab43658-3240-484a-9072-a40a0ed85be6", "id": "4ab43658-3240-484a-9072-a40a0ed85be6",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -273,11 +273,7 @@
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Output text:\n", "Output text:\n",
" Every effort moves you?\"\n", " Every effort moves you know,\" was one of the axioms he laid down across the Sevres and silver of an exquisitely appointed lun\n"
"\n",
"\"Yes--quite insensible to the irony. She wanted him vindicated--and by me!\"\n",
"\n",
"\n"
] ]
} }
], ],
@ -298,7 +294,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 7,
"id": "ebb22d06-393a-42d3-ab64-66646d33b39b", "id": "ebb22d06-393a-42d3-ab64-66646d33b39b",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -307,11 +303,7 @@
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Output text:\n", "Output text:\n",
" Every effort moves you?\"\n", " Every effort moves you know,\" was one of the axioms he laid down across the Sevres and silver of an exquisitely appointed lun\n"
"\n",
"\"Yes--quite insensible to the irony. She wanted him vindicated--and by me!\"\n",
"\n",
"\n"
] ]
} }
], ],
@ -332,7 +324,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 8,
"id": "75469f24-47cc-458d-a200-fe64c648131d", "id": "75469f24-47cc-458d-a200-fe64c648131d",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -341,11 +333,7 @@
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Output text:\n", "Output text:\n",
" Every effort moves you?\"\n", " Every effort moves you know,\" was one of the axioms he laid down across the Sevres and silver of an exquisitely appointed lun\n"
"\n",
"\"Yes--quite insensible to the irony. She wanted him vindicated--and by me!\"\n",
"\n",
"\n"
] ]
} }
], ],
@ -412,7 +400,7 @@
"model = GPTModel(GPT_CONFIG_124M)\n", "model = GPTModel(GPT_CONFIG_124M)\n",
"model.load_state_dict(checkpoint[\"model_state_dict\"])\n", "model.load_state_dict(checkpoint[\"model_state_dict\"])\n",
"\n", "\n",
"optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)\n", "optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)\n",
"optimizer.load_state_dict(checkpoint[\"optimizer_state_dict\"])\n", "optimizer.load_state_dict(checkpoint[\"optimizer_state_dict\"])\n",
"model.to(device)\n", "model.to(device)\n",
"model.train();" "model.train();"
@ -497,9 +485,9 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Ep 1 (Step 000000): Train loss 0.523, Val loss 6.445\n", "Ep 1 (Step 000000): Train loss 0.271, Val loss 6.545\n",
"Ep 1 (Step 000005): Train loss 0.422, Val loss 6.541\n", "Ep 1 (Step 000005): Train loss 0.244, Val loss 6.614\n",
"Every effort moves you?\" \"Yes--quite insensible to the irony. She wanted him vindicated--and by me!\" \"Oh, and I remember getting off a prodigious phrase about the sketch of the donkey. \"There were days when I\n" "Every effort moves you?\" \"Yes--quite insensible to the irony. She wanted him vindicated--and by me!\" He laughed again, and threw back his head to look up at the sketch of the donkey. \"There were days when I\n"
] ]
} }
], ],
@ -558,7 +546,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 12,
"id": "68d162d6-bbb9-4d6d-82ee-1c410694f872", "id": "68d162d6-bbb9-4d6d-82ee-1c410694f872",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -586,7 +574,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 13,
"id": "d8373461-7dad-47da-a489-3e23f0799b23", "id": "d8373461-7dad-47da-a489-3e23f0799b23",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -612,7 +600,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 14,
"id": "cdd44873-d6c2-4471-a20f-f639b09fdcd3", "id": "cdd44873-d6c2-4471-a20f-f639b09fdcd3",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -637,7 +625,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 15,
"id": "c7d562e4-33f6-4611-9b75-6ad1cb441d3b", "id": "c7d562e4-33f6-4611-9b75-6ad1cb441d3b",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -652,7 +640,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 16,
"id": "46eda9ea-ccb0-46ee-931b-3c07502b2544", "id": "46eda9ea-ccb0-46ee-931b-3c07502b2544",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -705,7 +693,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 17,
"id": "4e3574a2-687d-47a2-a2f6-457fe9d595f1", "id": "4e3574a2-687d-47a2-a2f6-457fe9d595f1",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -713,8 +701,8 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Training loss: 3.754748503367106\n", "Training loss: 3.7547483444213867\n",
"Validation loss: 3.559617757797241\n" "Validation loss: 3.5596189498901367\n"
] ]
} }
], ],
@ -739,7 +727,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 18,
"id": "1a79a4b6-fe8f-40c2-a018-e731dcf391b3", "id": "1a79a4b6-fe8f-40c2-a018-e731dcf391b3",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -754,8 +742,8 @@
"File already exists and is up-to-date: gpt2/1558M/model.ckpt.index\n", "File already exists and is up-to-date: gpt2/1558M/model.ckpt.index\n",
"File already exists and is up-to-date: gpt2/1558M/model.ckpt.meta\n", "File already exists and is up-to-date: gpt2/1558M/model.ckpt.meta\n",
"File already exists and is up-to-date: gpt2/1558M/vocab.bpe\n", "File already exists and is up-to-date: gpt2/1558M/vocab.bpe\n",
"Training loss: 3.3046312597062855\n", "Training loss: 3.3046313656700983\n",
"Validation loss: 3.119514226913452\n" "Validation loss: 3.1195149421691895\n"
] ]
} }
], ],
@ -812,7 +800,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 19,
"id": "31e0972b-e85e-4904-a0f5-24c3eacd5fa2", "id": "31e0972b-e85e-4904-a0f5-24c3eacd5fa2",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -838,7 +826,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 20,
"id": "b641ee88-f9d4-43ec-a787-e34199eed356", "id": "b641ee88-f9d4-43ec-a787-e34199eed356",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -882,7 +870,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 21,
"id": "c98f56f4-98fc-43b4-9ee5-726e9d17c73f", "id": "c98f56f4-98fc-43b4-9ee5-726e9d17c73f",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -893,7 +881,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 22,
"id": "b1f7853c-6e81-4f1f-a1d0-61e2c7d33a20", "id": "b1f7853c-6e81-4f1f-a1d0-61e2c7d33a20",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [

View File

@ -144,7 +144,6 @@ class FeedForward(nn.Module):
nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
GELU(), GELU(),
nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
nn.Dropout(cfg["drop_rate"])
) )
def forward(self, x): def forward(self, x):

View File

@ -144,7 +144,6 @@ class FeedForward(nn.Module):
nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
GELU(), GELU(),
nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
nn.Dropout(cfg["drop_rate"])
) )
def forward(self, x): def forward(self, x):

View File

@ -146,7 +146,6 @@ class FeedForward(nn.Module):
nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
GELU(), GELU(),
nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
nn.Dropout(cfg["drop_rate"])
) )
def forward(self, x): def forward(self, x):

View File

@ -149,7 +149,6 @@ class FeedForward(nn.Module):
nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
GELU(), GELU(),
nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
nn.Dropout(cfg["drop_rate"])
) )
def forward(self, x): def forward(self, x):