mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-12-24 13:42:08 +00:00
Rename drop_resid to drop_shortcut (#136)
This commit is contained in:
parent
d1edfcb63f
commit
a5b353667d
@ -170,21 +170,21 @@ class TransformerBlock(nn.Module):
|
||||
self.ff = FeedForward(cfg)
|
||||
self.norm1 = LayerNorm(cfg["emb_dim"])
|
||||
self.norm2 = LayerNorm(cfg["emb_dim"])
|
||||
self.drop_resid = nn.Dropout(cfg["drop_rate"])
|
||||
self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
|
||||
|
||||
def forward(self, x):
|
||||
# Shortcut connection for attention block
|
||||
shortcut = x
|
||||
x = self.norm1(x)
|
||||
x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
|
||||
x = self.drop_resid(x)
|
||||
x = self.drop_shortcut(x)
|
||||
x = x + shortcut # Add the original input back
|
||||
|
||||
# Shortcut connection for feed-forward block
|
||||
shortcut = x
|
||||
x = self.norm2(x)
|
||||
x = self.ff(x)
|
||||
x = self.drop_resid(x)
|
||||
x = self.drop_shortcut(x)
|
||||
x = x + shortcut # Add the original input back
|
||||
|
||||
return x
|
||||
|
||||
@ -950,21 +950,21 @@
|
||||
" self.ff = FeedForward(cfg)\n",
|
||||
" self.norm1 = LayerNorm(cfg[\"emb_dim\"])\n",
|
||||
" self.norm2 = LayerNorm(cfg[\"emb_dim\"])\n",
|
||||
" self.drop_resid = nn.Dropout(cfg[\"drop_rate\"])\n",
|
||||
" self.drop_shortcut = nn.Dropout(cfg[\"drop_rate\"])\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" # Shortcut connection for attention block\n",
|
||||
" shortcut = x\n",
|
||||
" x = self.norm1(x)\n",
|
||||
" x = self.att(x) # Shape [batch_size, num_tokens, emb_size]\n",
|
||||
" x = self.drop_resid(x)\n",
|
||||
" x = self.drop_shortcut(x)\n",
|
||||
" x = x + shortcut # Add the original input back\n",
|
||||
"\n",
|
||||
" # Shortcut connection for feed forward block\n",
|
||||
" shortcut = x\n",
|
||||
" x = self.norm2(x)\n",
|
||||
" x = self.ff(x)\n",
|
||||
" x = self.drop_resid(x)\n",
|
||||
" x = self.drop_shortcut(x)\n",
|
||||
" x = x + shortcut # Add the original input back\n",
|
||||
"\n",
|
||||
" return x"
|
||||
@ -1489,7 +1489,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.4"
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@ -34,7 +34,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from gpt import Transfocontext_lengthmerBlock\n",
|
||||
"from gpt import TransformerBlock\n",
|
||||
"\n",
|
||||
"GPT_CONFIG_124M = {\n",
|
||||
" \"vocab_size\": 50257,\n",
|
||||
@ -264,9 +264,9 @@
|
||||
" \"emb_dim\": 768,\n",
|
||||
" \"n_heads\": 12,\n",
|
||||
" \"n_layers\": 12,\n",
|
||||
" \"drop_rate_emb\": 0.1, # NEW: dropout for embedding layers\n",
|
||||
" \"drop_rate_attn\": 0.1, # NEW: dropout for multi-head attention \n",
|
||||
" \"drop_rate_resid\": 0.1, # NEW: dropout for residual connections \n",
|
||||
" \"drop_rate_emb\": 0.1, # NEW: dropout for embedding layers\n",
|
||||
" \"drop_rate_attn\": 0.1, # NEW: dropout for multi-head attention \n",
|
||||
" \"drop_rate_shortcut\": 0.1, # NEW: dropout for shortcut connections \n",
|
||||
" \"qkv_bias\": False\n",
|
||||
"}"
|
||||
]
|
||||
@ -295,21 +295,21 @@
|
||||
" self.ff = FeedForward(cfg)\n",
|
||||
" self.norm1 = LayerNorm(cfg[\"emb_dim\"])\n",
|
||||
" self.norm2 = LayerNorm(cfg[\"emb_dim\"])\n",
|
||||
" self.drop_resid = nn.Dropout(cfg[\"drop_rate_resid\"])\n",
|
||||
" self.drop_shortcut = nn.Dropout(cfg[\"drop_rate_shortcut\"])\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" # Shortcut connection for attention block\n",
|
||||
" shortcut = x\n",
|
||||
" x = self.norm1(x)\n",
|
||||
" x = self.att(x) # Shape [batch_size, num_tokens, emb_size]\n",
|
||||
" x = self.drop_resid(x)\n",
|
||||
" x = self.drop_shortcut(x)\n",
|
||||
" x = x + shortcut # Add the original input back\n",
|
||||
"\n",
|
||||
" # Shortcut connection for feed-forward block\n",
|
||||
" shortcut = x\n",
|
||||
" x = self.norm2(x)\n",
|
||||
" x = self.ff(x)\n",
|
||||
" x = self.drop_resid(x)\n",
|
||||
" x = self.drop_shortcut(x)\n",
|
||||
" x = x + shortcut # Add the original input back\n",
|
||||
"\n",
|
||||
" return x\n",
|
||||
@ -370,7 +370,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@ -162,21 +162,21 @@ class TransformerBlock(nn.Module):
|
||||
self.ff = FeedForward(cfg)
|
||||
self.norm1 = LayerNorm(cfg["emb_dim"])
|
||||
self.norm2 = LayerNorm(cfg["emb_dim"])
|
||||
self.drop_resid = nn.Dropout(cfg["drop_rate"])
|
||||
self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
|
||||
|
||||
def forward(self, x):
|
||||
# Shortcut connection for attention block
|
||||
shortcut = x
|
||||
x = self.norm1(x)
|
||||
x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
|
||||
x = self.drop_resid(x)
|
||||
x = self.drop_shortcut(x)
|
||||
x = x + shortcut # Add the original input back
|
||||
|
||||
# Shortcut connection for feed-forward block
|
||||
shortcut = x
|
||||
x = self.norm2(x)
|
||||
x = self.ff(x)
|
||||
x = self.drop_resid(x)
|
||||
x = self.drop_shortcut(x)
|
||||
x = x + shortcut # Add the original input back
|
||||
|
||||
return x
|
||||
|
||||
@ -519,7 +519,7 @@
|
||||
"train_losses, val_losses, tokens_seen = train_model_simple(\n",
|
||||
" model, train_loader, val_loader, optimizer, device,\n",
|
||||
" num_epochs=num_epochs, eval_freq=5, eval_iter=5,\n",
|
||||
" start_context=\"Every effort moves you\",\n",
|
||||
" start_context=\"Every effort moves you\", tokenizer=tokenizer\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@ -605,7 +605,7 @@
|
||||
"text": [
|
||||
"File already exists and is up-to-date: gpt2/124M/checkpoint\n",
|
||||
"File already exists and is up-to-date: gpt2/124M/encoder.json\n",
|
||||
"File already exists and is up-to-date: gpt2/124M/settings.json\n",
|
||||
"File already exists and is up-to-date: gpt2/124M/hparams.json\n",
|
||||
"File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001\n",
|
||||
"File already exists and is up-to-date: gpt2/124M/model.ckpt.index\n",
|
||||
"File already exists and is up-to-date: gpt2/124M/model.ckpt.meta\n",
|
||||
@ -760,7 +760,7 @@
|
||||
"text": [
|
||||
"File already exists and is up-to-date: gpt2/1558M/checkpoint\n",
|
||||
"File already exists and is up-to-date: gpt2/1558M/encoder.json\n",
|
||||
"File already exists and is up-to-date: gpt2/1558M/settings.json\n",
|
||||
"File already exists and is up-to-date: gpt2/1558M/hparams.json\n",
|
||||
"File already exists and is up-to-date: gpt2/1558M/model.ckpt.data-00000-of-00001\n",
|
||||
"File already exists and is up-to-date: gpt2/1558M/model.ckpt.index\n",
|
||||
"File already exists and is up-to-date: gpt2/1558M/model.ckpt.meta\n",
|
||||
@ -859,7 +859,7 @@
|
||||
"text": [
|
||||
"File already exists and is up-to-date: gpt2/1558M/checkpoint\n",
|
||||
"File already exists and is up-to-date: gpt2/1558M/encoder.json\n",
|
||||
"File already exists and is up-to-date: gpt2/1558M/settings.json\n",
|
||||
"File already exists and is up-to-date: gpt2/1558M/hparams.json\n",
|
||||
"File already exists and is up-to-date: gpt2/1558M/model.ckpt.data-00000-of-00001\n",
|
||||
"File already exists and is up-to-date: gpt2/1558M/model.ckpt.index\n",
|
||||
"File already exists and is up-to-date: gpt2/1558M/model.ckpt.meta\n",
|
||||
|
||||
@ -167,21 +167,21 @@ class TransformerBlock(nn.Module):
|
||||
self.ff = FeedForward(cfg)
|
||||
self.norm1 = LayerNorm(cfg["emb_dim"])
|
||||
self.norm2 = LayerNorm(cfg["emb_dim"])
|
||||
self.drop_resid = nn.Dropout(cfg["drop_rate"])
|
||||
self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
|
||||
|
||||
def forward(self, x):
|
||||
# Shortcut connection for attention block
|
||||
shortcut = x
|
||||
x = self.norm1(x)
|
||||
x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
|
||||
x = self.drop_resid(x)
|
||||
x = self.drop_shortcut(x)
|
||||
x = x + shortcut # Add the original input back
|
||||
|
||||
# Shortcut connection for feed-forward block
|
||||
shortcut = x
|
||||
x = self.norm2(x)
|
||||
x = self.ff(x)
|
||||
x = self.drop_resid(x)
|
||||
x = self.drop_shortcut(x)
|
||||
x = x + shortcut # Add the original input back
|
||||
|
||||
return x
|
||||
|
||||
@ -167,21 +167,21 @@ class TransformerBlock(nn.Module):
|
||||
self.ff = FeedForward(cfg)
|
||||
self.norm1 = LayerNorm(cfg["emb_dim"])
|
||||
self.norm2 = LayerNorm(cfg["emb_dim"])
|
||||
self.drop_resid = nn.Dropout(cfg["drop_rate"])
|
||||
self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
|
||||
|
||||
def forward(self, x):
|
||||
# Shortcut connection for attention block
|
||||
shortcut = x
|
||||
x = self.norm1(x)
|
||||
x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
|
||||
x = self.drop_resid(x)
|
||||
x = self.drop_shortcut(x)
|
||||
x = x + shortcut # Add the original input back
|
||||
|
||||
# Shortcut connection for feed-forward block
|
||||
shortcut = x
|
||||
x = self.norm2(x)
|
||||
x = self.ff(x)
|
||||
x = self.drop_resid(x)
|
||||
x = self.drop_shortcut(x)
|
||||
x = x + shortcut # Add the original input back
|
||||
|
||||
return x
|
||||
|
||||
@ -164,21 +164,21 @@ class TransformerBlock(nn.Module):
|
||||
self.ff = FeedForward(cfg)
|
||||
self.norm1 = LayerNorm(cfg["emb_dim"])
|
||||
self.norm2 = LayerNorm(cfg["emb_dim"])
|
||||
self.drop_resid = nn.Dropout(cfg["drop_rate"])
|
||||
self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
|
||||
|
||||
def forward(self, x):
|
||||
# Shortcut connection for attention block
|
||||
shortcut = x
|
||||
x = self.norm1(x)
|
||||
x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
|
||||
x = self.drop_resid(x)
|
||||
x = self.drop_shortcut(x)
|
||||
x = x + shortcut # Add the original input back
|
||||
|
||||
# Shortcut connection for feed-forward block
|
||||
shortcut = x
|
||||
x = self.norm2(x)
|
||||
x = self.ff(x)
|
||||
x = self.drop_resid(x)
|
||||
x = self.drop_shortcut(x)
|
||||
x = x + shortcut # Add the original input back
|
||||
|
||||
return x
|
||||
|
||||
@ -167,21 +167,21 @@ class TransformerBlock(nn.Module):
|
||||
self.ff = FeedForward(cfg)
|
||||
self.norm1 = LayerNorm(cfg["emb_dim"])
|
||||
self.norm2 = LayerNorm(cfg["emb_dim"])
|
||||
self.drop_resid = nn.Dropout(cfg["drop_rate"])
|
||||
self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
|
||||
|
||||
def forward(self, x):
|
||||
# Shortcut connection for attention block
|
||||
shortcut = x
|
||||
x = self.norm1(x)
|
||||
x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
|
||||
x = self.drop_resid(x)
|
||||
x = self.drop_shortcut(x)
|
||||
x = x + shortcut # Add the original input back
|
||||
|
||||
# Shortcut connection for feed-forward block
|
||||
shortcut = x
|
||||
x = self.norm2(x)
|
||||
x = self.ff(x)
|
||||
x = self.drop_resid(x)
|
||||
x = self.drop_shortcut(x)
|
||||
x = x + shortcut # Add the original input back
|
||||
|
||||
return x
|
||||
|
||||
@ -169,21 +169,21 @@ class TransformerBlock(nn.Module):
|
||||
self.ff = FeedForward(cfg)
|
||||
self.norm1 = LayerNorm(cfg["emb_dim"])
|
||||
self.norm2 = LayerNorm(cfg["emb_dim"])
|
||||
self.drop_resid = nn.Dropout(cfg["drop_rate"])
|
||||
self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
|
||||
|
||||
def forward(self, x):
|
||||
# Shortcut connection for attention block
|
||||
shortcut = x
|
||||
x = self.norm1(x)
|
||||
x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
|
||||
x = self.drop_resid(x)
|
||||
x = self.drop_shortcut(x)
|
||||
x = x + shortcut # Add the original input back
|
||||
|
||||
# Shortcut connection for feed-forward block
|
||||
shortcut = x
|
||||
x = self.norm2(x)
|
||||
x = self.ff(x)
|
||||
x = self.drop_resid(x)
|
||||
x = self.drop_shortcut(x)
|
||||
x = x + shortcut # Add the original input back
|
||||
|
||||
return x
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user