mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-12-15 17:20:44 +00:00
add shape information for clarity
This commit is contained in:
parent
3a5fc79b38
commit
5d1d8ce511
@ -29,7 +29,7 @@
|
||||
"id": "53fe99ab-0bcf-4778-a6b5-6db81fb826ef",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 4.1 Coding the LLM backbone"
|
||||
"## 4.1 Coding an LLM architecture"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -479,7 +479,7 @@
|
||||
"- Note that we also add a smaller value (`eps`) before computing the square root of the variance; this is to avoid division-by-zero errors if the variance is 0\n",
|
||||
"\n",
|
||||
"**Biased variance**\n",
|
||||
"- In the variance calculation above, setting `unbiased=False` means using the formula $\\frac{\\sum (x_i - \\bar{x})^2}{n}$ to compute the variance where n is the sample size (here, the number of features or columns); this formula does not include Bessel's correction (which uses `n-1` in the denominator), thus providing a biased estimate of the variance \n",
|
||||
"- In the variance calculation above, setting `unbiased=False` means using the formula $\\frac{\\sum_i (x_i - \\bar{x})^2}{n}$ to compute the variance where n is the sample size (here, the number of features or columns); this formula does not include Bessel's correction (which uses `n-1` in the denominator), thus providing a biased estimate of the variance \n",
|
||||
"- For LLMs, where the embedding dimension `n` is very large, the difference between using n and `n-1`\n",
|
||||
" is negligible\n",
|
||||
"- However, GPT-2 was trained with a biased variance in the normalization layers, which is why we also adopted this setting for compatibility reasons with the pretrained weights that we will load in later chapters\n",
|
||||
@ -558,7 +558,7 @@
|
||||
"id": "7d482ce7-e493-4bfc-a820-3ea99f564ebc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- GELU ([Hendrycks and Gimpel 2016](https://arxiv.org/abs/1606.08415)) can be implemented in several ways; the exact version is defined as GELU(x)=x⋅Φ(x), where Φ(x) is the cumulative distribution function of the standard Gaussian distribution\n",
|
||||
"- GELU ([Hendrycks and Gimpel 2016](https://arxiv.org/abs/1606.08415)) can be implemented in several ways; the exact version is defined as GELU(x)=x⋅Φ(x), where Φ(x) is the cumulative distribution function of the standard Gaussian distribution.\n",
|
||||
"- In practice, it's common to implement a computationally cheaper approximation: $\\text{GELU}(x) \\approx 0.5 \\cdot x \\cdot \\left(1 + \\tanh\\left[\\sqrt{\\frac{2}{\\pi}} \\cdot \\left(x + 0.044715 \\cdot x^3\\right)\\right]\\right)\n",
|
||||
"$ (the original GPT-2 model was also trained with this approximation)"
|
||||
]
|
||||
@ -680,7 +680,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 30,
|
||||
"id": "928e7f7c-d0b1-499f-8d07-4cadb428a6f9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -688,14 +688,15 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"torch.Size([2, 768])\n"
|
||||
"torch.Size([2, 3, 768])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ffn = FeedForward(GPT_CONFIG_124M)\n",
|
||||
"\n",
|
||||
"x = torch.rand(2, 768) # input with batch dimension 2\n",
|
||||
"# input shape: [batch_size, num_token, emb_size]\n",
|
||||
"x = torch.rand(2, 3, 768) \n",
|
||||
"out = ffn(x)\n",
|
||||
"print(out.shape)"
|
||||
]
|
||||
@ -832,7 +833,7 @@
|
||||
" # Shortcut connection for attention block\n",
|
||||
" shortcut = x\n",
|
||||
" x = self.norm1(x)\n",
|
||||
" x = self.att(x)\n",
|
||||
" x = self.att(x) # Shape [batch_size, num_tokens, emb_size]\n",
|
||||
" x = self.drop_resid(x)\n",
|
||||
" x = x + shortcut # Add the original input back\n",
|
||||
"\n",
|
||||
@ -957,7 +958,7 @@
|
||||
" batch_size, seq_len = in_idx.shape\n",
|
||||
" tok_embeds = self.tok_emb(in_idx)\n",
|
||||
" pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n",
|
||||
" x = tok_embeds + pos_embeds\n",
|
||||
" x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size]\n",
|
||||
" x = self.trf_blocks(x)\n",
|
||||
" x = self.final_norm(x)\n",
|
||||
" logits = self.out_head(x)\n",
|
||||
@ -1002,7 +1003,7 @@
|
||||
],
|
||||
"source": [
|
||||
"torch.manual_seed(123)\n",
|
||||
"model = GPTModel(GPT_CONFIG_124M)\n",
|
||||
"model = GPT(GPT_CONFIG_124M)\n",
|
||||
"\n",
|
||||
"out = model(batch)\n",
|
||||
"print(\"Output shape:\", out.shape)\n",
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 18 KiB After Width: | Height: | Size: 24 KiB |
@ -168,7 +168,7 @@ class TransformerBlock(nn.Module):
|
||||
# Shortcut connection for attention block
|
||||
shortcut = x
|
||||
x = self.norm1(x)
|
||||
x = self.att(x)
|
||||
x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
|
||||
x = self.drop_resid(x)
|
||||
x = x + shortcut # Add the original input back
|
||||
|
||||
@ -200,7 +200,7 @@ class GPTModel(nn.Module):
|
||||
batch_size, seq_len = in_idx.shape
|
||||
tok_embeds = self.tok_emb(in_idx)
|
||||
pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
|
||||
x = tok_embeds + pos_embeds
|
||||
x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size]
|
||||
x = self.trf_blocks(x)
|
||||
x = self.final_norm(x)
|
||||
logits = self.out_head(x)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user