diff --git a/appendix-D/01_main-chapter-code/previous_chapters.py b/appendix-D/01_main-chapter-code/previous_chapters.py index 01f4f7e..c7966ce 100644 --- a/appendix-D/01_main-chapter-code/previous_chapters.py +++ b/appendix-D/01_main-chapter-code/previous_chapters.py @@ -164,7 +164,7 @@ class TransformerBlock(nn.Module): self.att = MultiHeadAttention( d_in=cfg["emb_dim"], d_out=cfg["emb_dim"], - context_length=cfg["ctx_len"], + context_length=cfg["context_length"], num_heads=cfg["n_heads"], dropout=cfg["drop_rate"], qkv_bias=cfg["qkv_bias"]) @@ -195,7 +195,7 @@ class GPTModel(nn.Module): def __init__(self, cfg): super().__init__() self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]) - self.pos_emb = nn.Embedding(cfg["ctx_len"], cfg["emb_dim"]) + self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"]) self.drop_emb = nn.Dropout(cfg["drop_rate"]) self.trf_blocks = nn.Sequential( diff --git a/ch03/02_bonus_efficient-multihead-attention/ch03.py b/ch03/02_bonus_efficient-multihead-attention/ch03.py index 1797fe3..0ee3ca9 100644 --- a/ch03/02_bonus_efficient-multihead-attention/ch03.py +++ b/ch03/02_bonus_efficient-multihead-attention/ch03.py @@ -4,14 +4,14 @@ import torch.nn as nn class CausalAttention(nn.Module): - def __init__(self, d_in, d_out, block_size, dropout, qkv_bias=False): + def __init__(self, d_in, d_out, context_length, dropout, qkv_bias=False): super().__init__() self.d_out = d_out self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) self.dropout = nn.Dropout(dropout) # New - self.register_buffer('mask', torch.triu(torch.ones(block_size, block_size), diagonal=1)) # New + self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) # New def forward(self, x): b, num_tokens, d_in = x.shape # New batch dimension b @@ -31,10 +31,10 @@ class CausalAttention(nn.Module): class MultiHeadAttentionWrapper(nn.Module): - def __init__(self, d_in, d_out, block_size, dropout, num_heads, qkv_bias=False): + def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): super().__init__() self.heads = nn.ModuleList( - [CausalAttention(d_in, d_out, block_size, dropout, qkv_bias) + [CausalAttention(d_in, d_out, context_length, dropout, qkv_bias) for _ in range(num_heads)] ) self.out_proj = nn.Linear(d_out*num_heads, d_out*num_heads) @@ -45,7 +45,7 @@ class MultiHeadAttentionWrapper(nn.Module): class MultiHeadAttention(nn.Module): - def __init__(self, d_in, d_out, block_size, dropout, num_heads, qkv_bias=False): + def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): super().__init__() assert d_out % num_heads == 0, "d_out must be divisible by num_heads" @@ -58,7 +58,7 @@ class MultiHeadAttention(nn.Module): self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs self.dropout = nn.Dropout(dropout) - self.register_buffer('mask', torch.triu(torch.ones(block_size, block_size), diagonal=1)) + self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) def forward(self, x): b, num_tokens, d_in = x.shape diff --git a/ch04/01_main-chapter-code/ch04.ipynb b/ch04/01_main-chapter-code/ch04.ipynb index edd592d..1790bd6 100644 --- a/ch04/01_main-chapter-code/ch04.ipynb +++ b/ch04/01_main-chapter-code/ch04.ipynb @@ -165,7 +165,7 @@ " def __init__(self, cfg):\n", " super().__init__()\n", " self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n", - " self.pos_emb = nn.Embedding(cfg[\"ctx_len\"], cfg[\"emb_dim\"])\n", + " self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"emb_dim\"])\n", " self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n", " \n", " # Use a placeholder for TransformerBlock\n", @@ -943,7 +943,7 @@ " self.att = MultiHeadAttention(\n", " d_in=cfg[\"emb_dim\"],\n", " d_out=cfg[\"emb_dim\"],\n", - " context_length=cfg[\"ctx_len\"],\n", + " context_length=cfg[\"context_length\"],\n", " num_heads=cfg[\"n_heads\"], \n", " dropout=cfg[\"drop_rate\"],\n", " qkv_bias=cfg[\"qkv_bias\"])\n", @@ -1065,7 +1065,7 @@ " def __init__(self, cfg):\n", " super().__init__()\n", " self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n", - " self.pos_emb = nn.Embedding(cfg[\"ctx_len\"], cfg[\"emb_dim\"])\n", + " self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"emb_dim\"])\n", " self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n", " \n", " self.trf_blocks = nn.Sequential(\n", @@ -1429,7 +1429,7 @@ " model=model,\n", " idx=encoded_tensor, \n", " max_new_tokens=6, \n", - " context_size=GPT_CONFIG_124M[\"ctx_len\"]\n", + " context_size=GPT_CONFIG_124M[\"context_length\"]\n", ")\n", "\n", "print(\"Output:\", out)\n",