mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-08-30 03:20:51 +00:00
cleanup
This commit is contained in:
parent
ccd7cebbb3
commit
c8cffefb6f
@ -164,7 +164,7 @@ class TransformerBlock(nn.Module):
|
|||||||
self.att = MultiHeadAttention(
|
self.att = MultiHeadAttention(
|
||||||
d_in=cfg["emb_dim"],
|
d_in=cfg["emb_dim"],
|
||||||
d_out=cfg["emb_dim"],
|
d_out=cfg["emb_dim"],
|
||||||
context_length=cfg["ctx_len"],
|
context_length=cfg["context_length"],
|
||||||
num_heads=cfg["n_heads"],
|
num_heads=cfg["n_heads"],
|
||||||
dropout=cfg["drop_rate"],
|
dropout=cfg["drop_rate"],
|
||||||
qkv_bias=cfg["qkv_bias"])
|
qkv_bias=cfg["qkv_bias"])
|
||||||
@ -195,7 +195,7 @@ class GPTModel(nn.Module):
|
|||||||
def __init__(self, cfg):
|
def __init__(self, cfg):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
|
self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
|
||||||
self.pos_emb = nn.Embedding(cfg["ctx_len"], cfg["emb_dim"])
|
self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
|
||||||
self.drop_emb = nn.Dropout(cfg["drop_rate"])
|
self.drop_emb = nn.Dropout(cfg["drop_rate"])
|
||||||
|
|
||||||
self.trf_blocks = nn.Sequential(
|
self.trf_blocks = nn.Sequential(
|
||||||
|
@ -4,14 +4,14 @@ import torch.nn as nn
|
|||||||
|
|
||||||
class CausalAttention(nn.Module):
|
class CausalAttention(nn.Module):
|
||||||
|
|
||||||
def __init__(self, d_in, d_out, block_size, dropout, qkv_bias=False):
|
def __init__(self, d_in, d_out, context_length, dropout, qkv_bias=False):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.d_out = d_out
|
self.d_out = d_out
|
||||||
self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
|
self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
|
||||||
self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
|
self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
|
||||||
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
|
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
|
||||||
self.dropout = nn.Dropout(dropout) # New
|
self.dropout = nn.Dropout(dropout) # New
|
||||||
self.register_buffer('mask', torch.triu(torch.ones(block_size, block_size), diagonal=1)) # New
|
self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) # New
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
b, num_tokens, d_in = x.shape # New batch dimension b
|
b, num_tokens, d_in = x.shape # New batch dimension b
|
||||||
@ -31,10 +31,10 @@ class CausalAttention(nn.Module):
|
|||||||
|
|
||||||
class MultiHeadAttentionWrapper(nn.Module):
|
class MultiHeadAttentionWrapper(nn.Module):
|
||||||
|
|
||||||
def __init__(self, d_in, d_out, block_size, dropout, num_heads, qkv_bias=False):
|
def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.heads = nn.ModuleList(
|
self.heads = nn.ModuleList(
|
||||||
[CausalAttention(d_in, d_out, block_size, dropout, qkv_bias)
|
[CausalAttention(d_in, d_out, context_length, dropout, qkv_bias)
|
||||||
for _ in range(num_heads)]
|
for _ in range(num_heads)]
|
||||||
)
|
)
|
||||||
self.out_proj = nn.Linear(d_out*num_heads, d_out*num_heads)
|
self.out_proj = nn.Linear(d_out*num_heads, d_out*num_heads)
|
||||||
@ -45,7 +45,7 @@ class MultiHeadAttentionWrapper(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class MultiHeadAttention(nn.Module):
|
class MultiHeadAttention(nn.Module):
|
||||||
def __init__(self, d_in, d_out, block_size, dropout, num_heads, qkv_bias=False):
|
def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
|
assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
|
||||||
|
|
||||||
@ -58,7 +58,7 @@ class MultiHeadAttention(nn.Module):
|
|||||||
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
|
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
|
||||||
self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs
|
self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs
|
||||||
self.dropout = nn.Dropout(dropout)
|
self.dropout = nn.Dropout(dropout)
|
||||||
self.register_buffer('mask', torch.triu(torch.ones(block_size, block_size), diagonal=1))
|
self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
b, num_tokens, d_in = x.shape
|
b, num_tokens, d_in = x.shape
|
||||||
|
@ -165,7 +165,7 @@
|
|||||||
" def __init__(self, cfg):\n",
|
" def __init__(self, cfg):\n",
|
||||||
" super().__init__()\n",
|
" super().__init__()\n",
|
||||||
" self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n",
|
" self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n",
|
||||||
" self.pos_emb = nn.Embedding(cfg[\"ctx_len\"], cfg[\"emb_dim\"])\n",
|
" self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"emb_dim\"])\n",
|
||||||
" self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n",
|
" self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n",
|
||||||
" \n",
|
" \n",
|
||||||
" # Use a placeholder for TransformerBlock\n",
|
" # Use a placeholder for TransformerBlock\n",
|
||||||
@ -943,7 +943,7 @@
|
|||||||
" self.att = MultiHeadAttention(\n",
|
" self.att = MultiHeadAttention(\n",
|
||||||
" d_in=cfg[\"emb_dim\"],\n",
|
" d_in=cfg[\"emb_dim\"],\n",
|
||||||
" d_out=cfg[\"emb_dim\"],\n",
|
" d_out=cfg[\"emb_dim\"],\n",
|
||||||
" context_length=cfg[\"ctx_len\"],\n",
|
" context_length=cfg[\"context_length\"],\n",
|
||||||
" num_heads=cfg[\"n_heads\"], \n",
|
" num_heads=cfg[\"n_heads\"], \n",
|
||||||
" dropout=cfg[\"drop_rate\"],\n",
|
" dropout=cfg[\"drop_rate\"],\n",
|
||||||
" qkv_bias=cfg[\"qkv_bias\"])\n",
|
" qkv_bias=cfg[\"qkv_bias\"])\n",
|
||||||
@ -1065,7 +1065,7 @@
|
|||||||
" def __init__(self, cfg):\n",
|
" def __init__(self, cfg):\n",
|
||||||
" super().__init__()\n",
|
" super().__init__()\n",
|
||||||
" self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n",
|
" self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n",
|
||||||
" self.pos_emb = nn.Embedding(cfg[\"ctx_len\"], cfg[\"emb_dim\"])\n",
|
" self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"emb_dim\"])\n",
|
||||||
" self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n",
|
" self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n",
|
||||||
" \n",
|
" \n",
|
||||||
" self.trf_blocks = nn.Sequential(\n",
|
" self.trf_blocks = nn.Sequential(\n",
|
||||||
@ -1429,7 +1429,7 @@
|
|||||||
" model=model,\n",
|
" model=model,\n",
|
||||||
" idx=encoded_tensor, \n",
|
" idx=encoded_tensor, \n",
|
||||||
" max_new_tokens=6, \n",
|
" max_new_tokens=6, \n",
|
||||||
" context_size=GPT_CONFIG_124M[\"ctx_len\"]\n",
|
" context_size=GPT_CONFIG_124M[\"context_length\"]\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(\"Output:\", out)\n",
|
"print(\"Output:\", out)\n",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user