import tiktoken import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader class GPTDatasetV1(Dataset): def __init__(self, txt, tokenizer, max_length, stride): self.tokenizer = tokenizer self.input_ids = [] self.target_ids = [] # Tokenize the entire text token_ids = tokenizer.encode(txt) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride): input_chunk = token_ids[i:i + max_length] target_chunk = token_ids[i + 1: i + max_length + 1] self.input_ids.append(torch.tensor(input_chunk)) self.target_ids.append(torch.tensor(target_chunk)) def __len__(self): return len(self.input_ids) def __getitem__(self, idx): return self.input_ids[idx], self.target_ids[idx] def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True): # Initialize the tokenizer tokenizer = tiktoken.get_encoding("gpt2") # Create dataset dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) # Create dataloader dataloader = DataLoader( dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last) return dataloader class MultiHeadAttention(nn.Module): def __init__(self, d_in, d_out, block_size, dropout, num_heads, qkv_bias=False): super().__init__() assert d_out % num_heads == 0, "d_out must be divisible by num_heads" self.d_out = d_out self.num_heads = num_heads self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs self.dropout = nn.Dropout(dropout) self.register_buffer('mask', torch.triu(torch.ones(block_size, block_size), diagonal=1)) def forward(self, x): b, num_tokens, d_in = x.shape keys = self.W_key(x) # Shape: (b, num_tokens, d_out) queries = self.W_query(x) values = self.W_value(x) # We implicitly split the matrix by adding a `num_heads` dimension # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim) keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) values = values.view(b, num_tokens, self.num_heads, self.head_dim) queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) keys = keys.transpose(1, 2) queries = queries.transpose(1, 2) values = values.transpose(1, 2) # Compute scaled dot-product attention (aka self-attention) with a causal mask attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head # Original mask truncated to the number of tokens and converted to boolean mask_bool = self.mask.bool()[:num_tokens, :num_tokens] # Use the mask to fill attention scores attn_scores.masked_fill_(mask_bool, -torch.inf) attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) attn_weights = self.dropout(attn_weights) # Shape: (b, num_tokens, num_heads, head_dim) context_vec = (attn_weights @ values).transpose(1, 2) # Combine heads, where self.d_out = self.num_heads * self.head_dim context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out) context_vec = self.out_proj(context_vec) # optional projection return context_vec