From 089dfb756ae3defc70f785b65f4dda4e79427e18 Mon Sep 17 00:00:00 2001 From: rasbt Date: Mon, 3 Jun 2024 07:17:56 -0500 Subject: [PATCH] restore file --- .../01_main-chapter-code/previous_chapters.py | 113 ++++++++++++------ 1 file changed, 78 insertions(+), 35 deletions(-) diff --git a/appendix-D/01_main-chapter-code/previous_chapters.py b/appendix-D/01_main-chapter-code/previous_chapters.py index e5f6822..46030ef 100644 --- a/appendix-D/01_main-chapter-code/previous_chapters.py +++ b/appendix-D/01_main-chapter-code/previous_chapters.py @@ -11,12 +11,13 @@ import tiktoken import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader +import matplotlib.pyplot as plt + ##################################### # Chapter 2 ##################################### - class GPTDatasetV1(Dataset): def __init__(self, txt, tokenizer, max_length, stride): self.input_ids = [] @@ -57,10 +58,11 @@ def create_dataloader_v1(txt, batch_size=4, max_length=256, ##################################### # Chapter 3 ##################################### + class MultiHeadAttention(nn.Module): def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): super().__init__() - assert d_out % num_heads == 0, "d_out must be divisible by num_heads" + assert d_out % num_heads == 0, "d_out must be divisible by n_heads" self.d_out = d_out self.num_heads = num_heads @@ -107,7 +109,7 @@ class MultiHeadAttention(nn.Module): context_vec = (attn_weights @ values).transpose(1, 2) # Combine heads, where self.d_out = self.num_heads * self.head_dim - context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out) + context_vec = context_vec.reshape(b, num_tokens, self.d_out) context_vec = self.out_proj(context_vec) # optional projection return context_vec @@ -116,6 +118,7 @@ class MultiHeadAttention(nn.Module): ##################################### # Chapter 4 ##################################### + class LayerNorm(nn.Module): def __init__(self, emb_dim): super().__init__() @@ -238,42 +241,82 @@ def generate_text_simple(model, idx, max_new_tokens, context_size): return idx -if __name__ == "__main__": +##################################### +# Chapter 5 +#################################### - GPT_CONFIG_124M = { - "vocab_size": 50257, # Vocabulary size - "context_length": 1024, # Context length - "emb_dim": 768, # Embedding dimension - "n_heads": 12, # Number of attention heads - "n_layers": 12, # Number of layers - "drop_rate": 0.1, # Dropout rate - "qkv_bias": False # Query-Key-Value bias - } - torch.manual_seed(123) - model = GPTModel(GPT_CONFIG_124M) - model.eval() # disable dropout +def calc_loss_batch(input_batch, target_batch, model, device): + input_batch, target_batch = input_batch.to(device), target_batch.to(device) + logits = model(input_batch) + loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten()) + return loss - start_context = "Hello, I am" - tokenizer = tiktoken.get_encoding("gpt2") - encoded = tokenizer.encode(start_context) - encoded_tensor = torch.tensor(encoded).unsqueeze(0) +def calc_loss_loader(data_loader, model, device, num_batches=None): + total_loss = 0. + if len(data_loader) == 0: + return float("nan") + elif num_batches is None: + num_batches = len(data_loader) + else: + num_batches = min(num_batches, len(data_loader)) + for i, (input_batch, target_batch) in enumerate(data_loader): + if i < num_batches: + loss = calc_loss_batch(input_batch, target_batch, model, device) + total_loss += loss.item() + else: + break + return total_loss / num_batches - print(f"\n{50*'='}\n{22*' '}IN\n{50*'='}") - print("\nInput text:", start_context) - print("Encoded input text:", encoded) - print("encoded_tensor.shape:", encoded_tensor.shape) - out = generate_text_simple( - model=model, - idx=encoded_tensor, - max_new_tokens=10, - context_size=GPT_CONFIG_124M["context_length"] - ) - decoded_text = tokenizer.decode(out.squeeze(0).tolist()) +def evaluate_model(model, train_loader, val_loader, device, eval_iter): + model.eval() + with torch.no_grad(): + train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter) + val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter) + model.train() + return train_loss, val_loss - print(f"\n\n{50*'='}\n{22*' '}OUT\n{50*'='}") - print("\nOutput:", out) - print("Output length:", len(out[0])) - print("Output text:", decoded_text) + +def generate_and_print_sample(model, tokenizer, device, start_context): + model.eval() + context_size = model.pos_emb.weight.shape[0] + encoded = text_to_token_ids(start_context, tokenizer).to(device) + with torch.no_grad(): + token_ids = generate_text_simple( + model=model, idx=encoded, + max_new_tokens=50, context_size=context_size) + decoded_text = token_ids_to_text(token_ids, tokenizer) + print(decoded_text.replace("\n", " ")) # Compact print format + model.train() + + +def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses): + fig, ax1 = plt.subplots(figsize=(5, 3)) + + # Plot training and validation loss against epochs + ax1.plot(epochs_seen, train_losses, label="Training loss") + ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss") + ax1.set_xlabel("Epochs") + ax1.set_ylabel("Loss") + ax1.legend(loc="upper right") + + # Create a second x-axis for tokens seen + ax2 = ax1.twiny() # Create a second x-axis that shares the same y-axis + ax2.plot(tokens_seen, train_losses, alpha=0) # Invisible plot for aligning ticks + ax2.set_xlabel("Tokens seen") + + fig.tight_layout() # Adjust layout to make room + # plt.show() + + +def text_to_token_ids(text, tokenizer): + encoded = tokenizer.encode(text) + encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension + return encoded_tensor + + +def token_ids_to_text(token_ids, tokenizer): + flat = token_ids.squeeze(0) # remove batch dimension + return tokenizer.decode(flat.tolist())