From d51099a9e7bafc2ccf6af54cfe443f0619a7ea80 Mon Sep 17 00:00:00 2001 From: rasbt Date: Mon, 3 Jun 2024 07:12:47 -0500 Subject: [PATCH] add number of workers to data loader --- .../01_main-chapter-code/previous_chapters.py | 113 ++++++------------ 1 file changed, 35 insertions(+), 78 deletions(-) diff --git a/appendix-D/01_main-chapter-code/previous_chapters.py b/appendix-D/01_main-chapter-code/previous_chapters.py index 46030ef..e5f6822 100644 --- a/appendix-D/01_main-chapter-code/previous_chapters.py +++ b/appendix-D/01_main-chapter-code/previous_chapters.py @@ -11,13 +11,12 @@ import tiktoken import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader -import matplotlib.pyplot as plt - ##################################### # Chapter 2 ##################################### + class GPTDatasetV1(Dataset): def __init__(self, txt, tokenizer, max_length, stride): self.input_ids = [] @@ -58,11 +57,10 @@ def create_dataloader_v1(txt, batch_size=4, max_length=256, ##################################### # Chapter 3 ##################################### - class MultiHeadAttention(nn.Module): def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): super().__init__() - assert d_out % num_heads == 0, "d_out must be divisible by n_heads" + assert d_out % num_heads == 0, "d_out must be divisible by num_heads" self.d_out = d_out self.num_heads = num_heads @@ -109,7 +107,7 @@ class MultiHeadAttention(nn.Module): context_vec = (attn_weights @ values).transpose(1, 2) # Combine heads, where self.d_out = self.num_heads * self.head_dim - context_vec = context_vec.reshape(b, num_tokens, self.d_out) + context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out) context_vec = self.out_proj(context_vec) # optional projection return context_vec @@ -118,7 +116,6 @@ class MultiHeadAttention(nn.Module): ##################################### # Chapter 4 ##################################### - class LayerNorm(nn.Module): def __init__(self, emb_dim): super().__init__() @@ -241,82 +238,42 @@ def generate_text_simple(model, idx, max_new_tokens, context_size): return idx -##################################### -# Chapter 5 -#################################### +if __name__ == "__main__": + GPT_CONFIG_124M = { + "vocab_size": 50257, # Vocabulary size + "context_length": 1024, # Context length + "emb_dim": 768, # Embedding dimension + "n_heads": 12, # Number of attention heads + "n_layers": 12, # Number of layers + "drop_rate": 0.1, # Dropout rate + "qkv_bias": False # Query-Key-Value bias + } -def calc_loss_batch(input_batch, target_batch, model, device): - input_batch, target_batch = input_batch.to(device), target_batch.to(device) - logits = model(input_batch) - loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten()) - return loss + torch.manual_seed(123) + model = GPTModel(GPT_CONFIG_124M) + model.eval() # disable dropout + start_context = "Hello, I am" -def calc_loss_loader(data_loader, model, device, num_batches=None): - total_loss = 0. - if len(data_loader) == 0: - return float("nan") - elif num_batches is None: - num_batches = len(data_loader) - else: - num_batches = min(num_batches, len(data_loader)) - for i, (input_batch, target_batch) in enumerate(data_loader): - if i < num_batches: - loss = calc_loss_batch(input_batch, target_batch, model, device) - total_loss += loss.item() - else: - break - return total_loss / num_batches + tokenizer = tiktoken.get_encoding("gpt2") + encoded = tokenizer.encode(start_context) + encoded_tensor = torch.tensor(encoded).unsqueeze(0) + print(f"\n{50*'='}\n{22*' '}IN\n{50*'='}") + print("\nInput text:", start_context) + print("Encoded input text:", encoded) + print("encoded_tensor.shape:", encoded_tensor.shape) -def evaluate_model(model, train_loader, val_loader, device, eval_iter): - model.eval() - with torch.no_grad(): - train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter) - val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter) - model.train() - return train_loss, val_loss + out = generate_text_simple( + model=model, + idx=encoded_tensor, + max_new_tokens=10, + context_size=GPT_CONFIG_124M["context_length"] + ) + decoded_text = tokenizer.decode(out.squeeze(0).tolist()) - -def generate_and_print_sample(model, tokenizer, device, start_context): - model.eval() - context_size = model.pos_emb.weight.shape[0] - encoded = text_to_token_ids(start_context, tokenizer).to(device) - with torch.no_grad(): - token_ids = generate_text_simple( - model=model, idx=encoded, - max_new_tokens=50, context_size=context_size) - decoded_text = token_ids_to_text(token_ids, tokenizer) - print(decoded_text.replace("\n", " ")) # Compact print format - model.train() - - -def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses): - fig, ax1 = plt.subplots(figsize=(5, 3)) - - # Plot training and validation loss against epochs - ax1.plot(epochs_seen, train_losses, label="Training loss") - ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss") - ax1.set_xlabel("Epochs") - ax1.set_ylabel("Loss") - ax1.legend(loc="upper right") - - # Create a second x-axis for tokens seen - ax2 = ax1.twiny() # Create a second x-axis that shares the same y-axis - ax2.plot(tokens_seen, train_losses, alpha=0) # Invisible plot for aligning ticks - ax2.set_xlabel("Tokens seen") - - fig.tight_layout() # Adjust layout to make room - # plt.show() - - -def text_to_token_ids(text, tokenizer): - encoded = tokenizer.encode(text) - encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension - return encoded_tensor - - -def token_ids_to_text(token_ids, tokenizer): - flat = token_ids.squeeze(0) # remove batch dimension - return tokenizer.decode(flat.tolist()) + print(f"\n\n{50*'='}\n{22*' '}OUT\n{50*'='}") + print("\nOutput:", out) + print("Output length:", len(out[0])) + print("Output text:", decoded_text)