# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). # Source for "Build a Large Language Model From Scratch" # - https://www.manning.com/books/build-a-large-language-model-from-scratch # Code: https://github.com/rasbt/LLMs-from-scratch from llms_from_scratch.ch02 import create_dataloader_v1 from llms_from_scratch.ch04 import GPTModel from llms_from_scratch.ch05 import train_model_simple import os import urllib import pytest import tiktoken import torch from torch.utils.data import Subset, DataLoader @pytest.mark.parametrize("file_name", ["the-verdict.txt"]) def test_train_simple(tmp_path, file_name): GPT_CONFIG_124M = { "vocab_size": 50257, # Vocabulary size "context_length": 256, # Shortened context length (orig: 1024) "emb_dim": 768, # Embedding dimension "n_heads": 12, # Number of attention heads "n_layers": 12, # Number of layers "drop_rate": 0.1, # Dropout rate "qkv_bias": False # Query-key-value bias } OTHER_SETTINGS = { "learning_rate": 5e-4, "num_epochs": 2, "batch_size": 1, "weight_decay": 0.1 } torch.manual_seed(123) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ############################## # Download data if necessary ############################## file_path = tmp_path / "the-verdict.txt" url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt" if not os.path.exists(file_path): with urllib.request.urlopen(url) as response: text_data = response.read().decode('utf-8') with open(file_path, "w", encoding="utf-8") as file: file.write(text_data) else: with open(file_path, "r", encoding="utf-8") as file: text_data = file.read() ############################## # Initialize model ############################## model = GPTModel(GPT_CONFIG_124M) model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes optimizer = torch.optim.AdamW( model.parameters(), lr=OTHER_SETTINGS["learning_rate"], weight_decay=OTHER_SETTINGS["weight_decay"] ) ############################## # Set up dataloaders ############################## # Train/validation ratio train_ratio = 0.90 split_idx = int(train_ratio * len(text_data)) train_loader = create_dataloader_v1( text_data[:split_idx], batch_size=OTHER_SETTINGS["batch_size"], max_length=GPT_CONFIG_124M["context_length"], stride=GPT_CONFIG_124M["context_length"], drop_last=True, shuffle=True, num_workers=0 ) val_loader = create_dataloader_v1( text_data[split_idx:], batch_size=OTHER_SETTINGS["batch_size"], max_length=GPT_CONFIG_124M["context_length"], stride=GPT_CONFIG_124M["context_length"], drop_last=False, shuffle=False, num_workers=0 ) ############################## # Train model ############################## tokenizer = tiktoken.get_encoding("gpt2") train_subset = Subset(train_loader.dataset, range(1)) one_batch_train_loader = DataLoader(train_subset, batch_size=1) val_subset = Subset(val_loader.dataset, range(1)) one_batch_val_loader = DataLoader(val_subset, batch_size=1) train_losses, val_losses, tokens_seen = train_model_simple( model, one_batch_train_loader, one_batch_val_loader, optimizer, device, num_epochs=OTHER_SETTINGS["num_epochs"], eval_freq=1, eval_iter=1, start_context="Every effort moves you", tokenizer=tokenizer ) assert round(train_losses[0], 1) == 7.6 assert round(val_losses[0], 1) == 10.1 assert train_losses[-1] < train_losses[0]