mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-08-10 17:51:43 +00:00
119 lines
3.9 KiB
Python
119 lines
3.9 KiB
Python
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
|
# Source for "Build a Large Language Model From Scratch"
|
|
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
|
# Code: https://github.com/rasbt/LLMs-from-scratch
|
|
|
|
from llms_from_scratch.ch02 import create_dataloader_v1
|
|
from llms_from_scratch.ch04 import GPTModel
|
|
from llms_from_scratch.appendix_d import train_model
|
|
|
|
import os
|
|
import urllib
|
|
|
|
import tiktoken
|
|
import torch
|
|
from torch.utils.data import Subset, DataLoader
|
|
|
|
|
|
def test_train(tmp_path):
|
|
|
|
GPT_CONFIG_124M = {
|
|
"vocab_size": 50257, # Vocabulary size
|
|
"context_length": 256, # Shortened context length (orig: 1024)
|
|
"emb_dim": 768, # Embedding dimension
|
|
"n_heads": 12, # Number of attention heads
|
|
"n_layers": 12, # Number of layers
|
|
"drop_rate": 0.1, # Dropout rate
|
|
"qkv_bias": False # Query-key-value bias
|
|
}
|
|
|
|
OTHER_SETTINGS = {
|
|
"learning_rate": 5e-4,
|
|
"num_epochs": 2,
|
|
"batch_size": 1,
|
|
"weight_decay": 0.1
|
|
}
|
|
|
|
torch.manual_seed(123)
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
##############################
|
|
# Download data if necessary
|
|
##############################
|
|
|
|
file_path = tmp_path / "the-verdict.txt"
|
|
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
|
|
|
|
if not os.path.exists(file_path):
|
|
with urllib.request.urlopen(url) as response:
|
|
text_data = response.read().decode("utf-8")
|
|
with open(file_path, "w", encoding="utf-8") as file:
|
|
file.write(text_data)
|
|
else:
|
|
with open(file_path, "r", encoding="utf-8") as file:
|
|
text_data = file.read()
|
|
|
|
##############################
|
|
# Initialize model
|
|
##############################
|
|
|
|
model = GPTModel(GPT_CONFIG_124M)
|
|
model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes
|
|
|
|
##############################
|
|
# Set up dataloaders
|
|
##############################
|
|
|
|
# Train/validation ratio
|
|
train_ratio = 0.90
|
|
split_idx = int(train_ratio * len(text_data))
|
|
|
|
train_loader = create_dataloader_v1(
|
|
text_data[:split_idx],
|
|
batch_size=OTHER_SETTINGS["batch_size"],
|
|
max_length=GPT_CONFIG_124M["context_length"],
|
|
stride=GPT_CONFIG_124M["context_length"],
|
|
drop_last=True,
|
|
shuffle=True,
|
|
num_workers=0
|
|
)
|
|
|
|
val_loader = create_dataloader_v1(
|
|
text_data[split_idx:],
|
|
batch_size=OTHER_SETTINGS["batch_size"],
|
|
max_length=GPT_CONFIG_124M["context_length"],
|
|
stride=GPT_CONFIG_124M["context_length"],
|
|
drop_last=False,
|
|
shuffle=False,
|
|
num_workers=0
|
|
)
|
|
|
|
##############################
|
|
# Train model
|
|
##############################
|
|
|
|
tokenizer = tiktoken.get_encoding("gpt2")
|
|
|
|
train_subset = Subset(train_loader.dataset, range(1))
|
|
one_batch_train_loader = DataLoader(train_subset, batch_size=1)
|
|
val_subset = Subset(val_loader.dataset, range(1))
|
|
one_batch_val_loader = DataLoader(val_subset, batch_size=1)
|
|
|
|
peak_lr = 0.001 # this was originally set to 5e-4 in the book by mistake
|
|
optimizer = torch.optim.AdamW(model.parameters(), lr=peak_lr, weight_decay=0.1) # the book accidentally omitted the lr assignment
|
|
tokenizer = tiktoken.get_encoding("gpt2")
|
|
|
|
n_epochs = 6
|
|
warmup_steps = 1
|
|
|
|
train_losses, val_losses, tokens_seen, lrs = train_model(
|
|
model, one_batch_train_loader, one_batch_val_loader, optimizer, device, n_epochs=n_epochs,
|
|
eval_freq=5, eval_iter=1, start_context="Every effort moves you",
|
|
tokenizer=tokenizer, warmup_steps=warmup_steps,
|
|
initial_lr=1e-5, min_lr=1e-5
|
|
)
|
|
|
|
assert round(train_losses[0], 1) == 10.9
|
|
assert round(val_losses[0], 1) == 11.0
|
|
assert train_losses[-1] < train_losses[0]
|