mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-08-10 17:51:43 +00:00
109 lines
3.3 KiB
Python
109 lines
3.3 KiB
Python
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
|
# Source for "Build a Large Language Model From Scratch"
|
|
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
|
# Code: https://github.com/rasbt/LLMs-from-scratch
|
|
|
|
from llms_from_scratch.ch04 import GPTModel
|
|
from llms_from_scratch.ch05 import train_model_simple
|
|
from llms_from_scratch.ch07 import (
|
|
download_and_load_file, InstructionDataset, format_input, custom_collate_fn
|
|
)
|
|
|
|
from functools import partial
|
|
|
|
import torch
|
|
from torch.utils.data import DataLoader
|
|
import tiktoken
|
|
|
|
|
|
def test_instruction_finetune(tmp_path):
|
|
|
|
#######################################
|
|
# Download and prepare dataset
|
|
#######################################
|
|
file_path = tmp_path / "instruction-data.json"
|
|
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json"
|
|
data = download_and_load_file(file_path, url)
|
|
|
|
train_portion = int(len(data) * 0.85) # 85% for training
|
|
test_portion = int(len(data) * 0.1) # 10% for testing
|
|
|
|
train_data = data[:train_portion]
|
|
test_data = data[train_portion:train_portion + test_portion]
|
|
val_data = data[train_portion + test_portion:]
|
|
|
|
# Use very small subset for testing purposes
|
|
train_data = train_data[:15]
|
|
val_data = val_data[:15]
|
|
test_data = test_data[:15]
|
|
|
|
tokenizer = tiktoken.get_encoding("gpt2")
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=100)
|
|
|
|
num_workers = 0
|
|
batch_size = 8
|
|
|
|
torch.manual_seed(123)
|
|
|
|
train_dataset = InstructionDataset(train_data, tokenizer)
|
|
train_loader = DataLoader(
|
|
train_dataset,
|
|
batch_size=batch_size,
|
|
collate_fn=customized_collate_fn,
|
|
shuffle=True,
|
|
drop_last=True,
|
|
num_workers=num_workers
|
|
)
|
|
|
|
val_dataset = InstructionDataset(val_data, tokenizer)
|
|
val_loader = DataLoader(
|
|
val_dataset,
|
|
batch_size=batch_size,
|
|
collate_fn=customized_collate_fn,
|
|
shuffle=False,
|
|
drop_last=False,
|
|
num_workers=num_workers
|
|
)
|
|
|
|
#######################################
|
|
# Load pretrained model
|
|
#######################################
|
|
|
|
# Small GPT model for testing purposes
|
|
BASE_CONFIG = {
|
|
"vocab_size": 50257,
|
|
"context_length": 120,
|
|
"drop_rate": 0.0,
|
|
"qkv_bias": False,
|
|
"emb_dim": 12,
|
|
"n_layers": 1,
|
|
"n_heads": 2
|
|
}
|
|
model = GPTModel(BASE_CONFIG)
|
|
model.eval()
|
|
device = "cpu"
|
|
CHOOSE_MODEL = "Small test model"
|
|
|
|
print("Loaded model:", CHOOSE_MODEL)
|
|
print(50*"-")
|
|
|
|
#######################################
|
|
# Finetuning the model
|
|
#######################################
|
|
|
|
num_epochs = 10
|
|
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
|
|
|
|
torch.manual_seed(123)
|
|
train_losses, val_losses, tokens_seen = train_model_simple(
|
|
model, train_loader, val_loader, optimizer, device,
|
|
num_epochs=num_epochs, eval_freq=5, eval_iter=5,
|
|
start_context=format_input(val_data[0]), tokenizer=tokenizer
|
|
)
|
|
|
|
assert round(train_losses[0], 1) == 10.9
|
|
assert round(val_losses[0], 1) == 10.9
|
|
assert train_losses[-1] < train_losses[0]
|