# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). # Source for "Build a Large Language Model From Scratch" # - https://www.manning.com/books/build-a-large-language-model-from-scratch # Code: https://github.com/rasbt/LLMs-from-scratch from llms_from_scratch.ch04 import GPTModel from llms_from_scratch.ch05 import train_model_simple from llms_from_scratch.ch07 import ( download_and_load_file, InstructionDataset, format_input, custom_collate_fn ) from functools import partial import torch from torch.utils.data import DataLoader import tiktoken def test_instruction_finetune(tmp_path): ####################################### # Download and prepare dataset ####################################### file_path = tmp_path / "instruction-data.json" url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json" data = download_and_load_file(file_path, url) train_portion = int(len(data) * 0.85) # 85% for training test_portion = int(len(data) * 0.1) # 10% for testing train_data = data[:train_portion] test_data = data[train_portion:train_portion + test_portion] val_data = data[train_portion + test_portion:] # Use very small subset for testing purposes train_data = train_data[:15] val_data = val_data[:15] test_data = test_data[:15] tokenizer = tiktoken.get_encoding("gpt2") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=100) num_workers = 0 batch_size = 8 torch.manual_seed(123) train_dataset = InstructionDataset(train_data, tokenizer) train_loader = DataLoader( train_dataset, batch_size=batch_size, collate_fn=customized_collate_fn, shuffle=True, drop_last=True, num_workers=num_workers ) val_dataset = InstructionDataset(val_data, tokenizer) val_loader = DataLoader( val_dataset, batch_size=batch_size, collate_fn=customized_collate_fn, shuffle=False, drop_last=False, num_workers=num_workers ) ####################################### # Load pretrained model ####################################### # Small GPT model for testing purposes BASE_CONFIG = { "vocab_size": 50257, "context_length": 120, "drop_rate": 0.0, "qkv_bias": False, "emb_dim": 12, "n_layers": 1, "n_heads": 2 } model = GPTModel(BASE_CONFIG) model.eval() device = "cpu" CHOOSE_MODEL = "Small test model" print("Loaded model:", CHOOSE_MODEL) print(50*"-") ####################################### # Finetuning the model ####################################### num_epochs = 10 optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1) torch.manual_seed(123) train_losses, val_losses, tokens_seen = train_model_simple( model, train_loader, val_loader, optimizer, device, num_epochs=num_epochs, eval_freq=5, eval_iter=5, start_context=format_input(val_data[0]), tokenizer=tokenizer ) assert round(train_losses[0], 1) == 10.9 assert round(val_losses[0], 1) == 10.9 assert train_losses[-1] < train_losses[0]