mirror of
				https://github.com/rasbt/LLMs-from-scratch.git
				synced 2025-10-31 09:50:23 +00:00 
			
		
		
		
	
		
			
	
	
		
			109 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			109 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|   | # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). | ||
|  | # Source for "Build a Large Language Model From Scratch" | ||
|  | #   - https://www.manning.com/books/build-a-large-language-model-from-scratch | ||
|  | # Code: https://github.com/rasbt/LLMs-from-scratch | ||
|  | 
 | ||
|  | from llms_from_scratch.ch04 import GPTModel | ||
|  | from llms_from_scratch.ch05 import train_model_simple | ||
|  | from llms_from_scratch.ch07 import ( | ||
|  |     download_and_load_file, InstructionDataset, format_input, custom_collate_fn | ||
|  | ) | ||
|  | 
 | ||
|  | from functools import partial | ||
|  | 
 | ||
|  | import torch | ||
|  | from torch.utils.data import DataLoader | ||
|  | import tiktoken | ||
|  | 
 | ||
|  | 
 | ||
|  | def test_instruction_finetune(tmp_path): | ||
|  | 
 | ||
|  |     ####################################### | ||
|  |     # Download and prepare dataset | ||
|  |     ####################################### | ||
|  |     file_path = tmp_path / "instruction-data.json" | ||
|  |     url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json" | ||
|  |     data = download_and_load_file(file_path, url) | ||
|  | 
 | ||
|  |     train_portion = int(len(data) * 0.85)  # 85% for training | ||
|  |     test_portion = int(len(data) * 0.1)    # 10% for testing | ||
|  | 
 | ||
|  |     train_data = data[:train_portion] | ||
|  |     test_data = data[train_portion:train_portion + test_portion] | ||
|  |     val_data = data[train_portion + test_portion:] | ||
|  | 
 | ||
|  |     # Use very small subset for testing purposes | ||
|  |     train_data = train_data[:15] | ||
|  |     val_data = val_data[:15] | ||
|  |     test_data = test_data[:15] | ||
|  | 
 | ||
|  |     tokenizer = tiktoken.get_encoding("gpt2") | ||
|  |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
|  | 
 | ||
|  |     customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=100) | ||
|  | 
 | ||
|  |     num_workers = 0 | ||
|  |     batch_size = 8 | ||
|  | 
 | ||
|  |     torch.manual_seed(123) | ||
|  | 
 | ||
|  |     train_dataset = InstructionDataset(train_data, tokenizer) | ||
|  |     train_loader = DataLoader( | ||
|  |         train_dataset, | ||
|  |         batch_size=batch_size, | ||
|  |         collate_fn=customized_collate_fn, | ||
|  |         shuffle=True, | ||
|  |         drop_last=True, | ||
|  |         num_workers=num_workers | ||
|  |     ) | ||
|  | 
 | ||
|  |     val_dataset = InstructionDataset(val_data, tokenizer) | ||
|  |     val_loader = DataLoader( | ||
|  |         val_dataset, | ||
|  |         batch_size=batch_size, | ||
|  |         collate_fn=customized_collate_fn, | ||
|  |         shuffle=False, | ||
|  |         drop_last=False, | ||
|  |         num_workers=num_workers | ||
|  |     ) | ||
|  | 
 | ||
|  |     ####################################### | ||
|  |     # Load pretrained model | ||
|  |     ####################################### | ||
|  | 
 | ||
|  |     # Small GPT model for testing purposes | ||
|  |     BASE_CONFIG = { | ||
|  |         "vocab_size": 50257, | ||
|  |         "context_length": 120, | ||
|  |         "drop_rate": 0.0, | ||
|  |         "qkv_bias": False, | ||
|  |         "emb_dim": 12, | ||
|  |         "n_layers": 1, | ||
|  |         "n_heads": 2 | ||
|  |     } | ||
|  |     model = GPTModel(BASE_CONFIG) | ||
|  |     model.eval() | ||
|  |     device = "cpu" | ||
|  |     CHOOSE_MODEL = "Small test model" | ||
|  | 
 | ||
|  |     print("Loaded model:", CHOOSE_MODEL) | ||
|  |     print(50*"-") | ||
|  | 
 | ||
|  |     ####################################### | ||
|  |     # Finetuning the model | ||
|  |     ####################################### | ||
|  | 
 | ||
|  |     num_epochs = 10 | ||
|  |     optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1) | ||
|  | 
 | ||
|  |     torch.manual_seed(123) | ||
|  |     train_losses, val_losses, tokens_seen = train_model_simple( | ||
|  |         model, train_loader, val_loader, optimizer, device, | ||
|  |         num_epochs=num_epochs, eval_freq=5, eval_iter=5, | ||
|  |         start_context=format_input(val_data[0]), tokenizer=tokenizer | ||
|  |     ) | ||
|  | 
 | ||
|  |     assert round(train_losses[0], 1) == 10.9 | ||
|  |     assert round(val_losses[0], 1) == 10.9 | ||
|  |     assert train_losses[-1] < train_losses[0] |