mirror of
				https://github.com/rasbt/LLMs-from-scratch.git
				synced 2025-10-31 18:00:08 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			151 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			151 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
 | |
| # Source for "Build a Large Language Model From Scratch"
 | |
| #   - https://www.manning.com/books/build-a-large-language-model-from-scratch
 | |
| # Code: https://github.com/rasbt/LLMs-from-scratch
 | |
| 
 | |
| 
 | |
| from llms_from_scratch.ch04 import GPTModel
 | |
| from llms_from_scratch.ch06 import (
 | |
|     download_and_unzip_spam_data, create_balanced_dataset,
 | |
|     random_split, SpamDataset, train_classifier_simple
 | |
| )
 | |
| from llms_from_scratch.appendix_e import replace_linear_with_lora
 | |
| 
 | |
| from pathlib import Path
 | |
| import urllib
 | |
| 
 | |
| import pandas as pd
 | |
| import tiktoken
 | |
| import torch
 | |
| from torch.utils.data import DataLoader, Subset
 | |
| 
 | |
| 
 | |
| def test_train_classifier_lora(tmp_path):
 | |
| 
 | |
|     ########################################
 | |
|     # Download and prepare dataset
 | |
|     ########################################
 | |
| 
 | |
|     url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
 | |
|     zip_path = tmp_path / "sms_spam_collection.zip"
 | |
|     extracted_path = tmp_path / "sms_spam_collection"
 | |
|     data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"
 | |
| 
 | |
|     try:
 | |
|         download_and_unzip_spam_data(
 | |
|             url, zip_path, extracted_path, data_file_path
 | |
|         )
 | |
|     except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
 | |
|         print(f"Primary URL failed: {e}. Trying backup URL...")
 | |
|         backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
 | |
|         download_and_unzip_spam_data(
 | |
|             backup_url, zip_path, extracted_path, data_file_path
 | |
|         )
 | |
| 
 | |
|     df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
 | |
|     balanced_df = create_balanced_dataset(df)
 | |
|     balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
 | |
| 
 | |
|     train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
 | |
|     train_df.to_csv(tmp_path / "train.csv", index=None)
 | |
|     validation_df.to_csv(tmp_path / "validation.csv", index=None)
 | |
|     test_df.to_csv(tmp_path / "test.csv", index=None)
 | |
| 
 | |
|     ########################################
 | |
|     # Create data loaders
 | |
|     ########################################
 | |
|     tokenizer = tiktoken.get_encoding("gpt2")
 | |
| 
 | |
|     train_dataset = SpamDataset(
 | |
|         csv_file=tmp_path / "train.csv",
 | |
|         max_length=None,
 | |
|         tokenizer=tokenizer
 | |
|     )
 | |
| 
 | |
|     val_dataset = SpamDataset(
 | |
|         csv_file=tmp_path / "validation.csv",
 | |
|         max_length=train_dataset.max_length,
 | |
|         tokenizer=tokenizer
 | |
|     )
 | |
| 
 | |
|     num_workers = 0
 | |
|     batch_size = 8
 | |
| 
 | |
|     torch.manual_seed(123)
 | |
| 
 | |
|     train_loader = DataLoader(
 | |
|         dataset=train_dataset,
 | |
|         batch_size=batch_size,
 | |
|         shuffle=True,
 | |
|         num_workers=num_workers,
 | |
|         drop_last=True,
 | |
|     )
 | |
| 
 | |
|     val_loader = DataLoader(
 | |
|         dataset=val_dataset,
 | |
|         batch_size=batch_size,
 | |
|         num_workers=num_workers,
 | |
|         drop_last=False,
 | |
|     )
 | |
| 
 | |
|     ########################################
 | |
|     # Load pretrained model
 | |
|     ########################################
 | |
| 
 | |
|     # Small GPT model for testing purposes
 | |
|     BASE_CONFIG = {
 | |
|         "vocab_size": 50257,
 | |
|         "context_length": 120,
 | |
|         "drop_rate": 0.0,
 | |
|         "qkv_bias": False,
 | |
|         "emb_dim": 12,
 | |
|         "n_layers": 1,
 | |
|         "n_heads": 2
 | |
|     }
 | |
|     model = GPTModel(BASE_CONFIG)
 | |
|     model.eval()
 | |
|     device = "cpu"
 | |
| 
 | |
|     ########################################
 | |
|     # Modify and pretrained model
 | |
|     ########################################
 | |
| 
 | |
|     for param in model.parameters():
 | |
|         param.requires_grad = False
 | |
| 
 | |
|     torch.manual_seed(123)
 | |
| 
 | |
|     num_classes = 2
 | |
|     model.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"], out_features=num_classes)
 | |
|     replace_linear_with_lora(model, rank=16, alpha=16)
 | |
|     model.to(device)
 | |
| 
 | |
|     for param in model.trf_blocks[-1].parameters():
 | |
|         param.requires_grad = True
 | |
| 
 | |
|     for param in model.final_norm.parameters():
 | |
|         param.requires_grad = True
 | |
| 
 | |
|     ########################################
 | |
|     # Finetune modified model
 | |
|     ########################################
 | |
| 
 | |
|     torch.manual_seed(123)
 | |
| 
 | |
|     optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
 | |
| 
 | |
|     train_subset = Subset(train_loader.dataset, range(5))
 | |
|     batch_train_loader = DataLoader(train_subset, batch_size=5)
 | |
|     val_subset = Subset(val_loader.dataset, range(5))
 | |
|     batch_val_loader = DataLoader(val_subset, batch_size=5)
 | |
| 
 | |
|     num_epochs = 6
 | |
|     train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
 | |
|         model, batch_train_loader, batch_val_loader, optimizer, device,
 | |
|         num_epochs=num_epochs, eval_freq=1, eval_iter=1,
 | |
|     )
 | |
| 
 | |
|     assert round(train_losses[0], 1) == 0.8
 | |
|     assert round(val_losses[0], 1) == 0.8
 | |
|     assert train_losses[-1] < train_losses[0]
 | 
