From da61d5b76a61b667e071c47a690c40ea751d8a91 Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Fri, 3 May 2024 08:37:58 -0500 Subject: [PATCH] Ch06 draft (#138) * Ch06 first draft * add utility files --- ch06/01_main-chapter-code/ch06.ipynb | 2077 +++++++++++++++++ ch06/01_main-chapter-code/gpt_download.py | 99 + .../01_main-chapter-code/previous_chapters.py | 345 +++ 3 files changed, 2521 insertions(+) create mode 100644 ch06/01_main-chapter-code/ch06.ipynb create mode 100644 ch06/01_main-chapter-code/gpt_download.py create mode 100644 ch06/01_main-chapter-code/previous_chapters.py diff --git a/ch06/01_main-chapter-code/ch06.ipynb b/ch06/01_main-chapter-code/ch06.ipynb new file mode 100644 index 0000000..3875ceb --- /dev/null +++ b/ch06/01_main-chapter-code/ch06.ipynb @@ -0,0 +1,2077 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c024bfa4-1a7a-4751-b5a1-827225a3478b", + "metadata": { + "id": "c024bfa4-1a7a-4751-b5a1-827225a3478b" + }, + "source": [ + "\n", + "Supplementary code for \"Build a Large Language Model From Scratch\": https://www.manning.com/books/build-a-large-language-model-from-scratch by Sebastian Raschka
\n", + "Code repository: https://github.com/rasbt/LLMs-from-scratch\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "bfabadb8-5935-45ff-b39c-db7a29012129", + "metadata": { + "id": "bfabadb8-5935-45ff-b39c-db7a29012129" + }, + "source": [ + "# Chapter 6: Finetuning for Text Classification" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5b7e01c2-1c84-4f2a-bb51-2e0b74abda90", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5b7e01c2-1c84-4f2a-bb51-2e0b74abda90", + "outputId": "9495f150-9d79-4910-d6e7-6c0d9aae4a41" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "matplotlib version: 3.7.2\n", + "numpy version: 1.25.2\n", + "tiktoken version: 0.5.1\n", + "torch version: 2.2.2\n", + "tensorflow version: 2.15.0\n", + "pandas version: 2.0.3\n" + ] + } + ], + "source": [ + "from importlib.metadata import version\n", + "\n", + "pkgs = [\"matplotlib\",\n", + " \"numpy\",\n", + " \"tiktoken\",\n", + " \"torch\",\n", + " \"tensorflow\", # For OpenAI's pretrained weights\n", + " \"pandas\" # Dataset loading\n", + " ]\n", + "for p in pkgs:\n", + " print(f\"{p} version: {version(p)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "3a84cf35-b37f-4c15-8972-dfafc9fadc1c", + "metadata": { + "id": "3a84cf35-b37f-4c15-8972-dfafc9fadc1c" + }, + "source": [ + "## 6.1 Different categories of finetuning" + ] + }, + { + "cell_type": "markdown", + "id": "ede3d731-5123-4f02-accd-c670ce50a5a3", + "metadata": { + "id": "ede3d731-5123-4f02-accd-c670ce50a5a3" + }, + "source": [ + "- No code in this section" + ] + }, + { + "cell_type": "markdown", + "id": "8c7017a2-32aa-4002-a2f3-12aac293ccdf", + "metadata": { + "id": "8c7017a2-32aa-4002-a2f3-12aac293ccdf" + }, + "source": [ + "## 6.2 Preparing the dataset" + ] + }, + { + "cell_type": "markdown", + "id": "9fbd459f-63fa-4d8c-8499-e23103156c7d", + "metadata": { + "id": "9fbd459f-63fa-4d8c-8499-e23103156c7d" + }, + "source": [ + "- This section prepares the dataset we use for classification finetuning\n", + "- We use a dataset consisting of SPAM and non-SPAM text messages to finetune the LLM to classify them\n", + "- First, we download and unzip the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "def7c09b-af9c-4216-90ce-5e67aed1065c", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "def7c09b-af9c-4216-90ce-5e67aed1065c", + "outputId": "424e4423-f623-443c-ab9e-656f9e867559" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.\n" + ] + } + ], + "source": [ + "import urllib.request\n", + "import zipfile\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n", + "zip_path = \"sms_spam_collection.zip\"\n", + "extracted_path = \"sms_spam_collection\"\n", + "data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n", + "\n", + "def download_and_unzip(url, zip_path, extracted_path, data_file_path):\n", + " if data_file_path.exists():\n", + " print(f\"{data_file_path} already exists. Skipping download and extraction.\")\n", + " return\n", + "\n", + " # Downloading the file\n", + " with urllib.request.urlopen(url) as response:\n", + " with open(zip_path, \"wb\") as out_file:\n", + " out_file.write(response.read())\n", + "\n", + " # Unzipping the file\n", + " with zipfile.ZipFile(zip_path, \"r\") as zip_ref:\n", + " zip_ref.extractall(extracted_path)\n", + "\n", + " # Add .tsv file extension\n", + " original_file_path = Path(extracted_path) / \"SMSSpamCollection\"\n", + " os.rename(original_file_path, data_file_path)\n", + " print(f\"File downloaded and saved as {data_file_path}\")\n", + "\n", + "download_and_unzip(url, zip_path, extracted_path, data_file_path)" + ] + }, + { + "cell_type": "markdown", + "id": "6aac2d19-06d0-4005-916b-0bd4b1ee50d1", + "metadata": { + "id": "6aac2d19-06d0-4005-916b-0bd4b1ee50d1" + }, + "source": [ + "- The dataset is saved as a tab-separated text file, which we can load into a pandas DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "da0ed4da-ac31-4e4d-8bdd-2153be4656a4", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 423 + }, + "id": "da0ed4da-ac31-4e4d-8bdd-2153be4656a4", + "outputId": "a16c5cde-d341-4887-a93f-baa9bec542ab" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LabelText
0hamGo until jurong point, crazy.. Available only ...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
3hamU dun say so early hor... U c already then say...
4hamNah I don't think he goes to usf, he lives aro...
.........
5567spamThis is the 2nd time we have tried 2 contact u...
5568hamWill ü b going to esplanade fr home?
5569hamPity, * was in mood for that. So...any other s...
5570hamThe guy did some bitching but I acted like i'd...
5571hamRofl. Its true to its name
\n", + "

5572 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " Label Text\n", + "0 ham Go until jurong point, crazy.. Available only ...\n", + "1 ham Ok lar... Joking wif u oni...\n", + "2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n", + "3 ham U dun say so early hor... U c already then say...\n", + "4 ham Nah I don't think he goes to usf, he lives aro...\n", + "... ... ...\n", + "5567 spam This is the 2nd time we have tried 2 contact u...\n", + "5568 ham Will ü b going to esplanade fr home?\n", + "5569 ham Pity, * was in mood for that. So...any other s...\n", + "5570 ham The guy did some bitching but I acted like i'd...\n", + "5571 ham Rofl. Its true to its name\n", + "\n", + "[5572 rows x 2 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "e7b6e631-4f0b-4aab-82b9-8898e6663109", + "metadata": { + "id": "e7b6e631-4f0b-4aab-82b9-8898e6663109" + }, + "source": [ + "- When we check the class distribution, we see that the data contains \"ham\" (i.e., not-SPAM) much more frequently than \"spam\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "495a5280-9d7c-41d4-9719-64ab99056d4c", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "495a5280-9d7c-41d4-9719-64ab99056d4c", + "outputId": "761e0482-43ba-4f46-f4b7-6774dae51b38" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Label\n", + "ham 4825\n", + "spam 747\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "print(df[\"Label\"].value_counts())" + ] + }, + { + "cell_type": "markdown", + "id": "f773f054-0bdc-4aad-bbf6-397621bf63db", + "metadata": { + "id": "f773f054-0bdc-4aad-bbf6-397621bf63db" + }, + "source": [ + "- For simplicity, and because we prefer a small dataset for educational purposes anyway (it will make it possible to finetune the LLM faster), we subsample (undersample) the dataset so that it contains 747 instances from each class\n", + "- (Next to undersampling, there are several other ways to deal with class balances, but they are out of the scope of a book on LLMs; you can find examples and more information in the [`imbalanced-learn` user guide](https://imbalanced-learn.org/stable/user_guide.html))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7be4a0a2-9704-4a96-b38f-240339818688", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7be4a0a2-9704-4a96-b38f-240339818688", + "outputId": "396dc415-cb71-4a88-e85d-d88201c6d73f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Label\n", + "ham 747\n", + "spam 747\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "def create_balanced_dataset(df):\n", + " \n", + " # Count the instances of \"spam\"\n", + " num_spam = df[df[\"Label\"] == \"spam\"].shape[0]\n", + " \n", + " # Randomly sample \"ham' instances to match the number of 'spam' instances\n", + " ham_subset = df[df[\"Label\"] == \"ham\"].sample(num_spam, random_state=123)\n", + " \n", + " # Combine ham \"subset\" with \"spam\"\n", + " balanced_df = pd.concat([ham_subset, df[df[\"Label\"] == \"spam\"]])\n", + "\n", + " return balanced_df\n", + "\n", + "balanced_df = create_balanced_dataset(df)\n", + "print(balanced_df[\"Label\"].value_counts())" + ] + }, + { + "cell_type": "markdown", + "id": "d3fd2f5a-06d8-4d30-a2e3-230b86c559d6", + "metadata": { + "id": "d3fd2f5a-06d8-4d30-a2e3-230b86c559d6" + }, + "source": [ + "- Next, we change the \"string\" class labels \"ham\" and \"spam\" into integer class labels 0 and 1:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c1b10c3d-5d57-42d0-8de8-cf80a06f5ffd", + "metadata": { + "id": "c1b10c3d-5d57-42d0-8de8-cf80a06f5ffd" + }, + "outputs": [], + "source": [ + "balanced_df[\"Label\"] = balanced_df[\"Label\"].map({\"ham\": 0, \"spam\": 1})" + ] + }, + { + "cell_type": "markdown", + "id": "5715e685-35b4-4b45-a86c-8a8694de9d6f", + "metadata": { + "id": "5715e685-35b4-4b45-a86c-8a8694de9d6f" + }, + "source": [ + "- Let's now define a function that randomly divides the dataset into a training, validation, and test subset" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "uQl0Psdmx15D", + "metadata": { + "id": "uQl0Psdmx15D" + }, + "outputs": [], + "source": [ + "def random_split(df, train_frac, validation_frac):\n", + " # Shuffle the entire DataFrame\n", + " df = df.sample(frac=1, random_state=123).reset_index(drop=True)\n", + "\n", + " # Calculate split indices\n", + " train_end = int(len(df) * train_frac)\n", + " validation_end = train_end + int(len(df) * validation_frac)\n", + "\n", + " # Split the DataFrame\n", + " train_df = df[:train_end]\n", + " validation_df = df[train_end:validation_end]\n", + " test_df = df[validation_end:]\n", + "\n", + " return train_df, validation_df, test_df\n", + "\n", + "train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)\n", + "# Test size is implied to be 0.2 as the remainder\n", + "\n", + "train_df.to_csv(\"train.csv\", index=None)\n", + "validation_df.to_csv(\"validation.csv\", index=None)\n", + "test_df.to_csv(\"test.csv\", index=None)" + ] + }, + { + "cell_type": "markdown", + "id": "7126108a-75e7-4862-b0fb-cbf59a18bb6c", + "metadata": { + "id": "7126108a-75e7-4862-b0fb-cbf59a18bb6c" + }, + "source": [ + "- Note that the text messages have different lengths; if we want to combine multiple training examples in a batch, we have to either\n", + " - 1. truncate all messages to the length of the shortest message in the dataset or batch\n", + " - 2. pad all messages to the length of the longest message in the dataset or batch\n", + "\n", + "- We choose option 2 and pad all messages to the longest message in the text\n", + "- For that, we use `<|endoftext|>` as a padding token, as discussed in chapter 2" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "74c3c463-8763-4cc0-9320-41c7eaad8ab7", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "74c3c463-8763-4cc0-9320-41c7eaad8ab7", + "outputId": "b5b48439-32c8-4b37-cca2-c9dc8fa86563" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[50256]\n" + ] + } + ], + "source": [ + "import tiktoken\n", + "\n", + "tokenizer = tiktoken.get_encoding(\"gpt2\")\n", + "print(tokenizer.encode(\"<|endoftext|>\", allowed_special={\"<|endoftext|>\"}))" + ] + }, + { + "cell_type": "markdown", + "id": "04f582ff-68bf-450e-bd87-5fb61afe431c", + "metadata": { + "id": "04f582ff-68bf-450e-bd87-5fb61afe431c" + }, + "source": [ + "- The `SpamDataset` class below identifies the longest sequence in the training dataset and adds the padding token to the others to match that sequence length" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d7791b52-af18-4ac4-afa9-b921068e383e", + "metadata": { + "id": "d7791b52-af18-4ac4-afa9-b921068e383e" + }, + "outputs": [], + "source": [ + "import torch\n", + "from torch.utils.data import Dataset\n", + "\n", + "\n", + "class SpamDataset(Dataset):\n", + " def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):\n", + " self.data = pd.read_csv(csv_file)\n", + "\n", + " # Pre-tokenize texts\n", + " self.encoded_texts = [\n", + " tokenizer.encode(text) for text in self.data[\"Text\"]\n", + " ]\n", + "\n", + " if max_length is None:\n", + " self.max_length = self._longest_encoded_length()\n", + " else:\n", + " self.max_length = max_length\n", + " # Truncate sequences if they are longer than max_length\n", + " self.encoded_texts = [\n", + " encoded_text[:self.max_length]\n", + " for encoded_text in self.encoded_texts\n", + " ]\n", + "\n", + " # Pad sequences to the longest sequence\n", + " self.encoded_texts = [\n", + " encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))\n", + " for encoded_text in self.encoded_texts\n", + " ]\n", + "\n", + " def __getitem__(self, index):\n", + " encoded = self.encoded_texts[index]\n", + " label = self.data.iloc[index][\"Label\"]\n", + " return torch.tensor(encoded, dtype=torch.long), torch.tensor(label, dtype=torch.long)\n", + "\n", + " def __len__(self):\n", + " return len(self.data)\n", + "\n", + " def _longest_encoded_length(self):\n", + " max_length = 0\n", + " for encoded_text in self.encoded_texts:\n", + " encoded_length = len(encoded_text)\n", + " if encoded_length > max_length:\n", + " max_length = encoded_length\n", + " return max_length" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "uzj85f8ou82h", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uzj85f8ou82h", + "outputId": "d08f1cf0-c24d-445f-a3f8-793532c3716f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "120\n" + ] + } + ], + "source": [ + "train_dataset = SpamDataset(\"train.csv\", max_length=None, tokenizer=tokenizer)\n", + "print(train_dataset.max_length)" + ] + }, + { + "cell_type": "markdown", + "id": "15bdd932-97eb-4b88-9cf9-d766ea4c3a60", + "metadata": {}, + "source": [ + "- We also pad the validation and test set to the longest training sequence\n", + "- Note that validation and test set samples that are longer than the longest training example are being truncated via `encoded_text[:self.max_length]` in the `SpamDataset` code\n", + "- This behavior is entirely optional, and it would also work well if we set `max_length=None` in both the validation and test set cases" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "bb0c502d-a75e-4248-8ea0-196e2b00c61e", + "metadata": { + "id": "bb0c502d-a75e-4248-8ea0-196e2b00c61e" + }, + "outputs": [], + "source": [ + "val_dataset = SpamDataset(\"validation.csv\", max_length=train_dataset.max_length, tokenizer=tokenizer)\n", + "test_dataset = SpamDataset(\"test.csv\", max_length=train_dataset.max_length, tokenizer=tokenizer)" + ] + }, + { + "cell_type": "markdown", + "id": "20170d89-85a0-4844-9887-832f5d23432a", + "metadata": {}, + "source": [ + "- Next, we use the dataset to instantiate the data loaders, which is similar to creating the data loaders in previous chapters:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "8681adc0-6f02-4e75-b01a-a6ab75d05542", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8681adc0-6f02-4e75-b01a-a6ab75d05542", + "outputId": "3266c410-4fdb-4a8c-a142-7f707e2525ab" + }, + "outputs": [], + "source": [ + "from torch.utils.data import DataLoader\n", + "\n", + "num_workers = 0\n", + "batch_size = 8\n", + "\n", + "torch.manual_seed(123)\n", + "\n", + "train_loader = DataLoader(\n", + " dataset=train_dataset,\n", + " batch_size=batch_size,\n", + " shuffle=True,\n", + " num_workers=num_workers,\n", + " drop_last=True,\n", + ")\n", + "\n", + "val_loader = DataLoader(\n", + " dataset=val_dataset,\n", + " batch_size=batch_size,\n", + " num_workers=num_workers,\n", + " drop_last=False,\n", + ")\n", + "\n", + "test_loader = DataLoader(\n", + " dataset=test_dataset,\n", + " batch_size=batch_size,\n", + " num_workers=num_workers,\n", + " drop_last=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ab7335db-e0bb-4e27-80c5-eea11e593a57", + "metadata": {}, + "source": [ + "- As a sanity check, we iterate through the data loaders and check that the batches contain 8 training examples each, where each training example consists of 120 tokens:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "4dee6882-4c3a-4964-af15-fa31f86ad047", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train loader:\n", + "Input batch dimensions: torch.Size([8, 120])\n", + "Label batch dimensions torch.Size([8])\n" + ] + } + ], + "source": [ + "print(\"Train loader:\")\n", + "for input_batch, target_batch in train_loader:\n", + " pass\n", + "\n", + "print(\"Input batch dimensions:\", input_batch.shape)\n", + "print(\"Label batch dimensions\", target_batch.shape)" + ] + }, + { + "cell_type": "markdown", + "id": "5cdd7947-7039-49bf-8a5e-c0a2f4281ca1", + "metadata": {}, + "source": [ + "- Lastly, let's print the total number of batches in each dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "IZfw-TYD2zTj", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IZfw-TYD2zTj", + "outputId": "6934bbf2-9797-4fbe-d26b-1a246e18c2fb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "130 training batches\n", + "19 validation batches\n", + "38 test batches\n" + ] + } + ], + "source": [ + "print(f\"{len(train_loader)} training batches\")\n", + "print(f\"{len(val_loader)} validation batches\")\n", + "print(f\"{len(test_loader)} test batches\")" + ] + }, + { + "cell_type": "markdown", + "id": "d1c4f61a-5f5d-4b3b-97cf-151b617d1d6c", + "metadata": { + "id": "d1c4f61a-5f5d-4b3b-97cf-151b617d1d6c" + }, + "source": [ + "## 6.3 Initializing a model with pretrained weights" + ] + }, + { + "cell_type": "markdown", + "id": "97e1af8b-8bd1-4b44-8b8b-dc031496e208", + "metadata": {}, + "source": [ + "As a verification step, we iterate through the data loaders and ensure that the batches contain 8 training examples each, where each training example consists of 120 tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "2992d779-f9fb-4812-a117-553eb790a5a9", + "metadata": { + "id": "2992d779-f9fb-4812-a117-553eb790a5a9" + }, + "outputs": [], + "source": [ + "CHOOSE_MODEL = \"gpt2-small (124M)\"\n", + "INPUT_PROMPT = \"Every effort moves\"\n", + "\n", + "BASE_CONFIG = {\n", + " \"vocab_size\": 50257, # Vocabulary size\n", + " \"context_length\": 1024, # Context length\n", + " \"drop_rate\": 0.0, # Dropout rate\n", + " \"qkv_bias\": True # Query-key-value bias\n", + "}\n", + "\n", + "model_configs = {\n", + " \"gpt2-small (124M)\": {\"emb_dim\": 768, \"n_layers\": 12, \"n_heads\": 12},\n", + " \"gpt2-medium (355M)\": {\"emb_dim\": 1024, \"n_layers\": 24, \"n_heads\": 16},\n", + " \"gpt2-large (774M)\": {\"emb_dim\": 1280, \"n_layers\": 36, \"n_heads\": 20},\n", + " \"gpt2-xl (1558M)\": {\"emb_dim\": 1600, \"n_layers\": 48, \"n_heads\": 25},\n", + "}\n", + "\n", + "BASE_CONFIG.update(model_configs[CHOOSE_MODEL])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "022a649a-44f5-466c-8a8e-326c063384f5", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "022a649a-44f5-466c-8a8e-326c063384f5", + "outputId": "7091e401-8442-4f47-a1d9-ecb42a1ef930" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File already exists and is up-to-date: gpt2/124M/checkpoint\n", + "File already exists and is up-to-date: gpt2/124M/encoder.json\n", + "File already exists and is up-to-date: gpt2/124M/hparams.json\n", + "File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001\n", + "File already exists and is up-to-date: gpt2/124M/model.ckpt.index\n", + "File already exists and is up-to-date: gpt2/124M/model.ckpt.meta\n", + "File already exists and is up-to-date: gpt2/124M/vocab.bpe\n" + ] + } + ], + "source": [ + "from gpt_download import download_and_load_gpt2\n", + "from previous_chapters import GPTModel, load_weights_into_gpt\n", + "\n", + "model_size = CHOOSE_MODEL.split(\" \")[-1].lstrip(\"(\").rstrip(\")\")\n", + "settings, params = download_and_load_gpt2(model_size=model_size, models_dir=\"gpt2\")\n", + "\n", + "model = GPTModel(BASE_CONFIG)\n", + "load_weights_into_gpt(model, params)\n", + "model.eval();" + ] + }, + { + "cell_type": "markdown", + "id": "ab8e056c-abe0-415f-b34d-df686204259e", + "metadata": {}, + "source": [ + "- To ensure that the model was loaded corrected, let's double-check that it generates coherent text" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "fe4af171-5dce-4f6e-9b63-1e4e16e8b94c", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fe4af171-5dce-4f6e-9b63-1e4e16e8b94c", + "outputId": "8ff3ec54-1dc3-4930-9be6-8eeaf560f8d4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Output text: Every effort moves you forward.\n", + "\n", + "The first step is to understand the importance of your work\n" + ] + } + ], + "source": [ + "from previous_chapters import generate_text_simple\n", + "\n", + "start_context = \"Every effort moves you\"\n", + "\n", + "tokenizer = tiktoken.get_encoding(\"gpt2\")\n", + "encoded = tokenizer.encode(start_context)\n", + "encoded_tensor = torch.tensor(encoded).unsqueeze(0)\n", + "\n", + "out = generate_text_simple(\n", + " model=model,\n", + " idx=encoded_tensor,\n", + " max_new_tokens=15,\n", + " context_size=BASE_CONFIG[\"context_length\"]\n", + ")\n", + "decoded_text = tokenizer.decode(out.squeeze(0).tolist())\n", + "\n", + "print(\"Output text:\", decoded_text)" + ] + }, + { + "cell_type": "markdown", + "id": "4c9ae440-32f9-412f-96cf-fd52cc3e2522", + "metadata": { + "id": "4c9ae440-32f9-412f-96cf-fd52cc3e2522" + }, + "source": [ + "## 6.4 Adding a classification head" + ] + }, + { + "cell_type": "markdown", + "id": "217bac05-78df-4412-bd80-612f8061c01d", + "metadata": {}, + "source": [ + "- In this section, we are modifying the pretrained LLM to make it ready for classification finetuning\n", + "- Let's take a look at the model architecture first" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b23aff91-6bd0-48da-88f6-353657e6c981", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1d8f7a01-b7c0-48d4-b1e7-8c12cc7ad932", + "outputId": "b6a5b9b5-a92f-498f-d7cb-b58dd99e4497" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPTModel(\n", + " (tok_emb): Embedding(50257, 768)\n", + " (pos_emb): Embedding(1024, 768)\n", + " (drop_emb): Dropout(p=0.0, inplace=False)\n", + " (trf_blocks): Sequential(\n", + " (0): TransformerBlock(\n", + " (att): MultiHeadAttention(\n", + " (W_query): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_key): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_value): Linear(in_features=768, out_features=768, bias=True)\n", + " (out_proj): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (ff): FeedForward(\n", + " (layers): Sequential(\n", + " (0): Linear(in_features=768, out_features=3072, bias=True)\n", + " (1): GELU()\n", + " (2): Linear(in_features=3072, out_features=768, bias=True)\n", + " )\n", + " )\n", + " (norm1): LayerNorm()\n", + " (norm2): LayerNorm()\n", + " (drop_resid): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (1): TransformerBlock(\n", + " (att): MultiHeadAttention(\n", + " (W_query): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_key): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_value): Linear(in_features=768, out_features=768, bias=True)\n", + " (out_proj): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (ff): FeedForward(\n", + " (layers): Sequential(\n", + " (0): Linear(in_features=768, out_features=3072, bias=True)\n", + " (1): GELU()\n", + " (2): Linear(in_features=3072, out_features=768, bias=True)\n", + " )\n", + " )\n", + " (norm1): LayerNorm()\n", + " (norm2): LayerNorm()\n", + " (drop_resid): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (2): TransformerBlock(\n", + " (att): MultiHeadAttention(\n", + " (W_query): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_key): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_value): Linear(in_features=768, out_features=768, bias=True)\n", + " (out_proj): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (ff): FeedForward(\n", + " (layers): Sequential(\n", + " (0): Linear(in_features=768, out_features=3072, bias=True)\n", + " (1): GELU()\n", + " (2): Linear(in_features=3072, out_features=768, bias=True)\n", + " )\n", + " )\n", + " (norm1): LayerNorm()\n", + " (norm2): LayerNorm()\n", + " (drop_resid): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (3): TransformerBlock(\n", + " (att): MultiHeadAttention(\n", + " (W_query): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_key): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_value): Linear(in_features=768, out_features=768, bias=True)\n", + " (out_proj): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (ff): FeedForward(\n", + " (layers): Sequential(\n", + " (0): Linear(in_features=768, out_features=3072, bias=True)\n", + " (1): GELU()\n", + " (2): Linear(in_features=3072, out_features=768, bias=True)\n", + " )\n", + " )\n", + " (norm1): LayerNorm()\n", + " (norm2): LayerNorm()\n", + " (drop_resid): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (4): TransformerBlock(\n", + " (att): MultiHeadAttention(\n", + " (W_query): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_key): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_value): Linear(in_features=768, out_features=768, bias=True)\n", + " (out_proj): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (ff): FeedForward(\n", + " (layers): Sequential(\n", + " (0): Linear(in_features=768, out_features=3072, bias=True)\n", + " (1): GELU()\n", + " (2): Linear(in_features=3072, out_features=768, bias=True)\n", + " )\n", + " )\n", + " (norm1): LayerNorm()\n", + " (norm2): LayerNorm()\n", + " (drop_resid): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (5): TransformerBlock(\n", + " (att): MultiHeadAttention(\n", + " (W_query): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_key): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_value): Linear(in_features=768, out_features=768, bias=True)\n", + " (out_proj): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (ff): FeedForward(\n", + " (layers): Sequential(\n", + " (0): Linear(in_features=768, out_features=3072, bias=True)\n", + " (1): GELU()\n", + " (2): Linear(in_features=3072, out_features=768, bias=True)\n", + " )\n", + " )\n", + " (norm1): LayerNorm()\n", + " (norm2): LayerNorm()\n", + " (drop_resid): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (6): TransformerBlock(\n", + " (att): MultiHeadAttention(\n", + " (W_query): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_key): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_value): Linear(in_features=768, out_features=768, bias=True)\n", + " (out_proj): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (ff): FeedForward(\n", + " (layers): Sequential(\n", + " (0): Linear(in_features=768, out_features=3072, bias=True)\n", + " (1): GELU()\n", + " (2): Linear(in_features=3072, out_features=768, bias=True)\n", + " )\n", + " )\n", + " (norm1): LayerNorm()\n", + " (norm2): LayerNorm()\n", + " (drop_resid): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (7): TransformerBlock(\n", + " (att): MultiHeadAttention(\n", + " (W_query): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_key): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_value): Linear(in_features=768, out_features=768, bias=True)\n", + " (out_proj): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (ff): FeedForward(\n", + " (layers): Sequential(\n", + " (0): Linear(in_features=768, out_features=3072, bias=True)\n", + " (1): GELU()\n", + " (2): Linear(in_features=3072, out_features=768, bias=True)\n", + " )\n", + " )\n", + " (norm1): LayerNorm()\n", + " (norm2): LayerNorm()\n", + " (drop_resid): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (8): TransformerBlock(\n", + " (att): MultiHeadAttention(\n", + " (W_query): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_key): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_value): Linear(in_features=768, out_features=768, bias=True)\n", + " (out_proj): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (ff): FeedForward(\n", + " (layers): Sequential(\n", + " (0): Linear(in_features=768, out_features=3072, bias=True)\n", + " (1): GELU()\n", + " (2): Linear(in_features=3072, out_features=768, bias=True)\n", + " )\n", + " )\n", + " (norm1): LayerNorm()\n", + " (norm2): LayerNorm()\n", + " (drop_resid): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (9): TransformerBlock(\n", + " (att): MultiHeadAttention(\n", + " (W_query): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_key): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_value): Linear(in_features=768, out_features=768, bias=True)\n", + " (out_proj): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (ff): FeedForward(\n", + " (layers): Sequential(\n", + " (0): Linear(in_features=768, out_features=3072, bias=True)\n", + " (1): GELU()\n", + " (2): Linear(in_features=3072, out_features=768, bias=True)\n", + " )\n", + " )\n", + " (norm1): LayerNorm()\n", + " (norm2): LayerNorm()\n", + " (drop_resid): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (10): TransformerBlock(\n", + " (att): MultiHeadAttention(\n", + " (W_query): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_key): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_value): Linear(in_features=768, out_features=768, bias=True)\n", + " (out_proj): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (ff): FeedForward(\n", + " (layers): Sequential(\n", + " (0): Linear(in_features=768, out_features=3072, bias=True)\n", + " (1): GELU()\n", + " (2): Linear(in_features=3072, out_features=768, bias=True)\n", + " )\n", + " )\n", + " (norm1): LayerNorm()\n", + " (norm2): LayerNorm()\n", + " (drop_resid): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (11): TransformerBlock(\n", + " (att): MultiHeadAttention(\n", + " (W_query): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_key): Linear(in_features=768, out_features=768, bias=True)\n", + " (W_value): Linear(in_features=768, out_features=768, bias=True)\n", + " (out_proj): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.0, inplace=False)\n", + " )\n", + " (ff): FeedForward(\n", + " (layers): Sequential(\n", + " (0): Linear(in_features=768, out_features=3072, bias=True)\n", + " (1): GELU()\n", + " (2): Linear(in_features=3072, out_features=768, bias=True)\n", + " )\n", + " )\n", + " (norm1): LayerNorm()\n", + " (norm2): LayerNorm()\n", + " (drop_resid): Dropout(p=0.0, inplace=False)\n", + " )\n", + " )\n", + " (final_norm): LayerNorm()\n", + " (out_head): Linear(in_features=768, out_features=50257, bias=False)\n", + ")\n" + ] + } + ], + "source": [ + "print(model)" + ] + }, + { + "cell_type": "markdown", + "id": "3f640a76-dd00-4769-9bc8-1aed0cec330d", + "metadata": {}, + "source": [ + "- Above, we can see the architecture we implemented in chapter 4 neatly laid out\n", + "- The goal is to replace and finetune the output layer\n", + "- To achieve this, we first freeze the model, meaning that we make all layers non-trainable" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "fkMWFl-0etea", + "metadata": { + "id": "fkMWFl-0etea" + }, + "outputs": [], + "source": [ + "for param in model.parameters():\n", + " param.requires_grad = False" + ] + }, + { + "cell_type": "markdown", + "id": "72155f83-87d9-476a-a978-a15aa2d44147", + "metadata": {}, + "source": [ + "- Then, we replace the output layer (`model.out_head`), which originally maps the layer inputs to 50,257 dimensions (the size of the vocabulary)\n", + "- Since we finetune the model for binary classification (predicting 2 classes, \"spam\" and \"ham\"), we can replace the output layer as shown below, which will be trainable by default" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "7e759fa0-0f69-41be-b576-17e5f20e04cb", + "metadata": {}, + "outputs": [], + "source": [ + "torch.manual_seed(123)\n", + "\n", + "num_classes = 2\n", + "model.out_head = torch.nn.Linear(in_features=768, out_features=num_classes)" + ] + }, + { + "cell_type": "markdown", + "id": "30be5475-ae77-4f97-8f3e-dec462b1339f", + "metadata": {}, + "source": [ + "- Technically, it's sufficient to only train the output layer\n", + "- However, as I found in [experiments finetuning additional layers](https://magazine.sebastianraschka.com/p/finetuning-large-language-models) can noticeably improve the performance\n", + "- So, we are also making the last transformer block and the final `LayerNorm` module connecting the last transformer block to the output layer trainable" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "2aedc120-5ee3-48f6-92f2-ad9304ebcdc7", + "metadata": { + "id": "2aedc120-5ee3-48f6-92f2-ad9304ebcdc7" + }, + "outputs": [], + "source": [ + "for param in model.trf_blocks[-1].parameters():\n", + " param.requires_grad = True\n", + "\n", + "for param in model.final_norm.parameters():\n", + " param.requires_grad = True" + ] + }, + { + "cell_type": "markdown", + "id": "f012b899-8284-4d3a-97c0-8a48eb33ba2e", + "metadata": {}, + "source": [ + "- We can still use this model similar to before in previous chapters\n", + "- For example, let's feed it some text input" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "f645c06a-7df6-451c-ad3f-eafb18224ebc", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "f645c06a-7df6-451c-ad3f-eafb18224ebc", + "outputId": "27e041b1-d731-48a1-cf60-f22d4565304e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Inputs: tensor([[ 40, 1107, 8288, 428, 3807, 13]])\n", + "Inputs dimensions: torch.Size([1, 6])\n" + ] + } + ], + "source": [ + "inputs = tokenizer.encode(\"I really liked this movie.\")\n", + "inputs = torch.tensor(inputs).unsqueeze(0)\n", + "print(\"Inputs:\", inputs)\n", + "print(\"Inputs dimensions:\", inputs.shape) # shape: (batch_size, num_tokens)" + ] + }, + { + "cell_type": "markdown", + "id": "fbbf8481-772d-467b-851c-a62b86d0cb1b", + "metadata": {}, + "source": [ + "- What's different compared to previous chapters is that it now has two output dimensions instead of 50,257" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "48dc84f1-85cc-4609-9cee-94ff539f00f4", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "48dc84f1-85cc-4609-9cee-94ff539f00f4", + "outputId": "9cae7448-253d-4776-973e-0af190b06354" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Outputs:\n", + " tensor([[[-1.9044, 1.5321],\n", + " [-4.9851, 8.5136],\n", + " [-1.6985, 4.6314],\n", + " [-2.3820, 5.7547],\n", + " [-3.8736, 4.4867],\n", + " [-5.7543, 5.3615]]])\n", + "Outputs dimensions: torch.Size([1, 6, 2])\n" + ] + } + ], + "source": [ + "with torch.no_grad():\n", + " outputs = model(inputs)\n", + "\n", + "print(\"Outputs:\\n\", outputs)\n", + "print(\"Outputs dimensions:\", outputs.shape) # shape: (batch_size, num_tokens, num_classes)" + ] + }, + { + "cell_type": "markdown", + "id": "e3bb8616-c791-4f5c-bac0-5302f663e46a", + "metadata": {}, + "source": [ + "- As discussed in previous chapters, for each input token, there's one output vector\n", + "- Since we fed the model a text sample with 6 input tokens, the output consists of 6 2-dimensional output vectors above\n", + "- In chapter 3, we discussed the attention mechanism, which connects each input token to each other input token\n", + "- In chapter 3, we then also introduced the causal attention mask that is used in GPT-like models; this causal mask lets a current token only attend to the current and previous token positions\n", + "- Based on this causal attention mechanism, the 6th (last) token above contains the most information among all tokens because it's the only token that includes information about all other tokens\n", + "- Hence, we are particularly interested in this last token, which we will finetune for the spam classification task" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "49383a8c-41d5-4dab-98f1-238bca0c2ed7", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "49383a8c-41d5-4dab-98f1-238bca0c2ed7", + "outputId": "e79eb155-fa1f-46ed-ff8c-d828c3a3fabd" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Last output token: tensor([[-5.7543, 5.3615]])\n" + ] + } + ], + "source": [ + "print(\"Last output token:\", outputs[:, -1, :])" + ] + }, + { + "cell_type": "markdown", + "id": "32aa4aef-e1e9-491b-9adf-5aa973e59b8c", + "metadata": {}, + "source": [ + "## 6.5 Calculating the classification loss and accuracy" + ] + }, + { + "cell_type": "markdown", + "id": "4f4a9d15-8fc7-48a2-8734-d92a2f265328", + "metadata": {}, + "source": [ + "- Before we can start finetuning (/training), we first have to define the loss function we want to optimize during training\n", + "- The goal is to maximize the spam classification accuracy of the model; however, classification accuracy is not a differentiable function\n", + "- Hence, instead, we minimize the cross entropy loss as a proxy for maximizing the classification accuracy (you can learn more about this topic in lecture 8 of my freely available [Introduction to Deep Learning](https://sebastianraschka.com/blog/2021/dl-course.html#l08-multinomial-logistic-regression--softmax-regression) class.\n", + "\n", + "- Note that in chapter 5, we calculated the cross entropy loss for the next predicted token over the 50,257 token IDs in the vocabulary\n", + "- Here, we calculate the cross entropy in a similar fashion; the only difference is that instead of 50,257 token IDs, we now have only two choices: spam (label 1) or ham (label 0).\n", + "- In other words, the loss calculation training code is practically identical to the one in chapter 5, but we now only have two labels instead of 50,257 labels (token IDs).\n", + "\n", + "\n", + "- Consequently, the `calc_loss_batch` function is the same here as in chapter 5, except that we are only interested in optimizing the last token `model(input_batch)[:, -1, :]` instead of all tokens `model(input_batch)`:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "2f1e9547-806c-41a9-8aba-3b2822baabe4", + "metadata": { + "id": "2f1e9547-806c-41a9-8aba-3b2822baabe4" + }, + "outputs": [], + "source": [ + "def calc_loss_batch(input_batch, target_batch, model, device):\n", + " input_batch, target_batch = input_batch.to(device), target_batch.to(device)\n", + " logits = model(input_batch)[:, -1, :] # Logits of last ouput token\n", + " loss = torch.nn.functional.cross_entropy(logits, target_batch)\n", + " return loss" + ] + }, + { + "cell_type": "markdown", + "id": "a013aab9-f854-4866-ad55-5b8350adb50a", + "metadata": {}, + "source": [ + "The `calc_loss_loader` is exactly the same as in chapter 5:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "b7b83e10-5720-45e7-ac5e-369417ca846b", + "metadata": {}, + "outputs": [], + "source": [ + "# Same as in chapter 5\n", + "def calc_loss_loader(data_loader, model, device, num_batches=None):\n", + " total_loss = 0.\n", + " if len(data_loader) == 0:\n", + " return float(\"nan\")\n", + " elif num_batches is None:\n", + " num_batches = len(data_loader)\n", + " else:\n", + " # Reduce the number of batches to match the total number of batches in the data loader\n", + " # if num_batches exceeds the number of batches in the data loader\n", + " num_batches = min(num_batches, len(data_loader))\n", + " for i, (input_batch, target_batch) in enumerate(data_loader):\n", + " if i < num_batches:\n", + " loss = calc_loss_batch(input_batch, target_batch, model, device)\n", + " total_loss += loss.item()\n", + " else:\n", + " break\n", + " return total_loss / num_batches" + ] + }, + { + "cell_type": "markdown", + "id": "56826ecd-6e74-40e6-b772-d3541e585067", + "metadata": {}, + "source": [ + "- Using the `calc_closs_loader`, we compute the initial training, validation, and test set losses before we start training\n", + "- Here, we use `torch.no_grad()` so that no gradients are computed during the forward pass, which reduces memory consumption and speeds up computations since we are not training the model yet\n", + "- Via the `device` setting, the model automatically runs on a GPU if a GPU with Nvidia CUDA support is available and otherwise runs on a CPU" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "f6f00e53-5beb-4e64-b147-f26fd481c6ff", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "f6f00e53-5beb-4e64-b147-f26fd481c6ff", + "outputId": "49df8648-9e38-4314-854d-9faacd1b2e89" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training loss: 3.095\n", + "Validation loss: 2.583\n", + "Test loss: 2.322\n" + ] + } + ], + "source": [ + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes\n", + "\n", + "torch.manual_seed(123) # For reproducibility due to the shuffling in the training data loader\n", + "\n", + "with torch.no_grad(): # Disable gradient tracking for efficiency because we are not training, yet\n", + " train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)\n", + " val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)\n", + " test_loss = calc_loss_loader(test_loader, model, device, num_batches=5)\n", + "\n", + "print(f\"Training loss: {train_loss:.3f}\")\n", + "print(f\"Validation loss: {val_loss:.3f}\")\n", + "print(f\"Test loss: {test_loss:.3f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "b109556e-ddae-49fd-ad08-e6fa1032ea7a", + "metadata": {}, + "source": [ + "- Similar to the `calc_loss_loader` function above, we can define a `calc_accuracy_loader` function that calculates the classification accuracy by checking how many predicted class (spam and ham) labels match the given labels in the dataset\n", + "- Note that the classification accuracy is a mathematically non-differentiable function, and we only use it for evaluation; hence, we can disable the gradient calculation permanently to save resources here\n", + "- We can disable the gradient tracking either using the `with torch.no_grad():` inside the function or by using the `@torch.no_grad()` function decorator" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "64ce5b12-84cd-488c-8ea7-4cef5b2d947e", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "64ce5b12-84cd-488c-8ea7-4cef5b2d947e", + "outputId": "239581b4-fd0f-4adf-e67b-364e0f0f96b7" + }, + "outputs": [], + "source": [ + "@torch.no_grad() # Disable gradient tracking for efficiency\n", + "def calc_accuracy_loader(data_loader, model, device, num_batches=None):\n", + " model.eval()\n", + " correct_predictions, num_examples = 0, 0\n", + "\n", + " if num_batches is None:\n", + " num_batches = len(data_loader)\n", + " else:\n", + " num_batches = min(num_batches, len(data_loader))\n", + " for i, (input_batch, target_batch) in enumerate(data_loader):\n", + " if i < num_batches:\n", + " input_batch, target_batch = input_batch.to(device), target_batch.to(device)\n", + " logits = model(input_batch)[:, -1, :] # Logits of last ouput token\n", + " predicted_labels = torch.argmax(logits, dim=-1)\n", + "\n", + " num_examples += predicted_labels.shape[0]\n", + " correct_predictions += (predicted_labels == target_batch).sum().item()\n", + " else:\n", + " break\n", + " return correct_predictions / num_examples" + ] + }, + { + "cell_type": "markdown", + "id": "90521a9a-639c-4c7f-a5c0-aca8fa5d4c1b", + "metadata": {}, + "source": [ + "- Let's check the initial classification accuracy before we start training the model:" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "2160418f-988b-40f3-bce8-e431021e97dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training accuracy: 46.25%\n", + "Validation accuracy: 45.00%\n", + "Test accuracy: 48.75%\n" + ] + } + ], + "source": [ + "torch.manual_seed(123)\n", + "train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=10)\n", + "val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=10)\n", + "test_accuracy = calc_accuracy_loader(test_loader, model, device, num_batches=10)\n", + "\n", + "print(f\"Training accuracy: {train_accuracy*100:.2f}%\")\n", + "print(f\"Validation accuracy: {val_accuracy*100:.2f}%\")\n", + "print(f\"Test accuracy: {test_accuracy*100:.2f}%\")" + ] + }, + { + "cell_type": "markdown", + "id": "e04b980b-e583-4f62-84a0-4edafaf99d5d", + "metadata": {}, + "source": [ + "- As we can see, the model only gets roughly half (50%) of the predictions correctly\n", + "- In the next section, we train the model to improve the classification accuracy" + ] + }, + { + "cell_type": "markdown", + "id": "456ae0fd-6261-42b4-ab6a-d24289953083", + "metadata": { + "id": "456ae0fd-6261-42b4-ab6a-d24289953083" + }, + "source": [ + "## 6.6 Finetuning the model on supervised data" + ] + }, + { + "cell_type": "markdown", + "id": "6a9b099b-0829-4f72-8a2b-4363e3497026", + "metadata": {}, + "source": [ + "- In this section, we define and use the training function to improve the classification accuracy of the model\n", + "- The `train_classifier_simple` function below is practically the same as the `train_model_simple` function we used for pretraining the model in chapter 5\n", + "- The only two differences are that we now \n", + " 1. track the number of training examples seen (`examples_seen`) instead of the number of tokens seen\n", + " 2. calculate the accuracy after each epoch instead of printing a sample text after each epoch" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "Csbr60to50FL", + "metadata": { + "id": "Csbr60to50FL" + }, + "outputs": [], + "source": [ + "# Overall the same as `train_model_simple` in chapter 5\n", + "def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs,\n", + " eval_freq, eval_iter, tokenizer):\n", + " # Initialize lists to track losses and tokens seen\n", + " train_losses, val_losses, train_accs, val_accs = [], [], [], []\n", + " examples_seen, global_step = 0, -1\n", + "\n", + " # Main training loop\n", + " for epoch in range(num_epochs):\n", + " model.train() # Set model to training mode\n", + "\n", + " for input_batch, target_batch in train_loader:\n", + " optimizer.zero_grad() # Reset loss gradients from previous epoch\n", + " loss = calc_loss_batch(input_batch, target_batch, model, device)\n", + " loss.backward() # Calculate loss gradients\n", + " optimizer.step() # Update model weights using loss gradients\n", + " examples_seen += input_batch.shape[0] # New: track examples instead of tokens\n", + " global_step += 1\n", + "\n", + " # Optional evaluation step\n", + " if global_step % eval_freq == 0:\n", + " train_loss, val_loss = evaluate_model(\n", + " model, train_loader, val_loader, device, eval_iter)\n", + " train_losses.append(train_loss)\n", + " val_losses.append(val_loss)\n", + " print(f\"Ep {epoch+1} (Step {global_step:06d}): \"\n", + " f\"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}\")\n", + "\n", + " # Calculate accuracy after each epoch\n", + " train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter)\n", + " val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter)\n", + " print(f\"Training accuracy: {train_accuracy*100:.2f}% | \", end=\"\")\n", + " print(f\"Validation accuracy: {val_accuracy*100:.2f}%\")\n", + " train_accs.append(train_accuracy)\n", + " val_accs.append(val_accuracy)\n", + "\n", + " return train_losses, val_losses, train_accs, val_accs, examples_seen" + ] + }, + { + "cell_type": "markdown", + "id": "9624cb30-3e3a-45be-b006-c00475b58ae8", + "metadata": {}, + "source": [ + "- The `evaluate_model` function used in the `train_classifier_simple` is the same as the one we used in chapter 5" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "bcc7bc04-6aa6-4516-a147-460e2f466eab", + "metadata": {}, + "outputs": [], + "source": [ + "# Same as chapter 5\n", + "def evaluate_model(model, train_loader, val_loader, device, eval_iter):\n", + " model.eval()\n", + " with torch.no_grad():\n", + " train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)\n", + " val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)\n", + " model.train()\n", + " return train_loss, val_loss" + ] + }, + { + "cell_type": "markdown", + "id": "e807bfe9-364d-46b2-9e25-3b000c3ef6f9", + "metadata": {}, + "source": [ + "- The training takes about 5 minutes on a M3 MacBook Air laptop computer and less than half a minute on a V100 or A100 GPU" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "X7kU3aAj7vTJ", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "X7kU3aAj7vTJ", + "outputId": "504a033e-2bf8-41b5-a037-468309845513" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ep 1 (Step 000000): Train loss 2.153, Val loss 2.392\n", + "Ep 1 (Step 000050): Train loss 0.617, Val loss 0.637\n", + "Ep 1 (Step 000100): Train loss 0.523, Val loss 0.557\n", + "Training accuracy: 70.00% | Validation accuracy: 72.50%\n", + "Ep 2 (Step 000150): Train loss 0.561, Val loss 0.489\n", + "Ep 2 (Step 000200): Train loss 0.419, Val loss 0.397\n", + "Ep 2 (Step 000250): Train loss 0.409, Val loss 0.353\n", + "Training accuracy: 82.50% | Validation accuracy: 85.00%\n", + "Ep 3 (Step 000300): Train loss 0.333, Val loss 0.320\n", + "Ep 3 (Step 000350): Train loss 0.340, Val loss 0.306\n", + "Training accuracy: 90.00% | Validation accuracy: 90.00%\n", + "Ep 4 (Step 000400): Train loss 0.136, Val loss 0.200\n", + "Ep 4 (Step 000450): Train loss 0.153, Val loss 0.132\n", + "Ep 4 (Step 000500): Train loss 0.222, Val loss 0.137\n", + "Training accuracy: 100.00% | Validation accuracy: 97.50%\n", + "Ep 5 (Step 000550): Train loss 0.207, Val loss 0.143\n", + "Ep 5 (Step 000600): Train loss 0.083, Val loss 0.074\n", + "Training accuracy: 100.00% | Validation accuracy: 97.50%\n", + "Training completed in 5.65 minutes.\n" + ] + } + ], + "source": [ + "import time\n", + "\n", + "start_time = time.time()\n", + "\n", + "torch.manual_seed(123)\n", + "\n", + "optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)\n", + "\n", + "num_epochs = 5\n", + "train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(\n", + " model, train_loader, val_loader, optimizer, device,\n", + " num_epochs=num_epochs, eval_freq=50, eval_iter=5,\n", + " tokenizer=tokenizer\n", + ")\n", + "\n", + "end_time = time.time()\n", + "execution_time_minutes = (end_time - start_time) / 60\n", + "print(f\"Training completed in {execution_time_minutes:.2f} minutes.\")" + ] + }, + { + "cell_type": "markdown", + "id": "1261bf90-3ce7-4591-895a-044a05538f30", + "metadata": {}, + "source": [ + "- Similar to chapter 5, we use matplotlib to plot the loss function for the training and validation set" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "cURgnDqdCeka", + "metadata": { + "id": "cURgnDqdCeka" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "def plot_values(epochs_seen, examples_seen, train_values, val_values, label=\"loss\"):\n", + " fig, ax1 = plt.subplots(figsize=(5, 3))\n", + "\n", + " # Plot training and validation loss against epochs\n", + " ax1.plot(epochs_seen, train_values, label=f\"Training {label}\")\n", + " ax1.plot(epochs_seen, val_values, linestyle=\"-.\", label=f\"Validation {label}\")\n", + " ax1.set_xlabel(\"Epochs\")\n", + " ax1.set_ylabel(label.capitalize())\n", + " ax1.legend()\n", + "\n", + " # Create a second x-axis for tokens seen\n", + " ax2 = ax1.twiny() # Create a second x-axis that shares the same y-axis\n", + " ax2.plot(examples_seen, train_values, alpha=0) # Invisible plot for aligning ticks\n", + " ax2.set_xlabel(\"Examples seen\")\n", + "\n", + " fig.tight_layout() # Adjust layout to make room\n", + " plt.savefig(f\"{label}-plot.pdf\")\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "OIqRt466DiGk", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 307 + }, + "id": "OIqRt466DiGk", + "outputId": "b16987cf-0001-4652-ddaf-02f7cffc34db" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))\n", + "examples_seen_tensor = torch.linspace(0, examples_seen, len(train_losses))\n", + "\n", + "plot_values(epochs_tensor, examples_seen_tensor, train_losses, val_losses)" + ] + }, + { + "cell_type": "markdown", + "id": "dbd28174-1836-44ba-b6c0-7e0be774fadc", + "metadata": {}, + "source": [ + "- Above, based on the downward slope, we see that the model learns well\n", + "- Furthermore, the fact that the training and validation loss are very close indicates that the model does not tend to overfit the training data\n", + "- Similarly, we can plot the accuracy below" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "yz8BIsaF0TUo", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 307 + }, + "id": "yz8BIsaF0TUo", + "outputId": "3a7ed967-1f2a-4c6d-f4a3-0cc8cc9d6c5f" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "epochs_tensor = torch.linspace(0, num_epochs, len(train_accs))\n", + "examples_seen_tensor = torch.linspace(0, examples_seen, len(train_accs))\n", + "\n", + "plot_values(epochs_tensor, examples_seen_tensor, train_accs, val_accs, label=\"accuracy\")" + ] + }, + { + "cell_type": "markdown", + "id": "90aba699-21bc-42de-a69c-99f370bb0363", + "metadata": {}, + "source": [ + "- Based on the accuracy plot above, we can see that the model achieves a relatively high training and validation accuracy after epochs 4 and 5\n", + "- However, we have to keep in mind that we specified `eval_iter=5` in the training function earlier, which means that we only estimated the training and validation set performances\n", + "- We can compute the training, validation, and test set performances over the complete dataset as follows below" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "UHWaJFrjY0zW", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UHWaJFrjY0zW", + "outputId": "e111e6e6-b147-4159-eb9d-19d4e809ed34" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training accuracy: 97.21%\n", + "Validation accuracy: 97.32%\n", + "Test accuracy: 95.67%\n" + ] + } + ], + "source": [ + "train_accuracy = calc_accuracy_loader(train_loader, model, device)\n", + "val_accuracy = calc_accuracy_loader(val_loader, model, device)\n", + "test_accuracy = calc_accuracy_loader(test_loader, model, device)\n", + "\n", + "print(f\"Training accuracy: {train_accuracy*100:.2f}%\")\n", + "print(f\"Validation accuracy: {val_accuracy*100:.2f}%\")\n", + "print(f\"Test accuracy: {test_accuracy*100:.2f}%\")" + ] + }, + { + "cell_type": "markdown", + "id": "6882649f-dc7b-401f-84d2-024ff79c74a1", + "metadata": {}, + "source": [ + "- We can see that the training and test set performances are practically identical\n", + "- However, based on the slightly lower test set performance, we can see that the model overfits the training data to a very small degree\n", + "- This is normal, however, and this gap could potentially be further reduced by increasing the model's dropout rate (`drop_rate`) or the `weight_decay` in the optimizer setting" + ] + }, + { + "cell_type": "markdown", + "id": "a74d9ad7-3ec1-450e-8c9f-4fc46d3d5bb0", + "metadata": {}, + "source": [ + "## 6.7 Using the LLM as a SPAM classifier" + ] + }, + { + "cell_type": "markdown", + "id": "fd5408e6-83e4-4e5a-8503-c2fba6073f31", + "metadata": {}, + "source": [ + "- Finally, let's use the finetuned GPT model in action\n", + "- The `classify_review` function below implements the data preprocessing steps similar to the `SpamDataset` we implemented earlier\n", + "- Then, the function returns the predicted integer class label from the model and returns the corresponding class name" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "aHdn6xvL-IW5", + "metadata": { + "id": "aHdn6xvL-IW5" + }, + "outputs": [], + "source": [ + "def classify_review(text, model, tokenizer, device, max_length=None, pad_token_id=50256):\n", + " model.eval()\n", + "\n", + " # Prepare inputs to the model\n", + " input_ids = tokenizer.encode(text)\n", + " supported_context_length = model.pos_emb.weight.shape[1]\n", + "\n", + " # Truncate sequences if they too long\n", + " input_ids = input_ids[:min(max_length, supported_context_length)]\n", + "\n", + " # Pad sequences to the longest sequence\n", + " input_ids += [pad_token_id] * (max_length - len(input_ids))\n", + " input_tensor = torch.tensor(input_ids, device=device).unsqueeze(0) # add batch dimension\n", + "\n", + " # Model inference\n", + " with torch.no_grad():\n", + " logits = model(input_tensor)[:, -1, :] # Logits of the last output token\n", + " predicted_label = torch.argmax(logits, dim=-1).item()\n", + "\n", + " # Return the classified result\n", + " return \"Positive\" if predicted_label == 1 else \"Negative\"" + ] + }, + { + "cell_type": "markdown", + "id": "f29682d8-a899-4d9b-b973-f8d5ec68172c", + "metadata": {}, + "source": [ + "- Let's try it out on a few examples below" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "apU_pf51AWSV", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "apU_pf51AWSV", + "outputId": "d0fde0a5-e7a3-4dbe-d9c5-0567dbab7e62" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Positive\n" + ] + } + ], + "source": [ + "text_1 = (\n", + " \"You are a winner you have been specially\"\n", + " \" selected to receive $1000 cash or a $2000 award.\"\n", + ")\n", + "\n", + "print(classify_review(text_1, model, tokenizer, device, max_length=train_dataset.max_length))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "1g5VTOo_Ajs5", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1g5VTOo_Ajs5", + "outputId": "659b08eb-b6a9-4a8a-9af7-d94c757e93c2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Negative\n" + ] + } + ], + "source": [ + "text_2 = (\n", + " \"Hey, just wanted to check if we're still on\"\n", + " \" for dinner tonight? Let me know!\"\n", + ")\n", + "\n", + "print(classify_review(text_2, model, tokenizer, device, max_length=train_dataset.max_length))" + ] + }, + { + "cell_type": "markdown", + "id": "bf736e39-0d47-40c1-8d18-1f716cf7a81e", + "metadata": {}, + "source": [ + "- Finally, let's save the model in case we want to reuse the model later without having to train it again" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "mYnX-gI1CfQY", + "metadata": { + "id": "mYnX-gI1CfQY" + }, + "outputs": [], + "source": [ + "torch.save(model.state_dict(), \"review_classifier.pth\")" + ] + }, + { + "cell_type": "markdown", + "id": "ba78cf7c-6b80-4f71-a50e-3ccc73839af6", + "metadata": {}, + "source": [ + "- Then, in a new session, we could load the model as follows" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "cc4e68a5-d492-493b-87ef-45c475f353f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_state_dict = torch.load(\"review_classifier.pth\")\n", + "model.load_state_dict(model_state_dict)" + ] + }, + { + "cell_type": "markdown", + "id": "5b70ac71-234f-4eeb-b33d-c62726d50cd4", + "metadata": { + "id": "5b70ac71-234f-4eeb-b33d-c62726d50cd4" + }, + "source": [ + "## Summary and takeaways" + ] + }, + { + "cell_type": "markdown", + "id": "dafdc910-d616-47ab-aa85-f90c6e7ed80e", + "metadata": {}, + "source": [ + "- Interested readers can find an introduction to parameter-efficient training with low-rank adaptation (LoRA) in appendix E\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "V100", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ch06/01_main-chapter-code/gpt_download.py b/ch06/01_main-chapter-code/gpt_download.py new file mode 100644 index 0000000..0d695d2 --- /dev/null +++ b/ch06/01_main-chapter-code/gpt_download.py @@ -0,0 +1,99 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + + +import os +import requests +import json +import numpy as np +import tensorflow as tf +from tqdm import tqdm + + +def download_and_load_gpt2(model_size, models_dir): + # Validate model size + allowed_sizes = ("124M", "355M", "774M", "1558M") + if model_size not in allowed_sizes: + raise ValueError(f"Model size not in {allowed_sizes}") + + # Define paths + model_dir = os.path.join(models_dir, model_size) + base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models" + filenames = [ + "checkpoint", "encoder.json", "hparams.json", + "model.ckpt.data-00000-of-00001", "model.ckpt.index", + "model.ckpt.meta", "vocab.bpe" + ] + + # Download files + os.makedirs(model_dir, exist_ok=True) + for filename in filenames: + file_url = os.path.join(base_url, model_size, filename) + file_path = os.path.join(model_dir, filename) + download_file(file_url, file_path) + + # Load settings and params + tf_ckpt_path = tf.train.latest_checkpoint(model_dir) + settings = json.load(open(os.path.join(model_dir, "hparams.json"))) + params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings) + + return settings, params + + +def download_file(url, destination): + # Send a GET request to download the file in streaming mode + response = requests.get(url, stream=True) + + # Get the total file size from headers, defaulting to 0 if not present + file_size = int(response.headers.get("content-length", 0)) + + # Check if file exists and has the same size + if os.path.exists(destination): + file_size_local = os.path.getsize(destination) + if file_size == file_size_local: + print(f"File already exists and is up-to-date: {destination}") + return + + # Define the block size for reading the file + block_size = 1024 # 1 Kilobyte + + # Initialize the progress bar with total file size + progress_bar_description = url.split("/")[-1] # Extract filename from URL + with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: + # Open the destination file in binary write mode + with open(destination, "wb") as file: + # Iterate over the file data in chunks + for chunk in response.iter_content(block_size): + progress_bar.update(len(chunk)) # Update progress bar + file.write(chunk) # Write the chunk to the file + + +def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): + # Initialize parameters dictionary with empty blocks for each layer + params = {"blocks": [{} for _ in range(settings["n_layer"])]} + + # Iterate over each variable in the checkpoint + for name, _ in tf.train.list_variables(ckpt_path): + # Load the variable and remove singleton dimensions + variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name)) + + # Process the variable name to extract relevant parts + variable_name_parts = name.split("/")[1:] # Skip the 'model/' prefix + + # Identify the target dictionary for the variable + target_dict = params + if variable_name_parts[0].startswith("h"): + layer_number = int(variable_name_parts[0][1:]) + target_dict = params["blocks"][layer_number] + + # Recursively access or create nested dictionaries + for key in variable_name_parts[1:-1]: + target_dict = target_dict.setdefault(key, {}) + + # Assign the variable array to the last key + last_key = variable_name_parts[-1] + target_dict[last_key] = variable_array + + return params diff --git a/ch06/01_main-chapter-code/previous_chapters.py b/ch06/01_main-chapter-code/previous_chapters.py new file mode 100644 index 0000000..e794f9b --- /dev/null +++ b/ch06/01_main-chapter-code/previous_chapters.py @@ -0,0 +1,345 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch +# +# This file collects all the relevant code that we covered thus far +# throughout Chapters 2-5. +# This file can be run as a standalone script. + +import numpy as np +import tiktoken +import torch +import torch.nn as nn +from torch.utils.data import Dataset, DataLoader + +##################################### +# Chapter 2 +##################################### + + +class GPTDatasetV1(Dataset): + def __init__(self, txt, tokenizer, max_length, stride): + self.tokenizer = tokenizer + self.input_ids = [] + self.target_ids = [] + + # Tokenize the entire text + token_ids = tokenizer.encode(txt) + + # Use a sliding window to chunk the book into overlapping sequences of max_length + for i in range(0, len(token_ids) - max_length, stride): + input_chunk = token_ids[i:i + max_length] + target_chunk = token_ids[i + 1: i + max_length + 1] + self.input_ids.append(torch.tensor(input_chunk)) + self.target_ids.append(torch.tensor(target_chunk)) + + def __len__(self): + return len(self.input_ids) + + def __getitem__(self, idx): + return self.input_ids[idx], self.target_ids[idx] + + +def create_dataloader_v1(txt, batch_size=4, max_length=256, + stride=128, shuffle=True, drop_last=True): + # Initialize the tokenizer + tokenizer = tiktoken.get_encoding("gpt2") + + # Create dataset + dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) + + # Create dataloader + dataloader = DataLoader( + dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last) + + return dataloader + + +##################################### +# Chapter 3 +##################################### +class MultiHeadAttention(nn.Module): + def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): + super().__init__() + assert d_out % num_heads == 0, "d_out must be divisible by n_heads" + + self.d_out = d_out + self.num_heads = num_heads + self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim + + self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) + self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) + self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) + self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs + self.dropout = nn.Dropout(dropout) + self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) + + def forward(self, x): + b, num_tokens, d_in = x.shape + + keys = self.W_key(x) # Shape: (b, num_tokens, d_out) + queries = self.W_query(x) + values = self.W_value(x) + + # We implicitly split the matrix by adding a `num_heads` dimension + # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim) + keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) + values = values.view(b, num_tokens, self.num_heads, self.head_dim) + queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) + + # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) + keys = keys.transpose(1, 2) + queries = queries.transpose(1, 2) + values = values.transpose(1, 2) + + # Compute scaled dot-product attention (aka self-attention) with a causal mask + attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head + + # Original mask truncated to the number of tokens and converted to boolean + mask_bool = self.mask.bool()[:num_tokens, :num_tokens] + + # Use the mask to fill attention scores + attn_scores.masked_fill_(mask_bool, -torch.inf) + + attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) + attn_weights = self.dropout(attn_weights) + + # Shape: (b, num_tokens, num_heads, head_dim) + context_vec = (attn_weights @ values).transpose(1, 2) + + # Combine heads, where self.d_out = self.num_heads * self.head_dim + context_vec = context_vec.reshape(b, num_tokens, self.d_out) + context_vec = self.out_proj(context_vec) # optional projection + + return context_vec + + +##################################### +# Chapter 4 +##################################### +class LayerNorm(nn.Module): + def __init__(self, emb_dim): + super().__init__() + self.eps = 1e-5 + self.scale = nn.Parameter(torch.ones(emb_dim)) + self.shift = nn.Parameter(torch.zeros(emb_dim)) + + def forward(self, x): + mean = x.mean(dim=-1, keepdim=True) + var = x.var(dim=-1, keepdim=True, unbiased=False) + norm_x = (x - mean) / torch.sqrt(var + self.eps) + return self.scale * norm_x + self.shift + + +class GELU(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return 0.5 * x * (1 + torch.tanh( + torch.sqrt(torch.tensor(2.0 / torch.pi)) * + (x + 0.044715 * torch.pow(x, 3)) + )) + + +class FeedForward(nn.Module): + def __init__(self, cfg): + super().__init__() + self.layers = nn.Sequential( + nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), + GELU(), + nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), + ) + + def forward(self, x): + return self.layers(x) + + +class TransformerBlock(nn.Module): + def __init__(self, cfg): + super().__init__() + self.att = MultiHeadAttention( + d_in=cfg["emb_dim"], + d_out=cfg["emb_dim"], + context_length=cfg["context_length"], + num_heads=cfg["n_heads"], + dropout=cfg["drop_rate"], + qkv_bias=cfg["qkv_bias"]) + self.ff = FeedForward(cfg) + self.norm1 = LayerNorm(cfg["emb_dim"]) + self.norm2 = LayerNorm(cfg["emb_dim"]) + self.drop_resid = nn.Dropout(cfg["drop_rate"]) + + def forward(self, x): + # Shortcut connection for attention block + shortcut = x + x = self.norm1(x) + x = self.att(x) # Shape [batch_size, num_tokens, emb_size] + x = self.drop_resid(x) + x = x + shortcut # Add the original input back + + # Shortcut connection for feed-forward block + shortcut = x + x = self.norm2(x) + x = self.ff(x) + x = self.drop_resid(x) + x = x + shortcut # Add the original input back + + return x + + +class GPTModel(nn.Module): + def __init__(self, cfg): + super().__init__() + self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]) + self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"]) + self.drop_emb = nn.Dropout(cfg["drop_rate"]) + + self.trf_blocks = nn.Sequential( + *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) + + self.final_norm = LayerNorm(cfg["emb_dim"]) + self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False) + + def forward(self, in_idx): + batch_size, seq_len = in_idx.shape + tok_embeds = self.tok_emb(in_idx) + pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) + x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size] + x = self.drop_emb(x) + x = self.trf_blocks(x) + x = self.final_norm(x) + logits = self.out_head(x) + return logits + + +def generate_text_simple(model, idx, max_new_tokens, context_size): + # idx is (B, T) array of indices in the current context + for _ in range(max_new_tokens): + + # Crop current context if it exceeds the supported context size + # E.g., if LLM supports only 5 tokens, and the context size is 10 + # then only the last 5 tokens are used as context + idx_cond = idx[:, -context_size:] + + # Get the predictions + with torch.no_grad(): + logits = model(idx_cond) + + # Focus only on the last time step + # (batch, n_token, vocab_size) becomes (batch, vocab_size) + logits = logits[:, -1, :] + + # Get the idx of the vocab entry with the highest logits value + idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch, 1) + + # Append sampled index to the running sequence + idx = torch.cat((idx, idx_next), dim=1) # (batch, n_tokens+1) + + return idx + + +##################################### +# Chapter 5 +##################################### +def assign(left, right): + if left.shape != right.shape: + raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}") + return torch.nn.Parameter(torch.tensor(right)) + + +def load_weights_into_gpt(gpt, params): + gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe']) + gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte']) + + for b in range(len(params["blocks"])): + q_w, k_w, v_w = np.split( + (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1) + gpt.trf_blocks[b].att.W_query.weight = assign( + gpt.trf_blocks[b].att.W_query.weight, q_w.T) + gpt.trf_blocks[b].att.W_key.weight = assign( + gpt.trf_blocks[b].att.W_key.weight, k_w.T) + gpt.trf_blocks[b].att.W_value.weight = assign( + gpt.trf_blocks[b].att.W_value.weight, v_w.T) + + q_b, k_b, v_b = np.split( + (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1) + gpt.trf_blocks[b].att.W_query.bias = assign( + gpt.trf_blocks[b].att.W_query.bias, q_b) + gpt.trf_blocks[b].att.W_key.bias = assign( + gpt.trf_blocks[b].att.W_key.bias, k_b) + gpt.trf_blocks[b].att.W_value.bias = assign( + gpt.trf_blocks[b].att.W_value.bias, v_b) + + gpt.trf_blocks[b].att.out_proj.weight = assign( + gpt.trf_blocks[b].att.out_proj.weight, + params["blocks"][b]["attn"]["c_proj"]["w"].T) + gpt.trf_blocks[b].att.out_proj.bias = assign( + gpt.trf_blocks[b].att.out_proj.bias, + params["blocks"][b]["attn"]["c_proj"]["b"]) + + gpt.trf_blocks[b].ff.layers[0].weight = assign( + gpt.trf_blocks[b].ff.layers[0].weight, + params["blocks"][b]["mlp"]["c_fc"]["w"].T) + gpt.trf_blocks[b].ff.layers[0].bias = assign( + gpt.trf_blocks[b].ff.layers[0].bias, + params["blocks"][b]["mlp"]["c_fc"]["b"]) + gpt.trf_blocks[b].ff.layers[2].weight = assign( + gpt.trf_blocks[b].ff.layers[2].weight, + params["blocks"][b]["mlp"]["c_proj"]["w"].T) + gpt.trf_blocks[b].ff.layers[2].bias = assign( + gpt.trf_blocks[b].ff.layers[2].bias, + params["blocks"][b]["mlp"]["c_proj"]["b"]) + + gpt.trf_blocks[b].norm1.scale = assign( + gpt.trf_blocks[b].norm1.scale, + params["blocks"][b]["ln_1"]["g"]) + gpt.trf_blocks[b].norm1.shift = assign( + gpt.trf_blocks[b].norm1.shift, + params["blocks"][b]["ln_1"]["b"]) + gpt.trf_blocks[b].norm2.scale = assign( + gpt.trf_blocks[b].norm2.scale, + params["blocks"][b]["ln_2"]["g"]) + gpt.trf_blocks[b].norm2.shift = assign( + gpt.trf_blocks[b].norm2.shift, + params["blocks"][b]["ln_2"]["b"]) + + gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"]) + gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"]) + gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"]) + + +def generate(model, idx, max_new_tokens, context_size, temperature, top_k=None): + # For-loop is the same as before: Get logits, and only focus on last time step + for _ in range(max_new_tokens): + idx_cond = idx[:, -context_size:] + with torch.no_grad(): + logits = model(idx_cond) + logits = logits[:, -1, :] + + # New: Filter logits with top_k sampling + if top_k is not None: + # Keep only top_k values + top_logits, _ = torch.topk(logits, top_k) + min_val = top_logits[:, -1] + logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits) + + # New: Apply temperature scaling + if temperature > 0.0: + logits = logits / temperature + + # Apply softmax to get probabilities + probs = torch.softmax(logits, dim=-1) # (batch_size, context_len) + + # Sample from the distribution + idx_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1) + + # Otherwise same as before: get idx of the vocab entry with the highest logits value + else: + idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch_size, 1) + + # Same as before: append sampled index to the running sequence + idx = torch.cat((idx, idx_next), dim=1) # (batch_size, num_tokens+1) + + return idx