mirror of
				https://github.com/rasbt/LLMs-from-scratch.git
				synced 2025-10-30 17:29:59 +00:00 
			
		
		
		
	 790d0808b2
			
		
	
	
		790d0808b2
		
	
	
	
	
		
			
			* Organized setup instructions * update tets * link checker action * raise error upon broken link * fix links * fix links * delete duplicated paragraph
		
			
				
	
	
		
			485 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			485 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| {
 | |
|  "cells": [
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "<font size=\"1\">\n",
 | |
|     "Supplementary code for \"Build a Large Language Model From Scratch\": <a href=\"https://www.manning.com/books/build-a-large-language-model-from-scratch\">https://www.manning.com/books/build-a-large-language-model-from-scratch</a> by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
 | |
|     "Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
 | |
|     "</font>"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "metadata": {
 | |
|     "id": "O9i6kzBsZVaZ"
 | |
|    },
 | |
|    "source": [
 | |
|     "# Appendix A: Introduction to PyTorch (Part 2)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "metadata": {
 | |
|     "id": "ppbG5d-NZezH"
 | |
|    },
 | |
|    "source": [
 | |
|     "## A.9 Optimizing training performance with GPUs"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "metadata": {
 | |
|     "id": "6jH0J_DPZhbn"
 | |
|    },
 | |
|    "source": [
 | |
|     "### A.9.1 PyTorch computations on GPU devices"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 1,
 | |
|    "metadata": {
 | |
|     "colab": {
 | |
|      "base_uri": "https://localhost:8080/"
 | |
|     },
 | |
|     "id": "RM7kGhwMF_nO",
 | |
|     "outputId": "ac60b048-b81f-4bb0-90fa-1ca474f04e9a"
 | |
|    },
 | |
|    "outputs": [
 | |
|     {
 | |
|      "name": "stdout",
 | |
|      "output_type": "stream",
 | |
|      "text": [
 | |
|       "2.0.1+cu118\n"
 | |
|      ]
 | |
|     }
 | |
|    ],
 | |
|    "source": [
 | |
|     "import torch\n",
 | |
|     "\n",
 | |
|     "print(torch.__version__)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 2,
 | |
|    "metadata": {
 | |
|     "colab": {
 | |
|      "base_uri": "https://localhost:8080/"
 | |
|     },
 | |
|     "id": "OXLCKXhiUkZt",
 | |
|     "outputId": "39fe5366-287e-47eb-cc34-3508d616c4f9"
 | |
|    },
 | |
|    "outputs": [
 | |
|     {
 | |
|      "name": "stdout",
 | |
|      "output_type": "stream",
 | |
|      "text": [
 | |
|       "True\n"
 | |
|      ]
 | |
|     }
 | |
|    ],
 | |
|    "source": [
 | |
|     "print(torch.cuda.is_available())"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 3,
 | |
|    "metadata": {
 | |
|     "colab": {
 | |
|      "base_uri": "https://localhost:8080/"
 | |
|     },
 | |
|     "id": "MTTlfh53Va-T",
 | |
|     "outputId": "f31d8bbe-577f-4db4-9939-02e66b9f96d1"
 | |
|    },
 | |
|    "outputs": [
 | |
|     {
 | |
|      "data": {
 | |
|       "text/plain": [
 | |
|        "tensor([5., 7., 9.])"
 | |
|       ]
 | |
|      },
 | |
|      "execution_count": 3,
 | |
|      "metadata": {},
 | |
|      "output_type": "execute_result"
 | |
|     }
 | |
|    ],
 | |
|    "source": [
 | |
|     "tensor_1 = torch.tensor([1., 2., 3.])\n",
 | |
|     "tensor_2 = torch.tensor([4., 5., 6.])\n",
 | |
|     "\n",
 | |
|     "print(tensor_1 + tensor_2)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 5,
 | |
|    "metadata": {
 | |
|     "colab": {
 | |
|      "base_uri": "https://localhost:8080/"
 | |
|     },
 | |
|     "id": "Z4LwTNw7Vmmb",
 | |
|     "outputId": "1c025c6a-e3ed-4c7c-f5fd-86c14607036e"
 | |
|    },
 | |
|    "outputs": [
 | |
|     {
 | |
|      "name": "stdout",
 | |
|      "output_type": "stream",
 | |
|      "text": [
 | |
|       "tensor([5., 7., 9.], device='cuda:0')\n"
 | |
|      ]
 | |
|     }
 | |
|    ],
 | |
|    "source": [
 | |
|     "tensor_1 = tensor_1.to(\"cuda\")\n",
 | |
|     "tensor_2 = tensor_2.to(\"cuda\")\n",
 | |
|     "\n",
 | |
|     "print(tensor_1 + tensor_2)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 7,
 | |
|    "metadata": {
 | |
|     "colab": {
 | |
|      "base_uri": "https://localhost:8080/",
 | |
|      "height": 184
 | |
|     },
 | |
|     "id": "tKT6URN1Vuft",
 | |
|     "outputId": "e6f01e7f-d9cf-44cb-cc6d-46fc7907d5c0"
 | |
|    },
 | |
|    "outputs": [
 | |
|     {
 | |
|      "ename": "RuntimeError",
 | |
|      "evalue": "ignored",
 | |
|      "output_type": "error",
 | |
|      "traceback": [
 | |
|       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 | |
|       "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
 | |
|       "\u001b[0;32m<ipython-input-7-4ff3c4d20fc3>\u001b[0m in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mtensor_1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtensor_1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"cpu\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor_1\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtensor_2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
 | |
|       "\u001b[0;31mRuntimeError\u001b[0m: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"
 | |
|      ]
 | |
|     }
 | |
|    ],
 | |
|    "source": [
 | |
|     "tensor_1 = tensor_1.to(\"cpu\")\n",
 | |
|     "print(tensor_1 + tensor_2)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "metadata": {
 | |
|     "id": "c8j1cWDcWAMf"
 | |
|    },
 | |
|    "source": [
 | |
|     "### A.9.2 Single-GPU training"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 8,
 | |
|    "metadata": {
 | |
|     "id": "GyY59cjieitv"
 | |
|    },
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "X_train = torch.tensor([\n",
 | |
|     "    [-1.2, 3.1],\n",
 | |
|     "    [-0.9, 2.9],\n",
 | |
|     "    [-0.5, 2.6],\n",
 | |
|     "    [2.3, -1.1],\n",
 | |
|     "    [2.7, -1.5]\n",
 | |
|     "])\n",
 | |
|     "\n",
 | |
|     "y_train = torch.tensor([0, 0, 0, 1, 1])\n",
 | |
|     "\n",
 | |
|     "X_test = torch.tensor([\n",
 | |
|     "    [-0.8, 2.8],\n",
 | |
|     "    [2.6, -1.6],\n",
 | |
|     "])\n",
 | |
|     "\n",
 | |
|     "y_test = torch.tensor([0, 1])"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 9,
 | |
|    "metadata": {
 | |
|     "id": "v41gKqEJempa"
 | |
|    },
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "from torch.utils.data import Dataset\n",
 | |
|     "\n",
 | |
|     "\n",
 | |
|     "class ToyDataset(Dataset):\n",
 | |
|     "    def __init__(self, X, y):\n",
 | |
|     "        self.features = X\n",
 | |
|     "        self.labels = y\n",
 | |
|     "\n",
 | |
|     "    def __getitem__(self, index):\n",
 | |
|     "        one_x = self.features[index]\n",
 | |
|     "        one_y = self.labels[index]\n",
 | |
|     "        return one_x, one_y\n",
 | |
|     "\n",
 | |
|     "    def __len__(self):\n",
 | |
|     "        return self.labels.shape[0]\n",
 | |
|     "\n",
 | |
|     "train_ds = ToyDataset(X_train, y_train)\n",
 | |
|     "test_ds = ToyDataset(X_test, y_test)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 23,
 | |
|    "metadata": {
 | |
|     "id": "UPGVRuylep8Y"
 | |
|    },
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "from torch.utils.data import DataLoader\n",
 | |
|     "\n",
 | |
|     "torch.manual_seed(123)\n",
 | |
|     "\n",
 | |
|     "train_loader = DataLoader(\n",
 | |
|     "    dataset=train_ds,\n",
 | |
|     "    batch_size=2,\n",
 | |
|     "    shuffle=True,\n",
 | |
|     "    num_workers=1,\n",
 | |
|     "    drop_last=True\n",
 | |
|     ")\n",
 | |
|     "\n",
 | |
|     "test_loader = DataLoader(\n",
 | |
|     "    dataset=test_ds,\n",
 | |
|     "    batch_size=2,\n",
 | |
|     "    shuffle=False,\n",
 | |
|     "    num_workers=1\n",
 | |
|     ")"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 24,
 | |
|    "metadata": {
 | |
|     "id": "drhg6IXofAXh"
 | |
|    },
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "class NeuralNetwork(torch.nn.Module):\n",
 | |
|     "    def __init__(self, num_inputs, num_outputs):\n",
 | |
|     "        super().__init__()\n",
 | |
|     "\n",
 | |
|     "        self.layers = torch.nn.Sequential(\n",
 | |
|     "\n",
 | |
|     "            # 1st hidden layer\n",
 | |
|     "            torch.nn.Linear(num_inputs, 30),\n",
 | |
|     "            torch.nn.ReLU(),\n",
 | |
|     "\n",
 | |
|     "            # 2nd hidden layer\n",
 | |
|     "            torch.nn.Linear(30, 20),\n",
 | |
|     "            torch.nn.ReLU(),\n",
 | |
|     "\n",
 | |
|     "            # output layer\n",
 | |
|     "            torch.nn.Linear(20, num_outputs),\n",
 | |
|     "        )\n",
 | |
|     "\n",
 | |
|     "    def forward(self, x):\n",
 | |
|     "        logits = self.layers(x)\n",
 | |
|     "        return logits"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 25,
 | |
|    "metadata": {
 | |
|     "colab": {
 | |
|      "base_uri": "https://localhost:8080/"
 | |
|     },
 | |
|     "id": "7jaS5sqPWCY0",
 | |
|     "outputId": "84c74615-38f2-48b8-eeda-b5912fed1d3a"
 | |
|    },
 | |
|    "outputs": [
 | |
|     {
 | |
|      "name": "stdout",
 | |
|      "output_type": "stream",
 | |
|      "text": [
 | |
|       "Epoch: 001/003 | Batch 000/002 | Train/Val Loss: 0.75\n",
 | |
|       "Epoch: 001/003 | Batch 001/002 | Train/Val Loss: 0.65\n",
 | |
|       "Epoch: 002/003 | Batch 000/002 | Train/Val Loss: 0.44\n",
 | |
|       "Epoch: 002/003 | Batch 001/002 | Train/Val Loss: 0.13\n",
 | |
|       "Epoch: 003/003 | Batch 000/002 | Train/Val Loss: 0.03\n",
 | |
|       "Epoch: 003/003 | Batch 001/002 | Train/Val Loss: 0.00\n"
 | |
|      ]
 | |
|     }
 | |
|    ],
 | |
|    "source": [
 | |
|     "import torch.nn.functional as F\n",
 | |
|     "\n",
 | |
|     "\n",
 | |
|     "torch.manual_seed(123)\n",
 | |
|     "model = NeuralNetwork(num_inputs=2, num_outputs=2)\n",
 | |
|     "\n",
 | |
|     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") # NEW\n",
 | |
|     "model = model.to(device) # NEW\n",
 | |
|     "\n",
 | |
|     "optimizer = torch.optim.SGD(model.parameters(), lr=0.5)\n",
 | |
|     "\n",
 | |
|     "num_epochs = 3\n",
 | |
|     "\n",
 | |
|     "for epoch in range(num_epochs):\n",
 | |
|     "\n",
 | |
|     "    model.train()\n",
 | |
|     "    for batch_idx, (features, labels) in enumerate(train_loader):\n",
 | |
|     "\n",
 | |
|     "        features, labels = features.to(device), labels.to(device) # NEW\n",
 | |
|     "        logits = model(features)\n",
 | |
|     "        loss = F.cross_entropy(logits, labels) # Loss function\n",
 | |
|     "\n",
 | |
|     "        optimizer.zero_grad()\n",
 | |
|     "        loss.backward()\n",
 | |
|     "        optimizer.step()\n",
 | |
|     "\n",
 | |
|     "        ### LOGGING\n",
 | |
|     "        print(f\"Epoch: {epoch+1:03d}/{num_epochs:03d}\"\n",
 | |
|     "              f\" | Batch {batch_idx:03d}/{len(train_loader):03d}\"\n",
 | |
|     "              f\" | Train/Val Loss: {loss:.2f}\")\n",
 | |
|     "\n",
 | |
|     "    model.eval()\n",
 | |
|     "    # Optional model evaluation"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 26,
 | |
|    "metadata": {
 | |
|     "id": "4qrlmnPPe7FO"
 | |
|    },
 | |
|    "outputs": [],
 | |
|    "source": [
 | |
|     "def compute_accuracy(model, dataloader, device):\n",
 | |
|     "\n",
 | |
|     "    model = model.eval()\n",
 | |
|     "    correct = 0.0\n",
 | |
|     "    total_examples = 0\n",
 | |
|     "\n",
 | |
|     "    for idx, (features, labels) in enumerate(dataloader):\n",
 | |
|     "\n",
 | |
|     "        features, labels = features.to(device), labels.to(device) # New\n",
 | |
|     "\n",
 | |
|     "        with torch.no_grad():\n",
 | |
|     "            logits = model(features)\n",
 | |
|     "\n",
 | |
|     "        predictions = torch.argmax(logits, dim=1)\n",
 | |
|     "        compare = labels == predictions\n",
 | |
|     "        correct += torch.sum(compare)\n",
 | |
|     "        total_examples += len(compare)\n",
 | |
|     "\n",
 | |
|     "    return (correct / total_examples).item()"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 27,
 | |
|    "metadata": {
 | |
|     "colab": {
 | |
|      "base_uri": "https://localhost:8080/"
 | |
|     },
 | |
|     "id": "1_-BfkfEf4HX",
 | |
|     "outputId": "473bf21d-5880-4de3-fc8a-051d75315b94"
 | |
|    },
 | |
|    "outputs": [
 | |
|     {
 | |
|      "data": {
 | |
|       "text/plain": [
 | |
|        "1.0"
 | |
|       ]
 | |
|      },
 | |
|      "execution_count": 27,
 | |
|      "metadata": {},
 | |
|      "output_type": "execute_result"
 | |
|     }
 | |
|    ],
 | |
|    "source": [
 | |
|     "compute_accuracy(model, train_loader, device=device)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "code",
 | |
|    "execution_count": 21,
 | |
|    "metadata": {
 | |
|     "colab": {
 | |
|      "base_uri": "https://localhost:8080/"
 | |
|     },
 | |
|     "id": "iYtXKBGEgKss",
 | |
|     "outputId": "508edd84-3fb7-4d04-cb23-9df0c3d24170"
 | |
|    },
 | |
|    "outputs": [
 | |
|     {
 | |
|      "data": {
 | |
|       "text/plain": [
 | |
|        "1.0"
 | |
|       ]
 | |
|      },
 | |
|      "execution_count": 21,
 | |
|      "metadata": {},
 | |
|      "output_type": "execute_result"
 | |
|     }
 | |
|    ],
 | |
|    "source": [
 | |
|     "compute_accuracy(model, test_loader, device=device)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "### A.9.3 Training with multiple GPUs"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "See [DDP-script.py](DDP-script.py)"
 | |
|    ]
 | |
|   },
 | |
|   {
 | |
|    "cell_type": "markdown",
 | |
|    "metadata": {},
 | |
|    "source": [
 | |
|     "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/appendix-a_compressed/12.webp\" width=\"600px\">\n",
 | |
|     "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/appendix-a_compressed/13.webp\" width=\"600px\">"
 | |
|    ]
 | |
|   }
 | |
|  ],
 | |
|  "metadata": {
 | |
|   "accelerator": "GPU",
 | |
|   "colab": {
 | |
|    "gpuType": "T4",
 | |
|    "provenance": []
 | |
|   },
 | |
|   "kernelspec": {
 | |
|    "display_name": "Python 3 (ipykernel)",
 | |
|    "language": "python",
 | |
|    "name": "python3"
 | |
|   },
 | |
|   "language_info": {
 | |
|    "codemirror_mode": {
 | |
|     "name": "ipython",
 | |
|     "version": 3
 | |
|    },
 | |
|    "file_extension": ".py",
 | |
|    "mimetype": "text/x-python",
 | |
|    "name": "python",
 | |
|    "nbconvert_exporter": "python",
 | |
|    "pygments_lexer": "ipython3",
 | |
|    "version": "3.11.4"
 | |
|   }
 | |
|  },
 | |
|  "nbformat": 4,
 | |
|  "nbformat_minor": 4
 | |
| }
 |