{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "AAAnDw04iAm4" }, "source": [ "\n", "\n", "\n", "\n", "\n", "
\n", "\n", "Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka
\n", "
Code repository: https://github.com/rasbt/LLMs-from-scratch\n", "
\n", "
\n", "\n", "
\n" ] }, { "cell_type": "markdown", "metadata": { "id": "O9i6kzBsZVaZ" }, "source": [ "# Appendix A: Introduction to PyTorch (Part 2)" ] }, { "cell_type": "markdown", "metadata": { "id": "ppbG5d-NZezH" }, "source": [ "## A.9 Optimizing training performance with GPUs" ] }, { "cell_type": "markdown", "metadata": { "id": "6jH0J_DPZhbn" }, "source": [ "### A.9.1 PyTorch computations on GPU devices" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RM7kGhwMF_nO", "outputId": "b1872617-aacd-46fa-e5f3-f130fd81b246" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2.4.0+cu121\n" ] } ], "source": [ "import torch\n", "\n", "print(torch.__version__)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "OXLCKXhiUkZt", "outputId": "e9ca3c58-d92c-4c8b-a9c9-cd7fcc1fedb4" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "True\n" ] } ], "source": [ "print(torch.cuda.is_available())" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MTTlfh53Va-T", "outputId": "bae76cb5-d1d3-441f-a7c5-93a161e2e86a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([5., 7., 9.])\n" ] } ], "source": [ "tensor_1 = torch.tensor([1., 2., 3.])\n", "tensor_2 = torch.tensor([4., 5., 6.])\n", "\n", "print(tensor_1 + tensor_2)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Z4LwTNw7Vmmb", "outputId": "9ad97923-bc8e-4c49-88bf-48dc1de56804" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([5., 7., 9.], device='cuda:0')\n" ] } ], "source": [ "tensor_1 = tensor_1.to(\"cuda\")\n", "tensor_2 = tensor_2.to(\"cuda\")\n", "\n", "print(tensor_1 + tensor_2)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 158 }, "id": "tKT6URN1Vuft", "outputId": "8396eb18-47c8-47a1-c1b6-8bcb9480fb52" }, "outputs": [ { "ename": "RuntimeError", "evalue": "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_2321/2079609735.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mtensor_1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtensor_1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"cpu\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor_1\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtensor_2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mRuntimeError\u001b[0m: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!" ] } ], "source": [ "tensor_1 = tensor_1.to(\"cpu\")\n", "print(tensor_1 + tensor_2)" ] }, { "cell_type": "markdown", "metadata": { "id": "c8j1cWDcWAMf" }, "source": [ "### A.9.2 Single-GPU training" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "GyY59cjieitv" }, "outputs": [], "source": [ "X_train = torch.tensor([\n", " [-1.2, 3.1],\n", " [-0.9, 2.9],\n", " [-0.5, 2.6],\n", " [2.3, -1.1],\n", " [2.7, -1.5]\n", "])\n", "\n", "y_train = torch.tensor([0, 0, 0, 1, 1])\n", "\n", "X_test = torch.tensor([\n", " [-0.8, 2.8],\n", " [2.6, -1.6],\n", "])\n", "\n", "y_test = torch.tensor([0, 1])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "v41gKqEJempa" }, "outputs": [], "source": [ "from torch.utils.data import Dataset\n", "\n", "\n", "class ToyDataset(Dataset):\n", " def __init__(self, X, y):\n", " self.features = X\n", " self.labels = y\n", "\n", " def __getitem__(self, index):\n", " one_x = self.features[index]\n", " one_y = self.labels[index]\n", " return one_x, one_y\n", "\n", " def __len__(self):\n", " return self.labels.shape[0]\n", "\n", "train_ds = ToyDataset(X_train, y_train)\n", "test_ds = ToyDataset(X_test, y_test)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "UPGVRuylep8Y" }, "outputs": [], "source": [ "from torch.utils.data import DataLoader\n", "\n", "torch.manual_seed(123)\n", "\n", "train_loader = DataLoader(\n", " dataset=train_ds,\n", " batch_size=2,\n", " shuffle=True,\n", " num_workers=1,\n", " drop_last=True\n", ")\n", "\n", "test_loader = DataLoader(\n", " dataset=test_ds,\n", " batch_size=2,\n", " shuffle=False,\n", " num_workers=1\n", ")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "id": "drhg6IXofAXh" }, "outputs": [], "source": [ "class NeuralNetwork(torch.nn.Module):\n", " def __init__(self, num_inputs, num_outputs):\n", " super().__init__()\n", "\n", " self.layers = torch.nn.Sequential(\n", "\n", " # 1st hidden layer\n", " torch.nn.Linear(num_inputs, 30),\n", " torch.nn.ReLU(),\n", "\n", " # 2nd hidden layer\n", " torch.nn.Linear(30, 20),\n", " torch.nn.ReLU(),\n", "\n", " # output layer\n", " torch.nn.Linear(20, num_outputs),\n", " )\n", "\n", " def forward(self, x):\n", " logits = self.layers(x)\n", " return logits" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7jaS5sqPWCY0", "outputId": "8a5cd93d-671c-4abf-d5cd-97845f300ffd" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch: 001/003 | Batch 000/002 | Train/Val Loss: 0.75\n", "Epoch: 001/003 | Batch 001/002 | Train/Val Loss: 0.65\n", "Epoch: 002/003 | Batch 000/002 | Train/Val Loss: 0.44\n", "Epoch: 002/003 | Batch 001/002 | Train/Val Loss: 0.13\n", "Epoch: 003/003 | Batch 000/002 | Train/Val Loss: 0.03\n", "Epoch: 003/003 | Batch 001/002 | Train/Val Loss: 0.00\n" ] } ], "source": [ "import torch.nn.functional as F\n", "\n", "\n", "torch.manual_seed(123)\n", "model = NeuralNetwork(num_inputs=2, num_outputs=2)\n", "\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") # NEW\n", "model.to(device) # NEW\n", "\n", "# Note that the book originally used the following line, but the \"model =\" is redundant\n", "# model = model.to(device) # NEW\n", "\n", "optimizer = torch.optim.SGD(model.parameters(), lr=0.5)\n", "\n", "num_epochs = 3\n", "\n", "for epoch in range(num_epochs):\n", "\n", " model.train()\n", " for batch_idx, (features, labels) in enumerate(train_loader):\n", "\n", " features, labels = features.to(device), labels.to(device) # NEW\n", " logits = model(features)\n", " loss = F.cross_entropy(logits, labels) # Loss function\n", "\n", " optimizer.zero_grad()\n", " loss.backward()\n", " optimizer.step()\n", "\n", " ### LOGGING\n", " print(f\"Epoch: {epoch+1:03d}/{num_epochs:03d}\"\n", " f\" | Batch {batch_idx:03d}/{len(train_loader):03d}\"\n", " f\" | Train/Val Loss: {loss:.2f}\")\n", "\n", " model.eval()\n", " # Optional model evaluation" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "id": "4qrlmnPPe7FO" }, "outputs": [], "source": [ "def compute_accuracy(model, dataloader, device):\n", "\n", " model = model.eval()\n", " correct = 0.0\n", " total_examples = 0\n", "\n", " for idx, (features, labels) in enumerate(dataloader):\n", "\n", " features, labels = features.to(device), labels.to(device) # New\n", "\n", " with torch.no_grad():\n", " logits = model(features)\n", "\n", " predictions = torch.argmax(logits, dim=1)\n", " compare = labels == predictions\n", " correct += torch.sum(compare)\n", " total_examples += len(compare)\n", "\n", " return (correct / total_examples).item()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1_-BfkfEf4HX", "outputId": "9453154f-0a5b-4a44-a3c9-f010e08d5a2c" }, "outputs": [ { "data": { "text/plain": [ "1.0" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "compute_accuracy(model, train_loader, device=device)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "iYtXKBGEgKss", "outputId": "d6cc870a-34de-490e-e5d3-23e6956744bd" }, "outputs": [ { "data": { "text/plain": [ "1.0" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "compute_accuracy(model, test_loader, device=device)" ] }, { "cell_type": "markdown", "metadata": { "id": "nc2LGFVbiAnB" }, "source": [ "### A.9.3 Training with multiple GPUs" ] }, { "cell_type": "markdown", "metadata": { "id": "cOUza9iQiAnC" }, "source": [ "See [DDP-script.py](DDP-script.py)" ] }, { "cell_type": "markdown", "metadata": { "id": "YOYk5Fh7iAnC" }, "source": [ "\n", "" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 4 }