mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-06-26 23:50:03 +00:00
502 lines
12 KiB
Plaintext
502 lines
12 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "AAAnDw04iAm4"
|
|
},
|
|
"source": [
|
|
"<table style=\"width:100%\">\n",
|
|
"<tr>\n",
|
|
"<td style=\"vertical-align:middle; text-align:left;\">\n",
|
|
"<font size=\"2\">\n",
|
|
"Supplementary code for the <a href=\"http://mng.bz/orYv\">Build a Large Language Model From Scratch</a> book by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
|
|
"<br>Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
|
|
"</font>\n",
|
|
"</td>\n",
|
|
"<td style=\"vertical-align:middle; text-align:left;\">\n",
|
|
"<a href=\"http://mng.bz/orYv\"><img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp\" width=\"100px\"></a>\n",
|
|
"</td>\n",
|
|
"</tr>\n",
|
|
"</table>\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "O9i6kzBsZVaZ"
|
|
},
|
|
"source": [
|
|
"# Appendix A: Introduction to PyTorch (Part 2)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "ppbG5d-NZezH"
|
|
},
|
|
"source": [
|
|
"## A.9 Optimizing training performance with GPUs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "6jH0J_DPZhbn"
|
|
},
|
|
"source": [
|
|
"### A.9.1 PyTorch computations on GPU devices"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "RM7kGhwMF_nO",
|
|
"outputId": "b1872617-aacd-46fa-e5f3-f130fd81b246"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"2.4.0+cu121\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import torch\n",
|
|
"\n",
|
|
"print(torch.__version__)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "OXLCKXhiUkZt",
|
|
"outputId": "e9ca3c58-d92c-4c8b-a9c9-cd7fcc1fedb4"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"True\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(torch.cuda.is_available())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "MTTlfh53Va-T",
|
|
"outputId": "bae76cb5-d1d3-441f-a7c5-93a161e2e86a"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"tensor([5., 7., 9.])\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"tensor_1 = torch.tensor([1., 2., 3.])\n",
|
|
"tensor_2 = torch.tensor([4., 5., 6.])\n",
|
|
"\n",
|
|
"print(tensor_1 + tensor_2)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "Z4LwTNw7Vmmb",
|
|
"outputId": "9ad97923-bc8e-4c49-88bf-48dc1de56804"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"tensor([5., 7., 9.], device='cuda:0')\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"tensor_1 = tensor_1.to(\"cuda\")\n",
|
|
"tensor_2 = tensor_2.to(\"cuda\")\n",
|
|
"\n",
|
|
"print(tensor_1 + tensor_2)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 158
|
|
},
|
|
"id": "tKT6URN1Vuft",
|
|
"outputId": "8396eb18-47c8-47a1-c1b6-8bcb9480fb52"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"ename": "RuntimeError",
|
|
"evalue": "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
|
|
"\u001b[0;32m/tmp/ipykernel_2321/2079609735.py\u001b[0m in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mtensor_1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtensor_1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"cpu\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor_1\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtensor_2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
|
"\u001b[0;31mRuntimeError\u001b[0m: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"tensor_1 = tensor_1.to(\"cpu\")\n",
|
|
"print(tensor_1 + tensor_2)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "c8j1cWDcWAMf"
|
|
},
|
|
"source": [
|
|
"### A.9.2 Single-GPU training"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {
|
|
"id": "GyY59cjieitv"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"X_train = torch.tensor([\n",
|
|
" [-1.2, 3.1],\n",
|
|
" [-0.9, 2.9],\n",
|
|
" [-0.5, 2.6],\n",
|
|
" [2.3, -1.1],\n",
|
|
" [2.7, -1.5]\n",
|
|
"])\n",
|
|
"\n",
|
|
"y_train = torch.tensor([0, 0, 0, 1, 1])\n",
|
|
"\n",
|
|
"X_test = torch.tensor([\n",
|
|
" [-0.8, 2.8],\n",
|
|
" [2.6, -1.6],\n",
|
|
"])\n",
|
|
"\n",
|
|
"y_test = torch.tensor([0, 1])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {
|
|
"id": "v41gKqEJempa"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from torch.utils.data import Dataset\n",
|
|
"\n",
|
|
"\n",
|
|
"class ToyDataset(Dataset):\n",
|
|
" def __init__(self, X, y):\n",
|
|
" self.features = X\n",
|
|
" self.labels = y\n",
|
|
"\n",
|
|
" def __getitem__(self, index):\n",
|
|
" one_x = self.features[index]\n",
|
|
" one_y = self.labels[index]\n",
|
|
" return one_x, one_y\n",
|
|
"\n",
|
|
" def __len__(self):\n",
|
|
" return self.labels.shape[0]\n",
|
|
"\n",
|
|
"train_ds = ToyDataset(X_train, y_train)\n",
|
|
"test_ds = ToyDataset(X_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {
|
|
"id": "UPGVRuylep8Y"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from torch.utils.data import DataLoader\n",
|
|
"\n",
|
|
"torch.manual_seed(123)\n",
|
|
"\n",
|
|
"train_loader = DataLoader(\n",
|
|
" dataset=train_ds,\n",
|
|
" batch_size=2,\n",
|
|
" shuffle=True,\n",
|
|
" num_workers=1,\n",
|
|
" drop_last=True\n",
|
|
")\n",
|
|
"\n",
|
|
"test_loader = DataLoader(\n",
|
|
" dataset=test_ds,\n",
|
|
" batch_size=2,\n",
|
|
" shuffle=False,\n",
|
|
" num_workers=1\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {
|
|
"id": "drhg6IXofAXh"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"class NeuralNetwork(torch.nn.Module):\n",
|
|
" def __init__(self, num_inputs, num_outputs):\n",
|
|
" super().__init__()\n",
|
|
"\n",
|
|
" self.layers = torch.nn.Sequential(\n",
|
|
"\n",
|
|
" # 1st hidden layer\n",
|
|
" torch.nn.Linear(num_inputs, 30),\n",
|
|
" torch.nn.ReLU(),\n",
|
|
"\n",
|
|
" # 2nd hidden layer\n",
|
|
" torch.nn.Linear(30, 20),\n",
|
|
" torch.nn.ReLU(),\n",
|
|
"\n",
|
|
" # output layer\n",
|
|
" torch.nn.Linear(20, num_outputs),\n",
|
|
" )\n",
|
|
"\n",
|
|
" def forward(self, x):\n",
|
|
" logits = self.layers(x)\n",
|
|
" return logits"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "7jaS5sqPWCY0",
|
|
"outputId": "8a5cd93d-671c-4abf-d5cd-97845f300ffd"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Epoch: 001/003 | Batch 000/002 | Train/Val Loss: 0.75\n",
|
|
"Epoch: 001/003 | Batch 001/002 | Train/Val Loss: 0.65\n",
|
|
"Epoch: 002/003 | Batch 000/002 | Train/Val Loss: 0.44\n",
|
|
"Epoch: 002/003 | Batch 001/002 | Train/Val Loss: 0.13\n",
|
|
"Epoch: 003/003 | Batch 000/002 | Train/Val Loss: 0.03\n",
|
|
"Epoch: 003/003 | Batch 001/002 | Train/Val Loss: 0.00\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import torch.nn.functional as F\n",
|
|
"\n",
|
|
"\n",
|
|
"torch.manual_seed(123)\n",
|
|
"model = NeuralNetwork(num_inputs=2, num_outputs=2)\n",
|
|
"\n",
|
|
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") # NEW\n",
|
|
"model.to(device) # NEW\n",
|
|
"\n",
|
|
"# Note that the book originally used the following line, but the \"model =\" is redundant\n",
|
|
"# model = model.to(device) # NEW\n",
|
|
"\n",
|
|
"optimizer = torch.optim.SGD(model.parameters(), lr=0.5)\n",
|
|
"\n",
|
|
"num_epochs = 3\n",
|
|
"\n",
|
|
"for epoch in range(num_epochs):\n",
|
|
"\n",
|
|
" model.train()\n",
|
|
" for batch_idx, (features, labels) in enumerate(train_loader):\n",
|
|
"\n",
|
|
" features, labels = features.to(device), labels.to(device) # NEW\n",
|
|
" logits = model(features)\n",
|
|
" loss = F.cross_entropy(logits, labels) # Loss function\n",
|
|
"\n",
|
|
" optimizer.zero_grad()\n",
|
|
" loss.backward()\n",
|
|
" optimizer.step()\n",
|
|
"\n",
|
|
" ### LOGGING\n",
|
|
" print(f\"Epoch: {epoch+1:03d}/{num_epochs:03d}\"\n",
|
|
" f\" | Batch {batch_idx:03d}/{len(train_loader):03d}\"\n",
|
|
" f\" | Train/Val Loss: {loss:.2f}\")\n",
|
|
"\n",
|
|
" model.eval()\n",
|
|
" # Optional model evaluation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {
|
|
"id": "4qrlmnPPe7FO"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def compute_accuracy(model, dataloader, device):\n",
|
|
"\n",
|
|
" model = model.eval()\n",
|
|
" correct = 0.0\n",
|
|
" total_examples = 0\n",
|
|
"\n",
|
|
" for idx, (features, labels) in enumerate(dataloader):\n",
|
|
"\n",
|
|
" features, labels = features.to(device), labels.to(device) # New\n",
|
|
"\n",
|
|
" with torch.no_grad():\n",
|
|
" logits = model(features)\n",
|
|
"\n",
|
|
" predictions = torch.argmax(logits, dim=1)\n",
|
|
" compare = labels == predictions\n",
|
|
" correct += torch.sum(compare)\n",
|
|
" total_examples += len(compare)\n",
|
|
"\n",
|
|
" return (correct / total_examples).item()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "1_-BfkfEf4HX",
|
|
"outputId": "9453154f-0a5b-4a44-a3c9-f010e08d5a2c"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"1.0"
|
|
]
|
|
},
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"compute_accuracy(model, train_loader, device=device)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "iYtXKBGEgKss",
|
|
"outputId": "d6cc870a-34de-490e-e5d3-23e6956744bd"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"1.0"
|
|
]
|
|
},
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"compute_accuracy(model, test_loader, device=device)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "nc2LGFVbiAnB"
|
|
},
|
|
"source": [
|
|
"### A.9.3 Training with multiple GPUs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "cOUza9iQiAnC"
|
|
},
|
|
"source": [
|
|
"See [DDP-script.py](DDP-script.py)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "YOYk5Fh7iAnC"
|
|
},
|
|
"source": [
|
|
"<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/appendix-a_compressed/12.webp\" width=\"600px\">\n",
|
|
"<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/appendix-a_compressed/13.webp\" width=\"600px\">"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"accelerator": "GPU",
|
|
"colab": {
|
|
"gpuType": "T4",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.16"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|