{ "cells": [ { "cell_type": "markdown", "id": "99311e42-8467-458d-b918-632c8840b96f", "metadata": {}, "source": [ "\n", "\n", "\n", "\n", "\n", "
\n", "\n", "Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka
\n", "
Code repository: https://github.com/rasbt/LLMs-from-scratch\n", "
\n", "
\n", "\n", "
\n" ] }, { "cell_type": "markdown", "id": "ab88d307-61ba-45ef-89bc-e3569443dfca", "metadata": {}, "source": [ "# Chapter 2 Exercise solutions" ] }, { "cell_type": "markdown", "id": "2ed9978c-6d8e-401b-9731-bec3802cbb96", "metadata": {}, "source": [ "Packages that are being used in this notebook:" ] }, { "cell_type": "code", "execution_count": 1, "id": "78b55ed6-3312-4e30-89b8-51dc8a4a908f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch version: 2.6.0\n", "tiktoken version: 0.9.0\n" ] } ], "source": [ "from importlib.metadata import version\n", "\n", "print(\"torch version:\", version(\"torch\"))\n", "print(\"tiktoken version:\", version(\"tiktoken\"))" ] }, { "cell_type": "markdown", "id": "6f678e62-7bcb-4405-86ae-dce94f494303", "metadata": {}, "source": [ "# Exercise 2.1" ] }, { "cell_type": "code", "execution_count": 2, "id": "7614337f-f639-42c9-a99b-d33f74fa8a03", "metadata": {}, "outputs": [], "source": [ "import tiktoken\n", "\n", "tokenizer = tiktoken.get_encoding(\"gpt2\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "4f235d87-be85-4ddf-95a6-af59fca13d82", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[33901, 86, 343, 86, 220, 959]\n" ] } ], "source": [ "integers = tokenizer.encode(\"Akwirw ier\")\n", "print(integers)" ] }, { "cell_type": "code", "execution_count": 4, "id": "45e4e8f0-3272-48bb-96f6-cced5584ceea", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "33901 -> Ak\n", "86 -> w\n", "343 -> ir\n", "86 -> w\n", "220 -> \n", "959 -> ier\n" ] } ], "source": [ "for i in integers:\n", " print(f\"{i} -> {tokenizer.decode([i])}\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "664397bc-6daa-4b88-90aa-e8fc1fbd5846", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[33901]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.encode(\"Ak\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "d3664332-e6bb-447e-8b96-203aafde8b24", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[86]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.encode(\"w\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "2773c09d-c136-4372-a2be-04b58d292842", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[343]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.encode(\"ir\")" ] }, { "cell_type": "code", "execution_count": 8, "id": "8a6abd32-1e0a-4038-9dd2-673f47bcdeb5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[86]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.encode(\"w\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "26ae940a-9841-4e27-a1df-b83fc8a488b3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[220]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.encode(\" \")" ] }, { "cell_type": "code", "execution_count": 10, "id": "a606c39a-6747-4cd8-bb38-e3183f80908d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[959]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.encode(\"ier\")" ] }, { "cell_type": "code", "execution_count": 11, "id": "47c7268d-8fdc-4957-bc68-5be6113f45a7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Akwirw ier'" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode([33901, 86, 343, 86, 220, 959])" ] }, { "cell_type": "markdown", "id": "29e5034a-95ed-46d8-9972-589354dc9fd4", "metadata": {}, "source": [ "# Exercise 2.2" ] }, { "cell_type": "code", "execution_count": 12, "id": "4d50af16-937b-49e0-8ffd-42d30cbb41c9", "metadata": {}, "outputs": [], "source": [ "import tiktoken\n", "import torch\n", "from torch.utils.data import Dataset, DataLoader\n", "\n", "\n", "class GPTDatasetV1(Dataset):\n", " def __init__(self, txt, tokenizer, max_length, stride):\n", " self.input_ids = []\n", " self.target_ids = []\n", "\n", " # Tokenize the entire text\n", " token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n", "\n", " # Use a sliding window to chunk the book into overlapping sequences of max_length\n", " for i in range(0, len(token_ids) - max_length, stride):\n", " input_chunk = token_ids[i:i + max_length]\n", " target_chunk = token_ids[i + 1: i + max_length + 1]\n", " self.input_ids.append(torch.tensor(input_chunk))\n", " self.target_ids.append(torch.tensor(target_chunk))\n", "\n", " def __len__(self):\n", " return len(self.input_ids)\n", "\n", " def __getitem__(self, idx):\n", " return self.input_ids[idx], self.target_ids[idx]\n", "\n", "\n", "def create_dataloader(txt, batch_size=4, max_length=256, stride=128):\n", " # Initialize the tokenizer\n", " tokenizer = tiktoken.get_encoding(\"gpt2\")\n", "\n", " # Create dataset\n", " dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n", "\n", " # Create dataloader\n", " dataloader = DataLoader(dataset, batch_size=batch_size)\n", "\n", " return dataloader\n", "\n", "\n", "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n", " raw_text = f.read()\n", "\n", "tokenizer = tiktoken.get_encoding(\"gpt2\")\n", "encoded_text = tokenizer.encode(raw_text)" ] }, { "cell_type": "code", "execution_count": 13, "id": "15c184fe-5553-4df2-a77f-7504901b6709", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[ 40, 367],\n", " [2885, 1464],\n", " [1807, 3619],\n", " [ 402, 271]])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataloader = create_dataloader(raw_text, batch_size=4, max_length=2, stride=2)\n", "\n", "for batch in dataloader:\n", " x, y = batch\n", " break\n", "\n", "x" ] }, { "cell_type": "code", "execution_count": 14, "id": "739990b2-ce4c-4d17-88e3-547c8c312019", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[ 40, 367, 2885, 1464, 1807, 3619, 402, 271],\n", " [ 2885, 1464, 1807, 3619, 402, 271, 10899, 2138],\n", " [ 1807, 3619, 402, 271, 10899, 2138, 257, 7026],\n", " [ 402, 271, 10899, 2138, 257, 7026, 15632, 438]])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataloader = create_dataloader(raw_text, batch_size=4, max_length=8, stride=2)\n", "\n", "for batch in dataloader:\n", " x, y = batch\n", " break\n", "\n", "x" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 5 }