{ "cells": [ { "cell_type": "markdown", "id": "c503e5ef-6bb4-45c3-ac49-0e016cedd8d0", "metadata": {}, "source": [ "\n", "\n", "\n", "\n", "\n", "
\n", "\n", "Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka
\n", "
Code repository: https://github.com/rasbt/LLMs-from-scratch\n", "
\n", "
\n", "\n", "
\n" ] }, { "cell_type": "markdown", "id": "8a9e554f-58e3-4787-832d-d149add1b857", "metadata": {}, "source": [ "- Install the additional package requirements for this bonus notebook by uncommenting and running the following cell:" ] }, { "cell_type": "code", "execution_count": 1, "id": "d70bae22-b540-4a13-ab01-e748cb9d55c9", "metadata": {}, "outputs": [], "source": [ "# pip install -r requirements-extra.txt" ] }, { "cell_type": "markdown", "id": "737c59bb-5922-46fc-a787-1369d70925b4", "metadata": {}, "source": [ "# Comparing Various Byte Pair Encoding (BPE) Implementations" ] }, { "cell_type": "markdown", "id": "a9adc3bf-353c-411e-a471-0e92786e7103", "metadata": {}, "source": [ "
\n", " \n", "\n", "## Using BPE from `tiktoken`" ] }, { "cell_type": "code", "execution_count": 2, "id": "1c490fca-a48a-47fa-a299-322d1a08ad17", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tiktoken version: 0.9.0\n" ] } ], "source": [ "from importlib.metadata import version\n", "\n", "print(\"tiktoken version:\", version(\"tiktoken\"))" ] }, { "cell_type": "code", "execution_count": 3, "id": "0952667c-ce84-4f21-87db-59f52b44cec4", "metadata": {}, "outputs": [], "source": [ "import tiktoken\n", "\n", "tik_tokenizer = tiktoken.get_encoding(\"gpt2\")\n", "\n", "text = \"Hello, world. Is this-- a test?\"" ] }, { "cell_type": "code", "execution_count": 4, "id": "b039c350-18ad-48fb-8e6a-085702dfc330", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n" ] } ], "source": [ "integers = tik_tokenizer.encode(text, allowed_special={\"<|endoftext|>\"})\n", "\n", "print(integers)" ] }, { "cell_type": "code", "execution_count": 5, "id": "7b152ba4-04d3-41cc-849f-adedcfb8cabb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Hello, world. Is this-- a test?\n" ] } ], "source": [ "strings = tik_tokenizer.decode(integers)\n", "\n", "print(strings)" ] }, { "cell_type": "code", "execution_count": 6, "id": "cf148a1a-316b-43ec-b7ba-1b6d409ce837", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "50257\n" ] } ], "source": [ "print(tik_tokenizer.n_vocab)" ] }, { "cell_type": "markdown", "id": "6a0b5d4f-2af9-40de-828c-063c4243e771", "metadata": {}, "source": [ "
\n", " \n", "\n", "## Using the original BPE implementation used in GPT-2" ] }, { "cell_type": "code", "execution_count": 7, "id": "0903108c-65cb-4ae1-967a-2155e25349c2", "metadata": {}, "outputs": [], "source": [ "from bpe_openai_gpt2 import get_encoder, download_vocab" ] }, { "cell_type": "code", "execution_count": 8, "id": "35dd8d7c-8c12-4b68-941a-0fd05882dd45", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Fetching encoder.json: 1.04Mit [00:00, 3.69Mit/s] \n", "Fetching vocab.bpe: 457kit [00:00, 2.53Mit/s] \n" ] } ], "source": [ "download_vocab()" ] }, { "cell_type": "code", "execution_count": 9, "id": "1888a7a9-9c40-4fe0-99b4-ebd20aa1ffd0", "metadata": {}, "outputs": [], "source": [ "orig_tokenizer = get_encoder(model_name=\"gpt2_model\", models_dir=\".\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "2740510c-a78a-4fba-ae18-2b156ba2dfef", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n" ] } ], "source": [ "integers = orig_tokenizer.encode(text)\n", "\n", "print(integers)" ] }, { "cell_type": "code", "execution_count": 11, "id": "434d115e-990d-42ad-88dd-31323a96e10f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Hello, world. Is this-- a test?\n" ] } ], "source": [ "strings = orig_tokenizer.decode(integers)\n", "\n", "print(strings)" ] }, { "cell_type": "markdown", "id": "4f63e8c6-707c-4d66-bcf8-dd790647cc86", "metadata": {}, "source": [ "
\n", " \n", "\n", "## Using the BPE via Hugging Face transformers" ] }, { "cell_type": "code", "execution_count": 12, "id": "e9077bf4-f91f-42ad-ab76-f3d89128510e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/sebastian/Developer/LLMs-from-scratch/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "data": { "text/plain": [ "'4.49.0'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import transformers\n", "\n", "transformers.__version__" ] }, { "cell_type": "code", "execution_count": 13, "id": "a9839137-b8ea-4a2c-85fc-9a63064cf8c8", "metadata": {}, "outputs": [], "source": [ "from transformers import GPT2Tokenizer\n", "\n", "hf_tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "222cbd69-6a3d-4868-9c1f-421ffc9d5fe1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hf_tokenizer(strings)[\"input_ids\"]" ] }, { "cell_type": "code", "execution_count": 15, "id": "a6233552", "metadata": {}, "outputs": [], "source": [ "from transformers import GPT2TokenizerFast\n", "\n", "hf_tokenizer_fast = GPT2TokenizerFast.from_pretrained(\"gpt2\")" ] }, { "cell_type": "code", "execution_count": 16, "id": "fa5ca643", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hf_tokenizer_fast(strings)[\"input_ids\"]" ] }, { "cell_type": "markdown", "id": "9d0f2e95-8ae8-4606-a8e0-b0fce91cfac9", "metadata": {}, "source": [ "
\n", " \n", "\n", "## Using my own from-scratch BPE tokenizer" ] }, { "cell_type": "code", "execution_count": 17, "id": "b6e6b1a5-9dc0-4b20-9a8b-c02aa0e3191c", "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "import io\n", "import nbformat\n", "import types\n", "\n", "def import_from_notebook():\n", " def import_definitions_from_notebook(fullname, names):\n", " current_dir = os.getcwd()\n", " path = os.path.join(current_dir, \"..\", \"05_bpe-from-scratch\", fullname + \".ipynb\")\n", " path = os.path.normpath(path)\n", "\n", " # Load the notebook\n", " if not os.path.exists(path):\n", " raise FileNotFoundError(f\"Notebook file not found at: {path}\")\n", "\n", " with io.open(path, \"r\", encoding=\"utf-8\") as f:\n", " nb = nbformat.read(f, as_version=4)\n", "\n", " # Create a module to store the imported functions and classes\n", " mod = types.ModuleType(fullname)\n", " sys.modules[fullname] = mod\n", "\n", " # Go through the notebook cells and only execute function or class definitions\n", " for cell in nb.cells:\n", " if cell.cell_type == \"code\":\n", " cell_code = cell.source\n", " for name in names:\n", " # Check for function or class definitions\n", " if f\"def {name}\" in cell_code or f\"class {name}\" in cell_code:\n", " exec(cell_code, mod.__dict__)\n", " return mod\n", "\n", " fullname = \"bpe-from-scratch\"\n", " names = [\"BPETokenizerSimple\"]\n", "\n", " return import_definitions_from_notebook(fullname, names)" ] }, { "cell_type": "code", "execution_count": 18, "id": "04fbd764-ec98-44f1-9b0a-e9db9a3bb91e", "metadata": {}, "outputs": [], "source": [ "imported_module = import_from_notebook()\n", "BPETokenizerSimple = getattr(imported_module, \"BPETokenizerSimple\", None)\n", "\n", "tokenizer_gpt2 = BPETokenizerSimple()\n", "tokenizer_gpt2.load_vocab_and_merges_from_openai(\n", " vocab_path=os.path.join(\"gpt2_model\", \"encoder.json\"),\n", " bpe_merges_path=os.path.join(\"gpt2_model\", \"vocab.bpe\")\n", ")" ] }, { "cell_type": "code", "execution_count": 19, "id": "5a5def88-1d2c-4550-a5e8-ee82b72b92d7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n" ] } ], "source": [ "integers = tokenizer_gpt2.encode(text)\n", "\n", "print(integers)" ] }, { "cell_type": "markdown", "id": "907a1ade-3401-4f2e-9017-7f58a60cbd98", "metadata": {}, "source": [ "
\n", " \n", "\n", "## A quick performance benchmark" ] }, { "cell_type": "code", "execution_count": 20, "id": "a61bb445-b151-4a2f-8180-d4004c503754", "metadata": {}, "outputs": [], "source": [ "with open(\"../01_main-chapter-code/the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n", " raw_text = f.read()" ] }, { "cell_type": "markdown", "id": "9c0ae9f0-47a1-4e7f-a210-e1d2721f4d1e", "metadata": {}, "source": [ "### Original OpenAI GPT-2 tokenizer" ] }, { "cell_type": "code", "execution_count": 21, "id": "57f7c0a3-c1fd-4313-af34-68e78eb33653", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3.84 ms ± 9.83 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], "source": [ "%timeit orig_tokenizer.encode(raw_text)" ] }, { "cell_type": "markdown", "id": "ef2ce3f3-1f81-47ce-b563-99fe2c7a1e90", "metadata": {}, "source": [ "### Tiktoken OpenAI GPT-2 tokenizer" ] }, { "cell_type": "code", "execution_count": 22, "id": "036dd628-3591-46c9-a5ce-b20b105a8062", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "901 μs ± 6.27 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], "source": [ "%timeit tik_tokenizer.encode(raw_text)" ] }, { "cell_type": "markdown", "id": "0c748de8-273e-42df-b078-3a510106da60", "metadata": {}, "source": [ "### Hugging Face OpenAI GPT-2 tokenizer" ] }, { "cell_type": "code", "execution_count": 23, "id": "b9c85b58-bfbc-465e-9a7e-477e53d55c90", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "11 ms ± 94.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], "source": [ "%timeit hf_tokenizer(raw_text)[\"input_ids\"]" ] }, { "cell_type": "code", "execution_count": 24, "id": "7117107f-22a6-46b4-a442-712d50b3ac7a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "10.8 ms ± 180 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], "source": [ "%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)[\"input_ids\"]" ] }, { "cell_type": "code", "execution_count": 25, "id": "d6bfc7f0", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "3.66 ms ± 3.67 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], "source": [ "%timeit hf_tokenizer_fast(raw_text)[\"input_ids\"]" ] }, { "cell_type": "code", "execution_count": 26, "id": "da57c95a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3.77 ms ± 49.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], "source": [ "%timeit hf_tokenizer_fast(raw_text, max_length=5145, truncation=True)[\"input_ids\"]" ] }, { "cell_type": "markdown", "id": "91ac2876-f36e-498c-bd75-8597a39f2d4b", "metadata": {}, "source": [ "### My own GPT-2 tokenizer (for educational purposes)" ] }, { "cell_type": "code", "execution_count": 27, "id": "3b4ff4d5-f2d9-4ea6-a51c-023dbba15429", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "9.37 ms ± 50.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], "source": [ "%timeit tokenizer_gpt2.encode(raw_text)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 5 }