diff --git a/README.md b/README.md index 5c9d6e5..d09e760 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,7 @@ Several folders contain optional materials as a bonus for interested readers: - [Installing Python Packages and Libraries Used In This Book](setup/02_installing-python-libraries) - [Docker Environment Setup Guide](setup/03_optional-docker-environment) - **Chapter 2: Working with text data** + - [Byte Pair Encoding (BPE) Tokenizer From Scratch](ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb) - [Comparing Various Byte Pair Encoding (BPE) Implementations](ch02/02_bonus_bytepair-encoder) - [Understanding the Difference Between Embedding Layers and Linear Layers](ch02/03_bonus_embedding-vs-matmul) - [Dataloader Intuition with Simple Numbers](ch02/04_bonus_dataloader-intuition) diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb index 3b2e7db..3d0ad06 100644 --- a/ch02/01_main-chapter-code/ch02.ipynb +++ b/ch02/01_main-chapter-code/ch02.ipynb @@ -1900,7 +1900,9 @@ "source": [ "See the [./dataloader.ipynb](./dataloader.ipynb) code notebook, which is a concise version of the data loader that we implemented in this chapter and will need for training the GPT model in upcoming chapters.\n", "\n", - "See [./exercise-solutions.ipynb](./exercise-solutions.ipynb) for the exercise solutions." + "See [./exercise-solutions.ipynb](./exercise-solutions.ipynb) for the exercise solutions.\n", + "\n", + "See the [Byte Pair Encoding (BPE) Tokenizer From Scratch](../02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb) notebook if you are interested in learning how the GPT-2 tokenizer can be implemented and trained from scratch." ] } ], diff --git a/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb b/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb index b5e154c..a141079 100644 --- a/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb +++ b/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb @@ -67,7 +67,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "tiktoken version: 0.5.1\n" + "tiktoken version: 0.7.0\n" ] } ], @@ -180,8 +180,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "Fetching encoder.json: 1.04Mit [00:00, 3.14Mit/s] \n", - "Fetching vocab.bpe: 457kit [00:00, 1.67Mit/s] \n" + "Fetching encoder.json: 1.04Mit [00:00, 3.47Mit/s] \n", + "Fetching vocab.bpe: 457kit [00:00, 2.07Mit/s] \n" ] } ], @@ -259,7 +259,7 @@ { "data": { "text/plain": [ - "'4.34.0'" + "'4.48.0'" ] }, "execution_count": 12, @@ -278,78 +278,7 @@ "execution_count": 13, "id": "a9839137-b8ea-4a2c-85fc-9a63064cf8c8", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e4df871bb797435787143a3abe6b0231", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading tokenizer_config.json: 0%| | 0.00/26.0 [00:00\n", + " \n", + "\n", + "## Using my own from-scratch BPE tokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "b6e6b1a5-9dc0-4b20-9a8b-c02aa0e3191c", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import io\n", + "import nbformat\n", + "import types\n", + "\n", + "def import_from_notebook():\n", + " def import_definitions_from_notebook(fullname, names):\n", + " current_dir = os.getcwd()\n", + " path = os.path.join(current_dir, \"..\", \"05_bpe-from-scratch\", fullname + \".ipynb\")\n", + " path = os.path.normpath(path)\n", + "\n", + " # Load the notebook\n", + " if not os.path.exists(path):\n", + " raise FileNotFoundError(f\"Notebook file not found at: {path}\")\n", + "\n", + " with io.open(path, \"r\", encoding=\"utf-8\") as f:\n", + " nb = nbformat.read(f, as_version=4)\n", + "\n", + " # Create a module to store the imported functions and classes\n", + " mod = types.ModuleType(fullname)\n", + " sys.modules[fullname] = mod\n", + "\n", + " # Go through the notebook cells and only execute function or class definitions\n", + " for cell in nb.cells:\n", + " if cell.cell_type == \"code\":\n", + " cell_code = cell.source\n", + " for name in names:\n", + " # Check for function or class definitions\n", + " if f\"def {name}\" in cell_code or f\"class {name}\" in cell_code:\n", + " exec(cell_code, mod.__dict__)\n", + " return mod\n", + "\n", + " fullname = \"bpe-from-scratch\"\n", + " names = [\"BPETokenizerSimple\"]\n", + "\n", + " return import_definitions_from_notebook(fullname, names)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "04fbd764-ec98-44f1-9b0a-e9db9a3bb91e", + "metadata": {}, + "outputs": [], + "source": [ + "imported_module = import_from_notebook()\n", + "BPETokenizerSimple = getattr(imported_module, \"BPETokenizerSimple\", None)\n", + "\n", + "tokenizer_gpt2 = BPETokenizerSimple()\n", + "tokenizer_gpt2.load_vocab_and_merges_from_openai(\n", + " vocab_path=os.path.join(\"gpt2_model\", \"encoder.json\"),\n", + " bpe_merges_path=os.path.join(\"gpt2_model\", \"vocab.bpe\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "5a5def88-1d2c-4550-a5e8-ee82b72b92d7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1544, 18798, 11, 995, 13, 1148, 256, 5303, 82, 438, 257, 1332, 30]\n" + ] + } + ], + "source": [ + "integers = tokenizer_gpt2.encode(text)\n", + "\n", + "print(integers)" + ] + }, { "cell_type": "markdown", "id": "907a1ade-3401-4f2e-9017-7f58a60cbd98", @@ -390,7 +413,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "id": "a61bb445-b151-4a2f-8180-d4004c503754", "metadata": {}, "outputs": [], @@ -399,9 +422,17 @@ " raw_text = f.read()" ] }, + { + "cell_type": "markdown", + "id": "9c0ae9f0-47a1-4e7f-a210-e1d2721f4d1e", + "metadata": {}, + "source": [ + "### Original OpenAI GPT-2 tokenizer" + ] + }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "id": "57f7c0a3-c1fd-4313-af34-68e78eb33653", "metadata": {}, "outputs": [ @@ -409,7 +440,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "4.29 ms ± 46.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + "3.44 ms ± 54 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], @@ -417,9 +448,17 @@ "%timeit orig_tokenizer.encode(raw_text)" ] }, + { + "cell_type": "markdown", + "id": "ef2ce3f3-1f81-47ce-b563-99fe2c7a1e90", + "metadata": {}, + "source": [ + "### Tiktoken OpenAI GPT-2 tokenizer" + ] + }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "id": "036dd628-3591-46c9-a5ce-b20b105a8062", "metadata": {}, "outputs": [ @@ -427,7 +466,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "1.4 ms ± 9.71 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + "1.08 ms ± 4.69 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], @@ -435,9 +474,17 @@ "%timeit tik_tokenizer.encode(raw_text)" ] }, + { + "cell_type": "markdown", + "id": "0c748de8-273e-42df-b078-3a510106da60", + "metadata": {}, + "source": [ + "### Hugging Face OpenAI GPT-2 tokenizer" + ] + }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "id": "b9c85b58-bfbc-465e-9a7e-477e53d55c90", "metadata": {}, "outputs": [ @@ -452,7 +499,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "8.46 ms ± 48.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + "10.3 ms ± 180 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], @@ -462,7 +509,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "id": "7117107f-22a6-46b4-a442-712d50b3ac7a", "metadata": {}, "outputs": [ @@ -470,13 +517,39 @@ "name": "stdout", "output_type": "stream", "text": [ - "8.36 ms ± 184 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + "10.2 ms ± 72.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], "source": [ "%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)[\"input_ids\"]" ] + }, + { + "cell_type": "markdown", + "id": "91ac2876-f36e-498c-bd75-8597a39f2d4b", + "metadata": {}, + "source": [ + "### My own GPT-2 tokenizer (for educational purposes)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "3b4ff4d5-f2d9-4ea6-a51c-023dbba15429", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.74 ms ± 48.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + ] + } + ], + "source": [ + "%timeit tokenizer_gpt2.encode(raw_text)" + ] } ], "metadata": { diff --git a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb new file mode 100644 index 0000000..2d24547 --- /dev/null +++ b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb @@ -0,0 +1,1301 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9dec0dfb-3d60-41d0-a63a-b010dce67e32", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "\n", + "Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka
\n", + "
Code repository: https://github.com/rasbt/LLMs-from-scratch\n", + "
\n", + "
\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "5e475425-8300-43f2-a5e8-6b5d2de59925", + "metadata": {}, + "source": [ + "# Byte Pair Encoding (BPE) Tokenizer From Scratch" + ] + }, + { + "cell_type": "markdown", + "id": "a1bfc3f3-8ec1-4fd3-b378-d9a3d7807a54", + "metadata": {}, + "source": [ + "- This is a standalone notebook implementing the popular byte pair encoding (BPE) tokenization algorithm, which is used in models like GPT-2 to GPT-4, Llama 3, etc., from scratch for educational purposes\n", + "- For more details about the purpose of tokenization, please refer to [Chapter 2](https://github.com/rasbt/LLMs-from-scratch/blob/main/ch02/01_main-chapter-code/ch02.ipynb); this code here is bonus material explaining the BPE algorithm\n", + "- The original BPE tokenizer that OpenAI implemented for training the original GPT models can be found [here](https://github.com/openai/gpt-2/blob/master/src/encoder.py)\n", + "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](http://www.pennelynn.com/Documents/CUJ/HTML/94HTML/19940045.HTM)\" by Philip Gage\n", + "- Most projects, including Llama 3, nowadays use OpenAI's open-source [tiktoken library](https://github.com/openai/tiktoken) due to its computational performance; it allows loading pretrained GPT-2 and GPT-4 tokenizers, for example (the Llama 3 models were trained using the GPT-4 tokenizer as well)\n", + "- The difference between the implementations above and my implementation in this notebook, besides it being is that it also includes a function for training the tokenizer (for educational purposes)\n", + "- There's also an implementation called [minBPE](https://github.com/karpathy/minbpe) with training support, which is maybe more performant (my implementation here is focused on educational purposes); in contrast to `minbpe` my implementation additionally allows loading the original OpenAI tokenizer vocabulary and merges" + ] + }, + { + "cell_type": "markdown", + "id": "f62336db-f45c-4894-9167-7583095dbdf1", + "metadata": {}, + "source": [ + " \n", + "# 1. The main idea behind byte pair encoding (BPE)" + ] + }, + { + "cell_type": "markdown", + "id": "cd3f1231-bd42-41b5-a017-974b8c660a44", + "metadata": {}, + "source": [ + "- The main idea in BPE is to convert text into an integer representation (token IDs) for LLM training (see [Chapter 2](https://github.com/rasbt/LLMs-from-scratch/blob/main/ch02/01_main-chapter-code/ch02.ipynb))\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "760c625d-26a1-4896-98a2-0fdcd1591256", + "metadata": {}, + "source": [ + " \n", + "## 1.1 Bits and bytes" + ] + }, + { + "cell_type": "markdown", + "id": "d4ddaa35-0ed7-4012-827e-911de11c266c", + "metadata": {}, + "source": [ + "- Before getting to the BPE algorithm, let's introduce the notion of bytes\n", + "- Consider converting text into a byte array (BPE stands for \"byte\" pair encoding after all):" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "8c9bc9e4-120f-4bac-8fa6-6523c568d12e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "bytearray(b'This is some text')\n" + ] + } + ], + "source": [ + "text = \"This is some text\"\n", + "byte_ary = bytearray(text, \"utf-8\")\n", + "print(byte_ary)" + ] + }, + { + "cell_type": "markdown", + "id": "dbd92a2a-9d74-4dc7-bb53-ac33d6cf2fab", + "metadata": {}, + "source": [ + "- When we call `list()` on a `bytearray` object, each byte is treated as an individual element, and the result is a list of integers corresponding to the byte values:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6c586945-d459-4f9a-855d-bf73438ef0e3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[84, 104, 105, 115, 32, 105, 115, 32, 115, 111, 109, 101, 32, 116, 101, 120, 116]\n" + ] + } + ], + "source": [ + "ids = list(byte_ary)\n", + "print(ids)" + ] + }, + { + "cell_type": "markdown", + "id": "71efea37-f4c3-4cb8-bfa5-9299175faf9a", + "metadata": {}, + "source": [ + "- This would be a valid way to convert text into a token ID representation that we need for the embedding layer of an LLM\n", + "- However, the downside of this approach is that it is creating one ID for each character (that's a lot of IDs for a short text!)\n", + "- I.e., this means for a 17-character input text, we have to use 17 token IDs as input to the LLM:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0d5b61d9-79a0-48b4-9b3e-64ab595c5b01", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of characters: 17\n", + "Number of token IDs: 17\n" + ] + } + ], + "source": [ + "print(\"Number of characters:\", len(text))\n", + "print(\"Number of token IDs:\", len(ids))" + ] + }, + { + "cell_type": "markdown", + "id": "68cc833a-c0d4-4d46-9180-c0042fd6addc", + "metadata": {}, + "source": [ + "- If you have worked with LLMs before, you may know that the BPE tokenizers have a vocabulary where we have a token ID for whole words or subwords instead of each character\n", + "- For example, the GPT-2 tokenizer tokenizes the same text (\"This is some text\") into only 4 instead of 17 tokens: `1212, 318, 617, 2420`\n", + "- You can double-check this using the interactive [tiktoken app](https://tiktokenizer.vercel.app/?model=gpt2) or the [tiktoken library](https://github.com/openai/tiktoken):\n", + "\n", + "\n", + "\n", + "```python\n", + "import tiktoken\n", + "\n", + "gpt2_tokenizer = tiktoken.get_encoding(\"gpt2\")\n", + "gpt2_tokenizer.encode(\"This is some text\")\n", + "# prints [1212, 318, 617, 2420]\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "425b99de-cbfc-441c-8b3e-296a5dd7bb27", + "metadata": {}, + "source": [ + "- Since a byte consists of 8 bits, there are 28 = 256 possible values that a single byte can represent, ranging from 0 to 255\n", + "- You can confirm this by executing the code `bytearray(range(0, 257))`, which will warn you that `ValueError: byte must be in range(0, 256)`)\n", + "- A BPE tokenizer usually uses these 256 values as its first 256 single-character tokens; one could visually check this by running the following code:\n", + "\n", + "```python\n", + "import tiktoken\n", + "gpt2_tokenizer = tiktoken.get_encoding(\"gpt2\")\n", + "\n", + "for i in range(300):\n", + " decoded = gpt2_tokenizer.decode([i])\n", + " print(f\"{i}: {decoded}\")\n", + "\"\"\"\n", + "prints:\n", + "0: !\n", + "1: \"\n", + "2: #\n", + "...\n", + "255: � # <---- single character tokens up to here\n", + "256: t\n", + "257: a\n", + "...\n", + "298: ent\n", + "299: n\n", + "\"\"\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "97ff0207-7f8e-44fa-9381-2a4bd83daab3", + "metadata": {}, + "source": [ + "- Above, note that entries 256 and 257 are not single-character values but double-character values (a whitespace + a letter), which is a little shortcoming of the original GPT-2 BPE Tokenizer (this has been improved in the GPT-4 tokenizer)" + ] + }, + { + "cell_type": "markdown", + "id": "8241c23a-d487-488d-bded-cdf054e24920", + "metadata": {}, + "source": [ + " \n", + "## 1.2 Building the vocabulary" + ] + }, + { + "cell_type": "markdown", + "id": "d7c2ceb7-0b3f-4a62-8dcc-07810cd8886e", + "metadata": {}, + "source": [ + "- The goal of the BPE tokenization algorithm is to build a vocabulary of commonly occurring subwords like `298: ent` (which can be found in *entangle, entertain, enter, entrance, entity, ...*, for example), or even complete words like \n", + "\n", + "```\n", + "318: is\n", + "617: some\n", + "1212: This\n", + "2420: text\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "8c0d4420-a4c7-4813-916a-06f4f46bc3f0", + "metadata": {}, + "source": [ + "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](http://www.pennelynn.com/Documents/CUJ/HTML/94HTML/19940045.HTM)\" by Philip Gage\n", + "- Before we get to the actual code implementation, the form that is used for LLM tokenizers today can be summarized as follows:" + ] + }, + { + "cell_type": "markdown", + "id": "ebc71db9-b070-48c4-8412-81f45b308ab3", + "metadata": {}, + "source": [ + " \n", + "## 1.3 BPE algorithm outline\n", + "\n", + "**1. Identify frequent pairs**\n", + "- In each iteration, scan the text to find the most commonly occurring pair of bytes (or characters)\n", + "\n", + "**2. Replace and record**\n", + "\n", + "- Replace that pair with a new placeholder ID (one not already in use, e.g., if we start with 0...255, the first placeholder would be 256)\n", + "- Record this mapping in a lookup table\n", + "- The size of the lookup table is a hyperparameter, also called \"vocabulary size\" (for GPT-2, that's\n", + "50,257)\n", + "\n", + "**3. Repeat until no gains**\n", + "\n", + "- Keep repeating steps 1 and 2, continually merging the most frequent pairs\n", + "- Stop when no further compression is possible (e.g., no pair occurs more than once)\n", + "\n", + "**Decompression (decoding)**\n", + "\n", + "- To restore the original text, reverse the process by substituting each ID with its corresponding pair, using the lookup table\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "e9f5ac9a-3528-4186-9468-8420c7b2ac00", + "metadata": {}, + "source": [ + " \n", + "## 1.4 BPE algorithm example\n", + "\n", + "### 1.4.1 Concrete example of the encoding part (steps 1 & 2)\n", + "\n", + "- Suppose we have the text (training dataset) `the cat in the hat` from which we want to build the vocabulary for a BPE tokenizer\n", + "\n", + "**Iteration 1**\n", + "\n", + "1. Identify frequent pairs\n", + " - In this text, \"th\" appears twice (at the beginning and before the second \"e\")\n", + "\n", + "2. Replace and record\n", + " - replace \"th\" with a new token ID that is not already in use, e.g., 256\n", + " - the new text is: `<256>e cat in <256>e hat`\n", + " - the new vocabulary is\n", + "\n", + "```\n", + " 0: ...\n", + " ...\n", + " 256: \"th\"\n", + "```\n", + "\n", + "**Iteration 2**\n", + "\n", + "1. **Identify frequent pairs** \n", + " - In the text `<256>e cat in <256>e hat`, the pair `<256>e` appears twice\n", + "\n", + "2. **Replace and record** \n", + " - replace `<256>e` with a new token ID that is not already in use, for example, `257`. \n", + " - The new text is:\n", + " ```\n", + " <257> cat in <257> hat\n", + " ```\n", + " - The updated vocabulary is:\n", + " ```\n", + " 0: ...\n", + " ...\n", + " 256: \"th\"\n", + " 257: \"<256>e\"\n", + " ```\n", + "\n", + "**Iteration 3**\n", + "\n", + "1. **Identify frequent pairs** \n", + " - In the text `<257> cat in <257> hat`, the pair `<257> ` appears twice (once at the beginning and once before “hat”).\n", + "\n", + "2. **Replace and record** \n", + " - replace `<257> ` with a new token ID that is not already in use, for example, `258`. \n", + " - the new text is:\n", + " ```\n", + " <258>cat in <258>hat\n", + " ```\n", + " - The updated vocabulary is:\n", + " ```\n", + " 0: ...\n", + " ...\n", + " 256: \"th\"\n", + " 257: \"<256>e\"\n", + " 258: \"<257> \"\n", + " ```\n", + " \n", + "- and so forth\n", + "\n", + " \n", + "### 1.4.2 Concrete example of the decoding part (steps 3)\n", + "\n", + "- To restore the original text, we reverse the process by substituting each token ID with its corresponding pair in the reverse order they were introduced\n", + "- Start with the final compressed text: `<258>cat in <258>hat`\n", + "- Substitute `<258>` → `<257> `: `<257> cat in <257> hat` \n", + "- Substitute `<257>` → `<256>e`: `<256>e cat in <256>e hat`\n", + "- Substitute `<256>` → \"th\": `the cat in the hat`" + ] + }, + { + "cell_type": "markdown", + "id": "a2324948-ddd0-45d1-8ba8-e8eda9fc6677", + "metadata": {}, + "source": [ + " \n", + "## 2. A simple BPE implementation" + ] + }, + { + "cell_type": "markdown", + "id": "429ca709-40d7-4e3d-bf3e-4f5687a2e19b", + "metadata": {}, + "source": [ + "- Below is an implementation of this algorithm described above as a Python class that mimics the `tiktoken` Python user interface\n", + "- Note that the encoding part above describes the original training step via `train()`; however, the `encode()` method works similarly (although it looks a bit more complicated because of the special token handling):\n", + "\n", + "1. Split the input text into individual bytes\n", + "2. Repeatedly find & replace (merge) adjacent tokens (pairs) when they match any pair in the learned BPE merges (from highest to lowest \"rank,\" i.e., in the order they were learned)\n", + "3. Continue merging until no more merges can be applied\n", + "4. The final list of token IDs is the encoded output" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "3e4a15ec-2667-4f56-b7c1-34e8071b621d", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import Counter, deque\n", + "from functools import lru_cache\n", + "import json\n", + "\n", + "\n", + "class BPETokenizerSimple:\n", + " def __init__(self):\n", + " # Maps token_id to token_str (e.g., {11246: \"some\"})\n", + " self.vocab = {}\n", + " # Maps token_str to token_id (e.g., {\"some\": 11246})\n", + " self.inverse_vocab = {}\n", + " # Dictionary of BPE merges: {(token_id1, token_id2): merged_token_id}\n", + " self.bpe_merges = {}\n", + "\n", + " def train(self, text, vocab_size, allowed_special={\"<|endoftext|>\"}):\n", + " \"\"\"\n", + " Train the BPE tokenizer from scratch.\n", + "\n", + " Args:\n", + " text (str): The training text.\n", + " vocab_size (int): The desired vocabulary size.\n", + " allowed_special (set): A set of special tokens to include.\n", + " \"\"\"\n", + "\n", + " # Preprocess: Replace spaces with 'Ġ'\n", + " # Note that Ġ is a particularity of the GPT-2 BPE implementation\n", + " # E.g., \"Hello world\" might be tokenized as [\"Hello\", \"Ġworld\"]\n", + " # (GPT-4 BPE would tokenize it as [\"Hello\", \" world\"])\n", + " processed_text = []\n", + " for i, char in enumerate(text):\n", + " if char == \" \" and i != 0:\n", + " processed_text.append(\"Ġ\")\n", + " if char != \" \":\n", + " processed_text.append(char)\n", + " processed_text = \"\".join(processed_text)\n", + "\n", + " # Initialize vocab with unique characters, including 'Ġ' if present\n", + " # Start with the first 256 ASCII characters\n", + " unique_chars = [chr(i) for i in range(256)]\n", + "\n", + " # Extend unique_chars with characters from processed_text that are not already included\n", + " unique_chars.extend(char for char in sorted(set(processed_text)) if char not in unique_chars)\n", + "\n", + " # Optionally, ensure 'Ġ' is included if it is relevant to your text processing\n", + " if 'Ġ' not in unique_chars:\n", + " unique_chars.append('Ġ')\n", + "\n", + " # Now create the vocab and inverse vocab dictionaries\n", + " self.vocab = {i: char for i, char in enumerate(unique_chars)}\n", + " self.inverse_vocab = {char: i for i, char in self.vocab.items()}\n", + "\n", + " # Add allowed special tokens\n", + " if allowed_special:\n", + " for token in allowed_special:\n", + " if token not in self.inverse_vocab:\n", + " new_id = len(self.vocab)\n", + " self.vocab[new_id] = token\n", + " self.inverse_vocab[token] = new_id\n", + "\n", + " # Tokenize the processed_text into token IDs\n", + " token_ids = [self.inverse_vocab[char] for char in processed_text]\n", + "\n", + " # BPE steps 1-3: Repeatedly find and replace frequent pairs\n", + " for new_id in range(len(self.vocab), vocab_size):\n", + " pair_id = self.find_freq_pair(token_ids, mode=\"most\")\n", + " if pair_id is None: # No more pairs to merge. Stopping training.\n", + " break\n", + " token_ids = self.replace_pair(token_ids, pair_id, new_id)\n", + " self.bpe_merges[pair_id] = new_id\n", + "\n", + " # Build the vocabulary with merged tokens\n", + " for (p0, p1), new_id in self.bpe_merges.items():\n", + " merged_token = self.vocab[p0] + self.vocab[p1]\n", + " self.vocab[new_id] = merged_token\n", + " self.inverse_vocab[merged_token] = new_id\n", + "\n", + " def load_vocab_and_merges_from_openai(self, vocab_path, bpe_merges_path):\n", + " \"\"\"\n", + " Load pre-trained vocabulary and BPE merges from OpenAI's GPT-2 files.\n", + "\n", + " Args:\n", + " vocab_path (str): Path to the vocab file (GPT-2 calls it 'encoder.json').\n", + " bpe_merges_path (str): Path to the bpe_merges file (GPT-2 calls it 'vocab.bpe').\n", + " \"\"\"\n", + " # Load vocabulary\n", + " with open(vocab_path, \"r\", encoding=\"utf-8\") as file:\n", + " loaded_vocab = json.load(file)\n", + " # loaded_vocab maps token_str to token_id\n", + " self.vocab = {int(v): k for k, v in loaded_vocab.items()} # token_id: token_str\n", + " self.inverse_vocab = {k: int(v) for k, v in loaded_vocab.items()} # token_str: token_id\n", + "\n", + " # Load BPE merges\n", + " with open(bpe_merges_path, \"r\", encoding=\"utf-8\") as file:\n", + " lines = file.readlines()\n", + " # Skip header line if present\n", + " if lines and lines[0].startswith(\"#\"):\n", + " lines = lines[1:]\n", + "\n", + " for rank, line in enumerate(lines):\n", + " pair = tuple(line.strip().split())\n", + " if len(pair) != 2:\n", + " print(f\"Line {rank+1} has more than 2 entries: {line.strip()}\")\n", + " continue\n", + " token1, token2 = pair\n", + " if token1 in self.inverse_vocab and token2 in self.inverse_vocab:\n", + " token_id1 = self.inverse_vocab[token1]\n", + " token_id2 = self.inverse_vocab[token2]\n", + " merged_token = token1 + token2\n", + " if merged_token in self.inverse_vocab:\n", + " merged_token_id = self.inverse_vocab[merged_token]\n", + " self.bpe_merges[(token_id1, token_id2)] = merged_token_id\n", + " # print(f\"Loaded merge: '{token1}' + '{token2}' -> '{merged_token}' (ID: {merged_token_id})\")\n", + " else:\n", + " print(f\"Merged token '{merged_token}' not found in vocab. Skipping.\")\n", + " else:\n", + " print(f\"Skipping pair {pair} as one of the tokens is not in the vocabulary.\")\n", + "\n", + " def encode(self, text):\n", + " \"\"\"\n", + " Encode the input text into a list of token IDs.\n", + "\n", + " Args:\n", + " text (str): The text to encode.\n", + "\n", + " Returns:\n", + " List[int]: The list of token IDs.\n", + " \"\"\"\n", + " tokens = []\n", + " # Split text into tokens, keeping newlines intact\n", + " words = text.replace(\"\\n\", \" \\n \").split() # Ensure '\\n' is treated as a separate token\n", + "\n", + " for i, word in enumerate(words):\n", + " if i > 0 and not word.startswith(\"\\n\"):\n", + " tokens.append(\"Ġ\" + word) # Add 'Ġ' to words that follow a space or newline\n", + " else:\n", + " tokens.append(word) # Handle first word or standalone '\\n'\n", + "\n", + " token_ids = []\n", + " for token in tokens:\n", + " if token in self.inverse_vocab:\n", + " # token is contained in the vocabulary as is\n", + " token_id = self.inverse_vocab[token]\n", + " token_ids.append(token_id)\n", + " else:\n", + " # Attempt to handle subword tokenization via BPE\n", + " sub_token_ids = self.tokenize_with_bpe(token)\n", + " token_ids.extend(sub_token_ids)\n", + "\n", + " return token_ids\n", + "\n", + " def tokenize_with_bpe(self, token):\n", + " \"\"\"\n", + " Tokenize a single token using BPE merges.\n", + "\n", + " Args:\n", + " token (str): The token to tokenize.\n", + "\n", + " Returns:\n", + " List[int]: The list of token IDs after applying BPE.\n", + " \"\"\"\n", + " # Tokenize the token into individual characters (as initial token IDs)\n", + " token_ids = [self.inverse_vocab.get(char, None) for char in token]\n", + " if None in token_ids:\n", + " missing_chars = [char for char, tid in zip(token, token_ids) if tid is None]\n", + " raise ValueError(f\"Characters not found in vocab: {missing_chars}\")\n", + "\n", + " can_merge = True\n", + " while can_merge and len(token_ids) > 1:\n", + " can_merge = False\n", + " new_tokens = []\n", + " i = 0\n", + " while i < len(token_ids) - 1:\n", + " pair = (token_ids[i], token_ids[i + 1])\n", + " if pair in self.bpe_merges:\n", + " merged_token_id = self.bpe_merges[pair]\n", + " new_tokens.append(merged_token_id)\n", + " # Uncomment for educational purposes:\n", + " # print(f\"Merged pair {pair} -> {merged_token_id} ('{self.vocab[merged_token_id]}')\")\n", + " i += 2 # Skip the next token as it's merged\n", + " can_merge = True\n", + " else:\n", + " new_tokens.append(token_ids[i])\n", + " i += 1\n", + " if i < len(token_ids):\n", + " new_tokens.append(token_ids[i])\n", + " token_ids = new_tokens\n", + "\n", + " return token_ids\n", + "\n", + " def decode(self, token_ids):\n", + " \"\"\"\n", + " Decode a list of token IDs back into a string.\n", + "\n", + " Args:\n", + " token_ids (List[int]): The list of token IDs to decode.\n", + "\n", + " Returns:\n", + " str: The decoded string.\n", + " \"\"\"\n", + " decoded_string = \"\"\n", + " for token_id in token_ids:\n", + " if token_id not in self.vocab:\n", + " raise ValueError(f\"Token ID {token_id} not found in vocab.\")\n", + " token = self.vocab[token_id]\n", + " if token.startswith(\"Ġ\"):\n", + " # Replace 'Ġ' with a space\n", + " decoded_string += \" \" + token[1:]\n", + " else:\n", + " decoded_string += token\n", + " return decoded_string\n", + "\n", + " def save_vocab_and_merges(self, vocab_path, bpe_merges_path):\n", + " \"\"\"\n", + " Save the vocabulary and BPE merges to JSON files.\n", + "\n", + " Args:\n", + " vocab_path (str): Path to save the vocabulary.\n", + " bpe_merges_path (str): Path to save the BPE merges.\n", + " \"\"\"\n", + " # Save vocabulary\n", + " with open(vocab_path, \"w\", encoding=\"utf-8\") as file:\n", + " json.dump({k: v for k, v in self.vocab.items()}, file, ensure_ascii=False, indent=2)\n", + "\n", + " # Save BPE merges as a list of dictionaries\n", + " with open(bpe_merges_path, \"w\", encoding=\"utf-8\") as file:\n", + " merges_list = [{\"pair\": list(pair), \"new_id\": new_id}\n", + " for pair, new_id in self.bpe_merges.items()]\n", + " json.dump(merges_list, file, ensure_ascii=False, indent=2)\n", + "\n", + " def load_vocab_and_merges(self, vocab_path, bpe_merges_path):\n", + " \"\"\"\n", + " Load the vocabulary and BPE merges from JSON files.\n", + "\n", + " Args:\n", + " vocab_path (str): Path to the vocabulary file.\n", + " bpe_merges_path (str): Path to the BPE merges file.\n", + " \"\"\"\n", + " # Load vocabulary\n", + " with open(vocab_path, \"r\", encoding=\"utf-8\") as file:\n", + " loaded_vocab = json.load(file)\n", + " self.vocab = {int(k): v for k, v in loaded_vocab.items()}\n", + " self.inverse_vocab = {v: int(k) for k, v in loaded_vocab.items()}\n", + "\n", + " # Load BPE merges\n", + " with open(bpe_merges_path, \"r\", encoding=\"utf-8\") as file:\n", + " merges_list = json.load(file)\n", + " for merge in merges_list:\n", + " pair = tuple(merge['pair'])\n", + " new_id = merge['new_id']\n", + " self.bpe_merges[pair] = new_id\n", + "\n", + " @lru_cache(maxsize=None)\n", + " def get_special_token_id(self, token):\n", + " return self.inverse_vocab.get(token, None)\n", + "\n", + " @staticmethod\n", + " def find_freq_pair(token_ids, mode=\"most\"):\n", + " pairs = Counter(zip(token_ids, token_ids[1:]))\n", + "\n", + " if mode == \"most\":\n", + " return max(pairs.items(), key=lambda x: x[1])[0]\n", + " elif mode == \"least\":\n", + " return min(pairs.items(), key=lambda x: x[1])[0]\n", + " else:\n", + " raise ValueError(\"Invalid mode. Choose 'most' or 'least'.\")\n", + "\n", + " @staticmethod\n", + " def replace_pair(token_ids, pair_id, new_id):\n", + " dq = deque(token_ids)\n", + " replaced = []\n", + "\n", + " while dq:\n", + " current = dq.popleft()\n", + " if dq and (current, dq[0]) == pair_id:\n", + " replaced.append(new_id)\n", + " # Remove the 2nd token of the pair, 1st was already removed\n", + " dq.popleft()\n", + " else:\n", + " replaced.append(current)\n", + "\n", + " return replaced" + ] + }, + { + "cell_type": "markdown", + "id": "46db7310-79c7-4ee0-b5fa-d760c6e1aa67", + "metadata": {}, + "source": [ + "- There is a lot of code in the `BPETokenizerSimple` class above, and discussing it in detail is out of scope for this notebook, but the next section offers a short overview of the usage to understand the class methods a bit better" + ] + }, + { + "cell_type": "markdown", + "id": "8ffe1836-eed4-40dc-860b-2d23074d067e", + "metadata": {}, + "source": [ + "## 3. BPE implementation walkthrough" + ] + }, + { + "cell_type": "markdown", + "id": "3c7c996c-fd34-484f-a877-13d977214cf7", + "metadata": {}, + "source": [ + "- In practice, I highly recommend using [tiktoken](https://github.com/openai/tiktoken) as my implementation above focuses on readability and educational purposes, not on performance\n", + "- However, the usage is more or less similar to tiktoken, except that tiktoken does not have a training method\n", + "- Let's see how my `BPETokenizerSimple` Python code above works by looking at some examples below (a detailed code discussion is out of scope for this notebook)" + ] + }, + { + "cell_type": "markdown", + "id": "e82acaf6-7ed5-4d3b-81c0-ae4d3559d2c7", + "metadata": {}, + "source": [ + "### 3.1 Training, encoding, and decoding" + ] + }, + { + "cell_type": "markdown", + "id": "962bf037-903e-4555-b09c-206e1a410278", + "metadata": {}, + "source": [ + "- First, let's consider some sample text as our training dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "4d197cad-ed10-4a42-b01c-a763859781fb", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import urllib.request\n", + "\n", + "if not os.path.exists(\"the-verdict.txt\"):\n", + " url = (\"https://raw.githubusercontent.com/rasbt/\"\n", + " \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n", + " \"the-verdict.txt\")\n", + " file_path = \"the-verdict.txt\"\n", + " urllib.request.urlretrieve(url, file_path)\n", + "\n", + "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n", + " text = f.read()" + ] + }, + { + "cell_type": "markdown", + "id": "04d1b6ac-71d3-4817-956a-9bc7e463a84a", + "metadata": {}, + "source": [ + "- Next, let's initialize and train the BPE tokenizer with a vocabulary size of 1,000\n", + "- Note that the vocabulary size is already 255 by default due to the byte values discussed earlier, so we are only \"learning\" 745 vocabulary entries \n", + "- For comparison, the GPT-2 vocabulary is 50,257 tokens, the GPT-4 vocabulary is 100,256 tokens (`cl100k_base` in tiktoken), and GPT-4o uses 199,997 tokens (`o200k_base` in tiktoken); they have all much bigger training sets compared to our simple example text above" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "027348fd-d52f-4396-93dd-38eed142df9b", + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer = BPETokenizerSimple()\n", + "tokenizer.train(text, vocab_size=1000, allowed_special={\"<|endoftext|>\"})" + ] + }, + { + "cell_type": "markdown", + "id": "2474ff05-5629-4f13-9e03-a47b1e713850", + "metadata": {}, + "source": [ + "- You may want to inspect the vocabulary contents (but note it will create a long list)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "f705a283-355e-4460-b940-06bbc2ae4e61", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1000\n" + ] + } + ], + "source": [ + "# print(tokenizer.vocab)\n", + "print(len(tokenizer.vocab))" + ] + }, + { + "cell_type": "markdown", + "id": "36c9da0f-8a18-41cd-91ea-9ccc2bb5febb", + "metadata": {}, + "source": [ + "- This vocabulary is created by merging 742 times (~ `1000 - len(range(0, 256))`)" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "3da42d1c-f75c-4ba7-a6c5-4cb8543d4a44", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "742\n" + ] + } + ], + "source": [ + "print(len(tokenizer.bpe_merges))" + ] + }, + { + "cell_type": "markdown", + "id": "5dac69c9-8413-482a-8148-6b2afbf1fb89", + "metadata": {}, + "source": [ + "- This means that the first 256 entries are single-character tokens" + ] + }, + { + "cell_type": "markdown", + "id": "451a4108-7c8b-4b98-9c67-d622e9cdf250", + "metadata": {}, + "source": [ + "- Next, let's use the created merges via the `encode` method to encode some text:" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "e1db5cce-e015-412b-ad56-060b8b638078", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[424, 256, 654, 531, 302, 311, 256, 296, 97, 465, 121, 595, 841, 116, 287, 466, 256, 326, 972, 46]\n" + ] + } + ], + "source": [ + "input_text = \"Jack embraced beauty through art and life.\"\n", + "token_ids = tokenizer.encode(input_text)\n", + "print(token_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "1ed1b344-f7d4-4e9e-ac34-2a04b5c5b7a8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of characters: 42\n", + "Number of token IDs: 20\n" + ] + } + ], + "source": [ + "print(\"Number of characters:\", len(input_text))\n", + "print(\"Number of token IDs:\", len(token_ids))" + ] + }, + { + "cell_type": "markdown", + "id": "50c1cfb9-402a-4e1e-9678-0b7547406248", + "metadata": {}, + "source": [ + "- From the lengths above, we can see that a 42-character sentence was encoded into 20 token IDs, effectively cutting the input length roughly in half compared to a character-byte-based encoding" + ] + }, + { + "cell_type": "markdown", + "id": "252693ee-e806-4dac-ab76-2c69086360f4", + "metadata": {}, + "source": [ + "- Note that the vocabulary itself is used in the `decoder()` method, which allows us to map the token IDs back into text:" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "da0e1faf-1933-43d9-b681-916c282a8f86", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[424, 256, 654, 531, 302, 311, 256, 296, 97, 465, 121, 595, 841, 116, 287, 466, 256, 326, 972, 46]\n" + ] + } + ], + "source": [ + "print(token_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "8b690e83-5d6b-409a-804e-321c287c24a4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Jack embraced beauty through art and life.\n" + ] + } + ], + "source": [ + "print(tokenizer.decode(token_ids))" + ] + }, + { + "cell_type": "markdown", + "id": "adea5d09-e5ef-4721-994b-b9b25662fa0a", + "metadata": {}, + "source": [ + "- Iterating over each token ID can give us a better understanding of how the token IDs are decoded via the vocabulary:" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "2b9e6289-92cb-4d88-b3c8-e836d7c8095f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "424 -> Jack\n", + "256 -> \n", + "654 -> em\n", + "531 -> br\n", + "302 -> ac\n", + "311 -> ed\n", + "256 -> \n", + "296 -> be\n", + "97 -> a\n", + "465 -> ut\n", + "121 -> y\n", + "595 -> through\n", + "841 -> ar\n", + "116 -> t\n", + "287 -> a\n", + "466 -> nd\n", + "256 -> \n", + "326 -> li\n", + "972 -> fe\n", + "46 -> .\n" + ] + } + ], + "source": [ + "for token_id in token_ids:\n", + " print(f\"{token_id} -> {tokenizer.decode([token_id])}\")" + ] + }, + { + "cell_type": "markdown", + "id": "5ea41c6c-5538-4fd5-8b5f-195960853b71", + "metadata": {}, + "source": [ + "- As we can see, most token IDs represent 2-character subwords; that's because the training data text is very short with not that many repetitive words, and because we used a relatively small vocabulary size" + ] + }, + { + "cell_type": "markdown", + "id": "600055a3-7ec8-4abf-b88a-c4186fb71463", + "metadata": {}, + "source": [ + "- As a summary, calling `decode(encode())` should be able to reproduce arbitrary input texts:" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "c7056cb1-a9a3-4cf6-8364-29fb493ae240", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'This is some text.'" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.decode(tokenizer.encode(\"This is some text.\"))" + ] + }, + { + "cell_type": "markdown", + "id": "a63b42bb-55bc-4c9d-b859-457a28b76302", + "metadata": {}, + "source": [ + "### 3.2 Saving and loading the tokenizer" + ] + }, + { + "cell_type": "markdown", + "id": "86210925-06dc-4e8c-87bd-821569cd7142", + "metadata": {}, + "source": [ + "- Next, let's look at how we can save the trained tokenizer for reuse later:" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "955181cb-0910-4c6a-9c22-d8292a3ec1fc", + "metadata": {}, + "outputs": [], + "source": [ + "# Save trained tokenizer\n", + "tokenizer.save_vocab_and_merges(vocab_path=\"vocab.json\", bpe_merges_path=\"bpe_merges.txt\")" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "6e5ccfe7-ac67-42f3-b727-87886a8867f1", + "metadata": {}, + "outputs": [], + "source": [ + "# Load tokenizer\n", + "tokenizer2 = BPETokenizerSimple()\n", + "tokenizer2.load_vocab_and_merges(vocab_path=\"vocab.json\", bpe_merges_path=\"bpe_merges.txt\")" + ] + }, + { + "cell_type": "markdown", + "id": "e7f9bcc2-3b27-4473-b75e-4f289d52a7cc", + "metadata": {}, + "source": [ + "- The loaded tokenizer should be able to produce the same results as before:" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "00d9bf8f-756f-48bf-81b8-b890e2c2ef13", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Jack embraced beauty through art and life.\n" + ] + } + ], + "source": [ + "print(tokenizer2.decode(token_ids))" + ] + }, + { + "cell_type": "markdown", + "id": "b24d10b2-1ab8-44ee-b51a-14248e30d662", + "metadata": {}, + "source": [ + " \n", + "### 3.3 Loading the original GPT-2 BPE tokenizer from OpenAI" + ] + }, + { + "cell_type": "markdown", + "id": "df07e031-9495-4af1-929f-3f16cbde82a5", + "metadata": {}, + "source": [ + "- Finally, let's load OpenAI's GPT-2 tokenizer files" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "b45b4366-2c2b-4309-9a14-febf3add8512", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "vocab.bpe already exists\n", + "encoder.json already exists\n" + ] + } + ], + "source": [ + "import os\n", + "import urllib.request\n", + "\n", + "def download_file_if_absent(url, filename):\n", + " if not os.path.exists(filename):\n", + " try:\n", + " with urllib.request.urlopen(url) as response, open(filename, 'wb') as out_file:\n", + " out_file.write(response.read())\n", + " print(f\"Downloaded {filename}\")\n", + " except Exception as e:\n", + " print(f\"Failed to download {filename}. Error: {e}\")\n", + " else:\n", + " print(f\"{filename} already exists\")\n", + "\n", + "files_to_download = {\n", + " \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe\": \"vocab.bpe\",\n", + " \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json\": \"encoder.json\"\n", + "}\n", + "\n", + "for url, filename in files_to_download.items():\n", + " download_file_if_absent(url, filename)" + ] + }, + { + "cell_type": "markdown", + "id": "3fe260a0-1d5f-4bbd-9934-5117052764d1", + "metadata": {}, + "source": [ + "- Next, we load the files via the `load_vocab_and_merges_from_openai` method:" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "74306e6c-47d3-45a3-9e0f-93f7303ef601", + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer_gpt2 = BPETokenizerSimple()\n", + "tokenizer_gpt2.load_vocab_and_merges_from_openai(\n", + " vocab_path=\"encoder.json\", bpe_merges_path=\"vocab.bpe\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "e1d012ce-9e87-47d7-8a1b-b6d6294d76c0", + "metadata": {}, + "source": [ + "- The vocabulary size should be `50257` as we can confirm via the code below:" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "2bb722b4-dbf5-4a0c-9120-efda3293f132", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "50257" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(tokenizer_gpt2.vocab)" + ] + }, + { + "cell_type": "markdown", + "id": "7ea44b45-f524-44b5-a53a-f6d7f483fc19", + "metadata": {}, + "source": [ + "- We can now use the GPT-2 tokenizer via our `BPETokenizerSimple` object:" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "e4866de7-fb32-4dd6-a878-469ec734641c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1212, 318, 617, 2420]\n" + ] + } + ], + "source": [ + "input_text = \"This is some text\"\n", + "token_ids = tokenizer_gpt2.encode(input_text)\n", + "print(token_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "3da8d9b2-af55-4b09-95d7-fabd983e919e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is some text\n" + ] + } + ], + "source": [ + "print(tokenizer_gpt2.decode(token_ids))" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "460deb85-8de7-40c7-ba18-3c17831fa8ab", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[1212, 318, 617, 2420]" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import tiktoken\n", + "\n", + "tik_tokenizer = tiktoken.get_encoding(\"gpt2\")\n", + "tik_tokenizer.encode(input_text)" + ] + }, + { + "cell_type": "markdown", + "id": "b3b1e2dc-f69b-4533-87ef-549e6fb9b5a0", + "metadata": {}, + "source": [ + "- You can double-check that this produces the correct tokens using the interactive [tiktoken app](https://tiktokenizer.vercel.app/?model=gpt2) or the [tiktoken library](https://github.com/openai/tiktoken):\n", + "\n", + "\n", + "\n", + "```python\n", + "import tiktoken\n", + "\n", + "gpt2_tokenizer = tiktoken.get_encoding(\"gpt2\")\n", + "gpt2_tokenizer.encode(\"This is some text\")\n", + "# prints [1212, 318, 617, 2420]\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "id": "3558af04-483c-4f6b-88f5-a534f37316cd", + "metadata": {}, + "source": [ + " \n", + "# 4. Conclusion" + ] + }, + { + "cell_type": "markdown", + "id": "410ed0e6-ad06-4bb3-bb39-6b8110c1caa4", + "metadata": {}, + "source": [ + "- That's it! That's how BPE works in a nutshell, complete with a training method for creating new tokenizers or loading the GPT-2 tokenizer vocabular and merges from the original OpenAI GPT-2 model\n", + "- I hope you found this brief tutorial useful for educational purposes; if you have any questions, please feel free to open a new Discussion [here](https://github.com/rasbt/LLMs-from-scratch/discussions/categories/q-a)\n", + "- For a performance comparison with other tokenizer implementations, please see [this notebook](https://github.com/rasbt/LLMs-from-scratch/blob/main/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}