mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-08-30 19:41:51 +00:00

* Fix BPE bonus materials * fix bpe implementation * update * Add 'Hello, world. Is this-- a test?' test case * update link to test file * update path handling * update path handling * fix pytest paths
661 lines
16 KiB
Plaintext
661 lines
16 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "c503e5ef-6bb4-45c3-ac49-0e016cedd8d0",
|
|
"metadata": {},
|
|
"source": [
|
|
"<table style=\"width:100%\">\n",
|
|
"<tr>\n",
|
|
"<td style=\"vertical-align:middle; text-align:left;\">\n",
|
|
"<font size=\"2\">\n",
|
|
"Supplementary code for the <a href=\"http://mng.bz/orYv\">Build a Large Language Model From Scratch</a> book by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
|
|
"<br>Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
|
|
"</font>\n",
|
|
"</td>\n",
|
|
"<td style=\"vertical-align:middle; text-align:left;\">\n",
|
|
"<a href=\"http://mng.bz/orYv\"><img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp\" width=\"100px\"></a>\n",
|
|
"</td>\n",
|
|
"</tr>\n",
|
|
"</table>\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "8a9e554f-58e3-4787-832d-d149add1b857",
|
|
"metadata": {},
|
|
"source": [
|
|
"- Install the additional package requirements for this bonus notebook by uncommenting and running the following cell:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "d70bae22-b540-4a13-ab01-e748cb9d55c9",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# pip install -r requirements-extra.txt"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "737c59bb-5922-46fc-a787-1369d70925b4",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Comparing Various Byte Pair Encoding (BPE) Implementations"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "a9adc3bf-353c-411e-a471-0e92786e7103",
|
|
"metadata": {},
|
|
"source": [
|
|
"<br>\n",
|
|
" \n",
|
|
"\n",
|
|
"## Using BPE from `tiktoken`"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "1c490fca-a48a-47fa-a299-322d1a08ad17",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"tiktoken version: 0.9.0\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from importlib.metadata import version\n",
|
|
"\n",
|
|
"print(\"tiktoken version:\", version(\"tiktoken\"))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "0952667c-ce84-4f21-87db-59f52b44cec4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import tiktoken\n",
|
|
"\n",
|
|
"tik_tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
|
|
"\n",
|
|
"text = \"Hello, world. Is this-- a test?\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "b039c350-18ad-48fb-8e6a-085702dfc330",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"integers = tik_tokenizer.encode(text, allowed_special={\"<|endoftext|>\"})\n",
|
|
"\n",
|
|
"print(integers)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "7b152ba4-04d3-41cc-849f-adedcfb8cabb",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Hello, world. Is this-- a test?\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"strings = tik_tokenizer.decode(integers)\n",
|
|
"\n",
|
|
"print(strings)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "cf148a1a-316b-43ec-b7ba-1b6d409ce837",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"50257\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(tik_tokenizer.n_vocab)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "6a0b5d4f-2af9-40de-828c-063c4243e771",
|
|
"metadata": {},
|
|
"source": [
|
|
"<br>\n",
|
|
" \n",
|
|
"\n",
|
|
"## Using the original BPE implementation used in GPT-2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "0903108c-65cb-4ae1-967a-2155e25349c2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from bpe_openai_gpt2 import get_encoder, download_vocab"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "35dd8d7c-8c12-4b68-941a-0fd05882dd45",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Fetching encoder.json: 1.04Mit [00:00, 3.69Mit/s] \n",
|
|
"Fetching vocab.bpe: 457kit [00:00, 2.53Mit/s] \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"download_vocab()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "1888a7a9-9c40-4fe0-99b4-ebd20aa1ffd0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"orig_tokenizer = get_encoder(model_name=\"gpt2_model\", models_dir=\".\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "2740510c-a78a-4fba-ae18-2b156ba2dfef",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"integers = orig_tokenizer.encode(text)\n",
|
|
"\n",
|
|
"print(integers)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "434d115e-990d-42ad-88dd-31323a96e10f",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Hello, world. Is this-- a test?\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"strings = orig_tokenizer.decode(integers)\n",
|
|
"\n",
|
|
"print(strings)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "4f63e8c6-707c-4d66-bcf8-dd790647cc86",
|
|
"metadata": {},
|
|
"source": [
|
|
"<br>\n",
|
|
" \n",
|
|
"\n",
|
|
"## Using the BPE via Hugging Face transformers"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "e9077bf4-f91f-42ad-ab76-f3d89128510e",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/Users/sebastian/Developer/LLMs-from-scratch/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
|
" from .autonotebook import tqdm as notebook_tqdm\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'4.49.0'"
|
|
]
|
|
},
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import transformers\n",
|
|
"\n",
|
|
"transformers.__version__"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "a9839137-b8ea-4a2c-85fc-9a63064cf8c8",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from transformers import GPT2Tokenizer\n",
|
|
"\n",
|
|
"hf_tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "222cbd69-6a3d-4868-9c1f-421ffc9d5fe1",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]"
|
|
]
|
|
},
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"hf_tokenizer(strings)[\"input_ids\"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "a6233552",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from transformers import GPT2TokenizerFast\n",
|
|
"\n",
|
|
"hf_tokenizer_fast = GPT2TokenizerFast.from_pretrained(\"gpt2\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "fa5ca643",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]"
|
|
]
|
|
},
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"hf_tokenizer_fast(strings)[\"input_ids\"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "9d0f2e95-8ae8-4606-a8e0-b0fce91cfac9",
|
|
"metadata": {},
|
|
"source": [
|
|
"<br>\n",
|
|
" \n",
|
|
"\n",
|
|
"## Using my own from-scratch BPE tokenizer"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "b6e6b1a5-9dc0-4b20-9a8b-c02aa0e3191c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"import sys\n",
|
|
"import io\n",
|
|
"import nbformat\n",
|
|
"import types\n",
|
|
"\n",
|
|
"def import_from_notebook():\n",
|
|
" def import_definitions_from_notebook(fullname, names):\n",
|
|
" current_dir = os.getcwd()\n",
|
|
" path = os.path.join(current_dir, \"..\", \"05_bpe-from-scratch\", fullname + \".ipynb\")\n",
|
|
" path = os.path.normpath(path)\n",
|
|
"\n",
|
|
" # Load the notebook\n",
|
|
" if not os.path.exists(path):\n",
|
|
" raise FileNotFoundError(f\"Notebook file not found at: {path}\")\n",
|
|
"\n",
|
|
" with io.open(path, \"r\", encoding=\"utf-8\") as f:\n",
|
|
" nb = nbformat.read(f, as_version=4)\n",
|
|
"\n",
|
|
" # Create a module to store the imported functions and classes\n",
|
|
" mod = types.ModuleType(fullname)\n",
|
|
" sys.modules[fullname] = mod\n",
|
|
"\n",
|
|
" # Go through the notebook cells and only execute function or class definitions\n",
|
|
" for cell in nb.cells:\n",
|
|
" if cell.cell_type == \"code\":\n",
|
|
" cell_code = cell.source\n",
|
|
" for name in names:\n",
|
|
" # Check for function or class definitions\n",
|
|
" if f\"def {name}\" in cell_code or f\"class {name}\" in cell_code:\n",
|
|
" exec(cell_code, mod.__dict__)\n",
|
|
" return mod\n",
|
|
"\n",
|
|
" fullname = \"bpe-from-scratch\"\n",
|
|
" names = [\"BPETokenizerSimple\"]\n",
|
|
"\n",
|
|
" return import_definitions_from_notebook(fullname, names)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"id": "04fbd764-ec98-44f1-9b0a-e9db9a3bb91e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"imported_module = import_from_notebook()\n",
|
|
"BPETokenizerSimple = getattr(imported_module, \"BPETokenizerSimple\", None)\n",
|
|
"\n",
|
|
"tokenizer_gpt2 = BPETokenizerSimple()\n",
|
|
"tokenizer_gpt2.load_vocab_and_merges_from_openai(\n",
|
|
" vocab_path=os.path.join(\"gpt2_model\", \"encoder.json\"),\n",
|
|
" bpe_merges_path=os.path.join(\"gpt2_model\", \"vocab.bpe\")\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"id": "5a5def88-1d2c-4550-a5e8-ee82b72b92d7",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"integers = tokenizer_gpt2.encode(text)\n",
|
|
"\n",
|
|
"print(integers)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "907a1ade-3401-4f2e-9017-7f58a60cbd98",
|
|
"metadata": {},
|
|
"source": [
|
|
"<br>\n",
|
|
" \n",
|
|
"\n",
|
|
"## A quick performance benchmark"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"id": "a61bb445-b151-4a2f-8180-d4004c503754",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open(\"../01_main-chapter-code/the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
|
|
" raw_text = f.read()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "9c0ae9f0-47a1-4e7f-a210-e1d2721f4d1e",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Original OpenAI GPT-2 tokenizer"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"id": "57f7c0a3-c1fd-4313-af34-68e78eb33653",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"3.84 ms ± 9.83 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%timeit orig_tokenizer.encode(raw_text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "ef2ce3f3-1f81-47ce-b563-99fe2c7a1e90",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Tiktoken OpenAI GPT-2 tokenizer"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"id": "036dd628-3591-46c9-a5ce-b20b105a8062",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"901 μs ± 6.27 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%timeit tik_tokenizer.encode(raw_text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "0c748de8-273e-42df-b078-3a510106da60",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Hugging Face OpenAI GPT-2 tokenizer"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"id": "b9c85b58-bfbc-465e-9a7e-477e53d55c90",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"11 ms ± 94.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%timeit hf_tokenizer(raw_text)[\"input_ids\"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"id": "7117107f-22a6-46b4-a442-712d50b3ac7a",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"10.8 ms ± 180 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)[\"input_ids\"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"id": "d6bfc7f0",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"3.66 ms ± 3.67 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%timeit hf_tokenizer_fast(raw_text)[\"input_ids\"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 26,
|
|
"id": "da57c95a",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"3.77 ms ± 49.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%timeit hf_tokenizer_fast(raw_text, max_length=5145, truncation=True)[\"input_ids\"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "91ac2876-f36e-498c-bd75-8597a39f2d4b",
|
|
"metadata": {},
|
|
"source": [
|
|
"### My own GPT-2 tokenizer (for educational purposes)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 27,
|
|
"id": "3b4ff4d5-f2d9-4ea6-a51c-023dbba15429",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"9.37 ms ± 50.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%timeit tokenizer_gpt2.encode(raw_text)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.16"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|