From a96370df4a9f2370b90bc911209f7cd2495a3fd0 Mon Sep 17 00:00:00 2001 From: "85853890+weezymatt@users.noreply.github.com" Date: Wed, 9 Jul 2025 21:09:35 -0600 Subject: [PATCH] Fix issue: 731 by resolving the semantic error --- .../bpe-from-scratch.ipynb | 24 ++++----- ch02/05_bpe-from-scratch/{tests => }/tests.py | 50 +++++++++++-------- 2 files changed, 40 insertions(+), 34 deletions(-) rename ch02/05_bpe-from-scratch/{tests => }/tests.py (90%) diff --git a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb index 9853ac4..a4a39b8 100644 --- a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb +++ b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb @@ -81,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "8c9bc9e4-120f-4bac-8fa6-6523c568d12e", "metadata": {}, "outputs": [ @@ -109,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "6c586945-d459-4f9a-855d-bf73438ef0e3", "metadata": {}, "outputs": [ @@ -138,7 +138,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "0d5b61d9-79a0-48b4-9b3e-64ab595c5b01", "metadata": {}, "outputs": [ @@ -382,7 +382,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "3e4a15ec-2667-4f56-b7c1-34e8071b621d", "metadata": {}, "outputs": [], @@ -809,7 +809,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 36, "id": "51872c08-e01b-40c3-a8a0-e8d6a773e3df", "metadata": {}, "outputs": [ @@ -817,7 +817,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "the-verdict.txt already exists in ./the-verdict.txt\n" + "the-verdict.txt already exists in ../01_main-chapter-code/the-verdict.txt\n" ] } ], @@ -848,7 +848,7 @@ " \"the-verdict.txt\"\n", " ),\n", " filename=\"the-verdict.txt\",\n", - " search_dirs=\".\"\n", + " search_dirs=[\"../01_main-chapter-code/\", \"ch02/01_main-chapter-code/\", \".\"] \n", ")\n", "\n", "with open(verdict_path, \"r\", encoding=\"utf-8\") as f: # added ../01_main-chapter-code/\n", @@ -1293,7 +1293,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 26, "id": "b45b4366-2c2b-4309-9a14-febf3add8512", "metadata": {}, "outputs": [ @@ -1310,7 +1310,7 @@ "# Download files if not already present in this directory\n", "\n", "# Define the directories to search and the files to download\n", - "search_directories = [\".\", \"../02_bonus_bytepair-encoder/gpt2_model/\"]\n", + "search_directories = [\".\",\"ch02/02_bonus_bytepair-encoder/gpt2_model/\", \"../02_bonus_bytepair-encoder/gpt2_model/\"]\n", "\n", "files_to_download = {\n", " \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe\": \"vocab.bpe\",\n", @@ -1333,7 +1333,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "id": "74306e6c-47d3-45a3-9e0f-93f7303ef601", "metadata": {}, "outputs": [], @@ -1459,7 +1459,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -1473,7 +1473,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.16" + "version": "3.12.9" } }, "nbformat": 4, diff --git a/ch02/05_bpe-from-scratch/tests/tests.py b/ch02/05_bpe-from-scratch/tests.py similarity index 90% rename from ch02/05_bpe-from-scratch/tests/tests.py rename to ch02/05_bpe-from-scratch/tests.py index 97ee010..e82e8cd 100644 --- a/ch02/05_bpe-from-scratch/tests/tests.py +++ b/ch02/05_bpe-from-scratch/tests.py @@ -10,7 +10,7 @@ import tiktoken def import_definitions_from_notebook(fullname, names): """Loads function definitions from a Jupyter notebook file into a module.""" - path = os.path.join(os.path.dirname(__file__), "..", fullname + ".ipynb") + path = os.path.join(os.path.dirname(__file__), fullname + ".ipynb") path = os.path.normpath(path) if not os.path.exists(path): @@ -43,26 +43,10 @@ def imported_module(): @pytest.fixture(scope="module") -def gpt2_files(imported_module): - """Fixture to handle downloading GPT-2 files.""" +def verdict_file(imported_module): + """Fixture to handle downloading The Verdict file.""" download_file_if_absent = getattr(imported_module, "download_file_if_absent", None) - search_directories = [".", "../02_bonus_bytepair-encoder/gpt2_model/"] - files_to_download = { - "https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe": "vocab.bpe", - "https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json": "encoder.json" - } - paths = {filename: download_file_if_absent(url, filename, search_directories) - for url, filename in files_to_download.items()} - - return paths - - -def test_tokenizer_training(imported_module, gpt2_files): - BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None) - download_file_if_absent = getattr(imported_module, "download_file_if_absent", None) - - tokenizer = BPETokenizerSimple() verdict_path = download_file_if_absent( url=( "https://raw.githubusercontent.com/rasbt/" @@ -70,10 +54,33 @@ def test_tokenizer_training(imported_module, gpt2_files): "the-verdict.txt" ), filename="the-verdict.txt", - search_dirs="." + search_dirs=["ch02/01_main-chapter-code/", "../01_main-chapter-code/", "."] ) - with open(verdict_path, "r", encoding="utf-8") as f: # added ../01_main-chapter-code/ + return verdict_path + + +@pytest.fixture(scope="module") +def gpt2_files(imported_module): + """Fixture to handle downloading GPT-2 files.""" + download_file_if_absent = getattr(imported_module, "download_file_if_absent", None) + + search_directories = ["ch02/02_bonus_bytepair-encoder/gpt2_model/", "../02_bonus_bytepair-encoder/gpt2_model/", "."] + files_to_download = { + "https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe": "vocab.bpe", + "https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json": "encoder.json", + } + paths = {filename: download_file_if_absent(url, filename, search_directories) + for url, filename in files_to_download.items()} + + return paths + + +def test_tokenizer_training(imported_module, verdict_file): + BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None) + tokenizer = BPETokenizerSimple() + + with open(verdict_file, "r", encoding="utf-8") as f: # added ../01_main-chapter-code/ text = f.read() tokenizer.train(text, vocab_size=1000, allowed_special={"<|endoftext|>"}) @@ -91,7 +98,6 @@ def test_tokenizer_training(imported_module, gpt2_files): tokenizer2.load_vocab_and_merges(vocab_path="vocab.json", bpe_merges_path="bpe_merges.txt") assert tokenizer2.decode(token_ids) == input_text, "Decoded text mismatch after reloading tokenizer." - def test_gpt2_tokenizer_openai_simple(imported_module, gpt2_files): BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)