From a96370df4a9f2370b90bc911209f7cd2495a3fd0 Mon Sep 17 00:00:00 2001
From: "85853890+weezymatt@users.noreply.github.com" <mhernan0697@gmail.com>
Date: Wed, 9 Jul 2025 21:09:35 -0600
Subject: [PATCH] Fix issue: 731 by resolving the semantic error

---
 .../bpe-from-scratch.ipynb                    | 24 ++++-----
 ch02/05_bpe-from-scratch/{tests => }/tests.py | 50 +++++++++++--------
 2 files changed, 40 insertions(+), 34 deletions(-)
 rename ch02/05_bpe-from-scratch/{tests => }/tests.py (90%)

diff --git a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb
index 9853ac4..a4a39b8 100644
--- a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb
+++ b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb
@@ -81,7 +81,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "8c9bc9e4-120f-4bac-8fa6-6523c568d12e",
    "metadata": {},
    "outputs": [
@@ -109,7 +109,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "6c586945-d459-4f9a-855d-bf73438ef0e3",
    "metadata": {},
    "outputs": [
@@ -138,7 +138,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "0d5b61d9-79a0-48b4-9b3e-64ab595c5b01",
    "metadata": {},
    "outputs": [
@@ -382,7 +382,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "id": "3e4a15ec-2667-4f56-b7c1-34e8071b621d",
    "metadata": {},
    "outputs": [],
@@ -809,7 +809,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 36,
    "id": "51872c08-e01b-40c3-a8a0-e8d6a773e3df",
    "metadata": {},
    "outputs": [
@@ -817,7 +817,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "the-verdict.txt already exists in ./the-verdict.txt\n"
+      "the-verdict.txt already exists in ../01_main-chapter-code/the-verdict.txt\n"
      ]
     }
    ],
@@ -848,7 +848,7 @@
     "         \"the-verdict.txt\"\n",
     "    ),\n",
     "    filename=\"the-verdict.txt\",\n",
-    "    search_dirs=\".\"\n",
+    "    search_dirs=[\"../01_main-chapter-code/\", \"ch02/01_main-chapter-code/\", \".\"] \n",
     ")\n",
     "\n",
     "with open(verdict_path, \"r\", encoding=\"utf-8\") as f: # added ../01_main-chapter-code/\n",
@@ -1293,7 +1293,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 26,
    "id": "b45b4366-2c2b-4309-9a14-febf3add8512",
    "metadata": {},
    "outputs": [
@@ -1310,7 +1310,7 @@
     "# Download files if not already present in this directory\n",
     "\n",
     "# Define the directories to search and the files to download\n",
-    "search_directories = [\".\", \"../02_bonus_bytepair-encoder/gpt2_model/\"]\n",
+    "search_directories = [\".\",\"ch02/02_bonus_bytepair-encoder/gpt2_model/\", \"../02_bonus_bytepair-encoder/gpt2_model/\"]\n",
     "\n",
     "files_to_download = {\n",
     "    \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe\": \"vocab.bpe\",\n",
@@ -1333,7 +1333,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 24,
    "id": "74306e6c-47d3-45a3-9e0f-93f7303ef601",
    "metadata": {},
    "outputs": [],
@@ -1459,7 +1459,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -1473,7 +1473,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.12.9"
   }
  },
  "nbformat": 4,
diff --git a/ch02/05_bpe-from-scratch/tests/tests.py b/ch02/05_bpe-from-scratch/tests.py
similarity index 90%
rename from ch02/05_bpe-from-scratch/tests/tests.py
rename to ch02/05_bpe-from-scratch/tests.py
index 97ee010..e82e8cd 100644
--- a/ch02/05_bpe-from-scratch/tests/tests.py
+++ b/ch02/05_bpe-from-scratch/tests.py
@@ -10,7 +10,7 @@ import tiktoken
 
 def import_definitions_from_notebook(fullname, names):
     """Loads function definitions from a Jupyter notebook file into a module."""
-    path = os.path.join(os.path.dirname(__file__), "..", fullname + ".ipynb")
+    path = os.path.join(os.path.dirname(__file__), fullname + ".ipynb")
     path = os.path.normpath(path)
 
     if not os.path.exists(path):
@@ -43,26 +43,10 @@ def imported_module():
 
 
 @pytest.fixture(scope="module")
-def gpt2_files(imported_module):
-    """Fixture to handle downloading GPT-2 files."""
+def verdict_file(imported_module):
+    """Fixture to handle downloading The Verdict file."""
     download_file_if_absent = getattr(imported_module, "download_file_if_absent", None)
 
-    search_directories = [".", "../02_bonus_bytepair-encoder/gpt2_model/"]
-    files_to_download = {
-        "https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe": "vocab.bpe",
-        "https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json": "encoder.json"
-    }
-    paths = {filename: download_file_if_absent(url, filename, search_directories)
-             for url, filename in files_to_download.items()}
-
-    return paths
-
-
-def test_tokenizer_training(imported_module, gpt2_files):
-    BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)
-    download_file_if_absent = getattr(imported_module, "download_file_if_absent", None)
-
-    tokenizer = BPETokenizerSimple()
     verdict_path = download_file_if_absent(
         url=(
             "https://raw.githubusercontent.com/rasbt/"
@@ -70,10 +54,33 @@ def test_tokenizer_training(imported_module, gpt2_files):
             "the-verdict.txt"
         ),
         filename="the-verdict.txt",
-        search_dirs="."
+        search_dirs=["ch02/01_main-chapter-code/", "../01_main-chapter-code/", "."]
     )
 
-    with open(verdict_path, "r", encoding="utf-8") as f: # added ../01_main-chapter-code/
+    return verdict_path
+
+
+@pytest.fixture(scope="module")
+def gpt2_files(imported_module):
+    """Fixture to handle downloading GPT-2 files."""
+    download_file_if_absent = getattr(imported_module, "download_file_if_absent", None)
+
+    search_directories = ["ch02/02_bonus_bytepair-encoder/gpt2_model/", "../02_bonus_bytepair-encoder/gpt2_model/", "."]
+    files_to_download = {
+        "https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe": "vocab.bpe",
+        "https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json": "encoder.json",
+    }
+    paths = {filename: download_file_if_absent(url, filename, search_directories)
+             for url, filename in files_to_download.items()}
+
+    return paths
+
+
+def test_tokenizer_training(imported_module, verdict_file):
+    BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)
+    tokenizer = BPETokenizerSimple()
+
+    with open(verdict_file, "r", encoding="utf-8") as f: # added ../01_main-chapter-code/
         text = f.read()
 
     tokenizer.train(text, vocab_size=1000, allowed_special={"<|endoftext|>"})
@@ -91,7 +98,6 @@ def test_tokenizer_training(imported_module, gpt2_files):
     tokenizer2.load_vocab_and_merges(vocab_path="vocab.json", bpe_merges_path="bpe_merges.txt")
     assert tokenizer2.decode(token_ids) == input_text, "Decoded text mismatch after reloading tokenizer."
 
-
 def test_gpt2_tokenizer_openai_simple(imported_module, gpt2_files):
     BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)