BPE: fixed typo (#492)

* fixed typo * use rel path if exists * mod gitignore and use existing vocab files --------- Co-authored-by: rasbt <mail@sebastianraschka.com>
2025-12-12 23:42:17 +00:00 · 2025-01-21 03:49:53 +01:00 · 2025-01-21 03:49:53 +01:00 · 3f9facbc55
commit 3f9facbc55
parent b17d097742
2 changed files with 33 additions and 16 deletions
--- a/.gitignore
+++ b/.gitignore
@ -101,6 +101,13 @@ ch07/02_dataset-utilities/instruction-examples-modified.json
 ch07/04_preference-tuning-with-dpo/gpt2-medium355M-sft.pth
 ch07/04_preference-tuning-with-dpo/loss-plot.pdf

+# Tokenizer files
+ch02/05_bpe-from-scratch/bpe_merges.txt
+ch02/05_bpe-from-scratch/encoder.json
+ch02/05_bpe-from-scratch/vocab.bpe
+ch02/05_bpe-from-scratch/vocab.json
+
+
 # Other
 ch0?/0?_user_interface/.chainlit/
 ch0?/0?_user_interface/chainlit.md
--- a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb
+++ b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb
@ -722,14 +722,14 @@
    "import os\n",
    "import urllib.request\n",
    "\n",
-    "if not os.path.exists(\"the-verdict.txt\"):\n",
+    "if not os.path.exists(\"../01_main-chapter-code/the-verdict.txt\"):\n",
    "    url = (\"https://raw.githubusercontent.com/rasbt/\"\n",
    "           \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
    "           \"the-verdict.txt\")\n",
-    "    file_path = \"the-verdict.txt\"\n",
+    "    file_path = \"../01_main-chapter-code/the-verdict.txt\"\n",
    "    urllib.request.urlretrieve(url, file_path)\n",
    "\n",
-    "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
+    "with open(\"../01_main-chapter-code/the-verdict.txt\", \"r\", encoding=\"utf-8\") as f: # added ../01_main-chapter-code/\n",
    "    text = f.read()"
   ]
  },
@ -876,7 +876,7 @@
   "id": "252693ee-e806-4dac-ab76-2c69086360f4",
   "metadata": {},
   "source": [
-    "- Note that the vocabulary itself is used in the `decoder()` method, which allows us to map the token IDs back into text:"
+    "- Note that the vocabulary itself is used in the `decode()` method, which allows us to map the token IDs back into text:"
   ]
  },
  {
@ -1099,24 +1099,34 @@
    "import os\n",
    "import urllib.request\n",
    "\n",
-    "def download_file_if_absent(url, filename):\n",
-    "    if not os.path.exists(filename):\n",
+    "def download_file_if_absent(url, filename, search_dirs):\n",
+    "    for directory in search_dirs:\n",
+    "        file_path = os.path.join(directory, filename)\n",
+    "        if os.path.exists(file_path):\n",
+    "            print(f\"{filename} already exists in {file_path}\")\n",
+    "            return file_path\n",
+    "\n",
+    "    target_path = os.path.join(search_dirs[0], filename)\n",
    "    try:\n",
-    "            with urllib.request.urlopen(url) as response, open(filename, 'wb') as out_file:\n",
+    "        with urllib.request.urlopen(url) as response, open(target_path, \"wb\") as out_file:\n",
    "            out_file.write(response.read())\n",
-    "            print(f\"Downloaded {filename}\")\n",
+    "        print(f\"Downloaded {filename} to {target_path}\")\n",
    "    except Exception as e:\n",
    "        print(f\"Failed to download {filename}. Error: {e}\")\n",
-    "    else:\n",
-    "        print(f\"{filename} already exists\")\n",
+    "    return target_path\n",
+    "\n",
+    "# Define the directories to search and the files to download\n",
+    "search_directories = [\".\", \"../02_bonus_bytepair-encoder/gpt2_model/\"]\n",
    "\n",
    "files_to_download = {\n",
    "    \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe\": \"vocab.bpe\",\n",
    "    \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json\": \"encoder.json\"\n",
    "}\n",
    "\n",
+    "# Ensure directories exist and download files if needed\n",
+    "paths = {}\n",
    "for url, filename in files_to_download.items():\n",
-    "    download_file_if_absent(url, filename)"
+    "    paths[filename] = download_file_if_absent(url, filename, search_directories)"
   ]
  },
  {
@ -1136,7 +1146,7 @@
   "source": [
    "tokenizer_gpt2 = BPETokenizerSimple()\n",
    "tokenizer_gpt2.load_vocab_and_merges_from_openai(\n",
-    "    vocab_path=\"encoder.json\", bpe_merges_path=\"vocab.bpe\"\n",
+    "    vocab_path=paths[\"encoder.json\"], bpe_merges_path=paths[\"vocab.bpe\"]\n",
    ")"
   ]
  },