Fix issue: 731 by resolving the semantic error

This commit is contained in:
85853890+weezymatt@users.noreply.github.com 2025-07-09 21:09:35 -06:00
parent c8c6e7814a
commit a96370df4a
2 changed files with 40 additions and 34 deletions

View File

@ -81,7 +81,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"id": "8c9bc9e4-120f-4bac-8fa6-6523c568d12e",
"metadata": {},
"outputs": [
@ -109,7 +109,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"id": "6c586945-d459-4f9a-855d-bf73438ef0e3",
"metadata": {},
"outputs": [
@ -138,7 +138,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"id": "0d5b61d9-79a0-48b4-9b3e-64ab595c5b01",
"metadata": {},
"outputs": [
@ -382,7 +382,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"id": "3e4a15ec-2667-4f56-b7c1-34e8071b621d",
"metadata": {},
"outputs": [],
@ -809,7 +809,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 36,
"id": "51872c08-e01b-40c3-a8a0-e8d6a773e3df",
"metadata": {},
"outputs": [
@ -817,7 +817,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"the-verdict.txt already exists in ./the-verdict.txt\n"
"the-verdict.txt already exists in ../01_main-chapter-code/the-verdict.txt\n"
]
}
],
@ -848,7 +848,7 @@
" \"the-verdict.txt\"\n",
" ),\n",
" filename=\"the-verdict.txt\",\n",
" search_dirs=\".\"\n",
" search_dirs=[\"../01_main-chapter-code/\", \"ch02/01_main-chapter-code/\", \".\"] \n",
")\n",
"\n",
"with open(verdict_path, \"r\", encoding=\"utf-8\") as f: # added ../01_main-chapter-code/\n",
@ -1293,7 +1293,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 26,
"id": "b45b4366-2c2b-4309-9a14-febf3add8512",
"metadata": {},
"outputs": [
@ -1310,7 +1310,7 @@
"# Download files if not already present in this directory\n",
"\n",
"# Define the directories to search and the files to download\n",
"search_directories = [\".\", \"../02_bonus_bytepair-encoder/gpt2_model/\"]\n",
"search_directories = [\".\",\"ch02/02_bonus_bytepair-encoder/gpt2_model/\", \"../02_bonus_bytepair-encoder/gpt2_model/\"]\n",
"\n",
"files_to_download = {\n",
" \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe\": \"vocab.bpe\",\n",
@ -1333,7 +1333,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 24,
"id": "74306e6c-47d3-45a3-9e0f-93f7303ef601",
"metadata": {},
"outputs": [],
@ -1459,7 +1459,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "base",
"language": "python",
"name": "python3"
},
@ -1473,7 +1473,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
"version": "3.12.9"
}
},
"nbformat": 4,

View File

@ -10,7 +10,7 @@ import tiktoken
def import_definitions_from_notebook(fullname, names):
"""Loads function definitions from a Jupyter notebook file into a module."""
path = os.path.join(os.path.dirname(__file__), "..", fullname + ".ipynb")
path = os.path.join(os.path.dirname(__file__), fullname + ".ipynb")
path = os.path.normpath(path)
if not os.path.exists(path):
@ -43,26 +43,10 @@ def imported_module():
@pytest.fixture(scope="module")
def gpt2_files(imported_module):
"""Fixture to handle downloading GPT-2 files."""
def verdict_file(imported_module):
"""Fixture to handle downloading The Verdict file."""
download_file_if_absent = getattr(imported_module, "download_file_if_absent", None)
search_directories = [".", "../02_bonus_bytepair-encoder/gpt2_model/"]
files_to_download = {
"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe": "vocab.bpe",
"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json": "encoder.json"
}
paths = {filename: download_file_if_absent(url, filename, search_directories)
for url, filename in files_to_download.items()}
return paths
def test_tokenizer_training(imported_module, gpt2_files):
BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)
download_file_if_absent = getattr(imported_module, "download_file_if_absent", None)
tokenizer = BPETokenizerSimple()
verdict_path = download_file_if_absent(
url=(
"https://raw.githubusercontent.com/rasbt/"
@ -70,10 +54,33 @@ def test_tokenizer_training(imported_module, gpt2_files):
"the-verdict.txt"
),
filename="the-verdict.txt",
search_dirs="."
search_dirs=["ch02/01_main-chapter-code/", "../01_main-chapter-code/", "."]
)
with open(verdict_path, "r", encoding="utf-8") as f: # added ../01_main-chapter-code/
return verdict_path
@pytest.fixture(scope="module")
def gpt2_files(imported_module):
"""Fixture to handle downloading GPT-2 files."""
download_file_if_absent = getattr(imported_module, "download_file_if_absent", None)
search_directories = ["ch02/02_bonus_bytepair-encoder/gpt2_model/", "../02_bonus_bytepair-encoder/gpt2_model/", "."]
files_to_download = {
"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe": "vocab.bpe",
"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json": "encoder.json",
}
paths = {filename: download_file_if_absent(url, filename, search_directories)
for url, filename in files_to_download.items()}
return paths
def test_tokenizer_training(imported_module, verdict_file):
BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)
tokenizer = BPETokenizerSimple()
with open(verdict_file, "r", encoding="utf-8") as f: # added ../01_main-chapter-code/
text = f.read()
tokenizer.train(text, vocab_size=1000, allowed_special={"<|endoftext|>"})
@ -91,7 +98,6 @@ def test_tokenizer_training(imported_module, gpt2_files):
tokenizer2.load_vocab_and_merges(vocab_path="vocab.json", bpe_merges_path="bpe_merges.txt")
assert tokenizer2.decode(token_ids) == input_text, "Decoded text mismatch after reloading tokenizer."
def test_gpt2_tokenizer_openai_simple(imported_module, gpt2_files):
BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)