mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-09-01 20:38:11 +00:00
Fix issue: 731 by resolving semantic error (#738)
* fix issue 731 * update test path --------- Co-authored-by: rasbt <mail@sebastianraschka.com>
This commit is contained in:
parent
a200698698
commit
3d43a27ca9
2
.github/workflows/basic-tests-linux-uv.yml
vendored
2
.github/workflows/basic-tests-linux-uv.yml
vendored
@ -66,7 +66,7 @@ jobs:
|
|||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
pytest ch02/05_bpe-from-scratch/tests/tests.py
|
pytest ch02/05_bpe-from-scratch/tests.py
|
||||||
|
|
||||||
- name: Test Selected Bonus Materials
|
- name: Test Selected Bonus Materials
|
||||||
shell: bash
|
shell: bash
|
||||||
|
@ -81,7 +81,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 39,
|
||||||
"id": "8c9bc9e4-120f-4bac-8fa6-6523c568d12e",
|
"id": "8c9bc9e4-120f-4bac-8fa6-6523c568d12e",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -109,7 +109,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 40,
|
||||||
"id": "6c586945-d459-4f9a-855d-bf73438ef0e3",
|
"id": "6c586945-d459-4f9a-855d-bf73438ef0e3",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -138,7 +138,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 41,
|
||||||
"id": "0d5b61d9-79a0-48b4-9b3e-64ab595c5b01",
|
"id": "0d5b61d9-79a0-48b4-9b3e-64ab595c5b01",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -382,7 +382,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 42,
|
||||||
"id": "3e4a15ec-2667-4f56-b7c1-34e8071b621d",
|
"id": "3e4a15ec-2667-4f56-b7c1-34e8071b621d",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -809,7 +809,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 71,
|
||||||
"id": "51872c08-e01b-40c3-a8a0-e8d6a773e3df",
|
"id": "51872c08-e01b-40c3-a8a0-e8d6a773e3df",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -817,7 +817,7 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"the-verdict.txt already exists in ./the-verdict.txt\n"
|
"the-verdict.txt already exists in ../01_main-chapter-code/the-verdict.txt\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -848,7 +848,7 @@
|
|||||||
" \"the-verdict.txt\"\n",
|
" \"the-verdict.txt\"\n",
|
||||||
" ),\n",
|
" ),\n",
|
||||||
" filename=\"the-verdict.txt\",\n",
|
" filename=\"the-verdict.txt\",\n",
|
||||||
" search_dirs=\".\"\n",
|
" search_dirs=[\"ch02/01_main-chapter-code/\", \"../01_main-chapter-code/\", \".\"]\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"with open(verdict_path, \"r\", encoding=\"utf-8\") as f: # added ../01_main-chapter-code/\n",
|
"with open(verdict_path, \"r\", encoding=\"utf-8\") as f: # added ../01_main-chapter-code/\n",
|
||||||
@ -867,7 +867,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 46,
|
||||||
"id": "027348fd-d52f-4396-93dd-38eed142df9b",
|
"id": "027348fd-d52f-4396-93dd-38eed142df9b",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -886,7 +886,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 47,
|
||||||
"id": "f705a283-355e-4460-b940-06bbc2ae4e61",
|
"id": "f705a283-355e-4460-b940-06bbc2ae4e61",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -913,7 +913,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": 48,
|
||||||
"id": "3da42d1c-f75c-4ba7-a6c5-4cb8543d4a44",
|
"id": "3da42d1c-f75c-4ba7-a6c5-4cb8543d4a44",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -947,7 +947,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": 49,
|
||||||
"id": "e1db5cce-e015-412b-ad56-060b8b638078",
|
"id": "e1db5cce-e015-412b-ad56-060b8b638078",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -967,7 +967,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": 50,
|
||||||
"id": "78249752-38d7-47b9-b259-912bcc093dc4",
|
"id": "78249752-38d7-47b9-b259-912bcc093dc4",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -987,7 +987,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": 51,
|
||||||
"id": "0331d37d-49a3-44f7-9aa9-9834e0938741",
|
"id": "0331d37d-49a3-44f7-9aa9-9834e0938741",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -1007,7 +1007,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": 52,
|
||||||
"id": "1ed1b344-f7d4-4e9e-ac34-2a04b5c5b7a8",
|
"id": "1ed1b344-f7d4-4e9e-ac34-2a04b5c5b7a8",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -1043,7 +1043,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 13,
|
"execution_count": 53,
|
||||||
"id": "da0e1faf-1933-43d9-b681-916c282a8f86",
|
"id": "da0e1faf-1933-43d9-b681-916c282a8f86",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -1061,7 +1061,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 14,
|
"execution_count": 54,
|
||||||
"id": "8b690e83-5d6b-409a-804e-321c287c24a4",
|
"id": "8b690e83-5d6b-409a-804e-321c287c24a4",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -1087,7 +1087,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 15,
|
"execution_count": 55,
|
||||||
"id": "2b9e6289-92cb-4d88-b3c8-e836d7c8095f",
|
"id": "2b9e6289-92cb-4d88-b3c8-e836d7c8095f",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -1142,7 +1142,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 16,
|
"execution_count": 56,
|
||||||
"id": "c7056cb1-a9a3-4cf6-8364-29fb493ae240",
|
"id": "c7056cb1-a9a3-4cf6-8364-29fb493ae240",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -1152,7 +1152,7 @@
|
|||||||
"'This is some text.'"
|
"'This is some text.'"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 16,
|
"execution_count": 56,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -1165,7 +1165,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 17,
|
"execution_count": 57,
|
||||||
"id": "37bc6753-8f35-4ec7-b23e-df4a12103cb4",
|
"id": "37bc6753-8f35-4ec7-b23e-df4a12103cb4",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -1175,7 +1175,7 @@
|
|||||||
"'This is some text with \\n newline characters.'"
|
"'This is some text with \\n newline characters.'"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 17,
|
"execution_count": 57,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -1204,7 +1204,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 18,
|
"execution_count": 58,
|
||||||
"id": "955181cb-0910-4c6a-9c22-d8292a3ec1fc",
|
"id": "955181cb-0910-4c6a-9c22-d8292a3ec1fc",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -1215,7 +1215,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 19,
|
"execution_count": 59,
|
||||||
"id": "6e5ccfe7-ac67-42f3-b727-87886a8867f1",
|
"id": "6e5ccfe7-ac67-42f3-b727-87886a8867f1",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -1235,7 +1235,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 20,
|
"execution_count": 60,
|
||||||
"id": "00d9bf8f-756f-48bf-81b8-b890e2c2ef13",
|
"id": "00d9bf8f-756f-48bf-81b8-b890e2c2ef13",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -1253,7 +1253,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 21,
|
"execution_count": 61,
|
||||||
"id": "e7addb64-2892-4e1c-85dd-4f5152740099",
|
"id": "e7addb64-2892-4e1c-85dd-4f5152740099",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -1263,7 +1263,7 @@
|
|||||||
"'This is some text with \\n newline characters.'"
|
"'This is some text with \\n newline characters.'"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 21,
|
"execution_count": 61,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -1293,7 +1293,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 22,
|
"execution_count": 72,
|
||||||
"id": "b45b4366-2c2b-4309-9a14-febf3add8512",
|
"id": "b45b4366-2c2b-4309-9a14-febf3add8512",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -1310,7 +1310,7 @@
|
|||||||
"# Download files if not already present in this directory\n",
|
"# Download files if not already present in this directory\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Define the directories to search and the files to download\n",
|
"# Define the directories to search and the files to download\n",
|
||||||
"search_directories = [\".\", \"../02_bonus_bytepair-encoder/gpt2_model/\"]\n",
|
"search_directories = [\"ch02/02_bonus_bytepair-encoder/gpt2_model/\", \"../02_bonus_bytepair-encoder/gpt2_model/\", \".\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"files_to_download = {\n",
|
"files_to_download = {\n",
|
||||||
" \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe\": \"vocab.bpe\",\n",
|
" \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe\": \"vocab.bpe\",\n",
|
||||||
|
@ -10,7 +10,7 @@ import tiktoken
|
|||||||
|
|
||||||
def import_definitions_from_notebook(fullname, names):
|
def import_definitions_from_notebook(fullname, names):
|
||||||
"""Loads function definitions from a Jupyter notebook file into a module."""
|
"""Loads function definitions from a Jupyter notebook file into a module."""
|
||||||
path = os.path.join(os.path.dirname(__file__), "..", fullname + ".ipynb")
|
path = os.path.join(os.path.dirname(__file__), fullname + ".ipynb")
|
||||||
path = os.path.normpath(path)
|
path = os.path.normpath(path)
|
||||||
|
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
@ -42,12 +42,30 @@ def imported_module():
|
|||||||
return import_definitions_from_notebook(fullname, names)
|
return import_definitions_from_notebook(fullname, names)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def verdict_file(imported_module):
|
||||||
|
"""Fixture to handle downloading The Verdict file."""
|
||||||
|
download_file_if_absent = getattr(imported_module, "download_file_if_absent", None)
|
||||||
|
|
||||||
|
verdict_path = download_file_if_absent(
|
||||||
|
url=(
|
||||||
|
"https://raw.githubusercontent.com/rasbt/"
|
||||||
|
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
|
||||||
|
"the-verdict.txt"
|
||||||
|
),
|
||||||
|
filename="the-verdict.txt",
|
||||||
|
search_dirs=["ch02/01_main-chapter-code/", "../01_main-chapter-code/", "."]
|
||||||
|
)
|
||||||
|
|
||||||
|
return verdict_path
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def gpt2_files(imported_module):
|
def gpt2_files(imported_module):
|
||||||
"""Fixture to handle downloading GPT-2 files."""
|
"""Fixture to handle downloading GPT-2 files."""
|
||||||
download_file_if_absent = getattr(imported_module, "download_file_if_absent", None)
|
download_file_if_absent = getattr(imported_module, "download_file_if_absent", None)
|
||||||
|
|
||||||
search_directories = [".", "../02_bonus_bytepair-encoder/gpt2_model/"]
|
search_directories = ["ch02/02_bonus_bytepair-encoder/gpt2_model/", "../02_bonus_bytepair-encoder/gpt2_model/", "."]
|
||||||
files_to_download = {
|
files_to_download = {
|
||||||
"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe": "vocab.bpe",
|
"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe": "vocab.bpe",
|
||||||
"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json": "encoder.json"
|
"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json": "encoder.json"
|
||||||
@ -58,22 +76,11 @@ def gpt2_files(imported_module):
|
|||||||
return paths
|
return paths
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_training(imported_module):
|
def test_tokenizer_training(imported_module, verdict_file):
|
||||||
BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)
|
BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)
|
||||||
download_file_if_absent = getattr(imported_module, "download_file_if_absent", None)
|
|
||||||
|
|
||||||
tokenizer = BPETokenizerSimple()
|
tokenizer = BPETokenizerSimple()
|
||||||
verdict_path = download_file_if_absent(
|
|
||||||
url=(
|
|
||||||
"https://raw.githubusercontent.com/rasbt/"
|
|
||||||
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
|
|
||||||
"the-verdict.txt"
|
|
||||||
),
|
|
||||||
filename="the-verdict.txt",
|
|
||||||
search_dirs="."
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(verdict_path, "r", encoding="utf-8") as f: # added ../01_main-chapter-code/
|
with open(verdict_file, "r", encoding="utf-8") as f: # added ../01_main-chapter-code/
|
||||||
text = f.read()
|
text = f.read()
|
||||||
|
|
||||||
tokenizer.train(text, vocab_size=1000, allowed_special={"<|endoftext|>"})
|
tokenizer.train(text, vocab_size=1000, allowed_special={"<|endoftext|>"})
|
Loading…
x
Reference in New Issue
Block a user